aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.kexec12
-rw-r--r--kernel/Makefile5
-rw-r--r--kernel/bounds.c2
-rw-r--r--kernel/bpf/Kconfig2
-rw-r--r--kernel/bpf/Makefile5
-rw-r--r--kernel/bpf/arena.c29
-rw-r--r--kernel/bpf/arraymap.c54
-rw-r--r--kernel/bpf/bloom_filter.c13
-rw-r--r--kernel/bpf/bpf_local_storage.c2
-rw-r--r--kernel/bpf/bpf_struct_ops.c10
-rw-r--r--kernel/bpf/btf.c27
-rw-r--r--kernel/bpf/cgroup.c2
-rw-r--r--kernel/bpf/core.c84
-rw-r--r--kernel/bpf/cpumap.c3
-rw-r--r--kernel/bpf/cpumask.c1
-rw-r--r--kernel/bpf/crypto.c385
-rw-r--r--kernel/bpf/disasm.c14
-rw-r--r--kernel/bpf/hashtab.c64
-rw-r--r--kernel/bpf/helpers.c364
-rw-r--r--kernel/bpf/log.c4
-rw-r--r--kernel/bpf/lpm_trie.c31
-rw-r--r--kernel/bpf/syscall.c88
-rw-r--r--kernel/bpf/sysfs_btf.c6
-rw-r--r--kernel/bpf/trampoline.c20
-rw-r--r--kernel/bpf/verifier.c717
-rw-r--r--kernel/configs/hardening.config19
-rw-r--r--kernel/context_tracking.c2
-rw-r--r--kernel/cpu.c13
-rw-r--r--kernel/crash_core.c764
-rw-r--r--kernel/crash_reserve.c471
-rw-r--r--kernel/dma/contiguous.c6
-rw-r--r--kernel/dma/direct.c9
-rw-r--r--kernel/dma/swiotlb.c181
-rw-r--r--kernel/elfcorehdr.c (renamed from kernel/crash_dump.c)0
-rw-r--r--kernel/entry/common.c8
-rw-r--r--kernel/events/core.c273
-rw-r--r--kernel/events/ring_buffer.c4
-rw-r--r--kernel/events/uprobes.c2
-rw-r--r--kernel/fork.c33
-rw-r--r--kernel/hung_task.c1
-rw-r--r--kernel/irq/Kconfig4
-rw-r--r--kernel/irq/cpuhotplug.c27
-rw-r--r--kernel/irq/internals.h9
-rw-r--r--kernel/irq/irqdesc.c65
-rw-r--r--kernel/irq/irqdomain.c5
-rw-r--r--kernel/irq/manage.c37
-rw-r--r--kernel/irq/proc.c9
-rw-r--r--kernel/irq/resend.c2
-rw-r--r--kernel/jump_label.c53
-rw-r--r--kernel/kallsyms_selftest.c1
-rw-r--r--kernel/kcsan/kcsan_test.c17
-rw-r--r--kernel/kexec.c11
-rw-r--r--kernel/kexec_core.c294
-rw-r--r--kernel/kexec_file.c15
-rw-r--r--kernel/kexec_internal.h2
-rw-r--r--kernel/kprobes.c18
-rw-r--r--kernel/ksysfs.c12
-rw-r--r--kernel/kthread.c1
-rw-r--r--kernel/locking/lock_events.h4
-rw-r--r--kernel/locking/qspinlock.c13
-rw-r--r--kernel/locking/qspinlock_paravirt.h49
-rw-r--r--kernel/module/Kconfig8
-rw-r--r--kernel/module/internal.h6
-rw-r--r--kernel/module/main.c29
-rw-r--r--kernel/module/strict_rwx.c63
-rw-r--r--kernel/padata.c22
-rw-r--r--kernel/panic.c9
-rw-r--r--kernel/pid.c6
-rw-r--r--kernel/power/Kconfig26
-rw-r--r--kernel/power/energy_model.c552
-rw-r--r--kernel/power/hibernate.c109
-rw-r--r--kernel/power/main.c182
-rw-r--r--kernel/power/power.h23
-rw-r--r--kernel/power/snapshot.c25
-rw-r--r--kernel/power/suspend.c15
-rw-r--r--kernel/power/suspend_test.c2
-rw-r--r--kernel/power/swap.c197
-rw-r--r--kernel/power/user.c4
-rw-r--r--kernel/printk/printk.c31
-rw-r--r--kernel/profile.c43
-rw-r--r--kernel/ptrace.c13
-rw-r--r--kernel/rcu/Kconfig8
-rw-r--r--kernel/rcu/rcu.h20
-rw-r--r--kernel/rcu/rcutorture.c85
-rw-r--r--kernel/rcu/srcutiny.c31
-rw-r--r--kernel/rcu/srcutree.c5
-rw-r--r--kernel/rcu/sync.c8
-rw-r--r--kernel/rcu/tasks.h44
-rw-r--r--kernel/rcu/tiny.c4
-rw-r--r--kernel/rcu/tree.c430
-rw-r--r--kernel/rcu/tree.h24
-rw-r--r--kernel/rcu/tree_exp.h2
-rw-r--r--kernel/rcu/tree_plugin.h4
-rw-r--r--kernel/rcu/tree_stall.h11
-rw-r--r--kernel/rcu/update.c4
-rw-r--r--kernel/sched/core.c30
-rw-r--r--kernel/sched/cputime.c13
-rw-r--r--kernel/sched/fair.c543
-rw-r--r--kernel/sched/isolation.c18
-rw-r--r--kernel/sched/loadavg.c2
-rw-r--r--kernel/sched/membarrier.c13
-rw-r--r--kernel/sched/pelt.c22
-rw-r--r--kernel/sched/pelt.h16
-rw-r--r--kernel/sched/sched.h91
-rw-r--r--kernel/sched/stats.c5
-rw-r--r--kernel/sched/topology.c56
-rw-r--r--kernel/seccomp.c4
-rw-r--r--kernel/signal.c28
-rw-r--r--kernel/softirq.c12
-rw-r--r--kernel/stackleak.c6
-rw-r--r--kernel/sys.c7
-rw-r--r--kernel/sysctl.c4
-rw-r--r--kernel/time/Kconfig2
-rw-r--r--kernel/time/alarmtimer.c2
-rw-r--r--kernel/time/clockevents.c2
-rw-r--r--kernel/time/clocksource.c44
-rw-r--r--kernel/time/hrtimer.c41
-rw-r--r--kernel/time/posix-clock.c16
-rw-r--r--kernel/time/tick-common.c17
-rw-r--r--kernel/time/tick-sched.c54
-rw-r--r--kernel/time/tick-sched.h2
-rw-r--r--kernel/time/timekeeping.c96
-rw-r--r--kernel/time/timer.c36
-rw-r--r--kernel/time/timer_migration.c33
-rw-r--r--kernel/time/vsyscall.c6
-rw-r--r--kernel/trace/Kconfig6
-rw-r--r--kernel/trace/bpf_trace.c172
-rw-r--r--kernel/trace/ftrace.c93
-rw-r--r--kernel/trace/ring_buffer.c183
-rw-r--r--kernel/trace/trace.c816
-rw-r--r--kernel/trace/trace.h18
-rw-r--r--kernel/trace/trace_benchmark.c5
-rw-r--r--kernel/trace/trace_eprobe.c8
-rw-r--r--kernel/trace/trace_events.c16
-rw-r--r--kernel/trace/trace_events_trigger.c63
-rw-r--r--kernel/trace/trace_events_user.c209
-rw-r--r--kernel/trace/trace_fprobe.c59
-rw-r--r--kernel/trace/trace_kprobe.c58
-rw-r--r--kernel/trace/trace_probe.c419
-rw-r--r--kernel/trace/trace_probe.h30
-rw-r--r--kernel/trace/trace_probe_tmpl.h10
-rw-r--r--kernel/trace/trace_sched_switch.c515
-rw-r--r--kernel/trace/trace_selftest.c2
-rw-r--r--kernel/trace/trace_uprobe.c14
-rw-r--r--kernel/ucount.c2
-rw-r--r--kernel/user_namespace.c2
-rw-r--r--kernel/vmcore_info.c232
-rw-r--r--kernel/watchdog.c237
-rw-r--r--kernel/workqueue.c23
149 files changed, 7320 insertions, 3516 deletions
diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec
index 946dffa048b7..6c34e63c88ff 100644
--- a/kernel/Kconfig.kexec
+++ b/kernel/Kconfig.kexec
@@ -2,11 +2,13 @@
menu "Kexec and crash features"
-config CRASH_CORE
+config CRASH_RESERVE
+ bool
+
+config VMCORE_INFO
bool
config KEXEC_CORE
- select CRASH_CORE
bool
config KEXEC_ELF
@@ -95,9 +97,11 @@ config KEXEC_JUMP
config CRASH_DUMP
bool "kernel crash dumps"
+ default y
depends on ARCH_SUPPORTS_CRASH_DUMP
- select CRASH_CORE
- select KEXEC_CORE
+ depends on KEXEC_CORE
+ select VMCORE_INFO
+ select CRASH_RESERVE
help
Generate crash dump after being started by kexec.
This should be normally only set in special crash dump kernels
diff --git a/kernel/Makefile b/kernel/Makefile
index ce105a5558fc..3c13240dfc9f 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -68,8 +68,10 @@ obj-$(CONFIG_MODULE_SIG_FORMAT) += module_signature.o
obj-$(CONFIG_KALLSYMS) += kallsyms.o
obj-$(CONFIG_KALLSYMS_SELFTEST) += kallsyms_selftest.o
obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
-obj-$(CONFIG_CRASH_CORE) += crash_core.o
+obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o elfcorehdr.o
+obj-$(CONFIG_CRASH_RESERVE) += crash_reserve.o
obj-$(CONFIG_KEXEC_CORE) += kexec_core.o
+obj-$(CONFIG_CRASH_DUMP) += crash_core.o
obj-$(CONFIG_KEXEC) += kexec.o
obj-$(CONFIG_KEXEC_FILE) += kexec_file.o
obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o
@@ -120,7 +122,6 @@ obj-$(CONFIG_PERF_EVENTS) += events/
obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
obj-$(CONFIG_PADATA) += padata.o
-obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
obj-$(CONFIG_JUMP_LABEL) += jump_label.o
obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
obj-$(CONFIG_TORTURE_TEST) += torture.o
diff --git a/kernel/bounds.c b/kernel/bounds.c
index b529182e8b04..29b2cd00df2c 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -19,7 +19,7 @@ int main(void)
DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
#ifdef CONFIG_SMP
- DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
+ DEFINE(NR_CPUS_BITS, order_base_2(CONFIG_NR_CPUS));
#endif
DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
#ifdef CONFIG_LRU_GEN
diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig
index bc25f5098a25..4100df44c665 100644
--- a/kernel/bpf/Kconfig
+++ b/kernel/bpf/Kconfig
@@ -28,7 +28,7 @@ config BPF_SYSCALL
bool "Enable bpf() system call"
select BPF
select IRQ_WORK
- select TASKS_RCU if PREEMPTION
+ select NEED_TASKS_RCU
select TASKS_TRACE_RCU
select BINARY_PRINTF
select NET_SOCK_MSG if NET
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 368c5d86b5b7..7eb9ad3a3ae6 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -4,7 +4,7 @@ ifneq ($(CONFIG_BPF_JIT_ALWAYS_ON),y)
# ___bpf_prog_run() needs GCSE disabled on x86; see 3193c0836f203 for details
cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse
endif
-CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy)
+CFLAGS_core.o += -Wno-override-init $(cflags-nogcse-yy)
obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o
obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o
@@ -44,6 +44,9 @@ obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o
obj-$(CONFIG_BPF_SYSCALL) += cpumask.o
obj-${CONFIG_BPF_LSM} += bpf_lsm.o
endif
+ifneq ($(CONFIG_CRYPTO),)
+obj-$(CONFIG_BPF_SYSCALL) += crypto.o
+endif
obj-$(CONFIG_BPF_PRELOAD) += preload/
obj-$(CONFIG_BPF_SYSCALL) += relo_core.o
diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
index 86571e760dd6..f5953f1a95cd 100644
--- a/kernel/bpf/arena.c
+++ b/kernel/bpf/arena.c
@@ -37,8 +37,8 @@
*/
/* number of bytes addressable by LDX/STX insn with 16-bit 'off' field */
-#define GUARD_SZ (1ull << sizeof(((struct bpf_insn *)0)->off) * 8)
-#define KERN_VM_SZ ((1ull << 32) + GUARD_SZ)
+#define GUARD_SZ (1ull << sizeof_field(struct bpf_insn, off) * 8)
+#define KERN_VM_SZ (SZ_4G + GUARD_SZ)
struct bpf_arena {
struct bpf_map map;
@@ -110,7 +110,7 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
return ERR_PTR(-EINVAL);
vm_range = (u64)attr->max_entries * PAGE_SIZE;
- if (vm_range > (1ull << 32))
+ if (vm_range > SZ_4G)
return ERR_PTR(-E2BIG);
if ((attr->map_extra >> 32) != ((attr->map_extra + vm_range - 1) >> 32))
@@ -251,7 +251,7 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
int ret;
kbase = bpf_arena_get_kern_vm_start(arena);
- kaddr = kbase + (u32)(vmf->address & PAGE_MASK);
+ kaddr = kbase + (u32)(vmf->address);
guard(mutex)(&arena->lock);
page = vmalloc_to_page((void *)kaddr);
@@ -301,7 +301,7 @@ static unsigned long arena_get_unmapped_area(struct file *filp, unsigned long ad
if (pgoff)
return -EINVAL;
- if (len > (1ull << 32))
+ if (len > SZ_4G)
return -E2BIG;
/* if user_vm_start was specified at arena creation time */
@@ -322,7 +322,7 @@ static unsigned long arena_get_unmapped_area(struct file *filp, unsigned long ad
if (WARN_ON_ONCE(arena->user_vm_start))
/* checks at map creation time should prevent this */
return -EFAULT;
- return round_up(ret, 1ull << 32);
+ return round_up(ret, SZ_4G);
}
static int arena_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
@@ -346,7 +346,7 @@ static int arena_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
return -EBUSY;
/* Earlier checks should prevent this */
- if (WARN_ON_ONCE(vma->vm_end - vma->vm_start > (1ull << 32) || vma->vm_pgoff))
+ if (WARN_ON_ONCE(vma->vm_end - vma->vm_start > SZ_4G || vma->vm_pgoff))
return -EFAULT;
if (remember_vma(arena, vma))
@@ -420,7 +420,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
if (uaddr & ~PAGE_MASK)
return 0;
pgoff = compute_pgoff(arena, uaddr);
- if (pgoff + page_cnt > page_cnt_max)
+ if (pgoff > page_cnt_max - page_cnt)
/* requested address will be outside of user VMA */
return 0;
}
@@ -447,7 +447,13 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
goto out;
uaddr32 = (u32)(arena->user_vm_start + pgoff * PAGE_SIZE);
- /* Earlier checks make sure that uaddr32 + page_cnt * PAGE_SIZE will not overflow 32-bit */
+ /* Earlier checks made sure that uaddr32 + page_cnt * PAGE_SIZE - 1
+ * will not overflow 32-bit. Lower 32-bit need to represent
+ * contiguous user address range.
+ * Map these pages at kern_vm_start base.
+ * kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE - 1 can overflow
+ * lower 32-bit and it's ok.
+ */
ret = vm_area_map_pages(arena->kern_vm, kern_vm_start + uaddr32,
kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE, pages);
if (ret) {
@@ -510,6 +516,11 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt)
if (!page)
continue;
if (page_cnt == 1 && page_mapped(page)) /* mapped by some user process */
+ /* Optimization for the common case of page_cnt==1:
+ * If page wasn't mapped into some user vma there
+ * is no need to call zap_pages which is slow. When
+ * page_cnt is big it's faster to do the batched zap.
+ */
zap_pages(arena, full_uaddr, 1);
vm_area_unmap_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE);
__free_page(page);
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 13358675ff2e..feabc0193852 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -246,6 +246,38 @@ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key)
return this_cpu_ptr(array->pptrs[index & array->index_mask]);
}
+/* emit BPF instructions equivalent to C code of percpu_array_map_lookup_elem() */
+static int percpu_array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ struct bpf_insn *insn = insn_buf;
+
+ if (!bpf_jit_supports_percpu_insn())
+ return -EOPNOTSUPP;
+
+ if (map->map_flags & BPF_F_INNER_MAP)
+ return -EOPNOTSUPP;
+
+ BUILD_BUG_ON(offsetof(struct bpf_array, map) != 0);
+ *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, offsetof(struct bpf_array, pptrs));
+
+ *insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_2, 0);
+ if (!map->bypass_spec_v1) {
+ *insn++ = BPF_JMP_IMM(BPF_JGE, BPF_REG_0, map->max_entries, 6);
+ *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_0, array->index_mask);
+ } else {
+ *insn++ = BPF_JMP_IMM(BPF_JGE, BPF_REG_0, map->max_entries, 5);
+ }
+
+ *insn++ = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3);
+ *insn++ = BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1);
+ *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0);
+ *insn++ = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0);
+ *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
+ *insn++ = BPF_MOV64_IMM(BPF_REG_0, 0);
+ return insn - insn_buf;
+}
+
static void *percpu_array_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
@@ -396,17 +428,22 @@ static void *array_map_vmalloc_addr(struct bpf_array *array)
return (void *)round_down((unsigned long)array, PAGE_SIZE);
}
-static void array_map_free_timers(struct bpf_map *map)
+static void array_map_free_timers_wq(struct bpf_map *map)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
int i;
- /* We don't reset or free fields other than timer on uref dropping to zero. */
- if (!btf_record_has_field(map->record, BPF_TIMER))
- return;
-
- for (i = 0; i < array->map.max_entries; i++)
- bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i));
+ /* We don't reset or free fields other than timer and workqueue
+ * on uref dropping to zero.
+ */
+ if (btf_record_has_field(map->record, BPF_TIMER | BPF_WORKQUEUE)) {
+ for (i = 0; i < array->map.max_entries; i++) {
+ if (btf_record_has_field(map->record, BPF_TIMER))
+ bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i));
+ if (btf_record_has_field(map->record, BPF_WORKQUEUE))
+ bpf_obj_free_workqueue(map->record, array_map_elem_ptr(array, i));
+ }
+ }
}
/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
@@ -750,7 +787,7 @@ const struct bpf_map_ops array_map_ops = {
.map_alloc = array_map_alloc,
.map_free = array_map_free,
.map_get_next_key = array_map_get_next_key,
- .map_release_uref = array_map_free_timers,
+ .map_release_uref = array_map_free_timers_wq,
.map_lookup_elem = array_map_lookup_elem,
.map_update_elem = array_map_update_elem,
.map_delete_elem = array_map_delete_elem,
@@ -776,6 +813,7 @@ const struct bpf_map_ops percpu_array_map_ops = {
.map_free = array_map_free,
.map_get_next_key = array_map_get_next_key,
.map_lookup_elem = percpu_array_map_lookup_elem,
+ .map_gen_lookup = percpu_array_map_gen_lookup,
.map_update_elem = array_map_update_elem,
.map_delete_elem = array_map_delete_elem,
.map_lookup_percpu_elem = percpu_array_map_lookup_percpu_elem,
diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c
index addf3dd57b59..35e1ddca74d2 100644
--- a/kernel/bpf/bloom_filter.c
+++ b/kernel/bpf/bloom_filter.c
@@ -80,6 +80,18 @@ static int bloom_map_get_next_key(struct bpf_map *map, void *key, void *next_key
return -EOPNOTSUPP;
}
+/* Called from syscall */
+static int bloom_map_alloc_check(union bpf_attr *attr)
+{
+ if (attr->value_size > KMALLOC_MAX_SIZE)
+ /* if value_size is bigger, the user space won't be able to
+ * access the elements.
+ */
+ return -E2BIG;
+
+ return 0;
+}
+
static struct bpf_map *bloom_map_alloc(union bpf_attr *attr)
{
u32 bitset_bytes, bitset_mask, nr_hash_funcs, nr_bits;
@@ -191,6 +203,7 @@ static u64 bloom_map_mem_usage(const struct bpf_map *map)
BTF_ID_LIST_SINGLE(bpf_bloom_map_btf_ids, struct, bpf_bloom_filter)
const struct bpf_map_ops bloom_filter_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
+ .map_alloc_check = bloom_map_alloc_check,
.map_alloc = bloom_map_alloc,
.map_free = bloom_map_free,
.map_get_next_key = bloom_map_get_next_key,
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index bdea1a459153..976cb258a0ed 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -318,7 +318,7 @@ static bool check_storage_bpf_ma(struct bpf_local_storage *local_storage,
*
* If the local_storage->list is already empty, the caller will not
* care about the bpf_ma value also because the caller is not
- * responsibile to free the local_storage.
+ * responsible to free the local_storage.
*/
if (storage_smap)
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 43356faaa057..86c7884abaf8 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -728,8 +728,6 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
cur_image = image;
trampoline_start = 0;
}
- if (err < 0)
- goto reset_unlock;
*(void **)(kdata + moff) = image + trampoline_start + cfi_get_offset();
@@ -742,8 +740,12 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
if (err)
goto reset_unlock;
}
- for (i = 0; i < st_map->image_pages_cnt; i++)
- arch_protect_bpf_trampoline(st_map->image_pages[i], PAGE_SIZE);
+ for (i = 0; i < st_map->image_pages_cnt; i++) {
+ err = arch_protect_bpf_trampoline(st_map->image_pages[i],
+ PAGE_SIZE);
+ if (err)
+ goto reset_unlock;
+ }
if (st_map->map.map_flags & BPF_F_LINK) {
err = 0;
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 90c4a32d89ff..821063660d9f 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -218,6 +218,7 @@ enum btf_kfunc_hook {
BTF_KFUNC_HOOK_SOCKET_FILTER,
BTF_KFUNC_HOOK_LWT,
BTF_KFUNC_HOOK_NETFILTER,
+ BTF_KFUNC_HOOK_KPROBE,
BTF_KFUNC_HOOK_MAX,
};
@@ -3464,6 +3465,15 @@ static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask,
goto end;
}
}
+ if (field_mask & BPF_WORKQUEUE) {
+ if (!strcmp(name, "bpf_wq")) {
+ if (*seen_mask & BPF_WORKQUEUE)
+ return -E2BIG;
+ *seen_mask |= BPF_WORKQUEUE;
+ type = BPF_WORKQUEUE;
+ goto end;
+ }
+ }
field_mask_test_name(BPF_LIST_HEAD, "bpf_list_head");
field_mask_test_name(BPF_LIST_NODE, "bpf_list_node");
field_mask_test_name(BPF_RB_ROOT, "bpf_rb_root");
@@ -3515,6 +3525,7 @@ static int btf_find_struct_field(const struct btf *btf,
switch (field_type) {
case BPF_SPIN_LOCK:
case BPF_TIMER:
+ case BPF_WORKQUEUE:
case BPF_LIST_NODE:
case BPF_RB_NODE:
case BPF_REFCOUNT:
@@ -3582,6 +3593,7 @@ static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,
switch (field_type) {
case BPF_SPIN_LOCK:
case BPF_TIMER:
+ case BPF_WORKQUEUE:
case BPF_LIST_NODE:
case BPF_RB_NODE:
case BPF_REFCOUNT:
@@ -3816,6 +3828,7 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
rec->spin_lock_off = -EINVAL;
rec->timer_off = -EINVAL;
+ rec->wq_off = -EINVAL;
rec->refcount_off = -EINVAL;
for (i = 0; i < cnt; i++) {
field_type_size = btf_field_type_size(info_arr[i].type);
@@ -3846,6 +3859,11 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
/* Cache offset for faster lookup at runtime */
rec->timer_off = rec->fields[i].offset;
break;
+ case BPF_WORKQUEUE:
+ WARN_ON_ONCE(rec->wq_off >= 0);
+ /* Cache offset for faster lookup at runtime */
+ rec->wq_off = rec->fields[i].offset;
+ break;
case BPF_REFCOUNT:
WARN_ON_ONCE(rec->refcount_off >= 0);
/* Cache offset for faster lookup at runtime */
@@ -5642,8 +5660,8 @@ errout_free:
return ERR_PTR(err);
}
-extern char __weak __start_BTF[];
-extern char __weak __stop_BTF[];
+extern char __start_BTF[];
+extern char __stop_BTF[];
extern struct btf *btf_vmlinux;
#define BPF_MAP_TYPE(_id, _ops)
@@ -5971,6 +5989,9 @@ struct btf *btf_parse_vmlinux(void)
struct btf *btf = NULL;
int err;
+ if (!IS_ENABLED(CONFIG_DEBUG_INFO_BTF))
+ return ERR_PTR(-ENOENT);
+
env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN);
if (!env)
return ERR_PTR(-ENOMEM);
@@ -8137,6 +8158,8 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
return BTF_KFUNC_HOOK_LWT;
case BPF_PROG_TYPE_NETFILTER:
return BTF_KFUNC_HOOK_NETFILTER;
+ case BPF_PROG_TYPE_KPROBE:
+ return BTF_KFUNC_HOOK_KPROBE;
default:
return BTF_KFUNC_HOOK_MAX;
}
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 82243cb6c54d..8ba73042a239 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -2575,8 +2575,6 @@ cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
switch (func_id) {
case BPF_FUNC_get_current_uid_gid:
return &bpf_get_current_uid_gid_proto;
- case BPF_FUNC_get_current_pid_tgid:
- return &bpf_get_current_pid_tgid_proto;
case BPF_FUNC_get_current_comm:
return &bpf_get_current_comm_proto;
#ifdef CONFIG_CGROUP_NET_CLASSID
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 696bc55de8e8..733fca2634b7 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -26,6 +26,7 @@
#include <linux/bpf.h>
#include <linux/btf.h>
#include <linux/objtool.h>
+#include <linux/overflow.h>
#include <linux/rbtree_latch.h>
#include <linux/kallsyms.h>
#include <linux/rcupdate.h>
@@ -747,7 +748,7 @@ const char *__bpf_address_lookup(unsigned long addr, unsigned long *size,
unsigned long symbol_start = ksym->start;
unsigned long symbol_end = ksym->end;
- strncpy(sym, ksym->name, KSYM_NAME_LEN);
+ strscpy(sym, ksym->name, KSYM_NAME_LEN);
ret = sym;
if (size)
@@ -813,7 +814,7 @@ int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
if (it++ != symnum)
continue;
- strncpy(sym, ksym->name, KSYM_NAME_LEN);
+ strscpy(sym, ksym->name, KSYM_NAME_LEN);
*value = ksym->start;
*type = BPF_SYM_ELF_TYPE;
@@ -849,7 +850,7 @@ int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
return -EINVAL;
}
- tab = krealloc(tab, size * sizeof(*poke), GFP_KERNEL);
+ tab = krealloc_array(tab, size, sizeof(*poke), GFP_KERNEL);
if (!tab)
return -ENOMEM;
@@ -908,23 +909,30 @@ static LIST_HEAD(pack_list);
static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_insns)
{
struct bpf_prog_pack *pack;
+ int err;
pack = kzalloc(struct_size(pack, bitmap, BITS_TO_LONGS(BPF_PROG_CHUNK_COUNT)),
GFP_KERNEL);
if (!pack)
return NULL;
pack->ptr = bpf_jit_alloc_exec(BPF_PROG_PACK_SIZE);
- if (!pack->ptr) {
- kfree(pack);
- return NULL;
- }
+ if (!pack->ptr)
+ goto out;
bpf_fill_ill_insns(pack->ptr, BPF_PROG_PACK_SIZE);
bitmap_zero(pack->bitmap, BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE);
- list_add_tail(&pack->list, &pack_list);
set_vm_flush_reset_perms(pack->ptr);
- set_memory_rox((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE);
+ err = set_memory_rox((unsigned long)pack->ptr,
+ BPF_PROG_PACK_SIZE / PAGE_SIZE);
+ if (err)
+ goto out;
+ list_add_tail(&pack->list, &pack_list);
return pack;
+
+out:
+ bpf_jit_free_exec(pack->ptr);
+ kfree(pack);
+ return NULL;
}
void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns)
@@ -939,9 +947,16 @@ void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns)
size = round_up(size, PAGE_SIZE);
ptr = bpf_jit_alloc_exec(size);
if (ptr) {
+ int err;
+
bpf_fill_ill_insns(ptr, size);
set_vm_flush_reset_perms(ptr);
- set_memory_rox((unsigned long)ptr, size / PAGE_SIZE);
+ err = set_memory_rox((unsigned long)ptr,
+ size / PAGE_SIZE);
+ if (err) {
+ bpf_jit_free_exec(ptr);
+ ptr = NULL;
+ }
}
goto out;
}
@@ -2204,6 +2219,7 @@ static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn
u64 stack[stack_size / sizeof(u64)]; \
u64 regs[MAX_BPF_EXT_REG] = {}; \
\
+ kmsan_unpoison_memory(stack, sizeof(stack)); \
FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \
ARG1 = (u64) (unsigned long) ctx; \
return ___bpf_prog_run(regs, insn); \
@@ -2217,6 +2233,7 @@ static u64 PROG_NAME_ARGS(stack_size)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, \
u64 stack[stack_size / sizeof(u64)]; \
u64 regs[MAX_BPF_EXT_REG]; \
\
+ kmsan_unpoison_memory(stack, sizeof(stack)); \
FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \
BPF_R1 = r1; \
BPF_R2 = r2; \
@@ -2403,7 +2420,9 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
}
finalize:
- bpf_prog_lock_ro(fp);
+ *err = bpf_prog_lock_ro(fp);
+ if (*err)
+ return fp;
/* The tail call compatibility check can only be done at
* this late stage as we need to determine, if we deal
@@ -2437,13 +2456,14 @@ EXPORT_SYMBOL(bpf_empty_prog_array);
struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
{
+ struct bpf_prog_array *p;
+
if (prog_cnt)
- return kzalloc(sizeof(struct bpf_prog_array) +
- sizeof(struct bpf_prog_array_item) *
- (prog_cnt + 1),
- flags);
+ p = kzalloc(struct_size(p, items, prog_cnt + 1), flags);
+ else
+ p = &bpf_empty_prog_array.hdr;
- return &bpf_empty_prog_array.hdr;
+ return p;
}
void bpf_prog_array_free(struct bpf_prog_array *progs)
@@ -2796,7 +2816,7 @@ void bpf_prog_free(struct bpf_prog *fp)
}
EXPORT_SYMBOL_GPL(bpf_prog_free);
-/* RNG for unpriviledged user space with separated state from prandom_u32(). */
+/* RNG for unprivileged user space with separated state from prandom_u32(). */
static DEFINE_PER_CPU(struct rnd_state, bpf_user_rnd_state);
void bpf_user_rnd_init_once(void)
@@ -2921,12 +2941,28 @@ bool __weak bpf_jit_needs_zext(void)
return false;
}
+/* Return true if the JIT inlines the call to the helper corresponding to
+ * the imm.
+ *
+ * The verifier will not patch the insn->imm for the call to the helper if
+ * this returns true.
+ */
+bool __weak bpf_jit_inlines_helper_call(s32 imm)
+{
+ return false;
+}
+
/* Return TRUE if the JIT backend supports mixing bpf2bpf and tailcalls. */
bool __weak bpf_jit_supports_subprog_tailcalls(void)
{
return false;
}
+bool __weak bpf_jit_supports_percpu_insn(void)
+{
+ return false;
+}
+
bool __weak bpf_jit_supports_kfunc_call(void)
{
return false;
@@ -2942,6 +2978,20 @@ bool __weak bpf_jit_supports_arena(void)
return false;
}
+bool __weak bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena)
+{
+ return false;
+}
+
+u64 __weak bpf_arch_uaddress_limit(void)
+{
+#if defined(CONFIG_64BIT) && defined(CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE)
+ return TASK_SIZE;
+#else
+ return 0;
+#endif
+}
+
/* Return TRUE if the JIT backend satisfies the following two conditions:
* 1) JIT backend supports atomic_xchg() on pointer-sized words.
* 2) Under the specific arch, the implementation of xchg() is the same
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 9ee8da477465..a8e34416e960 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -263,6 +263,7 @@ static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames,
static int cpu_map_kthread_run(void *data)
{
struct bpf_cpu_map_entry *rcpu = data;
+ unsigned long last_qs = jiffies;
complete(&rcpu->kthread_running);
set_current_state(TASK_INTERRUPTIBLE);
@@ -288,10 +289,12 @@ static int cpu_map_kthread_run(void *data)
if (__ptr_ring_empty(rcpu->queue)) {
schedule();
sched = 1;
+ last_qs = jiffies;
} else {
__set_current_state(TASK_RUNNING);
}
} else {
+ rcu_softirq_qs_periodic(last_qs);
sched = cond_resched();
}
diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c
index dad0fb1c8e87..33c473d676a5 100644
--- a/kernel/bpf/cpumask.c
+++ b/kernel/bpf/cpumask.c
@@ -474,6 +474,7 @@ static int __init cpumask_kfunc_init(void)
ret = bpf_mem_alloc_init(&bpf_cpumask_ma, sizeof(struct bpf_cpumask), false);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &cpumask_kfunc_set);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &cpumask_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &cpumask_kfunc_set);
return ret ?: register_btf_id_dtor_kfuncs(cpumask_dtors,
ARRAY_SIZE(cpumask_dtors),
THIS_MODULE);
diff --git a/kernel/bpf/crypto.c b/kernel/bpf/crypto.c
new file mode 100644
index 000000000000..2bee4af91e38
--- /dev/null
+++ b/kernel/bpf/crypto.c
@@ -0,0 +1,385 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2024 Meta, Inc */
+#include <linux/bpf.h>
+#include <linux/bpf_crypto.h>
+#include <linux/bpf_mem_alloc.h>
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
+#include <linux/filter.h>
+#include <linux/scatterlist.h>
+#include <linux/skbuff.h>
+#include <crypto/skcipher.h>
+
+struct bpf_crypto_type_list {
+ const struct bpf_crypto_type *type;
+ struct list_head list;
+};
+
+/* BPF crypto initialization parameters struct */
+/**
+ * struct bpf_crypto_params - BPF crypto initialization parameters structure
+ * @type: The string of crypto operation type.
+ * @reserved: Reserved member, will be reused for more options in future
+ * Values:
+ * 0
+ * @algo: The string of algorithm to initialize.
+ * @key: The cipher key used to init crypto algorithm.
+ * @key_len: The length of cipher key.
+ * @authsize: The length of authentication tag used by algorithm.
+ */
+struct bpf_crypto_params {
+ char type[14];
+ u8 reserved[2];
+ char algo[128];
+ u8 key[256];
+ u32 key_len;
+ u32 authsize;
+};
+
+static LIST_HEAD(bpf_crypto_types);
+static DECLARE_RWSEM(bpf_crypto_types_sem);
+
+/**
+ * struct bpf_crypto_ctx - refcounted BPF crypto context structure
+ * @type: The pointer to bpf crypto type
+ * @tfm: The pointer to instance of crypto API struct.
+ * @siv_len: Size of IV and state storage for cipher
+ * @rcu: The RCU head used to free the crypto context with RCU safety.
+ * @usage: Object reference counter. When the refcount goes to 0, the
+ * memory is released back to the BPF allocator, which provides
+ * RCU safety.
+ */
+struct bpf_crypto_ctx {
+ const struct bpf_crypto_type *type;
+ void *tfm;
+ u32 siv_len;
+ struct rcu_head rcu;
+ refcount_t usage;
+};
+
+int bpf_crypto_register_type(const struct bpf_crypto_type *type)
+{
+ struct bpf_crypto_type_list *node;
+ int err = -EEXIST;
+
+ down_write(&bpf_crypto_types_sem);
+ list_for_each_entry(node, &bpf_crypto_types, list) {
+ if (!strcmp(node->type->name, type->name))
+ goto unlock;
+ }
+
+ node = kmalloc(sizeof(*node), GFP_KERNEL);
+ err = -ENOMEM;
+ if (!node)
+ goto unlock;
+
+ node->type = type;
+ list_add(&node->list, &bpf_crypto_types);
+ err = 0;
+
+unlock:
+ up_write(&bpf_crypto_types_sem);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(bpf_crypto_register_type);
+
+int bpf_crypto_unregister_type(const struct bpf_crypto_type *type)
+{
+ struct bpf_crypto_type_list *node;
+ int err = -ENOENT;
+
+ down_write(&bpf_crypto_types_sem);
+ list_for_each_entry(node, &bpf_crypto_types, list) {
+ if (strcmp(node->type->name, type->name))
+ continue;
+
+ list_del(&node->list);
+ kfree(node);
+ err = 0;
+ break;
+ }
+ up_write(&bpf_crypto_types_sem);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(bpf_crypto_unregister_type);
+
+static const struct bpf_crypto_type *bpf_crypto_get_type(const char *name)
+{
+ const struct bpf_crypto_type *type = ERR_PTR(-ENOENT);
+ struct bpf_crypto_type_list *node;
+
+ down_read(&bpf_crypto_types_sem);
+ list_for_each_entry(node, &bpf_crypto_types, list) {
+ if (strcmp(node->type->name, name))
+ continue;
+
+ if (try_module_get(node->type->owner))
+ type = node->type;
+ break;
+ }
+ up_read(&bpf_crypto_types_sem);
+
+ return type;
+}
+
+__bpf_kfunc_start_defs();
+
+/**
+ * bpf_crypto_ctx_create() - Create a mutable BPF crypto context.
+ *
+ * Allocates a crypto context that can be used, acquired, and released by
+ * a BPF program. The crypto context returned by this function must either
+ * be embedded in a map as a kptr, or freed with bpf_crypto_ctx_release().
+ * As crypto API functions use GFP_KERNEL allocations, this function can
+ * only be used in sleepable BPF programs.
+ *
+ * bpf_crypto_ctx_create() allocates memory for crypto context.
+ * It may return NULL if no memory is available.
+ * @params: pointer to struct bpf_crypto_params which contains all the
+ * details needed to initialise crypto context.
+ * @params__sz: size of steuct bpf_crypto_params usef by bpf program
+ * @err: integer to store error code when NULL is returned.
+ */
+__bpf_kfunc struct bpf_crypto_ctx *
+bpf_crypto_ctx_create(const struct bpf_crypto_params *params, u32 params__sz,
+ int *err)
+{
+ const struct bpf_crypto_type *type;
+ struct bpf_crypto_ctx *ctx;
+
+ if (!params || params->reserved[0] || params->reserved[1] ||
+ params__sz != sizeof(struct bpf_crypto_params)) {
+ *err = -EINVAL;
+ return NULL;
+ }
+
+ type = bpf_crypto_get_type(params->type);
+ if (IS_ERR(type)) {
+ *err = PTR_ERR(type);
+ return NULL;
+ }
+
+ if (!type->has_algo(params->algo)) {
+ *err = -EOPNOTSUPP;
+ goto err_module_put;
+ }
+
+ if (!!params->authsize ^ !!type->setauthsize) {
+ *err = -EOPNOTSUPP;
+ goto err_module_put;
+ }
+
+ if (!params->key_len || params->key_len > sizeof(params->key)) {
+ *err = -EINVAL;
+ goto err_module_put;
+ }
+
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx) {
+ *err = -ENOMEM;
+ goto err_module_put;
+ }
+
+ ctx->type = type;
+ ctx->tfm = type->alloc_tfm(params->algo);
+ if (IS_ERR(ctx->tfm)) {
+ *err = PTR_ERR(ctx->tfm);
+ goto err_free_ctx;
+ }
+
+ if (params->authsize) {
+ *err = type->setauthsize(ctx->tfm, params->authsize);
+ if (*err)
+ goto err_free_tfm;
+ }
+
+ *err = type->setkey(ctx->tfm, params->key, params->key_len);
+ if (*err)
+ goto err_free_tfm;
+
+ if (type->get_flags(ctx->tfm) & CRYPTO_TFM_NEED_KEY) {
+ *err = -EINVAL;
+ goto err_free_tfm;
+ }
+
+ ctx->siv_len = type->ivsize(ctx->tfm) + type->statesize(ctx->tfm);
+
+ refcount_set(&ctx->usage, 1);
+
+ return ctx;
+
+err_free_tfm:
+ type->free_tfm(ctx->tfm);
+err_free_ctx:
+ kfree(ctx);
+err_module_put:
+ module_put(type->owner);
+
+ return NULL;
+}
+
+static void crypto_free_cb(struct rcu_head *head)
+{
+ struct bpf_crypto_ctx *ctx;
+
+ ctx = container_of(head, struct bpf_crypto_ctx, rcu);
+ ctx->type->free_tfm(ctx->tfm);
+ module_put(ctx->type->owner);
+ kfree(ctx);
+}
+
+/**
+ * bpf_crypto_ctx_acquire() - Acquire a reference to a BPF crypto context.
+ * @ctx: The BPF crypto context being acquired. The ctx must be a trusted
+ * pointer.
+ *
+ * Acquires a reference to a BPF crypto context. The context returned by this function
+ * must either be embedded in a map as a kptr, or freed with
+ * bpf_crypto_ctx_release().
+ */
+__bpf_kfunc struct bpf_crypto_ctx *
+bpf_crypto_ctx_acquire(struct bpf_crypto_ctx *ctx)
+{
+ if (!refcount_inc_not_zero(&ctx->usage))
+ return NULL;
+ return ctx;
+}
+
+/**
+ * bpf_crypto_ctx_release() - Release a previously acquired BPF crypto context.
+ * @ctx: The crypto context being released.
+ *
+ * Releases a previously acquired reference to a BPF crypto context. When the final
+ * reference of the BPF crypto context has been released, its memory
+ * will be released.
+ */
+__bpf_kfunc void bpf_crypto_ctx_release(struct bpf_crypto_ctx *ctx)
+{
+ if (refcount_dec_and_test(&ctx->usage))
+ call_rcu(&ctx->rcu, crypto_free_cb);
+}
+
+static int bpf_crypto_crypt(const struct bpf_crypto_ctx *ctx,
+ const struct bpf_dynptr_kern *src,
+ const struct bpf_dynptr_kern *dst,
+ const struct bpf_dynptr_kern *siv,
+ bool decrypt)
+{
+ u32 src_len, dst_len, siv_len;
+ const u8 *psrc;
+ u8 *pdst, *piv;
+ int err;
+
+ if (__bpf_dynptr_is_rdonly(dst))
+ return -EINVAL;
+
+ siv_len = __bpf_dynptr_size(siv);
+ src_len = __bpf_dynptr_size(src);
+ dst_len = __bpf_dynptr_size(dst);
+ if (!src_len || !dst_len)
+ return -EINVAL;
+
+ if (siv_len != ctx->siv_len)
+ return -EINVAL;
+
+ psrc = __bpf_dynptr_data(src, src_len);
+ if (!psrc)
+ return -EINVAL;
+ pdst = __bpf_dynptr_data_rw(dst, dst_len);
+ if (!pdst)
+ return -EINVAL;
+
+ piv = siv_len ? __bpf_dynptr_data_rw(siv, siv_len) : NULL;
+ if (siv_len && !piv)
+ return -EINVAL;
+
+ err = decrypt ? ctx->type->decrypt(ctx->tfm, psrc, pdst, src_len, piv)
+ : ctx->type->encrypt(ctx->tfm, psrc, pdst, src_len, piv);
+
+ return err;
+}
+
+/**
+ * bpf_crypto_decrypt() - Decrypt buffer using configured context and IV provided.
+ * @ctx: The crypto context being used. The ctx must be a trusted pointer.
+ * @src: bpf_dynptr to the encrypted data. Must be a trusted pointer.
+ * @dst: bpf_dynptr to the buffer where to store the result. Must be a trusted pointer.
+ * @siv: bpf_dynptr to IV data and state data to be used by decryptor.
+ *
+ * Decrypts provided buffer using IV data and the crypto context. Crypto context must be configured.
+ */
+__bpf_kfunc int bpf_crypto_decrypt(struct bpf_crypto_ctx *ctx,
+ const struct bpf_dynptr_kern *src,
+ const struct bpf_dynptr_kern *dst,
+ const struct bpf_dynptr_kern *siv)
+{
+ return bpf_crypto_crypt(ctx, src, dst, siv, true);
+}
+
+/**
+ * bpf_crypto_encrypt() - Encrypt buffer using configured context and IV provided.
+ * @ctx: The crypto context being used. The ctx must be a trusted pointer.
+ * @src: bpf_dynptr to the plain data. Must be a trusted pointer.
+ * @dst: bpf_dynptr to buffer where to store the result. Must be a trusted pointer.
+ * @siv: bpf_dynptr to IV data and state data to be used by decryptor.
+ *
+ * Encrypts provided buffer using IV data and the crypto context. Crypto context must be configured.
+ */
+__bpf_kfunc int bpf_crypto_encrypt(struct bpf_crypto_ctx *ctx,
+ const struct bpf_dynptr_kern *src,
+ const struct bpf_dynptr_kern *dst,
+ const struct bpf_dynptr_kern *siv)
+{
+ return bpf_crypto_crypt(ctx, src, dst, siv, false);
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(crypt_init_kfunc_btf_ids)
+BTF_ID_FLAGS(func, bpf_crypto_ctx_create, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_crypto_ctx_release, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_crypto_ctx_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
+BTF_KFUNCS_END(crypt_init_kfunc_btf_ids)
+
+static const struct btf_kfunc_id_set crypt_init_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &crypt_init_kfunc_btf_ids,
+};
+
+BTF_KFUNCS_START(crypt_kfunc_btf_ids)
+BTF_ID_FLAGS(func, bpf_crypto_decrypt, KF_RCU)
+BTF_ID_FLAGS(func, bpf_crypto_encrypt, KF_RCU)
+BTF_KFUNCS_END(crypt_kfunc_btf_ids)
+
+static const struct btf_kfunc_id_set crypt_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &crypt_kfunc_btf_ids,
+};
+
+BTF_ID_LIST(bpf_crypto_dtor_ids)
+BTF_ID(struct, bpf_crypto_ctx)
+BTF_ID(func, bpf_crypto_ctx_release)
+
+static int __init crypto_kfunc_init(void)
+{
+ int ret;
+ const struct btf_id_dtor_kfunc bpf_crypto_dtors[] = {
+ {
+ .btf_id = bpf_crypto_dtor_ids[0],
+ .kfunc_btf_id = bpf_crypto_dtor_ids[1]
+ },
+ };
+
+ ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &crypt_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_ACT, &crypt_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &crypt_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL,
+ &crypt_init_kfunc_set);
+ return ret ?: register_btf_id_dtor_kfuncs(bpf_crypto_dtors,
+ ARRAY_SIZE(bpf_crypto_dtors),
+ THIS_MODULE);
+}
+
+late_initcall(crypto_kfunc_init);
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
index bd2e2dd04740..309c4aa1b026 100644
--- a/kernel/bpf/disasm.c
+++ b/kernel/bpf/disasm.c
@@ -172,6 +172,17 @@ static bool is_addr_space_cast(const struct bpf_insn *insn)
insn->off == BPF_ADDR_SPACE_CAST;
}
+/* Special (internal-only) form of mov, used to resolve per-CPU addrs:
+ * dst_reg = src_reg + <percpu_base_off>
+ * BPF_ADDR_PERCPU is used as a special insn->off value.
+ */
+#define BPF_ADDR_PERCPU (-1)
+
+static inline bool is_mov_percpu_addr(const struct bpf_insn *insn)
+{
+ return insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) && insn->off == BPF_ADDR_PERCPU;
+}
+
void print_bpf_insn(const struct bpf_insn_cbs *cbs,
const struct bpf_insn *insn,
bool allow_ptr_leaks)
@@ -194,6 +205,9 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
verbose(cbs->private_data, "(%02x) r%d = addr_space_cast(r%d, %d, %d)\n",
insn->code, insn->dst_reg,
insn->src_reg, ((u32)insn->imm) >> 16, (u16)insn->imm);
+ } else if (is_mov_percpu_addr(insn)) {
+ verbose(cbs->private_data, "(%02x) r%d = &(void __percpu *)(r%d)\n",
+ insn->code, insn->dst_reg, insn->src_reg);
} else if (BPF_SRC(insn->code) == BPF_X) {
verbose(cbs->private_data, "(%02x) %c%d %s %s%c%d\n",
insn->code, class == BPF_ALU ? 'w' : 'r',
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 3a088a5349bc..06115f8728e8 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -221,13 +221,11 @@ static bool htab_has_extra_elems(struct bpf_htab *htab)
return !htab_is_percpu(htab) && !htab_is_lru(htab);
}
-static void htab_free_prealloced_timers(struct bpf_htab *htab)
+static void htab_free_prealloced_timers_and_wq(struct bpf_htab *htab)
{
u32 num_entries = htab->map.max_entries;
int i;
- if (!btf_record_has_field(htab->map.record, BPF_TIMER))
- return;
if (htab_has_extra_elems(htab))
num_entries += num_possible_cpus();
@@ -235,7 +233,12 @@ static void htab_free_prealloced_timers(struct bpf_htab *htab)
struct htab_elem *elem;
elem = get_htab_elem(htab, i);
- bpf_obj_free_timer(htab->map.record, elem->key + round_up(htab->map.key_size, 8));
+ if (btf_record_has_field(htab->map.record, BPF_TIMER))
+ bpf_obj_free_timer(htab->map.record,
+ elem->key + round_up(htab->map.key_size, 8));
+ if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE))
+ bpf_obj_free_workqueue(htab->map.record,
+ elem->key + round_up(htab->map.key_size, 8));
cond_resched();
}
}
@@ -1490,11 +1493,12 @@ static void delete_all_elements(struct bpf_htab *htab)
hlist_nulls_del_rcu(&l->hash_node);
htab_elem_free(htab, l);
}
+ cond_resched();
}
migrate_enable();
}
-static void htab_free_malloced_timers(struct bpf_htab *htab)
+static void htab_free_malloced_timers_and_wq(struct bpf_htab *htab)
{
int i;
@@ -1506,24 +1510,29 @@ static void htab_free_malloced_timers(struct bpf_htab *htab)
hlist_nulls_for_each_entry(l, n, head, hash_node) {
/* We only free timer on uref dropping to zero */
- bpf_obj_free_timer(htab->map.record, l->key + round_up(htab->map.key_size, 8));
+ if (btf_record_has_field(htab->map.record, BPF_TIMER))
+ bpf_obj_free_timer(htab->map.record,
+ l->key + round_up(htab->map.key_size, 8));
+ if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE))
+ bpf_obj_free_workqueue(htab->map.record,
+ l->key + round_up(htab->map.key_size, 8));
}
cond_resched_rcu();
}
rcu_read_unlock();
}
-static void htab_map_free_timers(struct bpf_map *map)
+static void htab_map_free_timers_and_wq(struct bpf_map *map)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
- /* We only free timer on uref dropping to zero */
- if (!btf_record_has_field(htab->map.record, BPF_TIMER))
- return;
- if (!htab_is_prealloc(htab))
- htab_free_malloced_timers(htab);
- else
- htab_free_prealloced_timers(htab);
+ /* We only free timer and workqueue on uref dropping to zero */
+ if (btf_record_has_field(htab->map.record, BPF_TIMER | BPF_WORKQUEUE)) {
+ if (!htab_is_prealloc(htab))
+ htab_free_malloced_timers_and_wq(htab);
+ else
+ htab_free_prealloced_timers_and_wq(htab);
+ }
}
/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
@@ -1538,7 +1547,7 @@ static void htab_map_free(struct bpf_map *map)
*/
/* htab no longer uses call_rcu() directly. bpf_mem_alloc does it
- * underneath and is reponsible for waiting for callbacks to finish
+ * underneath and is responsible for waiting for callbacks to finish
* during bpf_mem_alloc_destroy().
*/
if (!htab_is_prealloc(htab)) {
@@ -2259,7 +2268,7 @@ const struct bpf_map_ops htab_map_ops = {
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
- .map_release_uref = htab_map_free_timers,
+ .map_release_uref = htab_map_free_timers_and_wq,
.map_lookup_elem = htab_map_lookup_elem,
.map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem,
.map_update_elem = htab_map_update_elem,
@@ -2280,7 +2289,7 @@ const struct bpf_map_ops htab_lru_map_ops = {
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
- .map_release_uref = htab_map_free_timers,
+ .map_release_uref = htab_map_free_timers_and_wq,
.map_lookup_elem = htab_lru_map_lookup_elem,
.map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem,
.map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys,
@@ -2307,6 +2316,26 @@ static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
return NULL;
}
+/* inline bpf_map_lookup_elem() call for per-CPU hashmap */
+static int htab_percpu_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ if (!bpf_jit_supports_percpu_insn())
+ return -EOPNOTSUPP;
+
+ BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
+ (void *(*)(struct bpf_map *map, void *key))NULL));
+ *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3);
+ *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_0,
+ offsetof(struct htab_elem, key) + map->key_size);
+ *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0);
+ *insn++ = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0);
+
+ return insn - insn_buf;
+}
+
static void *htab_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu)
{
struct htab_elem *l;
@@ -2435,6 +2464,7 @@ const struct bpf_map_ops htab_percpu_map_ops = {
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
.map_lookup_elem = htab_percpu_map_lookup_elem,
+ .map_gen_lookup = htab_percpu_map_gen_lookup,
.map_lookup_and_delete_elem = htab_percpu_map_lookup_and_delete_elem,
.map_update_elem = htab_percpu_map_update_elem,
.map_delete_elem = htab_map_delete_elem,
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index a89587859571..2a69a9a36c0f 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1079,11 +1079,20 @@ const struct bpf_func_proto bpf_snprintf_proto = {
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
+struct bpf_async_cb {
+ struct bpf_map *map;
+ struct bpf_prog *prog;
+ void __rcu *callback_fn;
+ void *value;
+ struct rcu_head rcu;
+ u64 flags;
+};
+
/* BPF map elements can contain 'struct bpf_timer'.
* Such map owns all of its BPF timers.
* 'struct bpf_timer' is allocated as part of map element allocation
* and it's zero initialized.
- * That space is used to keep 'struct bpf_timer_kern'.
+ * That space is used to keep 'struct bpf_async_kern'.
* bpf_timer_init() allocates 'struct bpf_hrtimer', inits hrtimer, and
* remembers 'struct bpf_map *' pointer it's part of.
* bpf_timer_set_callback() increments prog refcnt and assign bpf callback_fn.
@@ -1096,17 +1105,23 @@ const struct bpf_func_proto bpf_snprintf_proto = {
* freeing the timers when inner map is replaced or deleted by user space.
*/
struct bpf_hrtimer {
+ struct bpf_async_cb cb;
struct hrtimer timer;
- struct bpf_map *map;
- struct bpf_prog *prog;
- void __rcu *callback_fn;
- void *value;
- struct rcu_head rcu;
};
-/* the actual struct hidden inside uapi struct bpf_timer */
-struct bpf_timer_kern {
- struct bpf_hrtimer *timer;
+struct bpf_work {
+ struct bpf_async_cb cb;
+ struct work_struct work;
+ struct work_struct delete_work;
+};
+
+/* the actual struct hidden inside uapi struct bpf_timer and bpf_wq */
+struct bpf_async_kern {
+ union {
+ struct bpf_async_cb *cb;
+ struct bpf_hrtimer *timer;
+ struct bpf_work *work;
+ };
/* bpf_spin_lock is used here instead of spinlock_t to make
* sure that it always fits into space reserved by struct bpf_timer
* regardless of LOCKDEP and spinlock debug flags.
@@ -1114,19 +1129,24 @@ struct bpf_timer_kern {
struct bpf_spin_lock lock;
} __attribute__((aligned(8)));
+enum bpf_async_type {
+ BPF_ASYNC_TYPE_TIMER = 0,
+ BPF_ASYNC_TYPE_WQ,
+};
+
static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running);
static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer)
{
struct bpf_hrtimer *t = container_of(hrtimer, struct bpf_hrtimer, timer);
- struct bpf_map *map = t->map;
- void *value = t->value;
+ struct bpf_map *map = t->cb.map;
+ void *value = t->cb.value;
bpf_callback_t callback_fn;
void *key;
u32 idx;
BTF_TYPE_EMIT(struct bpf_timer);
- callback_fn = rcu_dereference_check(t->callback_fn, rcu_read_lock_bh_held());
+ callback_fn = rcu_dereference_check(t->cb.callback_fn, rcu_read_lock_bh_held());
if (!callback_fn)
goto out;
@@ -1155,46 +1175,112 @@ out:
return HRTIMER_NORESTART;
}
-BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map,
- u64, flags)
+static void bpf_wq_work(struct work_struct *work)
+{
+ struct bpf_work *w = container_of(work, struct bpf_work, work);
+ struct bpf_async_cb *cb = &w->cb;
+ struct bpf_map *map = cb->map;
+ bpf_callback_t callback_fn;
+ void *value = cb->value;
+ void *key;
+ u32 idx;
+
+ BTF_TYPE_EMIT(struct bpf_wq);
+
+ callback_fn = READ_ONCE(cb->callback_fn);
+ if (!callback_fn)
+ return;
+
+ if (map->map_type == BPF_MAP_TYPE_ARRAY) {
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+
+ /* compute the key */
+ idx = ((char *)value - array->value) / array->elem_size;
+ key = &idx;
+ } else { /* hash or lru */
+ key = value - round_up(map->key_size, 8);
+ }
+
+ rcu_read_lock_trace();
+ migrate_disable();
+
+ callback_fn((u64)(long)map, (u64)(long)key, (u64)(long)value, 0, 0);
+
+ migrate_enable();
+ rcu_read_unlock_trace();
+}
+
+static void bpf_wq_delete_work(struct work_struct *work)
+{
+ struct bpf_work *w = container_of(work, struct bpf_work, delete_work);
+
+ cancel_work_sync(&w->work);
+
+ kfree_rcu(w, cb.rcu);
+}
+
+static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags,
+ enum bpf_async_type type)
{
- clockid_t clockid = flags & (MAX_CLOCKS - 1);
+ struct bpf_async_cb *cb;
struct bpf_hrtimer *t;
+ struct bpf_work *w;
+ clockid_t clockid;
+ size_t size;
int ret = 0;
- BUILD_BUG_ON(MAX_CLOCKS != 16);
- BUILD_BUG_ON(sizeof(struct bpf_timer_kern) > sizeof(struct bpf_timer));
- BUILD_BUG_ON(__alignof__(struct bpf_timer_kern) != __alignof__(struct bpf_timer));
-
if (in_nmi())
return -EOPNOTSUPP;
- if (flags >= MAX_CLOCKS ||
- /* similar to timerfd except _ALARM variants are not supported */
- (clockid != CLOCK_MONOTONIC &&
- clockid != CLOCK_REALTIME &&
- clockid != CLOCK_BOOTTIME))
+ switch (type) {
+ case BPF_ASYNC_TYPE_TIMER:
+ size = sizeof(struct bpf_hrtimer);
+ break;
+ case BPF_ASYNC_TYPE_WQ:
+ size = sizeof(struct bpf_work);
+ break;
+ default:
return -EINVAL;
- __bpf_spin_lock_irqsave(&timer->lock);
- t = timer->timer;
+ }
+
+ __bpf_spin_lock_irqsave(&async->lock);
+ t = async->timer;
if (t) {
ret = -EBUSY;
goto out;
}
+
/* allocate hrtimer via map_kmalloc to use memcg accounting */
- t = bpf_map_kmalloc_node(map, sizeof(*t), GFP_ATOMIC, map->numa_node);
- if (!t) {
+ cb = bpf_map_kmalloc_node(map, size, GFP_ATOMIC, map->numa_node);
+ if (!cb) {
ret = -ENOMEM;
goto out;
}
- t->value = (void *)timer - map->record->timer_off;
- t->map = map;
- t->prog = NULL;
- rcu_assign_pointer(t->callback_fn, NULL);
- hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT);
- t->timer.function = bpf_timer_cb;
- WRITE_ONCE(timer->timer, t);
- /* Guarantee the order between timer->timer and map->usercnt. So
+
+ switch (type) {
+ case BPF_ASYNC_TYPE_TIMER:
+ clockid = flags & (MAX_CLOCKS - 1);
+ t = (struct bpf_hrtimer *)cb;
+
+ hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT);
+ t->timer.function = bpf_timer_cb;
+ cb->value = (void *)async - map->record->timer_off;
+ break;
+ case BPF_ASYNC_TYPE_WQ:
+ w = (struct bpf_work *)cb;
+
+ INIT_WORK(&w->work, bpf_wq_work);
+ INIT_WORK(&w->delete_work, bpf_wq_delete_work);
+ cb->value = (void *)async - map->record->wq_off;
+ break;
+ }
+ cb->map = map;
+ cb->prog = NULL;
+ cb->flags = flags;
+ rcu_assign_pointer(cb->callback_fn, NULL);
+
+ WRITE_ONCE(async->cb, cb);
+ /* Guarantee the order between async->cb and map->usercnt. So
* when there are concurrent uref release and bpf timer init, either
* bpf_timer_cancel_and_free() called by uref release reads a no-NULL
* timer or atomic64_read() below returns a zero usercnt.
@@ -1204,15 +1290,34 @@ BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map
/* maps with timers must be either held by user space
* or pinned in bpffs.
*/
- WRITE_ONCE(timer->timer, NULL);
- kfree(t);
+ WRITE_ONCE(async->cb, NULL);
+ kfree(cb);
ret = -EPERM;
}
out:
- __bpf_spin_unlock_irqrestore(&timer->lock);
+ __bpf_spin_unlock_irqrestore(&async->lock);
return ret;
}
+BPF_CALL_3(bpf_timer_init, struct bpf_async_kern *, timer, struct bpf_map *, map,
+ u64, flags)
+{
+ clock_t clockid = flags & (MAX_CLOCKS - 1);
+
+ BUILD_BUG_ON(MAX_CLOCKS != 16);
+ BUILD_BUG_ON(sizeof(struct bpf_async_kern) > sizeof(struct bpf_timer));
+ BUILD_BUG_ON(__alignof__(struct bpf_async_kern) != __alignof__(struct bpf_timer));
+
+ if (flags >= MAX_CLOCKS ||
+ /* similar to timerfd except _ALARM variants are not supported */
+ (clockid != CLOCK_MONOTONIC &&
+ clockid != CLOCK_REALTIME &&
+ clockid != CLOCK_BOOTTIME))
+ return -EINVAL;
+
+ return __bpf_async_init(timer, map, flags, BPF_ASYNC_TYPE_TIMER);
+}
+
static const struct bpf_func_proto bpf_timer_init_proto = {
.func = bpf_timer_init,
.gpl_only = true,
@@ -1222,22 +1327,23 @@ static const struct bpf_func_proto bpf_timer_init_proto = {
.arg3_type = ARG_ANYTHING,
};
-BPF_CALL_3(bpf_timer_set_callback, struct bpf_timer_kern *, timer, void *, callback_fn,
- struct bpf_prog_aux *, aux)
+static int __bpf_async_set_callback(struct bpf_async_kern *async, void *callback_fn,
+ struct bpf_prog_aux *aux, unsigned int flags,
+ enum bpf_async_type type)
{
struct bpf_prog *prev, *prog = aux->prog;
- struct bpf_hrtimer *t;
+ struct bpf_async_cb *cb;
int ret = 0;
if (in_nmi())
return -EOPNOTSUPP;
- __bpf_spin_lock_irqsave(&timer->lock);
- t = timer->timer;
- if (!t) {
+ __bpf_spin_lock_irqsave(&async->lock);
+ cb = async->cb;
+ if (!cb) {
ret = -EINVAL;
goto out;
}
- if (!atomic64_read(&t->map->usercnt)) {
+ if (!atomic64_read(&cb->map->usercnt)) {
/* maps with timers must be either held by user space
* or pinned in bpffs. Otherwise timer might still be
* running even when bpf prog is detached and user space
@@ -1246,7 +1352,7 @@ BPF_CALL_3(bpf_timer_set_callback, struct bpf_timer_kern *, timer, void *, callb
ret = -EPERM;
goto out;
}
- prev = t->prog;
+ prev = cb->prog;
if (prev != prog) {
/* Bump prog refcnt once. Every bpf_timer_set_callback()
* can pick different callback_fn-s within the same prog.
@@ -1259,14 +1365,20 @@ BPF_CALL_3(bpf_timer_set_callback, struct bpf_timer_kern *, timer, void *, callb
if (prev)
/* Drop prev prog refcnt when swapping with new prog */
bpf_prog_put(prev);
- t->prog = prog;
+ cb->prog = prog;
}
- rcu_assign_pointer(t->callback_fn, callback_fn);
+ rcu_assign_pointer(cb->callback_fn, callback_fn);
out:
- __bpf_spin_unlock_irqrestore(&timer->lock);
+ __bpf_spin_unlock_irqrestore(&async->lock);
return ret;
}
+BPF_CALL_3(bpf_timer_set_callback, struct bpf_async_kern *, timer, void *, callback_fn,
+ struct bpf_prog_aux *, aux)
+{
+ return __bpf_async_set_callback(timer, callback_fn, aux, 0, BPF_ASYNC_TYPE_TIMER);
+}
+
static const struct bpf_func_proto bpf_timer_set_callback_proto = {
.func = bpf_timer_set_callback,
.gpl_only = true,
@@ -1275,7 +1387,7 @@ static const struct bpf_func_proto bpf_timer_set_callback_proto = {
.arg2_type = ARG_PTR_TO_FUNC,
};
-BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, flags)
+BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, timer, u64, nsecs, u64, flags)
{
struct bpf_hrtimer *t;
int ret = 0;
@@ -1287,7 +1399,7 @@ BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, fla
return -EINVAL;
__bpf_spin_lock_irqsave(&timer->lock);
t = timer->timer;
- if (!t || !t->prog) {
+ if (!t || !t->cb.prog) {
ret = -EINVAL;
goto out;
}
@@ -1315,18 +1427,18 @@ static const struct bpf_func_proto bpf_timer_start_proto = {
.arg3_type = ARG_ANYTHING,
};
-static void drop_prog_refcnt(struct bpf_hrtimer *t)
+static void drop_prog_refcnt(struct bpf_async_cb *async)
{
- struct bpf_prog *prog = t->prog;
+ struct bpf_prog *prog = async->prog;
if (prog) {
bpf_prog_put(prog);
- t->prog = NULL;
- rcu_assign_pointer(t->callback_fn, NULL);
+ async->prog = NULL;
+ rcu_assign_pointer(async->callback_fn, NULL);
}
}
-BPF_CALL_1(bpf_timer_cancel, struct bpf_timer_kern *, timer)
+BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer)
{
struct bpf_hrtimer *t;
int ret = 0;
@@ -1348,7 +1460,7 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_timer_kern *, timer)
ret = -EDEADLK;
goto out;
}
- drop_prog_refcnt(t);
+ drop_prog_refcnt(&t->cb);
out:
__bpf_spin_unlock_irqrestore(&timer->lock);
/* Cancel the timer and wait for associated callback to finish
@@ -1366,36 +1478,44 @@ static const struct bpf_func_proto bpf_timer_cancel_proto = {
.arg1_type = ARG_PTR_TO_TIMER,
};
-/* This function is called by map_delete/update_elem for individual element and
- * by ops->map_release_uref when the user space reference to a map reaches zero.
- */
-void bpf_timer_cancel_and_free(void *val)
+static struct bpf_async_cb *__bpf_async_cancel_and_free(struct bpf_async_kern *async)
{
- struct bpf_timer_kern *timer = val;
- struct bpf_hrtimer *t;
+ struct bpf_async_cb *cb;
- /* Performance optimization: read timer->timer without lock first. */
- if (!READ_ONCE(timer->timer))
- return;
+ /* Performance optimization: read async->cb without lock first. */
+ if (!READ_ONCE(async->cb))
+ return NULL;
- __bpf_spin_lock_irqsave(&timer->lock);
+ __bpf_spin_lock_irqsave(&async->lock);
/* re-read it under lock */
- t = timer->timer;
- if (!t)
+ cb = async->cb;
+ if (!cb)
goto out;
- drop_prog_refcnt(t);
+ drop_prog_refcnt(cb);
/* The subsequent bpf_timer_start/cancel() helpers won't be able to use
* this timer, since it won't be initialized.
*/
- WRITE_ONCE(timer->timer, NULL);
+ WRITE_ONCE(async->cb, NULL);
out:
- __bpf_spin_unlock_irqrestore(&timer->lock);
+ __bpf_spin_unlock_irqrestore(&async->lock);
+ return cb;
+}
+
+/* This function is called by map_delete/update_elem for individual element and
+ * by ops->map_release_uref when the user space reference to a map reaches zero.
+ */
+void bpf_timer_cancel_and_free(void *val)
+{
+ struct bpf_hrtimer *t;
+
+ t = (struct bpf_hrtimer *)__bpf_async_cancel_and_free(val);
+
if (!t)
return;
/* Cancel the timer and wait for callback to complete if it was running.
* If hrtimer_cancel() can be safely called it's safe to call kfree(t)
* right after for both preallocated and non-preallocated maps.
- * The timer->timer = NULL was already done and no code path can
+ * The async->cb = NULL was already done and no code path can
* see address 't' anymore.
*
* Check that bpf_map_delete/update_elem() wasn't called from timer
@@ -1404,13 +1524,33 @@ out:
* return -1). Though callback_fn is still running on this cpu it's
* safe to do kfree(t) because bpf_timer_cb() read everything it needed
* from 't'. The bpf subprog callback_fn won't be able to access 't',
- * since timer->timer = NULL was already done. The timer will be
+ * since async->cb = NULL was already done. The timer will be
* effectively cancelled because bpf_timer_cb() will return
* HRTIMER_NORESTART.
*/
if (this_cpu_read(hrtimer_running) != t)
hrtimer_cancel(&t->timer);
- kfree_rcu(t, rcu);
+ kfree_rcu(t, cb.rcu);
+}
+
+/* This function is called by map_delete/update_elem for individual element and
+ * by ops->map_release_uref when the user space reference to a map reaches zero.
+ */
+void bpf_wq_cancel_and_free(void *val)
+{
+ struct bpf_work *work;
+
+ BTF_TYPE_EMIT(struct bpf_wq);
+
+ work = (struct bpf_work *)__bpf_async_cancel_and_free(val);
+ if (!work)
+ return;
+ /* Trigger cancel of the sleepable work, but *do not* wait for
+ * it to finish if it was running as we might not be in a
+ * sleepable context.
+ * kfree will be called once the work has finished.
+ */
+ schedule_work(&work->delete_work);
}
BPF_CALL_2(bpf_kptr_xchg, void *, map_value, void *, ptr)
@@ -1443,7 +1583,7 @@ static const struct bpf_func_proto bpf_kptr_xchg_proto = {
#define DYNPTR_SIZE_MASK 0xFFFFFF
#define DYNPTR_RDONLY_BIT BIT(31)
-static bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr)
+bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr)
{
return ptr->size & DYNPTR_RDONLY_BIT;
}
@@ -1730,6 +1870,10 @@ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_strtol_proto;
case BPF_FUNC_strtoul:
return &bpf_strtoul_proto;
+ case BPF_FUNC_get_current_pid_tgid:
+ return &bpf_get_current_pid_tgid_proto;
+ case BPF_FUNC_get_ns_current_pid_tgid:
+ return &bpf_get_ns_current_pid_tgid_proto;
default:
break;
}
@@ -2408,7 +2552,7 @@ __bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr_kern *ptr, u32 o
/* bpf_dynptr_slice_rdwr is the same logic as bpf_dynptr_slice.
*
* For skb-type dynptrs, it is safe to write into the returned pointer
- * if the bpf program allows skb data writes. There are two possiblities
+ * if the bpf program allows skb data writes. There are two possibilities
* that may occur when calling bpf_dynptr_slice_rdwr:
*
* 1) The requested slice is in the head of the skb. In this case, the
@@ -2545,10 +2689,65 @@ __bpf_kfunc void bpf_throw(u64 cookie)
WARN(1, "A call to BPF exception callback should never return\n");
}
+__bpf_kfunc int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags)
+{
+ struct bpf_async_kern *async = (struct bpf_async_kern *)wq;
+ struct bpf_map *map = p__map;
+
+ BUILD_BUG_ON(sizeof(struct bpf_async_kern) > sizeof(struct bpf_wq));
+ BUILD_BUG_ON(__alignof__(struct bpf_async_kern) != __alignof__(struct bpf_wq));
+
+ if (flags)
+ return -EINVAL;
+
+ return __bpf_async_init(async, map, flags, BPF_ASYNC_TYPE_WQ);
+}
+
+__bpf_kfunc int bpf_wq_start(struct bpf_wq *wq, unsigned int flags)
+{
+ struct bpf_async_kern *async = (struct bpf_async_kern *)wq;
+ struct bpf_work *w;
+
+ if (in_nmi())
+ return -EOPNOTSUPP;
+ if (flags)
+ return -EINVAL;
+ w = READ_ONCE(async->work);
+ if (!w || !READ_ONCE(w->cb.prog))
+ return -EINVAL;
+
+ schedule_work(&w->work);
+ return 0;
+}
+
+__bpf_kfunc int bpf_wq_set_callback_impl(struct bpf_wq *wq,
+ int (callback_fn)(void *map, int *key, struct bpf_wq *wq),
+ unsigned int flags,
+ void *aux__ign)
+{
+ struct bpf_prog_aux *aux = (struct bpf_prog_aux *)aux__ign;
+ struct bpf_async_kern *async = (struct bpf_async_kern *)wq;
+
+ if (flags)
+ return -EINVAL;
+
+ return __bpf_async_set_callback(async, callback_fn, aux, flags, BPF_ASYNC_TYPE_WQ);
+}
+
+__bpf_kfunc void bpf_preempt_disable(void)
+{
+ preempt_disable();
+}
+
+__bpf_kfunc void bpf_preempt_enable(void)
+{
+ preempt_enable();
+}
+
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(generic_btf_ids)
-#ifdef CONFIG_KEXEC_CORE
+#ifdef CONFIG_CRASH_DUMP
BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE)
#endif
BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)
@@ -2621,6 +2820,12 @@ BTF_ID_FLAGS(func, bpf_dynptr_is_null)
BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
BTF_ID_FLAGS(func, bpf_dynptr_size)
BTF_ID_FLAGS(func, bpf_dynptr_clone)
+BTF_ID_FLAGS(func, bpf_modify_return_test_tp)
+BTF_ID_FLAGS(func, bpf_wq_init)
+BTF_ID_FLAGS(func, bpf_wq_set_callback_impl)
+BTF_ID_FLAGS(func, bpf_wq_start)
+BTF_ID_FLAGS(func, bpf_preempt_disable)
+BTF_ID_FLAGS(func, bpf_preempt_enable)
BTF_KFUNCS_END(common_btf_ids)
static const struct btf_kfunc_id_set common_kfunc_set = {
@@ -2648,6 +2853,7 @@ static int __init kfunc_init(void)
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &generic_kfunc_set);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &generic_kfunc_set);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &generic_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &generic_kfunc_set);
ret = ret ?: register_btf_id_dtor_kfuncs(generic_dtors,
ARRAY_SIZE(generic_dtors),
THIS_MODULE);
diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
index 2a243cf37c60..4bd8f17a9f24 100644
--- a/kernel/bpf/log.c
+++ b/kernel/bpf/log.c
@@ -467,9 +467,9 @@ const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type)
if (type & PTR_MAYBE_NULL) {
if (base_type(type) == PTR_TO_BTF_ID)
- strncpy(postfix, "or_null_", 16);
+ strscpy(postfix, "or_null_");
else
- strncpy(postfix, "_or_null", 16);
+ strscpy(postfix, "_or_null");
}
snprintf(prefix, sizeof(prefix), "%s%s%s%s%s%s%s",
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 050fe1ebf0f7..0218a5132ab5 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -155,16 +155,17 @@ static inline int extract_bit(const u8 *data, size_t index)
}
/**
- * longest_prefix_match() - determine the longest prefix
+ * __longest_prefix_match() - determine the longest prefix
* @trie: The trie to get internal sizes from
* @node: The node to operate on
* @key: The key to compare to @node
*
* Determine the longest prefix of @node that matches the bits in @key.
*/
-static size_t longest_prefix_match(const struct lpm_trie *trie,
- const struct lpm_trie_node *node,
- const struct bpf_lpm_trie_key_u8 *key)
+static __always_inline
+size_t __longest_prefix_match(const struct lpm_trie *trie,
+ const struct lpm_trie_node *node,
+ const struct bpf_lpm_trie_key_u8 *key)
{
u32 limit = min(node->prefixlen, key->prefixlen);
u32 prefixlen = 0, i = 0;
@@ -224,6 +225,13 @@ static size_t longest_prefix_match(const struct lpm_trie *trie,
return prefixlen;
}
+static size_t longest_prefix_match(const struct lpm_trie *trie,
+ const struct lpm_trie_node *node,
+ const struct bpf_lpm_trie_key_u8 *key)
+{
+ return __longest_prefix_match(trie, node, key);
+}
+
/* Called from syscall or from eBPF program */
static void *trie_lookup_elem(struct bpf_map *map, void *_key)
{
@@ -245,7 +253,7 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key)
* If it's the maximum possible prefix for this trie, we have
* an exact match and can return it directly.
*/
- matchlen = longest_prefix_match(trie, node, key);
+ matchlen = __longest_prefix_match(trie, node, key);
if (matchlen == trie->max_prefixlen) {
found = node;
break;
@@ -308,6 +316,7 @@ static long trie_update_elem(struct bpf_map *map,
{
struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
struct lpm_trie_node *node, *im_node = NULL, *new_node = NULL;
+ struct lpm_trie_node *free_node = NULL;
struct lpm_trie_node __rcu **slot;
struct bpf_lpm_trie_key_u8 *key = _key;
unsigned long irq_flags;
@@ -382,7 +391,7 @@ static long trie_update_elem(struct bpf_map *map,
trie->n_entries--;
rcu_assign_pointer(*slot, new_node);
- kfree_rcu(node, rcu);
+ free_node = node;
goto out;
}
@@ -429,6 +438,7 @@ out:
}
spin_unlock_irqrestore(&trie->lock, irq_flags);
+ kfree_rcu(free_node, rcu);
return ret;
}
@@ -437,6 +447,7 @@ out:
static long trie_delete_elem(struct bpf_map *map, void *_key)
{
struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
+ struct lpm_trie_node *free_node = NULL, *free_parent = NULL;
struct bpf_lpm_trie_key_u8 *key = _key;
struct lpm_trie_node __rcu **trim, **trim2;
struct lpm_trie_node *node, *parent;
@@ -506,8 +517,8 @@ static long trie_delete_elem(struct bpf_map *map, void *_key)
else
rcu_assign_pointer(
*trim2, rcu_access_pointer(parent->child[0]));
- kfree_rcu(parent, rcu);
- kfree_rcu(node, rcu);
+ free_parent = parent;
+ free_node = node;
goto out;
}
@@ -521,10 +532,12 @@ static long trie_delete_elem(struct bpf_map *map, void *_key)
rcu_assign_pointer(*trim, rcu_access_pointer(node->child[1]));
else
RCU_INIT_POINTER(*trim, NULL);
- kfree_rcu(node, rcu);
+ free_node = node;
out:
spin_unlock_irqrestore(&trie->lock, irq_flags);
+ kfree_rcu(free_parent, rcu);
+ kfree_rcu(free_node, rcu);
return ret;
}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index ae2ff73bde7e..cf6285760aea 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -559,6 +559,7 @@ void btf_record_free(struct btf_record *rec)
case BPF_SPIN_LOCK:
case BPF_TIMER:
case BPF_REFCOUNT:
+ case BPF_WORKQUEUE:
/* Nothing to release */
break;
default:
@@ -608,6 +609,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec)
case BPF_SPIN_LOCK:
case BPF_TIMER:
case BPF_REFCOUNT:
+ case BPF_WORKQUEUE:
/* Nothing to acquire */
break;
default:
@@ -659,6 +661,13 @@ void bpf_obj_free_timer(const struct btf_record *rec, void *obj)
bpf_timer_cancel_and_free(obj + rec->timer_off);
}
+void bpf_obj_free_workqueue(const struct btf_record *rec, void *obj)
+{
+ if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_WORKQUEUE)))
+ return;
+ bpf_wq_cancel_and_free(obj + rec->wq_off);
+}
+
void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
{
const struct btf_field *fields;
@@ -679,6 +688,9 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
case BPF_TIMER:
bpf_timer_cancel_and_free(field_ptr);
break;
+ case BPF_WORKQUEUE:
+ bpf_wq_cancel_and_free(field_ptr);
+ break;
case BPF_KPTR_UNREF:
WRITE_ONCE(*(u64 *)field_ptr, 0);
break;
@@ -1085,7 +1097,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
map->record = btf_parse_fields(btf, value_type,
BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD |
- BPF_RB_ROOT | BPF_REFCOUNT,
+ BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE,
map->value_size);
if (!IS_ERR_OR_NULL(map->record)) {
int i;
@@ -1115,6 +1127,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
}
break;
case BPF_TIMER:
+ case BPF_WORKQUEUE:
if (map->map_type != BPF_MAP_TYPE_HASH &&
map->map_type != BPF_MAP_TYPE_LRU_HASH &&
map->map_type != BPF_MAP_TYPE_ARRAY) {
@@ -3024,17 +3037,46 @@ void bpf_link_inc(struct bpf_link *link)
atomic64_inc(&link->refcnt);
}
+static void bpf_link_defer_dealloc_rcu_gp(struct rcu_head *rcu)
+{
+ struct bpf_link *link = container_of(rcu, struct bpf_link, rcu);
+
+ /* free bpf_link and its containing memory */
+ link->ops->dealloc_deferred(link);
+}
+
+static void bpf_link_defer_dealloc_mult_rcu_gp(struct rcu_head *rcu)
+{
+ if (rcu_trace_implies_rcu_gp())
+ bpf_link_defer_dealloc_rcu_gp(rcu);
+ else
+ call_rcu(rcu, bpf_link_defer_dealloc_rcu_gp);
+}
+
/* bpf_link_free is guaranteed to be called from process context */
static void bpf_link_free(struct bpf_link *link)
{
+ bool sleepable = false;
+
bpf_link_free_id(link->id);
if (link->prog) {
+ sleepable = link->prog->sleepable;
/* detach BPF program, clean up used resources */
link->ops->release(link);
bpf_prog_put(link->prog);
}
- /* free bpf_link and its containing memory */
- link->ops->dealloc(link);
+ if (link->ops->dealloc_deferred) {
+ /* schedule BPF link deallocation; if underlying BPF program
+ * is sleepable, we need to first wait for RCU tasks trace
+ * sync, then go through "classic" RCU grace period
+ */
+ if (sleepable)
+ call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_mult_rcu_gp);
+ else
+ call_rcu(&link->rcu, bpf_link_defer_dealloc_rcu_gp);
+ }
+ if (link->ops->dealloc)
+ link->ops->dealloc(link);
}
static void bpf_link_put_deferred(struct work_struct *work)
@@ -3469,17 +3511,12 @@ out_put_prog:
return err;
}
-struct bpf_raw_tp_link {
- struct bpf_link link;
- struct bpf_raw_event_map *btp;
-};
-
static void bpf_raw_tp_link_release(struct bpf_link *link)
{
struct bpf_raw_tp_link *raw_tp =
container_of(link, struct bpf_raw_tp_link, link);
- bpf_probe_unregister(raw_tp->btp, raw_tp->link.prog);
+ bpf_probe_unregister(raw_tp->btp, raw_tp);
bpf_put_raw_tracepoint(raw_tp->btp);
}
@@ -3544,7 +3581,7 @@ static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link,
static const struct bpf_link_ops bpf_raw_tp_link_lops = {
.release = bpf_raw_tp_link_release,
- .dealloc = bpf_raw_tp_link_dealloc,
+ .dealloc_deferred = bpf_raw_tp_link_dealloc,
.show_fdinfo = bpf_raw_tp_link_show_fdinfo,
.fill_link_info = bpf_raw_tp_link_fill_link_info,
};
@@ -3779,7 +3816,7 @@ static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *pro
#endif /* CONFIG_PERF_EVENTS */
static int bpf_raw_tp_link_attach(struct bpf_prog *prog,
- const char __user *user_tp_name)
+ const char __user *user_tp_name, u64 cookie)
{
struct bpf_link_primer link_primer;
struct bpf_raw_tp_link *link;
@@ -3826,6 +3863,7 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog,
bpf_link_init(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT,
&bpf_raw_tp_link_lops, prog);
link->btp = btp;
+ link->cookie = cookie;
err = bpf_link_prime(&link->link, &link_primer);
if (err) {
@@ -3833,7 +3871,7 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog,
goto out_put_btp;
}
- err = bpf_probe_register(link->btp, prog);
+ err = bpf_probe_register(link->btp, link);
if (err) {
bpf_link_cleanup(&link_primer);
goto out_put_btp;
@@ -3846,11 +3884,13 @@ out_put_btp:
return err;
}
-#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd
+#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.cookie
static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
{
struct bpf_prog *prog;
+ void __user *tp_name;
+ __u64 cookie;
int fd;
if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN))
@@ -3860,7 +3900,9 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
if (IS_ERR(prog))
return PTR_ERR(prog);
- fd = bpf_raw_tp_link_attach(prog, u64_to_user_ptr(attr->raw_tracepoint.name));
+ tp_name = u64_to_user_ptr(attr->raw_tracepoint.name);
+ cookie = attr->raw_tracepoint.cookie;
+ fd = bpf_raw_tp_link_attach(prog, tp_name, cookie);
if (fd < 0)
bpf_prog_put(prog);
return fd;
@@ -3956,6 +3998,11 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
* check permissions at attach time.
*/
return -EPERM;
+
+ ptype = attach_type_to_prog_type(attach_type);
+ if (prog->type != ptype)
+ return -EINVAL;
+
return prog->enforce_expected_attach_type &&
prog->expected_attach_type != attach_type ?
-EINVAL : 0;
@@ -3974,11 +4021,15 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI &&
attach_type != BPF_TRACE_KPROBE_MULTI)
return -EINVAL;
+ if (prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION &&
+ attach_type != BPF_TRACE_KPROBE_SESSION)
+ return -EINVAL;
if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI &&
attach_type != BPF_TRACE_UPROBE_MULTI)
return -EINVAL;
if (attach_type != BPF_PERF_EVENT &&
attach_type != BPF_TRACE_KPROBE_MULTI &&
+ attach_type != BPF_TRACE_KPROBE_SESSION &&
attach_type != BPF_TRACE_UPROBE_MULTI)
return -EINVAL;
return 0;
@@ -5198,7 +5249,7 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
goto out;
}
if (prog->expected_attach_type == BPF_TRACE_RAW_TP)
- ret = bpf_raw_tp_link_attach(prog, NULL);
+ ret = bpf_raw_tp_link_attach(prog, NULL, attr->link_create.tracing.cookie);
else if (prog->expected_attach_type == BPF_TRACE_ITER)
ret = bpf_iter_link_attach(attr, uattr, prog);
else if (prog->expected_attach_type == BPF_LSM_CGROUP)
@@ -5213,6 +5264,10 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
case BPF_PROG_TYPE_SK_LOOKUP:
ret = netns_bpf_link_create(attr, prog);
break;
+ case BPF_PROG_TYPE_SK_MSG:
+ case BPF_PROG_TYPE_SK_SKB:
+ ret = sock_map_link_create(attr, prog);
+ break;
#ifdef CONFIG_NET
case BPF_PROG_TYPE_XDP:
ret = bpf_xdp_link_attach(attr, prog);
@@ -5235,7 +5290,8 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
case BPF_PROG_TYPE_KPROBE:
if (attr->link_create.attach_type == BPF_PERF_EVENT)
ret = bpf_perf_link_attach(attr, prog);
- else if (attr->link_create.attach_type == BPF_TRACE_KPROBE_MULTI)
+ else if (attr->link_create.attach_type == BPF_TRACE_KPROBE_MULTI ||
+ attr->link_create.attach_type == BPF_TRACE_KPROBE_SESSION)
ret = bpf_kprobe_multi_link_attach(attr, prog);
else if (attr->link_create.attach_type == BPF_TRACE_UPROBE_MULTI)
ret = bpf_uprobe_multi_link_attach(attr, prog);
diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c
index ef6911aee3bb..fedb54c94cdb 100644
--- a/kernel/bpf/sysfs_btf.c
+++ b/kernel/bpf/sysfs_btf.c
@@ -9,8 +9,8 @@
#include <linux/sysfs.h>
/* See scripts/link-vmlinux.sh, gen_btf() func for details */
-extern char __weak __start_BTF[];
-extern char __weak __stop_BTF[];
+extern char __start_BTF[];
+extern char __stop_BTF[];
static ssize_t
btf_vmlinux_read(struct file *file, struct kobject *kobj,
@@ -32,7 +32,7 @@ static int __init btf_vmlinux_init(void)
{
bin_attr_btf_vmlinux.size = __stop_BTF - __start_BTF;
- if (!__start_BTF || bin_attr_btf_vmlinux.size == 0)
+ if (bin_attr_btf_vmlinux.size == 0)
return 0;
btf_kobj = kobject_create_and_add("btf", kernel_kobj);
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index db7599c59c78..f8302a5ca400 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -333,7 +333,7 @@ static void bpf_tramp_image_put(struct bpf_tramp_image *im)
int err = bpf_arch_text_poke(im->ip_after_call, BPF_MOD_JUMP,
NULL, im->ip_epilogue);
WARN_ON(err);
- if (IS_ENABLED(CONFIG_PREEMPTION))
+ if (IS_ENABLED(CONFIG_TASKS_RCU))
call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu_tasks);
else
percpu_ref_kill(&im->pcref);
@@ -456,7 +456,9 @@ again:
if (err < 0)
goto out_free;
- arch_protect_bpf_trampoline(im->image, im->size);
+ err = arch_protect_bpf_trampoline(im->image, im->size);
+ if (err)
+ goto out_free;
WARN_ON(tr->cur_image && total == 0);
if (tr->cur_image)
@@ -883,12 +885,13 @@ static void notrace update_prog_stats(struct bpf_prog *prog,
* Hence check that 'start' is valid.
*/
start > NO_START_TIME) {
+ u64 duration = sched_clock() - start;
unsigned long flags;
stats = this_cpu_ptr(prog->stats);
flags = u64_stats_update_begin_irqsave(&stats->syncp);
u64_stats_inc(&stats->cnt);
- u64_stats_add(&stats->nsecs, sched_clock() - start);
+ u64_stats_add(&stats->nsecs, duration);
u64_stats_update_end_irqrestore(&stats->syncp, flags);
}
}
@@ -1072,17 +1075,10 @@ void __weak arch_free_bpf_trampoline(void *image, unsigned int size)
bpf_jit_free_exec(image);
}
-void __weak arch_protect_bpf_trampoline(void *image, unsigned int size)
-{
- WARN_ON_ONCE(size > PAGE_SIZE);
- set_memory_rox((long)image, 1);
-}
-
-void __weak arch_unprotect_bpf_trampoline(void *image, unsigned int size)
+int __weak arch_protect_bpf_trampoline(void *image, unsigned int size)
{
WARN_ON_ONCE(size > PAGE_SIZE);
- set_memory_nx((long)image, 1);
- set_memory_rw((long)image, 1);
+ return set_memory_rox((long)image, 1);
}
int __weak arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 63749ad5ac6b..77da1f438bec 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -172,7 +172,7 @@ static bool bpf_global_percpu_ma_set;
/* verifier_state + insn_idx are pushed to stack when branch is encountered */
struct bpf_verifier_stack_elem {
- /* verifer state is 'st'
+ /* verifier state is 'st'
* before processing instruction 'insn_idx'
* and after processing instruction 'prev_insn_idx'
*/
@@ -190,11 +190,6 @@ struct bpf_verifier_stack_elem {
#define BPF_MAP_KEY_POISON (1ULL << 63)
#define BPF_MAP_KEY_SEEN (1ULL << 62)
-#define BPF_MAP_PTR_UNPRIV 1UL
-#define BPF_MAP_PTR_POISON ((void *)((0xeB9FUL << 1) + \
- POISON_POINTER_DELTA))
-#define BPF_MAP_PTR(X) ((struct bpf_map *)((X) & ~BPF_MAP_PTR_UNPRIV))
-
#define BPF_GLOBAL_PERCPU_MA_MAX_SIZE 512
static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx);
@@ -209,21 +204,22 @@ static bool is_trusted_reg(const struct bpf_reg_state *reg);
static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux)
{
- return BPF_MAP_PTR(aux->map_ptr_state) == BPF_MAP_PTR_POISON;
+ return aux->map_ptr_state.poison;
}
static bool bpf_map_ptr_unpriv(const struct bpf_insn_aux_data *aux)
{
- return aux->map_ptr_state & BPF_MAP_PTR_UNPRIV;
+ return aux->map_ptr_state.unpriv;
}
static void bpf_map_ptr_store(struct bpf_insn_aux_data *aux,
- const struct bpf_map *map, bool unpriv)
+ struct bpf_map *map,
+ bool unpriv, bool poison)
{
- BUILD_BUG_ON((unsigned long)BPF_MAP_PTR_POISON & BPF_MAP_PTR_UNPRIV);
unpriv |= bpf_map_ptr_unpriv(aux);
- aux->map_ptr_state = (unsigned long)map |
- (unpriv ? BPF_MAP_PTR_UNPRIV : 0UL);
+ aux->map_ptr_state.unpriv = unpriv;
+ aux->map_ptr_state.poison = poison;
+ aux->map_ptr_state.map_ptr = map;
}
static bool bpf_map_key_poisoned(const struct bpf_insn_aux_data *aux)
@@ -336,6 +332,10 @@ struct bpf_kfunc_call_arg_meta {
u8 spi;
u8 frameno;
} iter;
+ struct {
+ struct bpf_map *ptr;
+ int uid;
+ } map;
u64 mem_size;
};
@@ -501,8 +501,12 @@ static bool is_dynptr_ref_function(enum bpf_func_id func_id)
}
static bool is_sync_callback_calling_kfunc(u32 btf_id);
+static bool is_async_callback_calling_kfunc(u32 btf_id);
+static bool is_callback_calling_kfunc(u32 btf_id);
static bool is_bpf_throw_kfunc(struct bpf_insn *insn);
+static bool is_bpf_wq_set_callback_impl_kfunc(u32 btf_id);
+
static bool is_sync_callback_calling_function(enum bpf_func_id func_id)
{
return func_id == BPF_FUNC_for_each_map_elem ||
@@ -530,7 +534,8 @@ static bool is_sync_callback_calling_insn(struct bpf_insn *insn)
static bool is_async_callback_calling_insn(struct bpf_insn *insn)
{
- return bpf_helper_call(insn) && is_async_callback_calling_function(insn->imm);
+ return (bpf_helper_call(insn) && is_async_callback_calling_function(insn->imm)) ||
+ (bpf_pseudo_kfunc_call(insn) && is_async_callback_calling_kfunc(insn->imm));
}
static bool is_may_goto_insn(struct bpf_insn *insn)
@@ -1429,6 +1434,8 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
}
dst_state->speculative = src->speculative;
dst_state->active_rcu_lock = src->active_rcu_lock;
+ dst_state->active_preempt_lock = src->active_preempt_lock;
+ dst_state->in_sleepable = src->in_sleepable;
dst_state->curframe = src->curframe;
dst_state->active_lock.ptr = src->active_lock.ptr;
dst_state->active_lock.id = src->active_lock.id;
@@ -1842,6 +1849,8 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
*/
if (btf_record_has_field(map->inner_map_meta->record, BPF_TIMER))
reg->map_uid = reg->id;
+ if (btf_record_has_field(map->inner_map_meta->record, BPF_WORKQUEUE))
+ reg->map_uid = reg->id;
} else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
reg->type = PTR_TO_XDP_SOCK;
} else if (map->map_type == BPF_MAP_TYPE_SOCKMAP ||
@@ -2135,7 +2144,7 @@ static void __reg64_deduce_bounds(struct bpf_reg_state *reg)
static void __reg_deduce_mixed_bounds(struct bpf_reg_state *reg)
{
/* Try to tighten 64-bit bounds from 32-bit knowledge, using 32-bit
- * values on both sides of 64-bit range in hope to have tigher range.
+ * values on both sides of 64-bit range in hope to have tighter range.
* E.g., if r1 is [0x1'00000000, 0x3'80000000], and we learn from
* 32-bit signed > 0 operation that s32 bounds are now [1; 0x7fffffff].
* With this, we can substitute 1 as low 32-bits of _low_ 64-bit bound
@@ -2143,7 +2152,7 @@ static void __reg_deduce_mixed_bounds(struct bpf_reg_state *reg)
* _high_ 64-bit bound (0x380000000 -> 0x37fffffff) and arrive at a
* better overall bounds for r1 as [0x1'000000001; 0x3'7fffffff].
* We just need to make sure that derived bounds we are intersecting
- * with are well-formed ranges in respecitve s64 or u64 domain, just
+ * with are well-formed ranges in respective s64 or u64 domain, just
* like we do with similar kinds of 32-to-64 or 64-to-32 adjustments.
*/
__u64 new_umin, new_umax;
@@ -2359,6 +2368,8 @@ static void mark_btf_ld_reg(struct bpf_verifier_env *env,
regs[regno].type = PTR_TO_BTF_ID | flag;
regs[regno].btf = btf;
regs[regno].btf_id = btf_id;
+ if (type_may_be_null(flag))
+ regs[regno].id = ++env->id_gen;
}
#define DEF_NOT_SUBREG (0)
@@ -2402,7 +2413,7 @@ static void init_func_state(struct bpf_verifier_env *env,
/* Similar to push_stack(), but for async callbacks */
static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env,
int insn_idx, int prev_insn_idx,
- int subprog)
+ int subprog, bool is_sleepable)
{
struct bpf_verifier_stack_elem *elem;
struct bpf_func_state *frame;
@@ -2429,6 +2440,7 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env,
* Initialize it similar to do_check_common().
*/
elem->st.branches = 1;
+ elem->st.in_sleepable = is_sleepable;
frame = kzalloc(sizeof(*frame), GFP_KERNEL);
if (!frame)
goto err;
@@ -3615,7 +3627,8 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
* sreg needs precision before this insn
*/
bt_clear_reg(bt, dreg);
- bt_set_reg(bt, sreg);
+ if (sreg != BPF_REG_FP)
+ bt_set_reg(bt, sreg);
} else {
/* dreg = K
* dreg needs precision after this insn.
@@ -3631,7 +3644,8 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
* both dreg and sreg need precision
* before this insn
*/
- bt_set_reg(bt, sreg);
+ if (sreg != BPF_REG_FP)
+ bt_set_reg(bt, sreg);
} /* else dreg += K
* dreg still needs precision before this insn
*/
@@ -5274,7 +5288,8 @@ bad_type:
static bool in_sleepable(struct bpf_verifier_env *env)
{
- return env->prog->sleepable;
+ return env->prog->sleepable ||
+ (env->cur_state && env->cur_state->in_sleepable);
}
/* The non-sleepable programs and sleepable programs with explicit bpf_rcu_read_lock()
@@ -5297,6 +5312,7 @@ BTF_ID(struct, cgroup)
BTF_ID(struct, bpf_cpumask)
#endif
BTF_ID(struct, task_struct)
+BTF_ID(struct, bpf_crypto_ctx)
BTF_SET_END(rcu_protected_types)
static bool rcu_protected_object(const struct btf *btf, u32 btf_id)
@@ -5386,8 +5402,6 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
*/
mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, kptr_field->kptr.btf,
kptr_field->kptr.btf_id, btf_ld_kptr_type(env, kptr_field));
- /* For mark_ptr_or_null_reg */
- val_reg->id = ++env->id_gen;
} else if (class == BPF_STX) {
val_reg = reg_state(env, value_regno);
if (!register_is_null(val_reg) &&
@@ -5682,6 +5696,13 @@ static bool is_flow_key_reg(struct bpf_verifier_env *env, int regno)
return reg->type == PTR_TO_FLOW_KEYS;
}
+static bool is_arena_reg(struct bpf_verifier_env *env, int regno)
+{
+ const struct bpf_reg_state *reg = reg_state(env, regno);
+
+ return reg->type == PTR_TO_ARENA;
+}
+
static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = {
#ifdef CONFIG_NET
[PTR_TO_SOCKET] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK],
@@ -5698,7 +5719,8 @@ static bool is_trusted_reg(const struct bpf_reg_state *reg)
return true;
/* Types listed in the reg2btf_ids are always trusted */
- if (reg2btf_ids[base_type(reg->type)])
+ if (reg2btf_ids[base_type(reg->type)] &&
+ !bpf_type_has_unsafe_modifiers(reg->type))
return true;
/* If a register is not referenced, it is trusted if it has the
@@ -6318,6 +6340,7 @@ static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val,
#define BTF_TYPE_SAFE_RCU(__type) __PASTE(__type, __safe_rcu)
#define BTF_TYPE_SAFE_RCU_OR_NULL(__type) __PASTE(__type, __safe_rcu_or_null)
#define BTF_TYPE_SAFE_TRUSTED(__type) __PASTE(__type, __safe_trusted)
+#define BTF_TYPE_SAFE_TRUSTED_OR_NULL(__type) __PASTE(__type, __safe_trusted_or_null)
/*
* Allow list few fields as RCU trusted or full trusted.
@@ -6381,7 +6404,7 @@ BTF_TYPE_SAFE_TRUSTED(struct dentry) {
struct inode *d_inode;
};
-BTF_TYPE_SAFE_TRUSTED(struct socket) {
+BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct socket) {
struct sock *sk;
};
@@ -6416,11 +6439,20 @@ static bool type_is_trusted(struct bpf_verifier_env *env,
BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct linux_binprm));
BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct file));
BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct dentry));
- BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct socket));
return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_trusted");
}
+static bool type_is_trusted_or_null(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg,
+ const char *field_name, u32 btf_id)
+{
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct socket));
+
+ return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id,
+ "__safe_trusted_or_null");
+}
+
static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
struct bpf_reg_state *regs,
int regno, int off, int size,
@@ -6529,6 +6561,8 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
*/
if (type_is_trusted(env, reg, field_name, btf_id)) {
flag |= PTR_TRUSTED;
+ } else if (type_is_trusted_or_null(env, reg, field_name, btf_id)) {
+ flag |= PTR_TRUSTED | PTR_MAYBE_NULL;
} else if (in_rcu_cs(env) && !type_may_be_null(reg->type)) {
if (type_is_rcu(env, reg, field_name, btf_id)) {
/* ignore __rcu tag and mark it MEM_RCU */
@@ -6694,6 +6728,11 @@ static int check_stack_access_within_bounds(
err = check_stack_slot_within_bounds(env, min_off, state, type);
if (!err && max_off > 0)
err = -EINVAL; /* out of stack access into non-negative offsets */
+ if (!err && access_size < 0)
+ /* access_size should not be negative (or overflow an int); others checks
+ * along the way should have prevented such an access.
+ */
+ err = -EFAULT; /* invalid negative access size; integer overflow? */
if (err) {
if (tnum_is_const(reg->var_off)) {
@@ -6960,6 +6999,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
return err;
}
+static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type type,
+ bool allow_trust_mismatch);
+
static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn)
{
int load_reg;
@@ -7019,7 +7061,8 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
if (is_ctx_reg(env, insn->dst_reg) ||
is_pkt_reg(env, insn->dst_reg) ||
is_flow_key_reg(env, insn->dst_reg) ||
- is_sk_reg(env, insn->dst_reg)) {
+ is_sk_reg(env, insn->dst_reg) ||
+ (is_arena_reg(env, insn->dst_reg) && !bpf_jit_supports_insn(insn, true))) {
verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n",
insn->dst_reg,
reg_type_str(env, reg_state(env, insn->dst_reg)->type));
@@ -7055,6 +7098,11 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
if (err)
return err;
+ if (is_arena_reg(env, insn->dst_reg)) {
+ err = save_aux_ptr_type(env, PTR_TO_ARENA, false);
+ if (err)
+ return err;
+ }
/* Check whether we can write into the same memory. */
err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
BPF_SIZE(insn->code), BPF_WRITE, -1, true, false);
@@ -7577,6 +7625,23 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno,
return 0;
}
+static int process_wq_func(struct bpf_verifier_env *env, int regno,
+ struct bpf_kfunc_call_arg_meta *meta)
+{
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ struct bpf_map *map = reg->map_ptr;
+ u64 val = reg->var_off.value;
+
+ if (map->record->wq_off != val + reg->off) {
+ verbose(env, "off %lld doesn't point to 'struct bpf_wq' that is at %d\n",
+ val + reg->off, map->record->wq_off);
+ return -EINVAL;
+ }
+ meta->map.uid = reg->map_uid;
+ meta->map.ptr = map;
+ return 0;
+}
+
static int process_kptr_func(struct bpf_verifier_env *env, int regno,
struct bpf_call_arg_meta *meta)
{
@@ -9471,7 +9536,7 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins
*/
env->subprog_info[subprog].is_cb = true;
if (bpf_pseudo_kfunc_call(insn) &&
- !is_sync_callback_calling_kfunc(insn->imm)) {
+ !is_callback_calling_kfunc(insn->imm)) {
verbose(env, "verifier bug: kfunc %s#%d not marked as callback-calling\n",
func_id_name(insn->imm), insn->imm);
return -EFAULT;
@@ -9485,10 +9550,11 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins
if (is_async_callback_calling_insn(insn)) {
struct bpf_verifier_state *async_cb;
- /* there is no real recursion here. timer callbacks are async */
+ /* there is no real recursion here. timer and workqueue callbacks are async */
env->subprog_info[subprog].is_async_cb = true;
async_cb = push_async_cb(env, env->subprog_info[subprog].start,
- insn_idx, subprog);
+ insn_idx, subprog,
+ is_bpf_wq_set_callback_impl_kfunc(insn->imm));
if (!async_cb)
return -EFAULT;
callee = async_cb->frame[0];
@@ -9548,6 +9614,13 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
return -EINVAL;
}
+ /* Only global subprogs cannot be called with preemption disabled. */
+ if (env->cur_state->active_preempt_lock) {
+ verbose(env, "global function calls are not allowed with preemption disabled,\n"
+ "use static function instead\n");
+ return -EINVAL;
+ }
+
if (err) {
verbose(env, "Caller passes invalid args into func#%d ('%s')\n",
subprog, sub_name);
@@ -9640,12 +9713,8 @@ static int set_map_elem_callback_state(struct bpf_verifier_env *env,
struct bpf_map *map;
int err;
- if (bpf_map_ptr_poisoned(insn_aux)) {
- verbose(env, "tail_call abusing map_ptr\n");
- return -EINVAL;
- }
-
- map = BPF_MAP_PTR(insn_aux->map_ptr_state);
+ /* valid map_ptr and poison value does not matter */
+ map = insn_aux->map_ptr_state.map_ptr;
if (!map->ops->map_set_for_each_callback_args ||
!map->ops->map_for_each_callback) {
verbose(env, "callback function not allowed for map\n");
@@ -10004,12 +10073,12 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
return -EACCES;
}
- if (!BPF_MAP_PTR(aux->map_ptr_state))
+ if (!aux->map_ptr_state.map_ptr)
+ bpf_map_ptr_store(aux, meta->map_ptr,
+ !meta->map_ptr->bypass_spec_v1, false);
+ else if (aux->map_ptr_state.map_ptr != meta->map_ptr)
bpf_map_ptr_store(aux, meta->map_ptr,
- !meta->map_ptr->bypass_spec_v1);
- else if (BPF_MAP_PTR(aux->map_ptr_state) != meta->map_ptr)
- bpf_map_ptr_store(aux, BPF_MAP_PTR_POISON,
- !meta->map_ptr->bypass_spec_v1);
+ !meta->map_ptr->bypass_spec_v1, true);
return 0;
}
@@ -10188,8 +10257,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
if (env->ops->get_func_proto)
fn = env->ops->get_func_proto(func_id, env->prog);
if (!fn) {
- verbose(env, "unknown func %s#%d\n", func_id_name(func_id),
- func_id);
+ verbose(env, "program of this type cannot use helper %s#%d\n",
+ func_id_name(func_id), func_id);
return -EINVAL;
}
@@ -10238,6 +10307,17 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
env->insn_aux_data[insn_idx].storage_get_func_atomic = true;
}
+ if (env->cur_state->active_preempt_lock) {
+ if (fn->might_sleep) {
+ verbose(env, "sleepable helper %s#%d in non-preemptible region\n",
+ func_id_name(func_id), func_id);
+ return -EINVAL;
+ }
+
+ if (in_sleepable(env) && is_storage_get_function(func_id))
+ env->insn_aux_data[insn_idx].storage_get_func_atomic = true;
+ }
+
meta.func_id = func_id;
/* check args */
for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
@@ -10826,6 +10906,7 @@ enum {
KF_ARG_LIST_NODE_ID,
KF_ARG_RB_ROOT_ID,
KF_ARG_RB_NODE_ID,
+ KF_ARG_WORKQUEUE_ID,
};
BTF_ID_LIST(kf_arg_btf_ids)
@@ -10834,6 +10915,7 @@ BTF_ID(struct, bpf_list_head)
BTF_ID(struct, bpf_list_node)
BTF_ID(struct, bpf_rb_root)
BTF_ID(struct, bpf_rb_node)
+BTF_ID(struct, bpf_wq)
static bool __is_kfunc_ptr_arg_type(const struct btf *btf,
const struct btf_param *arg, int type)
@@ -10877,6 +10959,11 @@ static bool is_kfunc_arg_rbtree_node(const struct btf *btf, const struct btf_par
return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RB_NODE_ID);
}
+static bool is_kfunc_arg_wq(const struct btf *btf, const struct btf_param *arg)
+{
+ return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_WORKQUEUE_ID);
+}
+
static bool is_kfunc_arg_callback(struct bpf_verifier_env *env, const struct btf *btf,
const struct btf_param *arg)
{
@@ -10946,6 +11033,7 @@ enum kfunc_ptr_arg_type {
KF_ARG_PTR_TO_NULL,
KF_ARG_PTR_TO_CONST_STR,
KF_ARG_PTR_TO_MAP,
+ KF_ARG_PTR_TO_WORKQUEUE,
};
enum special_kfunc_type {
@@ -10971,7 +11059,11 @@ enum special_kfunc_type {
KF_bpf_percpu_obj_new_impl,
KF_bpf_percpu_obj_drop_impl,
KF_bpf_throw,
+ KF_bpf_wq_set_callback_impl,
+ KF_bpf_preempt_disable,
+ KF_bpf_preempt_enable,
KF_bpf_iter_css_task_new,
+ KF_bpf_session_cookie,
};
BTF_SET_START(special_kfunc_set)
@@ -10995,6 +11087,7 @@ BTF_ID(func, bpf_dynptr_clone)
BTF_ID(func, bpf_percpu_obj_new_impl)
BTF_ID(func, bpf_percpu_obj_drop_impl)
BTF_ID(func, bpf_throw)
+BTF_ID(func, bpf_wq_set_callback_impl)
#ifdef CONFIG_CGROUPS
BTF_ID(func, bpf_iter_css_task_new)
#endif
@@ -11023,11 +11116,15 @@ BTF_ID(func, bpf_dynptr_clone)
BTF_ID(func, bpf_percpu_obj_new_impl)
BTF_ID(func, bpf_percpu_obj_drop_impl)
BTF_ID(func, bpf_throw)
+BTF_ID(func, bpf_wq_set_callback_impl)
+BTF_ID(func, bpf_preempt_disable)
+BTF_ID(func, bpf_preempt_enable)
#ifdef CONFIG_CGROUPS
BTF_ID(func, bpf_iter_css_task_new)
#else
BTF_ID_UNUSED
#endif
+BTF_ID(func, bpf_session_cookie)
static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)
{
@@ -11049,6 +11146,16 @@ static bool is_kfunc_bpf_rcu_read_unlock(struct bpf_kfunc_call_arg_meta *meta)
return meta->func_id == special_kfunc_list[KF_bpf_rcu_read_unlock];
}
+static bool is_kfunc_bpf_preempt_disable(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->func_id == special_kfunc_list[KF_bpf_preempt_disable];
+}
+
+static bool is_kfunc_bpf_preempt_enable(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->func_id == special_kfunc_list[KF_bpf_preempt_enable];
+}
+
static enum kfunc_ptr_arg_type
get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
struct bpf_kfunc_call_arg_meta *meta,
@@ -11102,6 +11209,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
if (is_kfunc_arg_map(meta->btf, &args[argno]))
return KF_ARG_PTR_TO_MAP;
+ if (is_kfunc_arg_wq(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_WORKQUEUE;
+
if ((base_type(reg->type) == PTR_TO_BTF_ID || reg2btf_ids[base_type(reg->type)])) {
if (!btf_type_is_struct(ref_t)) {
verbose(env, "kernel function %s args#%d pointer type %s %s is not supported\n",
@@ -11353,12 +11463,28 @@ static bool is_sync_callback_calling_kfunc(u32 btf_id)
return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl];
}
+static bool is_async_callback_calling_kfunc(u32 btf_id)
+{
+ return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl];
+}
+
static bool is_bpf_throw_kfunc(struct bpf_insn *insn)
{
return bpf_pseudo_kfunc_call(insn) && insn->off == 0 &&
insn->imm == special_kfunc_list[KF_bpf_throw];
}
+static bool is_bpf_wq_set_callback_impl_kfunc(u32 btf_id)
+{
+ return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl];
+}
+
+static bool is_callback_calling_kfunc(u32 btf_id)
+{
+ return is_sync_callback_calling_kfunc(btf_id) ||
+ is_async_callback_calling_kfunc(btf_id);
+}
+
static bool is_rbtree_lock_required_kfunc(u32 btf_id)
{
return is_bpf_rbtree_api_kfunc(btf_id);
@@ -11703,6 +11829,34 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
case KF_ARG_PTR_TO_NULL:
continue;
case KF_ARG_PTR_TO_MAP:
+ if (!reg->map_ptr) {
+ verbose(env, "pointer in R%d isn't map pointer\n", regno);
+ return -EINVAL;
+ }
+ if (meta->map.ptr && reg->map_ptr->record->wq_off >= 0) {
+ /* Use map_uid (which is unique id of inner map) to reject:
+ * inner_map1 = bpf_map_lookup_elem(outer_map, key1)
+ * inner_map2 = bpf_map_lookup_elem(outer_map, key2)
+ * if (inner_map1 && inner_map2) {
+ * wq = bpf_map_lookup_elem(inner_map1);
+ * if (wq)
+ * // mismatch would have been allowed
+ * bpf_wq_init(wq, inner_map2);
+ * }
+ *
+ * Comparing map_ptr is enough to distinguish normal and outer maps.
+ */
+ if (meta->map.ptr != reg->map_ptr ||
+ meta->map.uid != reg->map_uid) {
+ verbose(env,
+ "workqueue pointer in R1 map_uid=%d doesn't match map pointer in R2 map_uid=%d\n",
+ meta->map.uid, reg->map_uid);
+ return -EINVAL;
+ }
+ }
+ meta->map.ptr = reg->map_ptr;
+ meta->map.uid = reg->map_uid;
+ fallthrough;
case KF_ARG_PTR_TO_ALLOC_BTF_ID:
case KF_ARG_PTR_TO_BTF_ID:
if (!is_kfunc_trusted_args(meta) && !is_kfunc_rcu(meta))
@@ -11735,6 +11889,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
case KF_ARG_PTR_TO_CALLBACK:
case KF_ARG_PTR_TO_REFCOUNTED_KPTR:
case KF_ARG_PTR_TO_CONST_STR:
+ case KF_ARG_PTR_TO_WORKQUEUE:
/* Trusted by default */
break;
default:
@@ -12021,6 +12176,15 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
if (ret)
return ret;
break;
+ case KF_ARG_PTR_TO_WORKQUEUE:
+ if (reg->type != PTR_TO_MAP_VALUE) {
+ verbose(env, "arg#%d doesn't point to a map value\n", i);
+ return -EINVAL;
+ }
+ ret = process_wq_func(env, regno, meta);
+ if (ret < 0)
+ return ret;
+ break;
}
}
@@ -12080,11 +12244,11 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char
static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
int *insn_idx_p)
{
- const struct btf_type *t, *ptr_type;
+ bool sleepable, rcu_lock, rcu_unlock, preempt_disable, preempt_enable;
u32 i, nargs, ptr_type_id, release_ref_obj_id;
struct bpf_reg_state *regs = cur_regs(env);
const char *func_name, *ptr_type_name;
- bool sleepable, rcu_lock, rcu_unlock;
+ const struct btf_type *t, *ptr_type;
struct bpf_kfunc_call_arg_meta meta;
struct bpf_insn_aux_data *insn_aux;
int err, insn_idx = *insn_idx_p;
@@ -12132,9 +12296,27 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
}
}
+ if (meta.func_id == special_kfunc_list[KF_bpf_session_cookie]) {
+ meta.r0_size = sizeof(u64);
+ meta.r0_rdonly = false;
+ }
+
+ if (is_bpf_wq_set_callback_impl_kfunc(meta.func_id)) {
+ err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+ set_timer_callback_state);
+ if (err) {
+ verbose(env, "kfunc %s#%d failed callback verification\n",
+ func_name, meta.func_id);
+ return err;
+ }
+ }
+
rcu_lock = is_kfunc_bpf_rcu_read_lock(&meta);
rcu_unlock = is_kfunc_bpf_rcu_read_unlock(&meta);
+ preempt_disable = is_kfunc_bpf_preempt_disable(&meta);
+ preempt_enable = is_kfunc_bpf_preempt_enable(&meta);
+
if (env->cur_state->active_rcu_lock) {
struct bpf_func_state *state;
struct bpf_reg_state *reg;
@@ -12167,6 +12349,22 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
return -EINVAL;
}
+ if (env->cur_state->active_preempt_lock) {
+ if (preempt_disable) {
+ env->cur_state->active_preempt_lock++;
+ } else if (preempt_enable) {
+ env->cur_state->active_preempt_lock--;
+ } else if (sleepable) {
+ verbose(env, "kernel func %s is sleepable within non-preemptible region\n", func_name);
+ return -EACCES;
+ }
+ } else if (preempt_disable) {
+ env->cur_state->active_preempt_lock++;
+ } else if (preempt_enable) {
+ verbose(env, "unmatched attempt to enable preemption (kernel function %s)\n", func_name);
+ return -EINVAL;
+ }
+
/* In case of release function, we get register number of refcounted
* PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now.
*/
@@ -13305,7 +13503,6 @@ static void scalar32_min_max_and(struct bpf_reg_state *dst_reg,
bool src_known = tnum_subreg_is_const(src_reg->var_off);
bool dst_known = tnum_subreg_is_const(dst_reg->var_off);
struct tnum var32_off = tnum_subreg(dst_reg->var_off);
- s32 smin_val = src_reg->s32_min_value;
u32 umax_val = src_reg->u32_max_value;
if (src_known && dst_known) {
@@ -13318,18 +13515,16 @@ static void scalar32_min_max_and(struct bpf_reg_state *dst_reg,
*/
dst_reg->u32_min_value = var32_off.value;
dst_reg->u32_max_value = min(dst_reg->u32_max_value, umax_val);
- if (dst_reg->s32_min_value < 0 || smin_val < 0) {
- /* Lose signed bounds when ANDing negative numbers,
- * ain't nobody got time for that.
- */
- dst_reg->s32_min_value = S32_MIN;
- dst_reg->s32_max_value = S32_MAX;
- } else {
- /* ANDing two positives gives a positive, so safe to
- * cast result into s64.
- */
+
+ /* Safe to set s32 bounds by casting u32 result into s32 when u32
+ * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded.
+ */
+ if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) {
dst_reg->s32_min_value = dst_reg->u32_min_value;
dst_reg->s32_max_value = dst_reg->u32_max_value;
+ } else {
+ dst_reg->s32_min_value = S32_MIN;
+ dst_reg->s32_max_value = S32_MAX;
}
}
@@ -13338,7 +13533,6 @@ static void scalar_min_max_and(struct bpf_reg_state *dst_reg,
{
bool src_known = tnum_is_const(src_reg->var_off);
bool dst_known = tnum_is_const(dst_reg->var_off);
- s64 smin_val = src_reg->smin_value;
u64 umax_val = src_reg->umax_value;
if (src_known && dst_known) {
@@ -13351,18 +13545,16 @@ static void scalar_min_max_and(struct bpf_reg_state *dst_reg,
*/
dst_reg->umin_value = dst_reg->var_off.value;
dst_reg->umax_value = min(dst_reg->umax_value, umax_val);
- if (dst_reg->smin_value < 0 || smin_val < 0) {
- /* Lose signed bounds when ANDing negative numbers,
- * ain't nobody got time for that.
- */
- dst_reg->smin_value = S64_MIN;
- dst_reg->smax_value = S64_MAX;
- } else {
- /* ANDing two positives gives a positive, so safe to
- * cast result into s64.
- */
+
+ /* Safe to set s64 bounds by casting u64 result into s64 when u64
+ * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded.
+ */
+ if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) {
dst_reg->smin_value = dst_reg->umin_value;
dst_reg->smax_value = dst_reg->umax_value;
+ } else {
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
}
/* We may learn something more from the var_off */
__update_reg_bounds(dst_reg);
@@ -13374,7 +13566,6 @@ static void scalar32_min_max_or(struct bpf_reg_state *dst_reg,
bool src_known = tnum_subreg_is_const(src_reg->var_off);
bool dst_known = tnum_subreg_is_const(dst_reg->var_off);
struct tnum var32_off = tnum_subreg(dst_reg->var_off);
- s32 smin_val = src_reg->s32_min_value;
u32 umin_val = src_reg->u32_min_value;
if (src_known && dst_known) {
@@ -13387,18 +13578,16 @@ static void scalar32_min_max_or(struct bpf_reg_state *dst_reg,
*/
dst_reg->u32_min_value = max(dst_reg->u32_min_value, umin_val);
dst_reg->u32_max_value = var32_off.value | var32_off.mask;
- if (dst_reg->s32_min_value < 0 || smin_val < 0) {
- /* Lose signed bounds when ORing negative numbers,
- * ain't nobody got time for that.
- */
- dst_reg->s32_min_value = S32_MIN;
- dst_reg->s32_max_value = S32_MAX;
- } else {
- /* ORing two positives gives a positive, so safe to
- * cast result into s64.
- */
+
+ /* Safe to set s32 bounds by casting u32 result into s32 when u32
+ * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded.
+ */
+ if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) {
dst_reg->s32_min_value = dst_reg->u32_min_value;
dst_reg->s32_max_value = dst_reg->u32_max_value;
+ } else {
+ dst_reg->s32_min_value = S32_MIN;
+ dst_reg->s32_max_value = S32_MAX;
}
}
@@ -13407,7 +13596,6 @@ static void scalar_min_max_or(struct bpf_reg_state *dst_reg,
{
bool src_known = tnum_is_const(src_reg->var_off);
bool dst_known = tnum_is_const(dst_reg->var_off);
- s64 smin_val = src_reg->smin_value;
u64 umin_val = src_reg->umin_value;
if (src_known && dst_known) {
@@ -13420,18 +13608,16 @@ static void scalar_min_max_or(struct bpf_reg_state *dst_reg,
*/
dst_reg->umin_value = max(dst_reg->umin_value, umin_val);
dst_reg->umax_value = dst_reg->var_off.value | dst_reg->var_off.mask;
- if (dst_reg->smin_value < 0 || smin_val < 0) {
- /* Lose signed bounds when ORing negative numbers,
- * ain't nobody got time for that.
- */
- dst_reg->smin_value = S64_MIN;
- dst_reg->smax_value = S64_MAX;
- } else {
- /* ORing two positives gives a positive, so safe to
- * cast result into s64.
- */
+
+ /* Safe to set s64 bounds by casting u64 result into s64 when u64
+ * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded.
+ */
+ if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) {
dst_reg->smin_value = dst_reg->umin_value;
dst_reg->smax_value = dst_reg->umax_value;
+ } else {
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
}
/* We may learn something more from the var_off */
__update_reg_bounds(dst_reg);
@@ -13443,7 +13629,6 @@ static void scalar32_min_max_xor(struct bpf_reg_state *dst_reg,
bool src_known = tnum_subreg_is_const(src_reg->var_off);
bool dst_known = tnum_subreg_is_const(dst_reg->var_off);
struct tnum var32_off = tnum_subreg(dst_reg->var_off);
- s32 smin_val = src_reg->s32_min_value;
if (src_known && dst_known) {
__mark_reg32_known(dst_reg, var32_off.value);
@@ -13454,10 +13639,10 @@ static void scalar32_min_max_xor(struct bpf_reg_state *dst_reg,
dst_reg->u32_min_value = var32_off.value;
dst_reg->u32_max_value = var32_off.value | var32_off.mask;
- if (dst_reg->s32_min_value >= 0 && smin_val >= 0) {
- /* XORing two positive sign numbers gives a positive,
- * so safe to cast u32 result into s32.
- */
+ /* Safe to set s32 bounds by casting u32 result into s32 when u32
+ * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded.
+ */
+ if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) {
dst_reg->s32_min_value = dst_reg->u32_min_value;
dst_reg->s32_max_value = dst_reg->u32_max_value;
} else {
@@ -13471,7 +13656,6 @@ static void scalar_min_max_xor(struct bpf_reg_state *dst_reg,
{
bool src_known = tnum_is_const(src_reg->var_off);
bool dst_known = tnum_is_const(dst_reg->var_off);
- s64 smin_val = src_reg->smin_value;
if (src_known && dst_known) {
/* dst_reg->var_off.value has been updated earlier */
@@ -13483,10 +13667,10 @@ static void scalar_min_max_xor(struct bpf_reg_state *dst_reg,
dst_reg->umin_value = dst_reg->var_off.value;
dst_reg->umax_value = dst_reg->var_off.value | dst_reg->var_off.mask;
- if (dst_reg->smin_value >= 0 && smin_val >= 0) {
- /* XORing two positive sign numbers gives a positive,
- * so safe to cast u64 result into s64.
- */
+ /* Safe to set s64 bounds by casting u64 result into s64 when u64
+ * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded.
+ */
+ if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) {
dst_reg->smin_value = dst_reg->umin_value;
dst_reg->smax_value = dst_reg->umax_value;
} else {
@@ -13694,6 +13878,46 @@ static void scalar_min_max_arsh(struct bpf_reg_state *dst_reg,
__update_reg_bounds(dst_reg);
}
+static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn,
+ const struct bpf_reg_state *src_reg)
+{
+ bool src_is_const = false;
+ u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32;
+
+ if (insn_bitness == 32) {
+ if (tnum_subreg_is_const(src_reg->var_off)
+ && src_reg->s32_min_value == src_reg->s32_max_value
+ && src_reg->u32_min_value == src_reg->u32_max_value)
+ src_is_const = true;
+ } else {
+ if (tnum_is_const(src_reg->var_off)
+ && src_reg->smin_value == src_reg->smax_value
+ && src_reg->umin_value == src_reg->umax_value)
+ src_is_const = true;
+ }
+
+ switch (BPF_OP(insn->code)) {
+ case BPF_ADD:
+ case BPF_SUB:
+ case BPF_AND:
+ case BPF_XOR:
+ case BPF_OR:
+ case BPF_MUL:
+ return true;
+
+ /* Shift operators range is only computable if shift dimension operand
+ * is a constant. Shifts greater than 31 or 63 are undefined. This
+ * includes shifts by a negative number.
+ */
+ case BPF_LSH:
+ case BPF_RSH:
+ case BPF_ARSH:
+ return (src_is_const && src_reg->umax_value < insn_bitness);
+ default:
+ return false;
+ }
+}
+
/* WARNING: This function does calculations on 64-bit values, but the actual
* execution may occur on 32-bit values. Therefore, things like bitshifts
* need extra checks in the 32-bit case.
@@ -13703,53 +13927,11 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
struct bpf_reg_state *dst_reg,
struct bpf_reg_state src_reg)
{
- struct bpf_reg_state *regs = cur_regs(env);
u8 opcode = BPF_OP(insn->code);
- bool src_known;
- s64 smin_val, smax_val;
- u64 umin_val, umax_val;
- s32 s32_min_val, s32_max_val;
- u32 u32_min_val, u32_max_val;
- u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32;
bool alu32 = (BPF_CLASS(insn->code) != BPF_ALU64);
int ret;
- smin_val = src_reg.smin_value;
- smax_val = src_reg.smax_value;
- umin_val = src_reg.umin_value;
- umax_val = src_reg.umax_value;
-
- s32_min_val = src_reg.s32_min_value;
- s32_max_val = src_reg.s32_max_value;
- u32_min_val = src_reg.u32_min_value;
- u32_max_val = src_reg.u32_max_value;
-
- if (alu32) {
- src_known = tnum_subreg_is_const(src_reg.var_off);
- if ((src_known &&
- (s32_min_val != s32_max_val || u32_min_val != u32_max_val)) ||
- s32_min_val > s32_max_val || u32_min_val > u32_max_val) {
- /* Taint dst register if offset had invalid bounds
- * derived from e.g. dead branches.
- */
- __mark_reg_unknown(env, dst_reg);
- return 0;
- }
- } else {
- src_known = tnum_is_const(src_reg.var_off);
- if ((src_known &&
- (smin_val != smax_val || umin_val != umax_val)) ||
- smin_val > smax_val || umin_val > umax_val) {
- /* Taint dst register if offset had invalid bounds
- * derived from e.g. dead branches.
- */
- __mark_reg_unknown(env, dst_reg);
- return 0;
- }
- }
-
- if (!src_known &&
- opcode != BPF_ADD && opcode != BPF_SUB && opcode != BPF_AND) {
+ if (!is_safe_to_compute_dst_reg_range(insn, &src_reg)) {
__mark_reg_unknown(env, dst_reg);
return 0;
}
@@ -13806,46 +13988,24 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
scalar_min_max_xor(dst_reg, &src_reg);
break;
case BPF_LSH:
- if (umax_val >= insn_bitness) {
- /* Shifts greater than 31 or 63 are undefined.
- * This includes shifts by a negative number.
- */
- mark_reg_unknown(env, regs, insn->dst_reg);
- break;
- }
if (alu32)
scalar32_min_max_lsh(dst_reg, &src_reg);
else
scalar_min_max_lsh(dst_reg, &src_reg);
break;
case BPF_RSH:
- if (umax_val >= insn_bitness) {
- /* Shifts greater than 31 or 63 are undefined.
- * This includes shifts by a negative number.
- */
- mark_reg_unknown(env, regs, insn->dst_reg);
- break;
- }
if (alu32)
scalar32_min_max_rsh(dst_reg, &src_reg);
else
scalar_min_max_rsh(dst_reg, &src_reg);
break;
case BPF_ARSH:
- if (umax_val >= insn_bitness) {
- /* Shifts greater than 31 or 63 are undefined.
- * This includes shifts by a negative number.
- */
- mark_reg_unknown(env, regs, insn->dst_reg);
- break;
- }
if (alu32)
scalar32_min_max_arsh(dst_reg, &src_reg);
else
scalar_min_max_arsh(dst_reg, &src_reg);
break;
default:
- mark_reg_unknown(env, regs, insn->dst_reg);
break;
}
@@ -14014,6 +14174,10 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
verbose(env, "addr_space_cast insn can only convert between address space 1 and 0\n");
return -EINVAL;
}
+ if (!env->prog->aux->arena) {
+ verbose(env, "addr_space_cast insn can only be used in a program that has an associated arena\n");
+ return -EINVAL;
+ }
} else {
if ((insn->off != 0 && insn->off != 8 && insn->off != 16 &&
insn->off != 32) || insn->imm) {
@@ -14046,8 +14210,11 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
if (insn->imm) {
/* off == BPF_ADDR_SPACE_CAST */
mark_reg_unknown(env, regs, insn->dst_reg);
- if (insn->imm == 1) /* cast from as(1) to as(0) */
+ if (insn->imm == 1) { /* cast from as(1) to as(0) */
dst_reg->type = PTR_TO_ARENA;
+ /* PTR_TO_ARENA is 32-bit */
+ dst_reg->subreg_def = env->insn_idx + 1;
+ }
} else if (insn->off == 0) {
/* case: R1 = R2
* copy register state to dest reg
@@ -14544,7 +14711,19 @@ static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state
struct tnum t;
u64 val;
-again:
+ /* In case of GE/GT/SGE/JST, reuse LE/LT/SLE/SLT logic from below */
+ switch (opcode) {
+ case BPF_JGE:
+ case BPF_JGT:
+ case BPF_JSGE:
+ case BPF_JSGT:
+ opcode = flip_opcode(opcode);
+ swap(reg1, reg2);
+ break;
+ default:
+ break;
+ }
+
switch (opcode) {
case BPF_JEQ:
if (is_jmp32) {
@@ -14687,14 +14866,6 @@ again:
reg2->smin_value = max(reg1->smin_value + 1, reg2->smin_value);
}
break;
- case BPF_JGE:
- case BPF_JGT:
- case BPF_JSGE:
- case BPF_JSGT:
- /* just reuse LE/LT logic above */
- opcode = flip_opcode(opcode);
- swap(reg1, reg2);
- goto again;
default:
return;
}
@@ -14702,7 +14873,7 @@ again:
/* Adjusts the register min/max values in the case that the dst_reg and
* src_reg are both SCALAR_VALUE registers (or we are simply doing a BPF_K
- * check, in which case we havea fake SCALAR_VALUE representing insn->imm).
+ * check, in which case we have a fake SCALAR_VALUE representing insn->imm).
* Technically we can do similar adjustments for pointers to the same object,
* but we don't support that right now.
*/
@@ -15317,6 +15488,11 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
return -EINVAL;
}
+ if (env->cur_state->active_preempt_lock) {
+ verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_preempt_disable-ed region\n");
+ return -EINVAL;
+ }
+
if (regs[ctx_reg].type != PTR_TO_CTX) {
verbose(env,
"at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
@@ -16884,6 +17060,12 @@ static bool states_equal(struct bpf_verifier_env *env,
if (old->active_rcu_lock != cur->active_rcu_lock)
return false;
+ if (old->active_preempt_lock != cur->active_preempt_lock)
+ return false;
+
+ if (old->in_sleepable != cur->in_sleepable)
+ return false;
+
/* for states to be equal callsites have to be the same
* and all frame states need to be equivalent
*/
@@ -17340,7 +17522,7 @@ hit:
err = propagate_liveness(env, &sl->state, cur);
/* if previous state reached the exit with precision and
- * current state is equivalent to it (except precsion marks)
+ * current state is equivalent to it (except precision marks)
* the precision needs to be propagated back in
* the current state.
*/
@@ -17518,7 +17700,7 @@ static bool reg_type_mismatch(enum bpf_reg_type src, enum bpf_reg_type prev)
}
static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type type,
- bool allow_trust_missmatch)
+ bool allow_trust_mismatch)
{
enum bpf_reg_type *prev_type = &env->insn_aux_data[env->insn_idx].ptr_type;
@@ -17536,7 +17718,7 @@ static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type typ
* src_reg == stack|map in some other branch.
* Reject it.
*/
- if (allow_trust_missmatch &&
+ if (allow_trust_mismatch &&
base_type(type) == PTR_TO_BTF_ID &&
base_type(*prev_type) == PTR_TO_BTF_ID) {
/*
@@ -17832,6 +18014,13 @@ process_bpf_exit_full:
return -EINVAL;
}
+ if (env->cur_state->active_preempt_lock && !env->cur_state->curframe) {
+ verbose(env, "%d bpf_preempt_enable%s missing\n",
+ env->cur_state->active_preempt_lock,
+ env->cur_state->active_preempt_lock == 1 ? " is" : "(s) are");
+ return -EINVAL;
+ }
+
/* We must do check_reference_leak here before
* prepare_func_exit to handle the case when
* state->curframe > 0, it may be a callback
@@ -18129,6 +18318,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
}
}
+ if (btf_record_has_field(map->record, BPF_WORKQUEUE)) {
+ if (is_tracing_prog_type(prog_type)) {
+ verbose(env, "tracing progs cannot use bpf_wq yet\n");
+ return -EINVAL;
+ }
+ }
+
if ((bpf_prog_is_offloaded(prog->aux) || bpf_map_is_offloaded(map)) &&
!bpf_offload_prog_map_match(prog, map)) {
verbose(env, "offload device mismatch between prog and map\n");
@@ -18269,8 +18465,7 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
f = fdget(fd);
map = __bpf_map_get(f);
if (IS_ERR(map)) {
- verbose(env, "fd %d is not pointing to valid bpf_map\n",
- insn[0].imm);
+ verbose(env, "fd %d is not pointing to valid bpf_map\n", fd);
return PTR_ERR(map);
}
@@ -18324,6 +18519,8 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
}
if (env->used_map_cnt >= MAX_USED_MAPS) {
+ verbose(env, "The total number of maps per program has reached the limit of %u\n",
+ MAX_USED_MAPS);
fdput(f);
return -E2BIG;
}
@@ -18359,15 +18556,18 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
}
if (!env->prog->jit_requested) {
verbose(env, "JIT is required to use arena\n");
+ fdput(f);
return -EOPNOTSUPP;
}
if (!bpf_jit_supports_arena()) {
verbose(env, "JIT doesn't support arena\n");
+ fdput(f);
return -EOPNOTSUPP;
}
env->prog->aux->arena = (void *)map;
if (!bpf_arena_get_user_vm_start(env->prog->aux->arena)) {
verbose(env, "arena's user address must be set via map_extra or mmap()\n");
+ fdput(f);
return -EINVAL;
}
}
@@ -18935,6 +19135,12 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
insn->code == (BPF_ST | BPF_MEM | BPF_W) ||
insn->code == (BPF_ST | BPF_MEM | BPF_DW)) {
type = BPF_WRITE;
+ } else if ((insn->code == (BPF_STX | BPF_ATOMIC | BPF_W) ||
+ insn->code == (BPF_STX | BPF_ATOMIC | BPF_DW)) &&
+ env->insn_aux_data[i + delta].ptr_type == PTR_TO_ARENA) {
+ insn->code = BPF_STX | BPF_PROBE_ATOMIC | BPF_SIZE(insn->code);
+ env->prog->aux->num_exentries++;
+ continue;
} else {
continue;
}
@@ -19121,12 +19327,19 @@ static int jit_subprogs(struct bpf_verifier_env *env)
env->insn_aux_data[i].call_imm = insn->imm;
/* point imm to __bpf_call_base+1 from JITs point of view */
insn->imm = 1;
- if (bpf_pseudo_func(insn))
+ if (bpf_pseudo_func(insn)) {
+#if defined(MODULES_VADDR)
+ u64 addr = MODULES_VADDR;
+#else
+ u64 addr = VMALLOC_START;
+#endif
/* jit (e.g. x86_64) may emit fewer instructions
* if it learns a u32 imm is the same as a u64 imm.
- * Force a non zero here.
+ * Set close enough to possible prog address.
*/
- insn[1].imm = 1;
+ insn[0].imm = (u32)addr;
+ insn[1].imm = addr >> 32;
+ }
}
err = bpf_prog_alloc_jited_linfo(prog);
@@ -19158,6 +19371,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
if (bpf_prog_calc_tag(func[i]))
goto out_free;
func[i]->is_func = 1;
+ func[i]->sleepable = prog->sleepable;
func[i]->aux->func_idx = i;
/* Below members will be freed only at prog->aux */
func[i]->aux->btf = prog->aux->btf;
@@ -19198,6 +19412,9 @@ static int jit_subprogs(struct bpf_verifier_env *env)
BPF_CLASS(insn->code) == BPF_ST) &&
BPF_MODE(insn->code) == BPF_PROBE_MEM32)
num_exentries++;
+ if (BPF_CLASS(insn->code) == BPF_STX &&
+ BPF_MODE(insn->code) == BPF_PROBE_ATOMIC)
+ num_exentries++;
}
func[i]->aux->num_exentries = num_exentries;
func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable;
@@ -19262,10 +19479,14 @@ static int jit_subprogs(struct bpf_verifier_env *env)
* bpf_prog_load will add the kallsyms for the main program.
*/
for (i = 1; i < env->subprog_cnt; i++) {
- bpf_prog_lock_ro(func[i]);
- bpf_prog_kallsyms_add(func[i]);
+ err = bpf_prog_lock_ro(func[i]);
+ if (err)
+ goto out_free;
}
+ for (i = 1; i < env->subprog_cnt; i++)
+ bpf_prog_kallsyms_add(func[i]);
+
/* Last step: make now unused interpreter insns from main
* prog consistent for later dump requests, so they can
* later look the same as if they were interpreted only.
@@ -19525,6 +19746,13 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1);
*cnt = 1;
+ } else if (is_bpf_wq_set_callback_impl_kfunc(desc->func_id)) {
+ struct bpf_insn ld_addrs[2] = { BPF_LD_IMM64(BPF_REG_4, (long)env->prog->aux) };
+
+ insn_buf[0] = ld_addrs[0];
+ insn_buf[1] = ld_addrs[1];
+ insn_buf[2] = *insn;
+ *cnt = 3;
}
return 0;
}
@@ -19601,8 +19829,9 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
(((struct bpf_map *)env->prog->aux->arena)->map_flags & BPF_F_NO_USER_CONV)) {
/* convert to 32-bit mov that clears upper 32-bit */
insn->code = BPF_ALU | BPF_MOV | BPF_X;
- /* clear off, so it's a normal 'wX = wY' from JIT pov */
+ /* clear off and imm, so it's a normal 'wX = wY' from JIT pov */
insn->off = 0;
+ insn->imm = 0;
} /* cast from as(0) to as(1) should be handled by JIT */
goto next_insn;
}
@@ -19652,6 +19881,36 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
goto next_insn;
}
+ /* Make it impossible to de-reference a userspace address */
+ if (BPF_CLASS(insn->code) == BPF_LDX &&
+ (BPF_MODE(insn->code) == BPF_PROBE_MEM ||
+ BPF_MODE(insn->code) == BPF_PROBE_MEMSX)) {
+ struct bpf_insn *patch = &insn_buf[0];
+ u64 uaddress_limit = bpf_arch_uaddress_limit();
+
+ if (!uaddress_limit)
+ goto next_insn;
+
+ *patch++ = BPF_MOV64_REG(BPF_REG_AX, insn->src_reg);
+ if (insn->off)
+ *patch++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_AX, insn->off);
+ *patch++ = BPF_ALU64_IMM(BPF_RSH, BPF_REG_AX, 32);
+ *patch++ = BPF_JMP_IMM(BPF_JLE, BPF_REG_AX, uaddress_limit >> 32, 2);
+ *patch++ = *insn;
+ *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
+ *patch++ = BPF_MOV64_IMM(insn->dst_reg, 0);
+
+ cnt = patch - insn_buf;
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+
/* Implement LD_ABS and LD_IND with a rewrite, if supported by the program type. */
if (BPF_CLASS(insn->code) == BPF_LD &&
(BPF_MODE(insn->code) == BPF_ABS ||
@@ -19766,6 +20025,10 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
goto next_insn;
}
+ /* Skip inlining the helper call if the JIT does it. */
+ if (bpf_jit_inlines_helper_call(insn->imm))
+ goto next_insn;
+
if (insn->imm == BPF_FUNC_get_route_realm)
prog->dst_needed = 1;
if (insn->imm == BPF_FUNC_get_prandom_u32)
@@ -19799,7 +20062,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
!bpf_map_ptr_unpriv(aux)) {
struct bpf_jit_poke_descriptor desc = {
.reason = BPF_POKE_REASON_TAIL_CALL,
- .tail_call.map = BPF_MAP_PTR(aux->map_ptr_state),
+ .tail_call.map = aux->map_ptr_state.map_ptr,
.tail_call.key = bpf_map_key_immediate(aux),
.insn_idx = i + delta,
};
@@ -19828,7 +20091,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
return -EINVAL;
}
- map_ptr = BPF_MAP_PTR(aux->map_ptr_state);
+ map_ptr = aux->map_ptr_state.map_ptr;
insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3,
map_ptr->max_entries, 2);
insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3,
@@ -19936,7 +20199,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
if (bpf_map_ptr_poisoned(aux))
goto patch_call_imm;
- map_ptr = BPF_MAP_PTR(aux->map_ptr_state);
+ map_ptr = aux->map_ptr_state.map_ptr;
ops = map_ptr->ops;
if (insn->imm == BPF_FUNC_map_lookup_elem &&
ops->map_gen_lookup) {
@@ -20042,6 +20305,30 @@ patch_map_ops_generic:
goto next_insn;
}
+#ifdef CONFIG_X86_64
+ /* Implement bpf_get_smp_processor_id() inline. */
+ if (insn->imm == BPF_FUNC_get_smp_processor_id &&
+ prog->jit_requested && bpf_jit_supports_percpu_insn()) {
+ /* BPF_FUNC_get_smp_processor_id inlining is an
+ * optimization, so if pcpu_hot.cpu_number is ever
+ * changed in some incompatible and hard to support
+ * way, it's fine to back out this inlining logic
+ */
+ insn_buf[0] = BPF_MOV32_IMM(BPF_REG_0, (u32)(unsigned long)&pcpu_hot.cpu_number);
+ insn_buf[1] = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0);
+ insn_buf[2] = BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 0);
+ cnt = 3;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+#endif
/* Implement bpf_get_func_arg inline. */
if (prog_type == BPF_PROG_TYPE_TRACING &&
insn->imm == BPF_FUNC_get_func_arg) {
@@ -20125,6 +20412,62 @@ patch_map_ops_generic:
goto next_insn;
}
+ /* Implement bpf_get_branch_snapshot inline. */
+ if (IS_ENABLED(CONFIG_PERF_EVENTS) &&
+ prog->jit_requested && BITS_PER_LONG == 64 &&
+ insn->imm == BPF_FUNC_get_branch_snapshot) {
+ /* We are dealing with the following func protos:
+ * u64 bpf_get_branch_snapshot(void *buf, u32 size, u64 flags);
+ * int perf_snapshot_branch_stack(struct perf_branch_entry *entries, u32 cnt);
+ */
+ const u32 br_entry_size = sizeof(struct perf_branch_entry);
+
+ /* struct perf_branch_entry is part of UAPI and is
+ * used as an array element, so extremely unlikely to
+ * ever grow or shrink
+ */
+ BUILD_BUG_ON(br_entry_size != 24);
+
+ /* if (unlikely(flags)) return -EINVAL */
+ insn_buf[0] = BPF_JMP_IMM(BPF_JNE, BPF_REG_3, 0, 7);
+
+ /* Transform size (bytes) into number of entries (cnt = size / 24).
+ * But to avoid expensive division instruction, we implement
+ * divide-by-3 through multiplication, followed by further
+ * division by 8 through 3-bit right shift.
+ * Refer to book "Hacker's Delight, 2nd ed." by Henry S. Warren, Jr.,
+ * p. 227, chapter "Unsigned Division by 3" for details and proofs.
+ *
+ * N / 3 <=> M * N / 2^33, where M = (2^33 + 1) / 3 = 0xaaaaaaab.
+ */
+ insn_buf[1] = BPF_MOV32_IMM(BPF_REG_0, 0xaaaaaaab);
+ insn_buf[2] = BPF_ALU64_REG(BPF_MUL, BPF_REG_2, BPF_REG_0);
+ insn_buf[3] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_2, 36);
+
+ /* call perf_snapshot_branch_stack implementation */
+ insn_buf[4] = BPF_EMIT_CALL(static_call_query(perf_snapshot_branch_stack));
+ /* if (entry_cnt == 0) return -ENOENT */
+ insn_buf[5] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4);
+ /* return entry_cnt * sizeof(struct perf_branch_entry) */
+ insn_buf[6] = BPF_ALU32_IMM(BPF_MUL, BPF_REG_0, br_entry_size);
+ insn_buf[7] = BPF_JMP_A(3);
+ /* return -EINVAL; */
+ insn_buf[8] = BPF_MOV64_IMM(BPF_REG_0, -EINVAL);
+ insn_buf[9] = BPF_JMP_A(1);
+ /* return -ENOENT; */
+ insn_buf[10] = BPF_MOV64_IMM(BPF_REG_0, -ENOENT);
+ cnt = 11;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ continue;
+ }
+
/* Implement bpf_kptr_xchg inline */
if (prog->jit_requested && BITS_PER_LONG == 64 &&
insn->imm == BPF_FUNC_kptr_xchg &&
diff --git a/kernel/configs/hardening.config b/kernel/configs/hardening.config
index 7a5bbfc024b7..8a7ce7a6b3ab 100644
--- a/kernel/configs/hardening.config
+++ b/kernel/configs/hardening.config
@@ -23,6 +23,10 @@ CONFIG_SLAB_FREELIST_HARDENED=y
CONFIG_SHUFFLE_PAGE_ALLOCATOR=y
CONFIG_RANDOM_KMALLOC_CACHES=y
+# Sanity check userspace page table mappings.
+CONFIG_PAGE_TABLE_CHECK=y
+CONFIG_PAGE_TABLE_CHECK_ENFORCED=y
+
# Randomize kernel stack offset on syscall entry.
CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT=y
@@ -39,11 +43,12 @@ CONFIG_UBSAN=y
CONFIG_UBSAN_TRAP=y
CONFIG_UBSAN_BOUNDS=y
# CONFIG_UBSAN_SHIFT is not set
-# CONFIG_UBSAN_DIV_ZERO
-# CONFIG_UBSAN_UNREACHABLE
-# CONFIG_UBSAN_BOOL
-# CONFIG_UBSAN_ENUM
-# CONFIG_UBSAN_ALIGNMENT
+# CONFIG_UBSAN_DIV_ZERO is not set
+# CONFIG_UBSAN_UNREACHABLE is not set
+# CONFIG_UBSAN_SIGNED_WRAP is not set
+# CONFIG_UBSAN_BOOL is not set
+# CONFIG_UBSAN_ENUM is not set
+# CONFIG_UBSAN_ALIGNMENT is not set
# Sampling-based heap out-of-bounds and use-after-free detection.
CONFIG_KFENCE=y
@@ -81,6 +86,10 @@ CONFIG_SECCOMP_FILTER=y
# Provides some protections against SYN flooding.
CONFIG_SYN_COOKIES=y
+# Enable Kernel Control Flow Integrity (currently Clang only).
+CONFIG_CFI_CLANG=y
+# CONFIG_CFI_PERMISSIVE is not set
+
# Attack surface reduction: do not autoload TTY line disciplines.
# CONFIG_LDISC_AUTOLOAD is not set
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 70ae70d03823..24b1e1143260 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -432,7 +432,7 @@ static __always_inline void ct_kernel_enter(bool user, int offset) { }
#define CREATE_TRACE_POINTS
#include <trace/events/context_tracking.h>
-DEFINE_STATIC_KEY_FALSE(context_tracking_key);
+DEFINE_STATIC_KEY_FALSE_RO(context_tracking_key);
EXPORT_SYMBOL_GPL(context_tracking_key);
static noinstr bool context_tracking_recursion_enter(void)
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 8f6affd051f7..63447eb85dab 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -3196,6 +3196,7 @@ void __init boot_cpu_hotplug_init(void)
this_cpu_write(cpuhp_state.target, CPUHP_ONLINE);
}
+#ifdef CONFIG_CPU_MITIGATIONS
/*
* These are used for a global "mitigations=" cmdline option for toggling
* optional CPU mitigations.
@@ -3206,8 +3207,7 @@ enum cpu_mitigations {
CPU_MITIGATIONS_AUTO_NOSMT,
};
-static enum cpu_mitigations cpu_mitigations __ro_after_init =
- CPU_MITIGATIONS_AUTO;
+static enum cpu_mitigations cpu_mitigations __ro_after_init = CPU_MITIGATIONS_AUTO;
static int __init mitigations_parse_cmdline(char *arg)
{
@@ -3223,7 +3223,6 @@ static int __init mitigations_parse_cmdline(char *arg)
return 0;
}
-early_param("mitigations", mitigations_parse_cmdline);
/* mitigations=off */
bool cpu_mitigations_off(void)
@@ -3238,3 +3237,11 @@ bool cpu_mitigations_auto_nosmt(void)
return cpu_mitigations == CPU_MITIGATIONS_AUTO_NOSMT;
}
EXPORT_SYMBOL_GPL(cpu_mitigations_auto_nosmt);
+#else
+static int __init mitigations_parse_cmdline(char *arg)
+{
+ pr_crit("Kernel compiled without mitigations, ignoring 'mitigations'; system may still be vulnerable\n");
+ return 0;
+}
+#endif
+early_param("mitigations", mitigations_parse_cmdline);
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 75cd6a736d03..78b5dc7cee3a 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -11,9 +11,14 @@
#include <linux/sizes.h>
#include <linux/kexec.h>
#include <linux/memory.h>
+#include <linux/mm.h>
#include <linux/cpuhotplug.h>
#include <linux/memblock.h>
#include <linux/kmemleak.h>
+#include <linux/crash_core.h>
+#include <linux/reboot.h>
+#include <linux/btf.h>
+#include <linux/objtool.h>
#include <asm/page.h>
#include <asm/sections.h>
@@ -26,451 +31,130 @@
/* Per cpu memory for storing cpu states in case of system crash. */
note_buf_t __percpu *crash_notes;
-/* vmcoreinfo stuff */
-unsigned char *vmcoreinfo_data;
-size_t vmcoreinfo_size;
-u32 *vmcoreinfo_note;
-
-/* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */
-static unsigned char *vmcoreinfo_data_safecopy;
-
-/* Location of the reserved area for the crash kernel */
-struct resource crashk_res = {
- .name = "Crash kernel",
- .start = 0,
- .end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
- .desc = IORES_DESC_CRASH_KERNEL
-};
-struct resource crashk_low_res = {
- .name = "Crash kernel",
- .start = 0,
- .end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
- .desc = IORES_DESC_CRASH_KERNEL
-};
-
-/*
- * parsing the "crashkernel" commandline
- *
- * this code is intended to be called from architecture specific code
- */
+#ifdef CONFIG_CRASH_DUMP
-
-/*
- * This function parses command lines in the format
- *
- * crashkernel=ramsize-range:size[,...][@offset]
- *
- * The function returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_mem(char *cmdline,
- unsigned long long system_ram,
- unsigned long long *crash_size,
- unsigned long long *crash_base)
+int kimage_crash_copy_vmcoreinfo(struct kimage *image)
{
- char *cur = cmdline, *tmp;
- unsigned long long total_mem = system_ram;
+ struct page *vmcoreinfo_page;
+ void *safecopy;
+
+ if (!IS_ENABLED(CONFIG_CRASH_DUMP))
+ return 0;
+ if (image->type != KEXEC_TYPE_CRASH)
+ return 0;
/*
- * Firmware sometimes reserves some memory regions for its own use,
- * so the system memory size is less than the actual physical memory
- * size. Work around this by rounding up the total size to 128M,
- * which is enough for most test cases.
+ * For kdump, allocate one vmcoreinfo safe copy from the
+ * crash memory. as we have arch_kexec_protect_crashkres()
+ * after kexec syscall, we naturally protect it from write
+ * (even read) access under kernel direct mapping. But on
+ * the other hand, we still need to operate it when crash
+ * happens to generate vmcoreinfo note, hereby we rely on
+ * vmap for this purpose.
*/
- total_mem = roundup(total_mem, SZ_128M);
-
- /* for each entry of the comma-separated list */
- do {
- unsigned long long start, end = ULLONG_MAX, size;
-
- /* get the start of the range */
- start = memparse(cur, &tmp);
- if (cur == tmp) {
- pr_warn("crashkernel: Memory value expected\n");
- return -EINVAL;
- }
- cur = tmp;
- if (*cur != '-') {
- pr_warn("crashkernel: '-' expected\n");
- return -EINVAL;
- }
- cur++;
-
- /* if no ':' is here, than we read the end */
- if (*cur != ':') {
- end = memparse(cur, &tmp);
- if (cur == tmp) {
- pr_warn("crashkernel: Memory value expected\n");
- return -EINVAL;
- }
- cur = tmp;
- if (end <= start) {
- pr_warn("crashkernel: end <= start\n");
- return -EINVAL;
- }
- }
-
- if (*cur != ':') {
- pr_warn("crashkernel: ':' expected\n");
- return -EINVAL;
- }
- cur++;
-
- size = memparse(cur, &tmp);
- if (cur == tmp) {
- pr_warn("Memory value expected\n");
- return -EINVAL;
- }
- cur = tmp;
- if (size >= total_mem) {
- pr_warn("crashkernel: invalid size\n");
- return -EINVAL;
- }
-
- /* match ? */
- if (total_mem >= start && total_mem < end) {
- *crash_size = size;
- break;
- }
- } while (*cur++ == ',');
-
- if (*crash_size > 0) {
- while (*cur && *cur != ' ' && *cur != '@')
- cur++;
- if (*cur == '@') {
- cur++;
- *crash_base = memparse(cur, &tmp);
- if (cur == tmp) {
- pr_warn("Memory value expected after '@'\n");
- return -EINVAL;
- }
- }
- } else
- pr_info("crashkernel size resulted in zero bytes\n");
-
- return 0;
-}
-
-/*
- * That function parses "simple" (old) crashkernel command lines like
- *
- * crashkernel=size[@offset]
- *
- * It returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_simple(char *cmdline,
- unsigned long long *crash_size,
- unsigned long long *crash_base)
-{
- char *cur = cmdline;
-
- *crash_size = memparse(cmdline, &cur);
- if (cmdline == cur) {
- pr_warn("crashkernel: memory value expected\n");
- return -EINVAL;
- }
-
- if (*cur == '@')
- *crash_base = memparse(cur+1, &cur);
- else if (*cur != ' ' && *cur != '\0') {
- pr_warn("crashkernel: unrecognized char: %c\n", *cur);
- return -EINVAL;
+ vmcoreinfo_page = kimage_alloc_control_pages(image, 0);
+ if (!vmcoreinfo_page) {
+ pr_warn("Could not allocate vmcoreinfo buffer\n");
+ return -ENOMEM;
}
-
- return 0;
-}
-
-#define SUFFIX_HIGH 0
-#define SUFFIX_LOW 1
-#define SUFFIX_NULL 2
-static __initdata char *suffix_tbl[] = {
- [SUFFIX_HIGH] = ",high",
- [SUFFIX_LOW] = ",low",
- [SUFFIX_NULL] = NULL,
-};
-
-/*
- * That function parses "suffix" crashkernel command lines like
- *
- * crashkernel=size,[high|low]
- *
- * It returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_suffix(char *cmdline,
- unsigned long long *crash_size,
- const char *suffix)
-{
- char *cur = cmdline;
-
- *crash_size = memparse(cmdline, &cur);
- if (cmdline == cur) {
- pr_warn("crashkernel: memory value expected\n");
- return -EINVAL;
+ safecopy = vmap(&vmcoreinfo_page, 1, VM_MAP, PAGE_KERNEL);
+ if (!safecopy) {
+ pr_warn("Could not vmap vmcoreinfo buffer\n");
+ return -ENOMEM;
}
- /* check with suffix */
- if (strncmp(cur, suffix, strlen(suffix))) {
- pr_warn("crashkernel: unrecognized char: %c\n", *cur);
- return -EINVAL;
- }
- cur += strlen(suffix);
- if (*cur != ' ' && *cur != '\0') {
- pr_warn("crashkernel: unrecognized char: %c\n", *cur);
- return -EINVAL;
- }
+ image->vmcoreinfo_data_copy = safecopy;
+ crash_update_vmcoreinfo_safecopy(safecopy);
return 0;
}
-static __init char *get_last_crashkernel(char *cmdline,
- const char *name,
- const char *suffix)
-{
- char *p = cmdline, *ck_cmdline = NULL;
-
- /* find crashkernel and use the last one if there are more */
- p = strstr(p, name);
- while (p) {
- char *end_p = strchr(p, ' ');
- char *q;
-
- if (!end_p)
- end_p = p + strlen(p);
-
- if (!suffix) {
- int i;
-
- /* skip the one with any known suffix */
- for (i = 0; suffix_tbl[i]; i++) {
- q = end_p - strlen(suffix_tbl[i]);
- if (!strncmp(q, suffix_tbl[i],
- strlen(suffix_tbl[i])))
- goto next;
- }
- ck_cmdline = p;
- } else {
- q = end_p - strlen(suffix);
- if (!strncmp(q, suffix, strlen(suffix)))
- ck_cmdline = p;
- }
-next:
- p = strstr(p+1, name);
- }
- return ck_cmdline;
-}
-static int __init __parse_crashkernel(char *cmdline,
- unsigned long long system_ram,
- unsigned long long *crash_size,
- unsigned long long *crash_base,
- const char *suffix)
+int kexec_should_crash(struct task_struct *p)
{
- char *first_colon, *first_space;
- char *ck_cmdline;
- char *name = "crashkernel=";
-
- BUG_ON(!crash_size || !crash_base);
- *crash_size = 0;
- *crash_base = 0;
-
- ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
- if (!ck_cmdline)
- return -ENOENT;
-
- ck_cmdline += strlen(name);
-
- if (suffix)
- return parse_crashkernel_suffix(ck_cmdline, crash_size,
- suffix);
/*
- * if the commandline contains a ':', then that's the extended
- * syntax -- if not, it must be the classic syntax
+ * If crash_kexec_post_notifiers is enabled, don't run
+ * crash_kexec() here yet, which must be run after panic
+ * notifiers in panic().
*/
- first_colon = strchr(ck_cmdline, ':');
- first_space = strchr(ck_cmdline, ' ');
- if (first_colon && (!first_space || first_colon < first_space))
- return parse_crashkernel_mem(ck_cmdline, system_ram,
- crash_size, crash_base);
-
- return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
-}
-
-/*
- * That function is the entry point for command line parsing and should be
- * called from the arch-specific code.
- *
- * If crashkernel=,high|low is supported on architecture, non-NULL values
- * should be passed to parameters 'low_size' and 'high'.
- */
-int __init parse_crashkernel(char *cmdline,
- unsigned long long system_ram,
- unsigned long long *crash_size,
- unsigned long long *crash_base,
- unsigned long long *low_size,
- bool *high)
-{
- int ret;
-
- /* crashkernel=X[@offset] */
- ret = __parse_crashkernel(cmdline, system_ram, crash_size,
- crash_base, NULL);
-#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+ if (crash_kexec_post_notifiers)
+ return 0;
/*
- * If non-NULL 'high' passed in and no normal crashkernel
- * setting detected, try parsing crashkernel=,high|low.
+ * There are 4 panic() calls in make_task_dead() path, each of which
+ * corresponds to each of these 4 conditions.
*/
- if (high && ret == -ENOENT) {
- ret = __parse_crashkernel(cmdline, 0, crash_size,
- crash_base, suffix_tbl[SUFFIX_HIGH]);
- if (ret || !*crash_size)
- return -EINVAL;
-
- /*
- * crashkernel=Y,low can be specified or not, but invalid value
- * is not allowed.
- */
- ret = __parse_crashkernel(cmdline, 0, low_size,
- crash_base, suffix_tbl[SUFFIX_LOW]);
- if (ret == -ENOENT) {
- *low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE;
- ret = 0;
- } else if (ret) {
- return ret;
- }
-
- *high = true;
- }
-#endif
- if (!*crash_size)
- ret = -EINVAL;
-
- return ret;
+ if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
+ return 1;
+ return 0;
}
-/*
- * Add a dummy early_param handler to mark crashkernel= as a known command line
- * parameter and suppress incorrect warnings in init/main.c.
- */
-static int __init parse_crashkernel_dummy(char *arg)
+int kexec_crash_loaded(void)
{
- return 0;
+ return !!kexec_crash_image;
}
-early_param("crashkernel", parse_crashkernel_dummy);
+EXPORT_SYMBOL_GPL(kexec_crash_loaded);
-#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
-static int __init reserve_crashkernel_low(unsigned long long low_size)
+/*
+ * No panic_cpu check version of crash_kexec(). This function is called
+ * only when panic_cpu holds the current CPU number; this is the only CPU
+ * which processes crash_kexec routines.
+ */
+void __noclone __crash_kexec(struct pt_regs *regs)
{
-#ifdef CONFIG_64BIT
- unsigned long long low_base;
-
- low_base = memblock_phys_alloc_range(low_size, CRASH_ALIGN, 0, CRASH_ADDR_LOW_MAX);
- if (!low_base) {
- pr_err("cannot allocate crashkernel low memory (size:0x%llx).\n", low_size);
- return -ENOMEM;
+ /* Take the kexec_lock here to prevent sys_kexec_load
+ * running on one cpu from replacing the crash kernel
+ * we are using after a panic on a different cpu.
+ *
+ * If the crash kernel was not located in a fixed area
+ * of memory the xchg(&kexec_crash_image) would be
+ * sufficient. But since I reuse the memory...
+ */
+ if (kexec_trylock()) {
+ if (kexec_crash_image) {
+ struct pt_regs fixed_regs;
+
+ crash_setup_regs(&fixed_regs, regs);
+ crash_save_vmcoreinfo();
+ machine_crash_shutdown(&fixed_regs);
+ machine_kexec(kexec_crash_image);
+ }
+ kexec_unlock();
}
-
- pr_info("crashkernel low memory reserved: 0x%08llx - 0x%08llx (%lld MB)\n",
- low_base, low_base + low_size, low_size >> 20);
-
- crashk_low_res.start = low_base;
- crashk_low_res.end = low_base + low_size - 1;
-#endif
- return 0;
}
+STACK_FRAME_NON_STANDARD(__crash_kexec);
-void __init reserve_crashkernel_generic(char *cmdline,
- unsigned long long crash_size,
- unsigned long long crash_base,
- unsigned long long crash_low_size,
- bool high)
+__bpf_kfunc void crash_kexec(struct pt_regs *regs)
{
- unsigned long long search_end = CRASH_ADDR_LOW_MAX, search_base = 0;
- bool fixed_base = false;
-
- /* User specifies base address explicitly. */
- if (crash_base) {
- fixed_base = true;
- search_base = crash_base;
- search_end = crash_base + crash_size;
- } else if (high) {
- search_base = CRASH_ADDR_LOW_MAX;
- search_end = CRASH_ADDR_HIGH_MAX;
- }
+ int old_cpu, this_cpu;
-retry:
- crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN,
- search_base, search_end);
- if (!crash_base) {
- /*
- * For crashkernel=size[KMG]@offset[KMG], print out failure
- * message if can't reserve the specified region.
- */
- if (fixed_base) {
- pr_warn("crashkernel reservation failed - memory is in use.\n");
- return;
- }
+ /*
+ * Only one CPU is allowed to execute the crash_kexec() code as with
+ * panic(). Otherwise parallel calls of panic() and crash_kexec()
+ * may stop each other. To exclude them, we use panic_cpu here too.
+ */
+ old_cpu = PANIC_CPU_INVALID;
+ this_cpu = raw_smp_processor_id();
- /*
- * For crashkernel=size[KMG], if the first attempt was for
- * low memory, fall back to high memory, the minimum required
- * low memory will be reserved later.
- */
- if (!high && search_end == CRASH_ADDR_LOW_MAX) {
- search_end = CRASH_ADDR_HIGH_MAX;
- search_base = CRASH_ADDR_LOW_MAX;
- crash_low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE;
- goto retry;
- }
+ if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) {
+ /* This is the 1st CPU which comes here, so go ahead. */
+ __crash_kexec(regs);
/*
- * For crashkernel=size[KMG],high, if the first attempt was
- * for high memory, fall back to low memory.
+ * Reset panic_cpu to allow another panic()/crash_kexec()
+ * call.
*/
- if (high && search_end == CRASH_ADDR_HIGH_MAX) {
- search_end = CRASH_ADDR_LOW_MAX;
- search_base = 0;
- goto retry;
- }
- pr_warn("cannot allocate crashkernel (size:0x%llx)\n",
- crash_size);
- return;
- }
-
- if ((crash_base >= CRASH_ADDR_LOW_MAX) &&
- crash_low_size && reserve_crashkernel_low(crash_low_size)) {
- memblock_phys_free(crash_base, crash_size);
- return;
+ atomic_set(&panic_cpu, PANIC_CPU_INVALID);
}
-
- pr_info("crashkernel reserved: 0x%016llx - 0x%016llx (%lld MB)\n",
- crash_base, crash_base + crash_size, crash_size >> 20);
-
- /*
- * The crashkernel memory will be removed from the kernel linear
- * map. Inform kmemleak so that it won't try to access it.
- */
- kmemleak_ignore_phys(crash_base);
- if (crashk_low_res.end)
- kmemleak_ignore_phys(crashk_low_res.start);
-
- crashk_res.start = crash_base;
- crashk_res.end = crash_base + crash_size - 1;
}
-static __init int insert_crashkernel_resources(void)
+static inline resource_size_t crash_resource_size(const struct resource *res)
{
- if (crashk_res.start < crashk_res.end)
- insert_resource(&iomem_resource, &crashk_res);
+ return !res->end ? 0 : resource_size(res);
+}
+
- if (crashk_low_res.start < crashk_low_res.end)
- insert_resource(&iomem_resource, &crashk_low_res);
- return 0;
-}
-early_initcall(insert_crashkernel_resources);
-#endif
int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map,
void **addr, unsigned long *sz)
@@ -633,205 +317,129 @@ int crash_exclude_mem_range(struct crash_mem *mem,
return 0;
}
-Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
- void *data, size_t data_len)
+ssize_t crash_get_memory_size(void)
{
- struct elf_note *note = (struct elf_note *)buf;
-
- note->n_namesz = strlen(name) + 1;
- note->n_descsz = data_len;
- note->n_type = type;
- buf += DIV_ROUND_UP(sizeof(*note), sizeof(Elf_Word));
- memcpy(buf, name, note->n_namesz);
- buf += DIV_ROUND_UP(note->n_namesz, sizeof(Elf_Word));
- memcpy(buf, data, data_len);
- buf += DIV_ROUND_UP(data_len, sizeof(Elf_Word));
-
- return buf;
-}
+ ssize_t size = 0;
-void final_note(Elf_Word *buf)
-{
- memset(buf, 0, sizeof(struct elf_note));
-}
+ if (!kexec_trylock())
+ return -EBUSY;
-static void update_vmcoreinfo_note(void)
-{
- u32 *buf = vmcoreinfo_note;
+ size += crash_resource_size(&crashk_res);
+ size += crash_resource_size(&crashk_low_res);
- if (!vmcoreinfo_size)
- return;
- buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
- vmcoreinfo_size);
- final_note(buf);
+ kexec_unlock();
+ return size;
}
-void crash_update_vmcoreinfo_safecopy(void *ptr)
+static int __crash_shrink_memory(struct resource *old_res,
+ unsigned long new_size)
{
- if (ptr)
- memcpy(ptr, vmcoreinfo_data, vmcoreinfo_size);
+ struct resource *ram_res;
- vmcoreinfo_data_safecopy = ptr;
-}
+ ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
+ if (!ram_res)
+ return -ENOMEM;
-void crash_save_vmcoreinfo(void)
-{
- if (!vmcoreinfo_note)
- return;
+ ram_res->start = old_res->start + new_size;
+ ram_res->end = old_res->end;
+ ram_res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM;
+ ram_res->name = "System RAM";
+
+ if (!new_size) {
+ release_resource(old_res);
+ old_res->start = 0;
+ old_res->end = 0;
+ } else {
+ crashk_res.end = ram_res->start - 1;
+ }
- /* Use the safe copy to generate vmcoreinfo note if have */
- if (vmcoreinfo_data_safecopy)
- vmcoreinfo_data = vmcoreinfo_data_safecopy;
+ crash_free_reserved_phys_range(ram_res->start, ram_res->end);
+ insert_resource(&iomem_resource, ram_res);
- vmcoreinfo_append_str("CRASHTIME=%lld\n", ktime_get_real_seconds());
- update_vmcoreinfo_note();
+ return 0;
}
-void vmcoreinfo_append_str(const char *fmt, ...)
+int crash_shrink_memory(unsigned long new_size)
{
- va_list args;
- char buf[0x50];
- size_t r;
-
- va_start(args, fmt);
- r = vscnprintf(buf, sizeof(buf), fmt, args);
- va_end(args);
-
- r = min(r, (size_t)VMCOREINFO_BYTES - vmcoreinfo_size);
-
- memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
-
- vmcoreinfo_size += r;
-
- WARN_ONCE(vmcoreinfo_size == VMCOREINFO_BYTES,
- "vmcoreinfo data exceeds allocated size, truncating");
-}
+ int ret = 0;
+ unsigned long old_size, low_size;
-/*
- * provide an empty default implementation here -- architecture
- * code may override this
- */
-void __weak arch_crash_save_vmcoreinfo(void)
-{}
+ if (!kexec_trylock())
+ return -EBUSY;
-phys_addr_t __weak paddr_vmcoreinfo_note(void)
-{
- return __pa(vmcoreinfo_note);
-}
-EXPORT_SYMBOL(paddr_vmcoreinfo_note);
+ if (kexec_crash_image) {
+ ret = -ENOENT;
+ goto unlock;
+ }
-static int __init crash_save_vmcoreinfo_init(void)
-{
- vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL);
- if (!vmcoreinfo_data) {
- pr_warn("Memory allocation for vmcoreinfo_data failed\n");
- return -ENOMEM;
+ low_size = crash_resource_size(&crashk_low_res);
+ old_size = crash_resource_size(&crashk_res) + low_size;
+ new_size = roundup(new_size, KEXEC_CRASH_MEM_ALIGN);
+ if (new_size >= old_size) {
+ ret = (new_size == old_size) ? 0 : -EINVAL;
+ goto unlock;
}
- vmcoreinfo_note = alloc_pages_exact(VMCOREINFO_NOTE_SIZE,
- GFP_KERNEL | __GFP_ZERO);
- if (!vmcoreinfo_note) {
- free_page((unsigned long)vmcoreinfo_data);
- vmcoreinfo_data = NULL;
- pr_warn("Memory allocation for vmcoreinfo_note failed\n");
- return -ENOMEM;
+ /*
+ * (low_size > new_size) implies that low_size is greater than zero.
+ * This also means that if low_size is zero, the else branch is taken.
+ *
+ * If low_size is greater than 0, (low_size > new_size) indicates that
+ * crashk_low_res also needs to be shrunken. Otherwise, only crashk_res
+ * needs to be shrunken.
+ */
+ if (low_size > new_size) {
+ ret = __crash_shrink_memory(&crashk_res, 0);
+ if (ret)
+ goto unlock;
+
+ ret = __crash_shrink_memory(&crashk_low_res, new_size);
+ } else {
+ ret = __crash_shrink_memory(&crashk_res, new_size - low_size);
}
- VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
- VMCOREINFO_BUILD_ID();
- VMCOREINFO_PAGESIZE(PAGE_SIZE);
+ /* Swap crashk_res and crashk_low_res if needed */
+ if (!crashk_res.end && crashk_low_res.end) {
+ crashk_res.start = crashk_low_res.start;
+ crashk_res.end = crashk_low_res.end;
+ release_resource(&crashk_low_res);
+ crashk_low_res.start = 0;
+ crashk_low_res.end = 0;
+ insert_resource(&iomem_resource, &crashk_res);
+ }
- VMCOREINFO_SYMBOL(init_uts_ns);
- VMCOREINFO_OFFSET(uts_namespace, name);
- VMCOREINFO_SYMBOL(node_online_map);
-#ifdef CONFIG_MMU
- VMCOREINFO_SYMBOL_ARRAY(swapper_pg_dir);
-#endif
- VMCOREINFO_SYMBOL(_stext);
- VMCOREINFO_SYMBOL(vmap_area_list);
+unlock:
+ kexec_unlock();
+ return ret;
+}
-#ifndef CONFIG_NUMA
- VMCOREINFO_SYMBOL(mem_map);
- VMCOREINFO_SYMBOL(contig_page_data);
-#endif
-#ifdef CONFIG_SPARSEMEM
- VMCOREINFO_SYMBOL_ARRAY(mem_section);
- VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
- VMCOREINFO_STRUCT_SIZE(mem_section);
- VMCOREINFO_OFFSET(mem_section, section_mem_map);
- VMCOREINFO_NUMBER(SECTION_SIZE_BITS);
- VMCOREINFO_NUMBER(MAX_PHYSMEM_BITS);
-#endif
- VMCOREINFO_STRUCT_SIZE(page);
- VMCOREINFO_STRUCT_SIZE(pglist_data);
- VMCOREINFO_STRUCT_SIZE(zone);
- VMCOREINFO_STRUCT_SIZE(free_area);
- VMCOREINFO_STRUCT_SIZE(list_head);
- VMCOREINFO_SIZE(nodemask_t);
- VMCOREINFO_OFFSET(page, flags);
- VMCOREINFO_OFFSET(page, _refcount);
- VMCOREINFO_OFFSET(page, mapping);
- VMCOREINFO_OFFSET(page, lru);
- VMCOREINFO_OFFSET(page, _mapcount);
- VMCOREINFO_OFFSET(page, private);
- VMCOREINFO_OFFSET(page, compound_head);
- VMCOREINFO_OFFSET(pglist_data, node_zones);
- VMCOREINFO_OFFSET(pglist_data, nr_zones);
-#ifdef CONFIG_FLATMEM
- VMCOREINFO_OFFSET(pglist_data, node_mem_map);
-#endif
- VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
- VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
- VMCOREINFO_OFFSET(pglist_data, node_id);
- VMCOREINFO_OFFSET(zone, free_area);
- VMCOREINFO_OFFSET(zone, vm_stat);
- VMCOREINFO_OFFSET(zone, spanned_pages);
- VMCOREINFO_OFFSET(free_area, free_list);
- VMCOREINFO_OFFSET(list_head, next);
- VMCOREINFO_OFFSET(list_head, prev);
- VMCOREINFO_OFFSET(vmap_area, va_start);
- VMCOREINFO_OFFSET(vmap_area, list);
- VMCOREINFO_LENGTH(zone.free_area, NR_PAGE_ORDERS);
- log_buf_vmcoreinfo_setup();
- VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
- VMCOREINFO_NUMBER(NR_FREE_PAGES);
- VMCOREINFO_NUMBER(PG_lru);
- VMCOREINFO_NUMBER(PG_private);
- VMCOREINFO_NUMBER(PG_swapcache);
- VMCOREINFO_NUMBER(PG_swapbacked);
- VMCOREINFO_NUMBER(PG_slab);
-#ifdef CONFIG_MEMORY_FAILURE
- VMCOREINFO_NUMBER(PG_hwpoison);
-#endif
- VMCOREINFO_NUMBER(PG_head_mask);
-#define PAGE_BUDDY_MAPCOUNT_VALUE (~PG_buddy)
- VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
-#ifdef CONFIG_HUGETLB_PAGE
- VMCOREINFO_NUMBER(PG_hugetlb);
-#define PAGE_OFFLINE_MAPCOUNT_VALUE (~PG_offline)
- VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE);
-#endif
+void crash_save_cpu(struct pt_regs *regs, int cpu)
+{
+ struct elf_prstatus prstatus;
+ u32 *buf;
-#ifdef CONFIG_KALLSYMS
- VMCOREINFO_SYMBOL(kallsyms_names);
- VMCOREINFO_SYMBOL(kallsyms_num_syms);
- VMCOREINFO_SYMBOL(kallsyms_token_table);
- VMCOREINFO_SYMBOL(kallsyms_token_index);
-#ifdef CONFIG_KALLSYMS_BASE_RELATIVE
- VMCOREINFO_SYMBOL(kallsyms_offsets);
- VMCOREINFO_SYMBOL(kallsyms_relative_base);
-#else
- VMCOREINFO_SYMBOL(kallsyms_addresses);
-#endif /* CONFIG_KALLSYMS_BASE_RELATIVE */
-#endif /* CONFIG_KALLSYMS */
-
- arch_crash_save_vmcoreinfo();
- update_vmcoreinfo_note();
+ if ((cpu < 0) || (cpu >= nr_cpu_ids))
+ return;
- return 0;
+ /* Using ELF notes here is opportunistic.
+ * I need a well defined structure format
+ * for the data I pass, and I need tags
+ * on the data to indicate what information I have
+ * squirrelled away. ELF notes happen to provide
+ * all of that, so there is no need to invent something new.
+ */
+ buf = (u32 *)per_cpu_ptr(crash_notes, cpu);
+ if (!buf)
+ return;
+ memset(&prstatus, 0, sizeof(prstatus));
+ prstatus.common.pr_pid = current->pid;
+ elf_core_copy_regs(&prstatus.pr_reg, regs);
+ buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
+ &prstatus, sizeof(prstatus));
+ final_note(buf);
}
-subsys_initcall(crash_save_vmcoreinfo_init);
+
static int __init crash_notes_memory_init(void)
{
@@ -866,6 +474,8 @@ static int __init crash_notes_memory_init(void)
}
subsys_initcall(crash_notes_memory_init);
+#endif /*CONFIG_CRASH_DUMP*/
+
#ifdef CONFIG_CRASH_HOTPLUG
#undef pr_fmt
#define pr_fmt(fmt) "crash hp: " fmt
diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c
new file mode 100644
index 000000000000..066668799f75
--- /dev/null
+++ b/kernel/crash_reserve.c
@@ -0,0 +1,471 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * crash.c - kernel crash support code.
+ * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
+ */
+
+#include <linux/buildid.h>
+#include <linux/init.h>
+#include <linux/utsname.h>
+#include <linux/vmalloc.h>
+#include <linux/sizes.h>
+#include <linux/kexec.h>
+#include <linux/memory.h>
+#include <linux/cpuhotplug.h>
+#include <linux/memblock.h>
+#include <linux/kexec.h>
+#include <linux/kmemleak.h>
+
+#include <asm/page.h>
+#include <asm/sections.h>
+
+#include <crypto/sha1.h>
+
+#include "kallsyms_internal.h"
+#include "kexec_internal.h"
+
+/* Location of the reserved area for the crash kernel */
+struct resource crashk_res = {
+ .name = "Crash kernel",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
+ .desc = IORES_DESC_CRASH_KERNEL
+};
+struct resource crashk_low_res = {
+ .name = "Crash kernel",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
+ .desc = IORES_DESC_CRASH_KERNEL
+};
+
+/*
+ * parsing the "crashkernel" commandline
+ *
+ * this code is intended to be called from architecture specific code
+ */
+
+
+/*
+ * This function parses command lines in the format
+ *
+ * crashkernel=ramsize-range:size[,...][@offset]
+ *
+ * The function returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_mem(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ char *cur = cmdline, *tmp;
+ unsigned long long total_mem = system_ram;
+
+ /*
+ * Firmware sometimes reserves some memory regions for its own use,
+ * so the system memory size is less than the actual physical memory
+ * size. Work around this by rounding up the total size to 128M,
+ * which is enough for most test cases.
+ */
+ total_mem = roundup(total_mem, SZ_128M);
+
+ /* for each entry of the comma-separated list */
+ do {
+ unsigned long long start, end = ULLONG_MAX, size;
+
+ /* get the start of the range */
+ start = memparse(cur, &tmp);
+ if (cur == tmp) {
+ pr_warn("crashkernel: Memory value expected\n");
+ return -EINVAL;
+ }
+ cur = tmp;
+ if (*cur != '-') {
+ pr_warn("crashkernel: '-' expected\n");
+ return -EINVAL;
+ }
+ cur++;
+
+ /* if no ':' is here, than we read the end */
+ if (*cur != ':') {
+ end = memparse(cur, &tmp);
+ if (cur == tmp) {
+ pr_warn("crashkernel: Memory value expected\n");
+ return -EINVAL;
+ }
+ cur = tmp;
+ if (end <= start) {
+ pr_warn("crashkernel: end <= start\n");
+ return -EINVAL;
+ }
+ }
+
+ if (*cur != ':') {
+ pr_warn("crashkernel: ':' expected\n");
+ return -EINVAL;
+ }
+ cur++;
+
+ size = memparse(cur, &tmp);
+ if (cur == tmp) {
+ pr_warn("Memory value expected\n");
+ return -EINVAL;
+ }
+ cur = tmp;
+ if (size >= total_mem) {
+ pr_warn("crashkernel: invalid size\n");
+ return -EINVAL;
+ }
+
+ /* match ? */
+ if (total_mem >= start && total_mem < end) {
+ *crash_size = size;
+ break;
+ }
+ } while (*cur++ == ',');
+
+ if (*crash_size > 0) {
+ while (*cur && *cur != ' ' && *cur != '@')
+ cur++;
+ if (*cur == '@') {
+ cur++;
+ *crash_base = memparse(cur, &tmp);
+ if (cur == tmp) {
+ pr_warn("Memory value expected after '@'\n");
+ return -EINVAL;
+ }
+ }
+ } else
+ pr_info("crashkernel size resulted in zero bytes\n");
+
+ return 0;
+}
+
+/*
+ * That function parses "simple" (old) crashkernel command lines like
+ *
+ * crashkernel=size[@offset]
+ *
+ * It returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_simple(char *cmdline,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ char *cur = cmdline;
+
+ *crash_size = memparse(cmdline, &cur);
+ if (cmdline == cur) {
+ pr_warn("crashkernel: memory value expected\n");
+ return -EINVAL;
+ }
+
+ if (*cur == '@')
+ *crash_base = memparse(cur+1, &cur);
+ else if (*cur != ' ' && *cur != '\0') {
+ pr_warn("crashkernel: unrecognized char: %c\n", *cur);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+#define SUFFIX_HIGH 0
+#define SUFFIX_LOW 1
+#define SUFFIX_NULL 2
+static __initdata char *suffix_tbl[] = {
+ [SUFFIX_HIGH] = ",high",
+ [SUFFIX_LOW] = ",low",
+ [SUFFIX_NULL] = NULL,
+};
+
+/*
+ * That function parses "suffix" crashkernel command lines like
+ *
+ * crashkernel=size,[high|low]
+ *
+ * It returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_suffix(char *cmdline,
+ unsigned long long *crash_size,
+ const char *suffix)
+{
+ char *cur = cmdline;
+
+ *crash_size = memparse(cmdline, &cur);
+ if (cmdline == cur) {
+ pr_warn("crashkernel: memory value expected\n");
+ return -EINVAL;
+ }
+
+ /* check with suffix */
+ if (strncmp(cur, suffix, strlen(suffix))) {
+ pr_warn("crashkernel: unrecognized char: %c\n", *cur);
+ return -EINVAL;
+ }
+ cur += strlen(suffix);
+ if (*cur != ' ' && *cur != '\0') {
+ pr_warn("crashkernel: unrecognized char: %c\n", *cur);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static __init char *get_last_crashkernel(char *cmdline,
+ const char *name,
+ const char *suffix)
+{
+ char *p = cmdline, *ck_cmdline = NULL;
+
+ /* find crashkernel and use the last one if there are more */
+ p = strstr(p, name);
+ while (p) {
+ char *end_p = strchr(p, ' ');
+ char *q;
+
+ if (!end_p)
+ end_p = p + strlen(p);
+
+ if (!suffix) {
+ int i;
+
+ /* skip the one with any known suffix */
+ for (i = 0; suffix_tbl[i]; i++) {
+ q = end_p - strlen(suffix_tbl[i]);
+ if (!strncmp(q, suffix_tbl[i],
+ strlen(suffix_tbl[i])))
+ goto next;
+ }
+ ck_cmdline = p;
+ } else {
+ q = end_p - strlen(suffix);
+ if (!strncmp(q, suffix, strlen(suffix)))
+ ck_cmdline = p;
+ }
+next:
+ p = strstr(p+1, name);
+ }
+
+ return ck_cmdline;
+}
+
+static int __init __parse_crashkernel(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base,
+ const char *suffix)
+{
+ char *first_colon, *first_space;
+ char *ck_cmdline;
+ char *name = "crashkernel=";
+
+ BUG_ON(!crash_size || !crash_base);
+ *crash_size = 0;
+ *crash_base = 0;
+
+ ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
+ if (!ck_cmdline)
+ return -ENOENT;
+
+ ck_cmdline += strlen(name);
+
+ if (suffix)
+ return parse_crashkernel_suffix(ck_cmdline, crash_size,
+ suffix);
+ /*
+ * if the commandline contains a ':', then that's the extended
+ * syntax -- if not, it must be the classic syntax
+ */
+ first_colon = strchr(ck_cmdline, ':');
+ first_space = strchr(ck_cmdline, ' ');
+ if (first_colon && (!first_space || first_colon < first_space))
+ return parse_crashkernel_mem(ck_cmdline, system_ram,
+ crash_size, crash_base);
+
+ return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
+}
+
+/*
+ * That function is the entry point for command line parsing and should be
+ * called from the arch-specific code.
+ *
+ * If crashkernel=,high|low is supported on architecture, non-NULL values
+ * should be passed to parameters 'low_size' and 'high'.
+ */
+int __init parse_crashkernel(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base,
+ unsigned long long *low_size,
+ bool *high)
+{
+ int ret;
+
+ /* crashkernel=X[@offset] */
+ ret = __parse_crashkernel(cmdline, system_ram, crash_size,
+ crash_base, NULL);
+#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+ /*
+ * If non-NULL 'high' passed in and no normal crashkernel
+ * setting detected, try parsing crashkernel=,high|low.
+ */
+ if (high && ret == -ENOENT) {
+ ret = __parse_crashkernel(cmdline, 0, crash_size,
+ crash_base, suffix_tbl[SUFFIX_HIGH]);
+ if (ret || !*crash_size)
+ return -EINVAL;
+
+ /*
+ * crashkernel=Y,low can be specified or not, but invalid value
+ * is not allowed.
+ */
+ ret = __parse_crashkernel(cmdline, 0, low_size,
+ crash_base, suffix_tbl[SUFFIX_LOW]);
+ if (ret == -ENOENT) {
+ *low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE;
+ ret = 0;
+ } else if (ret) {
+ return ret;
+ }
+
+ *high = true;
+ }
+#endif
+ if (!*crash_size)
+ ret = -EINVAL;
+
+ return ret;
+}
+
+/*
+ * Add a dummy early_param handler to mark crashkernel= as a known command line
+ * parameter and suppress incorrect warnings in init/main.c.
+ */
+static int __init parse_crashkernel_dummy(char *arg)
+{
+ return 0;
+}
+early_param("crashkernel", parse_crashkernel_dummy);
+
+#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+static int __init reserve_crashkernel_low(unsigned long long low_size)
+{
+#ifdef CONFIG_64BIT
+ unsigned long long low_base;
+
+ low_base = memblock_phys_alloc_range(low_size, CRASH_ALIGN, 0, CRASH_ADDR_LOW_MAX);
+ if (!low_base) {
+ pr_err("cannot allocate crashkernel low memory (size:0x%llx).\n", low_size);
+ return -ENOMEM;
+ }
+
+ pr_info("crashkernel low memory reserved: 0x%08llx - 0x%08llx (%lld MB)\n",
+ low_base, low_base + low_size, low_size >> 20);
+
+ crashk_low_res.start = low_base;
+ crashk_low_res.end = low_base + low_size - 1;
+#ifdef HAVE_ARCH_ADD_CRASH_RES_TO_IOMEM_EARLY
+ insert_resource(&iomem_resource, &crashk_low_res);
+#endif
+#endif
+ return 0;
+}
+
+void __init reserve_crashkernel_generic(char *cmdline,
+ unsigned long long crash_size,
+ unsigned long long crash_base,
+ unsigned long long crash_low_size,
+ bool high)
+{
+ unsigned long long search_end = CRASH_ADDR_LOW_MAX, search_base = 0;
+ bool fixed_base = false;
+
+ /* User specifies base address explicitly. */
+ if (crash_base) {
+ fixed_base = true;
+ search_base = crash_base;
+ search_end = crash_base + crash_size;
+ } else if (high) {
+ search_base = CRASH_ADDR_LOW_MAX;
+ search_end = CRASH_ADDR_HIGH_MAX;
+ }
+
+retry:
+ crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN,
+ search_base, search_end);
+ if (!crash_base) {
+ /*
+ * For crashkernel=size[KMG]@offset[KMG], print out failure
+ * message if can't reserve the specified region.
+ */
+ if (fixed_base) {
+ pr_warn("crashkernel reservation failed - memory is in use.\n");
+ return;
+ }
+
+ /*
+ * For crashkernel=size[KMG], if the first attempt was for
+ * low memory, fall back to high memory, the minimum required
+ * low memory will be reserved later.
+ */
+ if (!high && search_end == CRASH_ADDR_LOW_MAX) {
+ search_end = CRASH_ADDR_HIGH_MAX;
+ search_base = CRASH_ADDR_LOW_MAX;
+ crash_low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE;
+ goto retry;
+ }
+
+ /*
+ * For crashkernel=size[KMG],high, if the first attempt was
+ * for high memory, fall back to low memory.
+ */
+ if (high && search_end == CRASH_ADDR_HIGH_MAX) {
+ search_end = CRASH_ADDR_LOW_MAX;
+ search_base = 0;
+ goto retry;
+ }
+ pr_warn("cannot allocate crashkernel (size:0x%llx)\n",
+ crash_size);
+ return;
+ }
+
+ if ((crash_base >= CRASH_ADDR_LOW_MAX) &&
+ crash_low_size && reserve_crashkernel_low(crash_low_size)) {
+ memblock_phys_free(crash_base, crash_size);
+ return;
+ }
+
+ pr_info("crashkernel reserved: 0x%016llx - 0x%016llx (%lld MB)\n",
+ crash_base, crash_base + crash_size, crash_size >> 20);
+
+ /*
+ * The crashkernel memory will be removed from the kernel linear
+ * map. Inform kmemleak so that it won't try to access it.
+ */
+ kmemleak_ignore_phys(crash_base);
+ if (crashk_low_res.end)
+ kmemleak_ignore_phys(crashk_low_res.start);
+
+ crashk_res.start = crash_base;
+ crashk_res.end = crash_base + crash_size - 1;
+#ifdef HAVE_ARCH_ADD_CRASH_RES_TO_IOMEM_EARLY
+ insert_resource(&iomem_resource, &crashk_res);
+#endif
+}
+
+#ifndef HAVE_ARCH_ADD_CRASH_RES_TO_IOMEM_EARLY
+static __init int insert_crashkernel_resources(void)
+{
+ if (crashk_res.start < crashk_res.end)
+ insert_resource(&iomem_resource, &crashk_res);
+
+ if (crashk_low_res.start < crashk_low_res.end)
+ insert_resource(&iomem_resource, &crashk_low_res);
+
+ return 0;
+}
+early_initcall(insert_crashkernel_resources);
+#endif
+#endif
diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index f005c66f378c..055da410ac71 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -37,12 +37,6 @@
#define pr_fmt(fmt) "cma: " fmt
-#ifdef CONFIG_CMA_DEBUG
-#ifndef DEBUG
-# define DEBUG
-#endif
-#endif
-
#include <asm/page.h>
#include <linux/memblock.h>
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 98b2e192fd69..4d543b1e9d57 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -286,7 +286,7 @@ void *dma_direct_alloc(struct device *dev, size_t size,
} else {
ret = page_address(page);
if (dma_set_decrypted(dev, ret, size))
- goto out_free_pages;
+ goto out_leak_pages;
}
memset(ret, 0, size);
@@ -307,6 +307,8 @@ out_encrypt_pages:
out_free_pages:
__dma_direct_free_pages(dev, page, size);
return NULL;
+out_leak_pages:
+ return NULL;
}
void dma_direct_free(struct device *dev, size_t size,
@@ -367,12 +369,11 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
ret = page_address(page);
if (dma_set_decrypted(dev, ret, size))
- goto out_free_pages;
+ goto out_leak_pages;
memset(ret, 0, size);
*dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
return page;
-out_free_pages:
- __dma_direct_free_pages(dev, page, size);
+out_leak_pages:
return NULL;
}
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index b079a9a8e087..0de66f0ff43a 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -69,11 +69,14 @@
* @alloc_size: Size of the allocated buffer.
* @list: The free list describing the number of free entries available
* from each index.
+ * @pad_slots: Number of preceding padding slots. Valid only in the first
+ * allocated non-padding slot.
*/
struct io_tlb_slot {
phys_addr_t orig_addr;
size_t alloc_size;
- unsigned int list;
+ unsigned short list;
+ unsigned short pad_slots;
};
static bool swiotlb_force_bounce;
@@ -287,6 +290,7 @@ static void swiotlb_init_io_tlb_pool(struct io_tlb_pool *mem, phys_addr_t start,
mem->nslabs - i);
mem->slots[i].orig_addr = INVALID_PHYS_ADDR;
mem->slots[i].alloc_size = 0;
+ mem->slots[i].pad_slots = 0;
}
memset(vaddr, 0, bytes);
@@ -821,12 +825,30 @@ void swiotlb_dev_init(struct device *dev)
#endif
}
-/*
- * Return the offset into a iotlb slot required to keep the device happy.
+/**
+ * swiotlb_align_offset() - Get required offset into an IO TLB allocation.
+ * @dev: Owning device.
+ * @align_mask: Allocation alignment mask.
+ * @addr: DMA address.
+ *
+ * Return the minimum offset from the start of an IO TLB allocation which is
+ * required for a given buffer address and allocation alignment to keep the
+ * device happy.
+ *
+ * First, the address bits covered by min_align_mask must be identical in the
+ * original address and the bounce buffer address. High bits are preserved by
+ * choosing a suitable IO TLB slot, but bits below IO_TLB_SHIFT require extra
+ * padding bytes before the bounce buffer.
+ *
+ * Second, @align_mask specifies which bits of the first allocated slot must
+ * be zero. This may require allocating additional padding slots, and then the
+ * offset (in bytes) from the first such padding slot is returned.
*/
-static unsigned int swiotlb_align_offset(struct device *dev, u64 addr)
+static unsigned int swiotlb_align_offset(struct device *dev,
+ unsigned int align_mask, u64 addr)
{
- return addr & dma_get_min_align_mask(dev) & (IO_TLB_SIZE - 1);
+ return addr & dma_get_min_align_mask(dev) &
+ (align_mask | (IO_TLB_SIZE - 1));
}
/*
@@ -841,27 +863,23 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size
size_t alloc_size = mem->slots[index].alloc_size;
unsigned long pfn = PFN_DOWN(orig_addr);
unsigned char *vaddr = mem->vaddr + tlb_addr - mem->start;
- unsigned int tlb_offset, orig_addr_offset;
+ int tlb_offset;
if (orig_addr == INVALID_PHYS_ADDR)
return;
- tlb_offset = tlb_addr & (IO_TLB_SIZE - 1);
- orig_addr_offset = swiotlb_align_offset(dev, orig_addr);
- if (tlb_offset < orig_addr_offset) {
- dev_WARN_ONCE(dev, 1,
- "Access before mapping start detected. orig offset %u, requested offset %u.\n",
- orig_addr_offset, tlb_offset);
- return;
- }
-
- tlb_offset -= orig_addr_offset;
- if (tlb_offset > alloc_size) {
- dev_WARN_ONCE(dev, 1,
- "Buffer overflow detected. Allocation size: %zu. Mapping size: %zu+%u.\n",
- alloc_size, size, tlb_offset);
- return;
- }
+ /*
+ * It's valid for tlb_offset to be negative. This can happen when the
+ * "offset" returned by swiotlb_align_offset() is non-zero, and the
+ * tlb_addr is pointing within the first "offset" bytes of the second
+ * or subsequent slots of the allocated swiotlb area. While it's not
+ * valid for tlb_addr to be pointing within the first "offset" bytes
+ * of the first slot, there's no way to check for such an error since
+ * this function can't distinguish the first slot from the second and
+ * subsequent slots.
+ */
+ tlb_offset = (tlb_addr & (IO_TLB_SIZE - 1)) -
+ swiotlb_align_offset(dev, 0, orig_addr);
orig_addr += tlb_offset;
alloc_size -= tlb_offset;
@@ -956,6 +974,28 @@ static void dec_used(struct io_tlb_mem *mem, unsigned int nslots)
}
#endif /* CONFIG_DEBUG_FS */
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+#ifdef CONFIG_DEBUG_FS
+static void inc_transient_used(struct io_tlb_mem *mem, unsigned int nslots)
+{
+ atomic_long_add(nslots, &mem->transient_nslabs);
+}
+
+static void dec_transient_used(struct io_tlb_mem *mem, unsigned int nslots)
+{
+ atomic_long_sub(nslots, &mem->transient_nslabs);
+}
+
+#else /* !CONFIG_DEBUG_FS */
+static void inc_transient_used(struct io_tlb_mem *mem, unsigned int nslots)
+{
+}
+static void dec_transient_used(struct io_tlb_mem *mem, unsigned int nslots)
+{
+}
+#endif /* CONFIG_DEBUG_FS */
+#endif /* CONFIG_SWIOTLB_DYNAMIC */
+
/**
* swiotlb_search_pool_area() - search one memory area in one pool
* @dev: Device which maps the buffer.
@@ -981,10 +1021,9 @@ static int swiotlb_search_pool_area(struct device *dev, struct io_tlb_pool *pool
dma_addr_t tbl_dma_addr =
phys_to_dma_unencrypted(dev, pool->start) & boundary_mask;
unsigned long max_slots = get_max_slots(boundary_mask);
- unsigned int iotlb_align_mask =
- dma_get_min_align_mask(dev) | alloc_align_mask;
+ unsigned int iotlb_align_mask = dma_get_min_align_mask(dev);
unsigned int nslots = nr_slots(alloc_size), stride;
- unsigned int offset = swiotlb_align_offset(dev, orig_addr);
+ unsigned int offset = swiotlb_align_offset(dev, 0, orig_addr);
unsigned int index, slots_checked, count = 0, i;
unsigned long flags;
unsigned int slot_base;
@@ -994,18 +1033,29 @@ static int swiotlb_search_pool_area(struct device *dev, struct io_tlb_pool *pool
BUG_ON(area_index >= pool->nareas);
/*
- * For allocations of PAGE_SIZE or larger only look for page aligned
- * allocations.
+ * Historically, swiotlb allocations >= PAGE_SIZE were guaranteed to be
+ * page-aligned in the absence of any other alignment requirements.
+ * 'alloc_align_mask' was later introduced to specify the alignment
+ * explicitly, however this is passed as zero for streaming mappings
+ * and so we preserve the old behaviour there in case any drivers are
+ * relying on it.
+ */
+ if (!alloc_align_mask && !iotlb_align_mask && alloc_size >= PAGE_SIZE)
+ alloc_align_mask = PAGE_SIZE - 1;
+
+ /*
+ * Ensure that the allocation is at least slot-aligned and update
+ * 'iotlb_align_mask' to ignore bits that will be preserved when
+ * offsetting into the allocation.
*/
- if (alloc_size >= PAGE_SIZE)
- iotlb_align_mask |= ~PAGE_MASK;
- iotlb_align_mask &= ~(IO_TLB_SIZE - 1);
+ alloc_align_mask |= (IO_TLB_SIZE - 1);
+ iotlb_align_mask &= ~alloc_align_mask;
/*
* For mappings with an alignment requirement don't bother looping to
* unaligned slots once we found an aligned one.
*/
- stride = (iotlb_align_mask >> IO_TLB_SHIFT) + 1;
+ stride = get_max_slots(max(alloc_align_mask, iotlb_align_mask));
spin_lock_irqsave(&area->lock, flags);
if (unlikely(nslots > pool->area_nslabs - area->used))
@@ -1015,11 +1065,14 @@ static int swiotlb_search_pool_area(struct device *dev, struct io_tlb_pool *pool
index = area->index;
for (slots_checked = 0; slots_checked < pool->area_nslabs; ) {
+ phys_addr_t tlb_addr;
+
slot_index = slot_base + index;
+ tlb_addr = slot_addr(tbl_dma_addr, slot_index);
- if (orig_addr &&
- (slot_addr(tbl_dma_addr, slot_index) &
- iotlb_align_mask) != (orig_addr & iotlb_align_mask)) {
+ if ((tlb_addr & alloc_align_mask) ||
+ (orig_addr && (tlb_addr & iotlb_align_mask) !=
+ (orig_addr & iotlb_align_mask))) {
index = wrap_area_index(pool, index + 1);
slots_checked++;
continue;
@@ -1170,6 +1223,7 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
spin_lock_irqsave(&dev->dma_io_tlb_lock, flags);
list_add_rcu(&pool->node, &dev->dma_io_tlb_pools);
spin_unlock_irqrestore(&dev->dma_io_tlb_lock, flags);
+ inc_transient_used(mem, pool->nslabs);
found:
WRITE_ONCE(dev->dma_uses_io_tlb, true);
@@ -1292,11 +1346,12 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
unsigned long attrs)
{
struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
- unsigned int offset = swiotlb_align_offset(dev, orig_addr);
+ unsigned int offset;
struct io_tlb_pool *pool;
unsigned int i;
int index;
phys_addr_t tlb_addr;
+ unsigned short pad_slots;
if (!mem || !mem->nslabs) {
dev_warn_ratelimited(dev,
@@ -1313,6 +1368,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
return (phys_addr_t)DMA_MAPPING_ERROR;
}
+ offset = swiotlb_align_offset(dev, alloc_align_mask, orig_addr);
index = swiotlb_find_slots(dev, orig_addr,
alloc_size + offset, alloc_align_mask, &pool);
if (index == -1) {
@@ -1328,6 +1384,10 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
* This is needed when we sync the memory. Then we sync the buffer if
* needed.
*/
+ pad_slots = offset >> IO_TLB_SHIFT;
+ offset &= (IO_TLB_SIZE - 1);
+ index += pad_slots;
+ pool->slots[index].pad_slots = pad_slots;
for (i = 0; i < nr_slots(alloc_size + offset); i++)
pool->slots[index + i].orig_addr = slot_addr(orig_addr, i);
tlb_addr = slot_addr(pool->start, index) + offset;
@@ -1348,13 +1408,17 @@ static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr)
{
struct io_tlb_pool *mem = swiotlb_find_pool(dev, tlb_addr);
unsigned long flags;
- unsigned int offset = swiotlb_align_offset(dev, tlb_addr);
- int index = (tlb_addr - offset - mem->start) >> IO_TLB_SHIFT;
- int nslots = nr_slots(mem->slots[index].alloc_size + offset);
- int aindex = index / mem->area_nslabs;
- struct io_tlb_area *area = &mem->areas[aindex];
+ unsigned int offset = swiotlb_align_offset(dev, 0, tlb_addr);
+ int index, nslots, aindex;
+ struct io_tlb_area *area;
int count, i;
+ index = (tlb_addr - offset - mem->start) >> IO_TLB_SHIFT;
+ index -= mem->slots[index].pad_slots;
+ nslots = nr_slots(mem->slots[index].alloc_size + offset);
+ aindex = index / mem->area_nslabs;
+ area = &mem->areas[aindex];
+
/*
* Return the buffer to the free list by setting the corresponding
* entries to indicate the number of contiguous entries available.
@@ -1377,6 +1441,7 @@ static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr)
mem->slots[i].list = ++count;
mem->slots[i].orig_addr = INVALID_PHYS_ADDR;
mem->slots[i].alloc_size = 0;
+ mem->slots[i].pad_slots = 0;
}
/*
@@ -1415,6 +1480,7 @@ static bool swiotlb_del_transient(struct device *dev, phys_addr_t tlb_addr)
dec_used(dev->dma_io_tlb_mem, pool->nslabs);
swiotlb_del_pool(dev, pool);
+ dec_transient_used(dev->dma_io_tlb_mem, pool->nslabs);
return true;
}
@@ -1557,6 +1623,23 @@ phys_addr_t default_swiotlb_limit(void)
}
#ifdef CONFIG_DEBUG_FS
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+static unsigned long mem_transient_used(struct io_tlb_mem *mem)
+{
+ return atomic_long_read(&mem->transient_nslabs);
+}
+
+static int io_tlb_transient_used_get(void *data, u64 *val)
+{
+ struct io_tlb_mem *mem = data;
+
+ *val = mem_transient_used(mem);
+ return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(fops_io_tlb_transient_used, io_tlb_transient_used_get,
+ NULL, "%llu\n");
+#endif /* CONFIG_SWIOTLB_DYNAMIC */
static int io_tlb_used_get(void *data, u64 *val)
{
@@ -1593,9 +1676,6 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_io_tlb_hiwater, io_tlb_hiwater_get,
static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem,
const char *dirname)
{
- atomic_long_set(&mem->total_used, 0);
- atomic_long_set(&mem->used_hiwater, 0);
-
mem->debugfs = debugfs_create_dir(dirname, io_tlb_default_mem.debugfs);
if (!mem->nslabs)
return;
@@ -1605,6 +1685,10 @@ static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem,
&fops_io_tlb_used);
debugfs_create_file("io_tlb_used_hiwater", 0600, mem->debugfs, mem,
&fops_io_tlb_hiwater);
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+ debugfs_create_file("io_tlb_transient_nslabs", 0400, mem->debugfs,
+ mem, &fops_io_tlb_transient_used);
+#endif
}
static int __init swiotlb_create_default_debugfs(void)
@@ -1631,16 +1715,24 @@ struct page *swiotlb_alloc(struct device *dev, size_t size)
struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
struct io_tlb_pool *pool;
phys_addr_t tlb_addr;
+ unsigned int align;
int index;
if (!mem)
return NULL;
- index = swiotlb_find_slots(dev, 0, size, 0, &pool);
+ align = (1 << (get_order(size) + PAGE_SHIFT)) - 1;
+ index = swiotlb_find_slots(dev, 0, size, align, &pool);
if (index == -1)
return NULL;
tlb_addr = slot_addr(pool->start, index);
+ if (unlikely(!PAGE_ALIGNED(tlb_addr))) {
+ dev_WARN_ONCE(dev, 1, "Cannot allocate pages from non page-aligned swiotlb addr 0x%pa.\n",
+ &tlb_addr);
+ swiotlb_release_slots(dev, tlb_addr);
+ return NULL;
+ }
return pfn_to_page(PFN_DOWN(tlb_addr));
}
@@ -1706,6 +1798,7 @@ static int rmem_swiotlb_device_init(struct reserved_mem *rmem,
mem->for_alloc = true;
#ifdef CONFIG_SWIOTLB_DYNAMIC
spin_lock_init(&mem->lock);
+ INIT_LIST_HEAD_RCU(&mem->pools);
#endif
add_mem_pool(mem, pool);
diff --git a/kernel/crash_dump.c b/kernel/elfcorehdr.c
index 92da32275af5..92da32275af5 100644
--- a/kernel/crash_dump.c
+++ b/kernel/elfcorehdr.c
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 88cb3c88aaa5..90843cc38588 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -57,8 +57,14 @@ long syscall_trace_enter(struct pt_regs *regs, long syscall,
/* Either of the above might have changed the syscall number */
syscall = syscall_get_nr(current, regs);
- if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT))
+ if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) {
trace_sys_enter(regs, syscall);
+ /*
+ * Probes or BPF hooks in the tracepoint may have changed the
+ * system call number as well.
+ */
+ syscall = syscall_get_nr(current, regs);
+ }
syscall_enter_audit(regs, syscall);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 724e6d7e128f..6b0a66ed2ae3 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2302,8 +2302,10 @@ event_sched_out(struct perf_event *event, struct perf_event_context *ctx)
if (!is_software_event(event))
cpc->active_oncpu--;
- if (event->attr.freq && event->attr.sample_freq)
+ if (event->attr.freq && event->attr.sample_freq) {
ctx->nr_freq--;
+ epc->nr_freq--;
+ }
if (event->attr.exclusive || !cpc->active_oncpu)
cpc->exclusive = 0;
@@ -2558,9 +2560,10 @@ event_sched_in(struct perf_event *event, struct perf_event_context *ctx)
if (!is_software_event(event))
cpc->active_oncpu++;
- if (event->attr.freq && event->attr.sample_freq)
+ if (event->attr.freq && event->attr.sample_freq) {
ctx->nr_freq++;
-
+ epc->nr_freq++;
+ }
if (event->attr.exclusive)
cpc->exclusive = 1;
@@ -4123,30 +4126,14 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bo
}
}
-/*
- * combine freq adjustment with unthrottling to avoid two passes over the
- * events. At the same time, make sure, having freq events does not change
- * the rate of unthrottling as that would introduce bias.
- */
-static void
-perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
+static void perf_adjust_freq_unthr_events(struct list_head *event_list)
{
struct perf_event *event;
struct hw_perf_event *hwc;
u64 now, period = TICK_NSEC;
s64 delta;
- /*
- * only need to iterate over all events iff:
- * - context have events in frequency mode (needs freq adjust)
- * - there are events to unthrottle on this cpu
- */
- if (!(ctx->nr_freq || unthrottle))
- return;
-
- raw_spin_lock(&ctx->lock);
-
- list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
+ list_for_each_entry(event, event_list, active_list) {
if (event->state != PERF_EVENT_STATE_ACTIVE)
continue;
@@ -4154,18 +4141,17 @@ perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
if (!event_filter_match(event))
continue;
- perf_pmu_disable(event->pmu);
-
hwc = &event->hw;
if (hwc->interrupts == MAX_INTERRUPTS) {
hwc->interrupts = 0;
perf_log_throttle(event, 1);
- event->pmu->start(event, 0);
+ if (!event->attr.freq || !event->attr.sample_freq)
+ event->pmu->start(event, 0);
}
if (!event->attr.freq || !event->attr.sample_freq)
- goto next;
+ continue;
/*
* stop the event and update event->count
@@ -4187,8 +4173,41 @@ perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
perf_adjust_period(event, period, delta, false);
event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
- next:
- perf_pmu_enable(event->pmu);
+ }
+}
+
+/*
+ * combine freq adjustment with unthrottling to avoid two passes over the
+ * events. At the same time, make sure, having freq events does not change
+ * the rate of unthrottling as that would introduce bias.
+ */
+static void
+perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
+{
+ struct perf_event_pmu_context *pmu_ctx;
+
+ /*
+ * only need to iterate over all events iff:
+ * - context have events in frequency mode (needs freq adjust)
+ * - there are events to unthrottle on this cpu
+ */
+ if (!(ctx->nr_freq || unthrottle))
+ return;
+
+ raw_spin_lock(&ctx->lock);
+
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+ if (!(pmu_ctx->nr_freq || unthrottle))
+ continue;
+ if (!perf_pmu_ctx_is_active(pmu_ctx))
+ continue;
+ if (pmu_ctx->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT)
+ continue;
+
+ perf_pmu_disable(pmu_ctx->pmu);
+ perf_adjust_freq_unthr_events(&pmu_ctx->pinned_active);
+ perf_adjust_freq_unthr_events(&pmu_ctx->flexible_active);
+ perf_pmu_enable(pmu_ctx->pmu);
}
raw_spin_unlock(&ctx->lock);
@@ -6684,14 +6703,6 @@ static const struct file_operations perf_fops = {
* to user-space before waking everybody up.
*/
-static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
-{
- /* only the parent has fasync state */
- if (event->parent)
- event = event->parent;
- return &event->fasync;
-}
-
void perf_event_wakeup(struct perf_event *event)
{
ring_buffer_wakeup(event);
@@ -9544,6 +9555,100 @@ static inline bool sample_is_allowed(struct perf_event *event, struct pt_regs *r
return true;
}
+#ifdef CONFIG_BPF_SYSCALL
+static int bpf_overflow_handler(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ struct bpf_perf_event_data_kern ctx = {
+ .data = data,
+ .event = event,
+ };
+ struct bpf_prog *prog;
+ int ret = 0;
+
+ ctx.regs = perf_arch_bpf_user_pt_regs(regs);
+ if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
+ goto out;
+ rcu_read_lock();
+ prog = READ_ONCE(event->prog);
+ if (prog) {
+ perf_prepare_sample(data, event, regs);
+ ret = bpf_prog_run(prog, &ctx);
+ }
+ rcu_read_unlock();
+out:
+ __this_cpu_dec(bpf_prog_active);
+
+ return ret;
+}
+
+static inline int perf_event_set_bpf_handler(struct perf_event *event,
+ struct bpf_prog *prog,
+ u64 bpf_cookie)
+{
+ if (event->overflow_handler_context)
+ /* hw breakpoint or kernel counter */
+ return -EINVAL;
+
+ if (event->prog)
+ return -EEXIST;
+
+ if (prog->type != BPF_PROG_TYPE_PERF_EVENT)
+ return -EINVAL;
+
+ if (event->attr.precise_ip &&
+ prog->call_get_stack &&
+ (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) ||
+ event->attr.exclude_callchain_kernel ||
+ event->attr.exclude_callchain_user)) {
+ /*
+ * On perf_event with precise_ip, calling bpf_get_stack()
+ * may trigger unwinder warnings and occasional crashes.
+ * bpf_get_[stack|stackid] works around this issue by using
+ * callchain attached to perf_sample_data. If the
+ * perf_event does not full (kernel and user) callchain
+ * attached to perf_sample_data, do not allow attaching BPF
+ * program that calls bpf_get_[stack|stackid].
+ */
+ return -EPROTO;
+ }
+
+ event->prog = prog;
+ event->bpf_cookie = bpf_cookie;
+ return 0;
+}
+
+static inline void perf_event_free_bpf_handler(struct perf_event *event)
+{
+ struct bpf_prog *prog = event->prog;
+
+ if (!prog)
+ return;
+
+ event->prog = NULL;
+ bpf_prog_put(prog);
+}
+#else
+static inline int bpf_overflow_handler(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ return 1;
+}
+
+static inline int perf_event_set_bpf_handler(struct perf_event *event,
+ struct bpf_prog *prog,
+ u64 bpf_cookie)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void perf_event_free_bpf_handler(struct perf_event *event)
+{
+}
+#endif
+
/*
* Generic event overflow handling, sampling.
*/
@@ -9564,6 +9669,9 @@ static int __perf_event_overflow(struct perf_event *event,
ret = __perf_event_account_interrupt(event, throttle);
+ if (event->prog && !bpf_overflow_handler(event, data, regs))
+ return ret;
+
/*
* XXX event_limit might not quite work as expected on inherited
* events
@@ -10422,97 +10530,6 @@ static void perf_event_free_filter(struct perf_event *event)
ftrace_profile_free_filter(event);
}
-#ifdef CONFIG_BPF_SYSCALL
-static void bpf_overflow_handler(struct perf_event *event,
- struct perf_sample_data *data,
- struct pt_regs *regs)
-{
- struct bpf_perf_event_data_kern ctx = {
- .data = data,
- .event = event,
- };
- struct bpf_prog *prog;
- int ret = 0;
-
- ctx.regs = perf_arch_bpf_user_pt_regs(regs);
- if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
- goto out;
- rcu_read_lock();
- prog = READ_ONCE(event->prog);
- if (prog) {
- perf_prepare_sample(data, event, regs);
- ret = bpf_prog_run(prog, &ctx);
- }
- rcu_read_unlock();
-out:
- __this_cpu_dec(bpf_prog_active);
- if (!ret)
- return;
-
- event->orig_overflow_handler(event, data, regs);
-}
-
-static int perf_event_set_bpf_handler(struct perf_event *event,
- struct bpf_prog *prog,
- u64 bpf_cookie)
-{
- if (event->overflow_handler_context)
- /* hw breakpoint or kernel counter */
- return -EINVAL;
-
- if (event->prog)
- return -EEXIST;
-
- if (prog->type != BPF_PROG_TYPE_PERF_EVENT)
- return -EINVAL;
-
- if (event->attr.precise_ip &&
- prog->call_get_stack &&
- (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) ||
- event->attr.exclude_callchain_kernel ||
- event->attr.exclude_callchain_user)) {
- /*
- * On perf_event with precise_ip, calling bpf_get_stack()
- * may trigger unwinder warnings and occasional crashes.
- * bpf_get_[stack|stackid] works around this issue by using
- * callchain attached to perf_sample_data. If the
- * perf_event does not full (kernel and user) callchain
- * attached to perf_sample_data, do not allow attaching BPF
- * program that calls bpf_get_[stack|stackid].
- */
- return -EPROTO;
- }
-
- event->prog = prog;
- event->bpf_cookie = bpf_cookie;
- event->orig_overflow_handler = READ_ONCE(event->overflow_handler);
- WRITE_ONCE(event->overflow_handler, bpf_overflow_handler);
- return 0;
-}
-
-static void perf_event_free_bpf_handler(struct perf_event *event)
-{
- struct bpf_prog *prog = event->prog;
-
- if (!prog)
- return;
-
- WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler);
- event->prog = NULL;
- bpf_prog_put(prog);
-}
-#else
-static int perf_event_set_bpf_handler(struct perf_event *event,
- struct bpf_prog *prog,
- u64 bpf_cookie)
-{
- return -EOPNOTSUPP;
-}
-static void perf_event_free_bpf_handler(struct perf_event *event)
-{
-}
-#endif
-
/*
* returns true if the event is a tracepoint, or a kprobe/upprobe created
* with perf_event_open()
@@ -11971,13 +11988,11 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
overflow_handler = parent_event->overflow_handler;
context = parent_event->overflow_handler_context;
#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING)
- if (overflow_handler == bpf_overflow_handler) {
+ if (parent_event->prog) {
struct bpf_prog *prog = parent_event->prog;
bpf_prog_inc(prog);
event->prog = prog;
- event->orig_overflow_handler =
- parent_event->orig_overflow_handler;
}
#endif
}
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 60ed43d1c29e..4013408ce012 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -22,6 +22,10 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
atomic_set(&handle->rb->poll, EPOLLIN);
handle->event->pending_wakeup = 1;
+
+ if (*perf_event_fasync(handle->event) && !handle->event->pending_kill)
+ handle->event->pending_kill = POLL_IN;
+
irq_work_queue(&handle->event->pending_irq);
}
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 929e98c62965..e4834d23e1d1 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -188,7 +188,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
dec_mm_counter(mm, MM_ANONPAGES);
if (!folio_test_anon(old_folio)) {
- dec_mm_counter(mm, mm_counter_file(old_page));
+ dec_mm_counter(mm, mm_counter_file(old_folio));
inc_mm_counter(mm, MM_ANONPAGES);
}
diff --git a/kernel/fork.c b/kernel/fork.c
index 39a5046c2f0b..aebb3e6c96dc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -714,6 +714,23 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
} else if (anon_vma_fork(tmp, mpnt))
goto fail_nomem_anon_vma_fork;
vm_flags_clear(tmp, VM_LOCKED_MASK);
+ /*
+ * Copy/update hugetlb private vma information.
+ */
+ if (is_vm_hugetlb_page(tmp))
+ hugetlb_dup_vma_private(tmp);
+
+ /*
+ * Link the vma into the MT. After using __mt_dup(), memory
+ * allocation is not necessary here, so it cannot fail.
+ */
+ vma_iter_bulk_store(&vmi, tmp);
+
+ mm->map_count++;
+
+ if (tmp->vm_ops && tmp->vm_ops->open)
+ tmp->vm_ops->open(tmp);
+
file = tmp->vm_file;
if (file) {
struct address_space *mapping = file->f_mapping;
@@ -730,25 +747,9 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
i_mmap_unlock_write(mapping);
}
- /*
- * Copy/update hugetlb private vma information.
- */
- if (is_vm_hugetlb_page(tmp))
- hugetlb_dup_vma_private(tmp);
-
- /*
- * Link the vma into the MT. After using __mt_dup(), memory
- * allocation is not necessary here, so it cannot fail.
- */
- vma_iter_bulk_store(&vmi, tmp);
-
- mm->map_count++;
if (!(tmp->vm_flags & VM_WIPEONFORK))
retval = copy_page_range(tmp, mpnt);
- if (tmp->vm_ops && tmp->vm_ops->open)
- tmp->vm_ops->open(tmp);
-
if (retval) {
mpnt = vma_next(&vmi);
goto loop_out;
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 9a24574988d2..b2fc2727d654 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -43,6 +43,7 @@ static int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
* Zero means infinite timeout - no checking done:
*/
unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT;
+EXPORT_SYMBOL_GPL(sysctl_hung_task_timeout_secs);
/*
* Zero (default value) means use sysctl_hung_task_timeout_secs:
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 2531f3496ab6..529adb1f5859 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -108,6 +108,10 @@ config GENERIC_IRQ_MATRIX_ALLOCATOR
config GENERIC_IRQ_RESERVATION_MODE
bool
+# Snapshot for interrupt statistics
+config GENERIC_IRQ_STAT_SNAPSHOT
+ bool
+
# Support forced irq threading
config IRQ_FORCED_THREADING
bool
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index 1ed2b1739363..75cadbc3c232 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -130,6 +130,22 @@ static bool migrate_one_irq(struct irq_desc *desc)
* CPU.
*/
err = irq_do_set_affinity(d, affinity, false);
+
+ /*
+ * If there are online CPUs in the affinity mask, but they have no
+ * vectors left to make the migration work, try to break the
+ * affinity by migrating to any online CPU.
+ */
+ if (err == -ENOSPC && !irqd_affinity_is_managed(d) && affinity != cpu_online_mask) {
+ pr_debug("IRQ%u: set affinity failed for %*pbl, re-try with online CPUs\n",
+ d->irq, cpumask_pr_args(affinity));
+
+ affinity = cpu_online_mask;
+ brokeaff = true;
+
+ err = irq_do_set_affinity(d, affinity, false);
+ }
+
if (err) {
pr_warn_ratelimited("IRQ%u: set affinity failed(%d).\n",
d->irq, err);
@@ -195,10 +211,15 @@ static void irq_restore_affinity_of_irq(struct irq_desc *desc, unsigned int cpu)
!irq_data_get_irq_chip(data) || !cpumask_test_cpu(cpu, affinity))
return;
- if (irqd_is_managed_and_shutdown(data)) {
- irq_startup(desc, IRQ_RESEND, IRQ_START_COND);
+ /*
+ * Don't restore suspended interrupts here when a system comes back
+ * from S3. They are reenabled via resume_device_irqs().
+ */
+ if (desc->istate & IRQS_SUSPENDED)
return;
- }
+
+ if (irqd_is_managed_and_shutdown(data))
+ irq_startup(desc, IRQ_RESEND, IRQ_START_COND);
/*
* If the interrupt can only be directed to a single target
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index bcc7f21db9ee..ed28059e9849 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -98,6 +98,8 @@ extern void mask_irq(struct irq_desc *desc);
extern void unmask_irq(struct irq_desc *desc);
extern void unmask_threaded_irq(struct irq_desc *desc);
+extern unsigned int kstat_irqs_desc(struct irq_desc *desc, const struct cpumask *cpumask);
+
#ifdef CONFIG_SPARSE_IRQ
static inline void irq_mark_irq(unsigned int irq) { }
#else
@@ -258,7 +260,7 @@ static inline void irq_state_set_masked(struct irq_desc *desc)
static inline void __kstat_incr_irqs_this_cpu(struct irq_desc *desc)
{
- __this_cpu_inc(*desc->kstat_irqs);
+ __this_cpu_inc(desc->kstat_irqs->cnt);
__this_cpu_inc(kstat.irqs_sum);
}
@@ -278,6 +280,11 @@ static inline int irq_desc_is_chained(struct irq_desc *desc)
return (desc->action && desc->action == &chained_action);
}
+static inline bool irq_is_nmi(struct irq_desc *desc)
+{
+ return desc->istate & IRQS_NMI;
+}
+
#ifdef CONFIG_PM_SLEEP
bool irq_pm_check_wakeup(struct irq_desc *desc);
void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 4c6b32318ce3..88ac3652fcf2 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -134,7 +134,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node,
desc->name = NULL;
desc->owner = owner;
for_each_possible_cpu(cpu)
- *per_cpu_ptr(desc->kstat_irqs, cpu) = 0;
+ *per_cpu_ptr(desc->kstat_irqs, cpu) = (struct irqstat) { };
desc_smp_init(desc, node, affinity);
}
@@ -186,7 +186,7 @@ static int init_desc(struct irq_desc *desc, int irq, int node,
const struct cpumask *affinity,
struct module *owner)
{
- desc->kstat_irqs = alloc_percpu(unsigned int);
+ desc->kstat_irqs = alloc_percpu(struct irqstat);
if (!desc->kstat_irqs)
return -ENOMEM;
@@ -911,10 +911,7 @@ int irq_set_percpu_devid_partition(unsigned int irq,
{
struct irq_desc *desc = irq_to_desc(irq);
- if (!desc)
- return -EINVAL;
-
- if (desc->percpu_enabled)
+ if (!desc || desc->percpu_enabled)
return -EINVAL;
desc->percpu_enabled = kzalloc(sizeof(*desc->percpu_enabled), GFP_KERNEL);
@@ -922,10 +919,7 @@ int irq_set_percpu_devid_partition(unsigned int irq,
if (!desc->percpu_enabled)
return -ENOMEM;
- if (affinity)
- desc->percpu_affinity = affinity;
- else
- desc->percpu_affinity = cpu_possible_mask;
+ desc->percpu_affinity = affinity ? : cpu_possible_mask;
irq_set_percpu_devid_flags(irq);
return 0;
@@ -968,33 +962,58 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
{
struct irq_desc *desc = irq_to_desc(irq);
- return desc && desc->kstat_irqs ?
- *per_cpu_ptr(desc->kstat_irqs, cpu) : 0;
+ return desc && desc->kstat_irqs ? per_cpu(desc->kstat_irqs->cnt, cpu) : 0;
}
-static bool irq_is_nmi(struct irq_desc *desc)
+unsigned int kstat_irqs_desc(struct irq_desc *desc, const struct cpumask *cpumask)
{
- return desc->istate & IRQS_NMI;
-}
-
-static unsigned int kstat_irqs(unsigned int irq)
-{
- struct irq_desc *desc = irq_to_desc(irq);
unsigned int sum = 0;
int cpu;
- if (!desc || !desc->kstat_irqs)
- return 0;
if (!irq_settings_is_per_cpu_devid(desc) &&
!irq_settings_is_per_cpu(desc) &&
!irq_is_nmi(desc))
return data_race(desc->tot_count);
- for_each_possible_cpu(cpu)
- sum += data_race(*per_cpu_ptr(desc->kstat_irqs, cpu));
+ for_each_cpu(cpu, cpumask)
+ sum += data_race(per_cpu(desc->kstat_irqs->cnt, cpu));
return sum;
}
+static unsigned int kstat_irqs(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ if (!desc || !desc->kstat_irqs)
+ return 0;
+ return kstat_irqs_desc(desc, cpu_possible_mask);
+}
+
+#ifdef CONFIG_GENERIC_IRQ_STAT_SNAPSHOT
+
+void kstat_snapshot_irqs(void)
+{
+ struct irq_desc *desc;
+ unsigned int irq;
+
+ for_each_irq_desc(irq, desc) {
+ if (!desc->kstat_irqs)
+ continue;
+ this_cpu_write(desc->kstat_irqs->ref, this_cpu_read(desc->kstat_irqs->cnt));
+ }
+}
+
+unsigned int kstat_get_irq_since_snapshot(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ if (!desc || !desc->kstat_irqs)
+ return 0;
+ return this_cpu_read(desc->kstat_irqs->cnt) - this_cpu_read(desc->kstat_irqs->ref);
+}
+
+#endif
+
/**
* kstat_irqs_usr - Get the statistics for an interrupt from thread context
* @irq: The interrupt number
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 3dd1c871e091..aadc8891cc16 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -909,10 +909,11 @@ EXPORT_SYMBOL_GPL(irq_create_of_mapping);
*/
void irq_dispose_mapping(unsigned int virq)
{
- struct irq_data *irq_data = irq_get_irq_data(virq);
+ struct irq_data *irq_data;
struct irq_domain *domain;
- if (!virq || !irq_data)
+ irq_data = virq ? irq_get_irq_data(virq) : NULL;
+ if (!irq_data)
return;
domain = irq_data->domain;
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index ad3eaf2ab959..71b0fc2d0aea 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -564,7 +564,7 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
/* The release function is promised process context */
might_sleep();
- if (!desc || desc->istate & IRQS_NMI)
+ if (!desc || irq_is_nmi(desc))
return -EINVAL;
/* Complete initialisation of *notify */
@@ -800,10 +800,14 @@ void __enable_irq(struct irq_desc *desc)
irq_settings_set_noprobe(desc);
/*
* Call irq_startup() not irq_enable() here because the
- * interrupt might be marked NOAUTOEN. So irq_startup()
- * needs to be invoked when it gets enabled the first
- * time. If it was already started up, then irq_startup()
- * will invoke irq_enable() under the hood.
+ * interrupt might be marked NOAUTOEN so irq_startup()
+ * needs to be invoked when it gets enabled the first time.
+ * This is also required when __enable_irq() is invoked for
+ * a managed and shutdown interrupt from the S3 resume
+ * path.
+ *
+ * If it was already started up, then irq_startup() will
+ * invoke irq_enable() under the hood.
*/
irq_startup(desc, IRQ_RESEND, IRQ_START_FORCE);
break;
@@ -898,7 +902,7 @@ int irq_set_irq_wake(unsigned int irq, unsigned int on)
return -EINVAL;
/* Don't use NMIs as wake up interrupts please */
- if (desc->istate & IRQS_NMI) {
+ if (irq_is_nmi(desc)) {
ret = -EINVAL;
goto out_unlock;
}
@@ -1624,7 +1628,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
*/
unsigned int oldtype;
- if (desc->istate & IRQS_NMI) {
+ if (irq_is_nmi(desc)) {
pr_err("Invalid attempt to share NMI for %s (irq %d) on irqchip %s.\n",
new->name, irq, desc->irq_data.chip->name);
ret = -EINVAL;
@@ -1643,8 +1647,13 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
}
if (!((old->flags & new->flags) & IRQF_SHARED) ||
- (oldtype != (new->flags & IRQF_TRIGGER_MASK)) ||
- ((old->flags ^ new->flags) & IRQF_ONESHOT))
+ (oldtype != (new->flags & IRQF_TRIGGER_MASK)))
+ goto mismatch;
+
+ if ((old->flags & IRQF_ONESHOT) &&
+ (new->flags & IRQF_COND_ONESHOT))
+ new->flags |= IRQF_ONESHOT;
+ else if ((old->flags ^ new->flags) & IRQF_ONESHOT)
goto mismatch;
/* All handlers must agree on per-cpuness */
@@ -2077,7 +2086,7 @@ const void *free_nmi(unsigned int irq, void *dev_id)
unsigned long flags;
const void *devname;
- if (!desc || WARN_ON(!(desc->istate & IRQS_NMI)))
+ if (!desc || WARN_ON(!irq_is_nmi(desc)))
return NULL;
if (WARN_ON(irq_settings_is_per_cpu_devid(desc)))
@@ -2543,7 +2552,7 @@ void free_percpu_nmi(unsigned int irq, void __percpu *dev_id)
if (!desc || !irq_settings_is_per_cpu_devid(desc))
return;
- if (WARN_ON(!(desc->istate & IRQS_NMI)))
+ if (WARN_ON(!irq_is_nmi(desc)))
return;
kfree(__free_percpu_irq(irq, dev_id));
@@ -2679,7 +2688,7 @@ int request_percpu_nmi(unsigned int irq, irq_handler_t handler,
return -EINVAL;
/* The line cannot already be NMI */
- if (desc->istate & IRQS_NMI)
+ if (irq_is_nmi(desc))
return -EINVAL;
action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
@@ -2740,7 +2749,7 @@ int prepare_percpu_nmi(unsigned int irq)
if (!desc)
return -EINVAL;
- if (WARN(!(desc->istate & IRQS_NMI),
+ if (WARN(!irq_is_nmi(desc),
KERN_ERR "prepare_percpu_nmi called for a non-NMI interrupt: irq %u\n",
irq)) {
ret = -EINVAL;
@@ -2782,7 +2791,7 @@ void teardown_percpu_nmi(unsigned int irq)
if (!desc)
return;
- if (WARN_ON(!(desc->istate & IRQS_NMI)))
+ if (WARN_ON(!irq_is_nmi(desc)))
goto out;
irq_nmi_teardown(desc);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 623b8136e9af..5c320c3f10a7 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -488,18 +488,15 @@ int show_interrupts(struct seq_file *p, void *v)
if (!desc || irq_settings_is_hidden(desc))
goto outsparse;
- if (desc->kstat_irqs) {
- for_each_online_cpu(j)
- any_count |= data_race(*per_cpu_ptr(desc->kstat_irqs, j));
- }
+ if (desc->kstat_irqs)
+ any_count = kstat_irqs_desc(desc, cpu_online_mask);
if ((!desc->action || irq_desc_is_chained(desc)) && !any_count)
goto outsparse;
seq_printf(p, "%*d: ", prec, i);
for_each_online_cpu(j)
- seq_printf(p, "%10u ", desc->kstat_irqs ?
- *per_cpu_ptr(desc->kstat_irqs, j) : 0);
+ seq_printf(p, "%10u ", desc->kstat_irqs ? per_cpu(desc->kstat_irqs->cnt, j) : 0);
raw_spin_lock_irqsave(&desc->lock, flags);
if (desc->irq_data.chip) {
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 5f2c66860ac6..b07a2d732ffb 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -190,7 +190,7 @@ int irq_inject_interrupt(unsigned int irq)
* - not NMI type
* - activated
*/
- if ((desc->istate & IRQS_NMI) || !irqd_is_activated(&desc->irq_data))
+ if (irq_is_nmi(desc) || !irqd_is_activated(&desc->irq_data))
err = -EINVAL;
else
err = check_irq_resend(desc, true);
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index d9c822bbffb8..3218fa5688b9 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -530,6 +530,45 @@ void __init jump_label_init(void)
cpus_read_unlock();
}
+static inline bool static_key_sealed(struct static_key *key)
+{
+ return (key->type & JUMP_TYPE_LINKED) && !(key->type & ~JUMP_TYPE_MASK);
+}
+
+static inline void static_key_seal(struct static_key *key)
+{
+ unsigned long type = key->type & JUMP_TYPE_TRUE;
+ key->type = JUMP_TYPE_LINKED | type;
+}
+
+void jump_label_init_ro(void)
+{
+ struct jump_entry *iter_start = __start___jump_table;
+ struct jump_entry *iter_stop = __stop___jump_table;
+ struct jump_entry *iter;
+
+ if (WARN_ON_ONCE(!static_key_initialized))
+ return;
+
+ cpus_read_lock();
+ jump_label_lock();
+
+ for (iter = iter_start; iter < iter_stop; iter++) {
+ struct static_key *iterk = jump_entry_key(iter);
+
+ if (!is_kernel_ro_after_init((unsigned long)iterk))
+ continue;
+
+ if (static_key_sealed(iterk))
+ continue;
+
+ static_key_seal(iterk);
+ }
+
+ jump_label_unlock();
+ cpus_read_unlock();
+}
+
#ifdef CONFIG_MODULES
enum jump_label_type jump_label_init_type(struct jump_entry *entry)
@@ -650,6 +689,15 @@ static int jump_label_add_module(struct module *mod)
static_key_set_entries(key, iter);
continue;
}
+
+ /*
+ * If the key was sealed at init, then there's no need to keep a
+ * reference to its module entries - just patch them now and be
+ * done with it.
+ */
+ if (static_key_sealed(key))
+ goto do_poke;
+
jlm = kzalloc(sizeof(struct static_key_mod), GFP_KERNEL);
if (!jlm)
return -ENOMEM;
@@ -675,6 +723,7 @@ static int jump_label_add_module(struct module *mod)
static_key_set_linked(key);
/* Only update if we've changed from our initial state */
+do_poke:
if (jump_label_type(iter) != jump_label_init_type(iter))
__jump_label_update(key, iter, iter_stop, true);
}
@@ -699,6 +748,10 @@ static void jump_label_del_module(struct module *mod)
if (within_module((unsigned long)key, mod))
continue;
+ /* No @jlm allocated because key was sealed at init. */
+ if (static_key_sealed(key))
+ continue;
+
/* No memory during module load */
if (WARN_ON(!static_key_linked(key)))
continue;
diff --git a/kernel/kallsyms_selftest.c b/kernel/kallsyms_selftest.c
index b4cac76ea5e9..8a689b4ff4f9 100644
--- a/kernel/kallsyms_selftest.c
+++ b/kernel/kallsyms_selftest.c
@@ -89,7 +89,6 @@ static struct test_item test_items[] = {
ITEM_DATA(kallsyms_test_var_data_static),
ITEM_DATA(kallsyms_test_var_bss),
ITEM_DATA(kallsyms_test_var_data),
- ITEM_DATA(vmap_area_list),
#endif
};
diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c
index 015586217875..0c17b4c83e1c 100644
--- a/kernel/kcsan/kcsan_test.c
+++ b/kernel/kcsan/kcsan_test.c
@@ -304,6 +304,7 @@ static long test_array[3 * PAGE_SIZE / sizeof(long)];
static struct {
long val[8];
} test_struct;
+static long __data_racy test_data_racy;
static DEFINE_SEQLOCK(test_seqlock);
static DEFINE_SPINLOCK(test_spinlock);
static DEFINE_MUTEX(test_mutex);
@@ -358,6 +359,8 @@ static noinline void test_kernel_write_uninstrumented(void) { test_var++; }
static noinline void test_kernel_data_race(void) { data_race(test_var++); }
+static noinline void test_kernel_data_racy_qualifier(void) { test_data_racy++; }
+
static noinline void test_kernel_assert_writer(void)
{
ASSERT_EXCLUSIVE_WRITER(test_var);
@@ -1009,6 +1012,19 @@ static void test_data_race(struct kunit *test)
KUNIT_EXPECT_FALSE(test, match_never);
}
+/* Test the __data_racy type qualifier. */
+__no_kcsan
+static void test_data_racy_qualifier(struct kunit *test)
+{
+ bool match_never = false;
+
+ begin_test_checks(test_kernel_data_racy_qualifier, test_kernel_data_racy_qualifier);
+ do {
+ match_never = report_available();
+ } while (!end_test_checks(match_never));
+ KUNIT_EXPECT_FALSE(test, match_never);
+}
+
__no_kcsan
static void test_assert_exclusive_writer(struct kunit *test)
{
@@ -1424,6 +1440,7 @@ static struct kunit_case kcsan_test_cases[] = {
KCSAN_KUNIT_CASE(test_read_plain_atomic_rmw),
KCSAN_KUNIT_CASE(test_zero_size_access),
KCSAN_KUNIT_CASE(test_data_race),
+ KCSAN_KUNIT_CASE(test_data_racy_qualifier),
KCSAN_KUNIT_CASE(test_assert_exclusive_writer),
KCSAN_KUNIT_CASE(test_assert_exclusive_access),
KCSAN_KUNIT_CASE(test_assert_exclusive_access_writer),
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 8f35a5a42af8..bab542fc1463 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -28,12 +28,14 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
struct kimage *image;
bool kexec_on_panic = flags & KEXEC_ON_CRASH;
+#ifdef CONFIG_CRASH_DUMP
if (kexec_on_panic) {
/* Verify we have a valid entry point */
if ((entry < phys_to_boot_phys(crashk_res.start)) ||
(entry > phys_to_boot_phys(crashk_res.end)))
return -EADDRNOTAVAIL;
}
+#endif
/* Allocate and initialize a controlling structure */
image = do_kimage_alloc_init();
@@ -44,11 +46,13 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
image->nr_segments = nr_segments;
memcpy(image->segment, segments, nr_segments * sizeof(*segments));
+#ifdef CONFIG_CRASH_DUMP
if (kexec_on_panic) {
/* Enable special crash kernel control page alloc policy. */
image->control_page = crashk_res.start;
image->type = KEXEC_TYPE_CRASH;
}
+#endif
ret = sanity_check_segment_list(image);
if (ret)
@@ -99,13 +103,14 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
if (!kexec_trylock())
return -EBUSY;
+#ifdef CONFIG_CRASH_DUMP
if (flags & KEXEC_ON_CRASH) {
dest_image = &kexec_crash_image;
if (kexec_crash_image)
arch_kexec_unprotect_crashkres();
- } else {
+ } else
+#endif
dest_image = &kexec_image;
- }
if (nr_segments == 0) {
/* Uninstall image */
@@ -162,8 +167,10 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
image = xchg(dest_image, image);
out:
+#ifdef CONFIG_CRASH_DUMP
if ((flags & KEXEC_ON_CRASH) && kexec_crash_image)
arch_kexec_protect_crashkres();
+#endif
kimage_free(image);
out_unlock:
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index d08fc7b5db97..0e96f6b24344 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -54,30 +54,6 @@ bool kexec_in_progress = false;
bool kexec_file_dbg_print;
-int kexec_should_crash(struct task_struct *p)
-{
- /*
- * If crash_kexec_post_notifiers is enabled, don't run
- * crash_kexec() here yet, which must be run after panic
- * notifiers in panic().
- */
- if (crash_kexec_post_notifiers)
- return 0;
- /*
- * There are 4 panic() calls in make_task_dead() path, each of which
- * corresponds to each of these 4 conditions.
- */
- if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
- return 1;
- return 0;
-}
-
-int kexec_crash_loaded(void)
-{
- return !!kexec_crash_image;
-}
-EXPORT_SYMBOL_GPL(kexec_crash_loaded);
-
/*
* When kexec transitions to the new kernel there is a one-to-one
* mapping between physical and virtual addresses. On processors
@@ -209,6 +185,7 @@ int sanity_check_segment_list(struct kimage *image)
if (total_pages > nr_pages / 2)
return -EINVAL;
+#ifdef CONFIG_CRASH_DUMP
/*
* Verify we have good destination addresses. Normally
* the caller is responsible for making certain we don't
@@ -231,6 +208,7 @@ int sanity_check_segment_list(struct kimage *image)
return -EADDRNOTAVAIL;
}
}
+#endif
return 0;
}
@@ -403,6 +381,7 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
return pages;
}
+#ifdef CONFIG_CRASH_DUMP
static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
unsigned int order)
{
@@ -468,6 +447,7 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
return pages;
}
+#endif
struct page *kimage_alloc_control_pages(struct kimage *image,
@@ -479,48 +459,16 @@ struct page *kimage_alloc_control_pages(struct kimage *image,
case KEXEC_TYPE_DEFAULT:
pages = kimage_alloc_normal_control_pages(image, order);
break;
+#ifdef CONFIG_CRASH_DUMP
case KEXEC_TYPE_CRASH:
pages = kimage_alloc_crash_control_pages(image, order);
break;
+#endif
}
return pages;
}
-int kimage_crash_copy_vmcoreinfo(struct kimage *image)
-{
- struct page *vmcoreinfo_page;
- void *safecopy;
-
- if (image->type != KEXEC_TYPE_CRASH)
- return 0;
-
- /*
- * For kdump, allocate one vmcoreinfo safe copy from the
- * crash memory. as we have arch_kexec_protect_crashkres()
- * after kexec syscall, we naturally protect it from write
- * (even read) access under kernel direct mapping. But on
- * the other hand, we still need to operate it when crash
- * happens to generate vmcoreinfo note, hereby we rely on
- * vmap for this purpose.
- */
- vmcoreinfo_page = kimage_alloc_control_pages(image, 0);
- if (!vmcoreinfo_page) {
- pr_warn("Could not allocate vmcoreinfo buffer\n");
- return -ENOMEM;
- }
- safecopy = vmap(&vmcoreinfo_page, 1, VM_MAP, PAGE_KERNEL);
- if (!safecopy) {
- pr_warn("Could not vmap vmcoreinfo buffer\n");
- return -ENOMEM;
- }
-
- image->vmcoreinfo_data_copy = safecopy;
- crash_update_vmcoreinfo_safecopy(safecopy);
-
- return 0;
-}
-
static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
{
if (*image->entry != 0)
@@ -603,10 +551,12 @@ void kimage_free(struct kimage *image)
if (!image)
return;
+#ifdef CONFIG_CRASH_DUMP
if (image->vmcoreinfo_data_copy) {
crash_update_vmcoreinfo_safecopy(NULL);
vunmap(image->vmcoreinfo_data_copy);
}
+#endif
kimage_free_extra_pages(image);
for_each_kimage_entry(image, ptr, entry) {
@@ -800,22 +750,24 @@ static int kimage_load_normal_segment(struct kimage *image,
PAGE_SIZE - (maddr & ~PAGE_MASK));
uchunk = min(ubytes, mchunk);
- /* For file based kexec, source pages are in kernel memory */
- if (image->file_mode)
- memcpy(ptr, kbuf, uchunk);
- else
- result = copy_from_user(ptr, buf, uchunk);
+ if (uchunk) {
+ /* For file based kexec, source pages are in kernel memory */
+ if (image->file_mode)
+ memcpy(ptr, kbuf, uchunk);
+ else
+ result = copy_from_user(ptr, buf, uchunk);
+ ubytes -= uchunk;
+ if (image->file_mode)
+ kbuf += uchunk;
+ else
+ buf += uchunk;
+ }
kunmap_local(ptr);
if (result) {
result = -EFAULT;
goto out;
}
- ubytes -= uchunk;
maddr += mchunk;
- if (image->file_mode)
- kbuf += mchunk;
- else
- buf += mchunk;
mbytes -= mchunk;
cond_resched();
@@ -824,6 +776,7 @@ out:
return result;
}
+#ifdef CONFIG_CRASH_DUMP
static int kimage_load_crash_segment(struct kimage *image,
struct kexec_segment *segment)
{
@@ -866,11 +819,18 @@ static int kimage_load_crash_segment(struct kimage *image,
memset(ptr + uchunk, 0, mchunk - uchunk);
}
- /* For file based kexec, source pages are in kernel memory */
- if (image->file_mode)
- memcpy(ptr, kbuf, uchunk);
- else
- result = copy_from_user(ptr, buf, uchunk);
+ if (uchunk) {
+ /* For file based kexec, source pages are in kernel memory */
+ if (image->file_mode)
+ memcpy(ptr, kbuf, uchunk);
+ else
+ result = copy_from_user(ptr, buf, uchunk);
+ ubytes -= uchunk;
+ if (image->file_mode)
+ kbuf += uchunk;
+ else
+ buf += uchunk;
+ }
kexec_flush_icache_page(page);
kunmap_local(ptr);
arch_kexec_pre_free_pages(page_address(page), 1);
@@ -878,12 +838,7 @@ static int kimage_load_crash_segment(struct kimage *image,
result = -EFAULT;
goto out;
}
- ubytes -= uchunk;
maddr += mchunk;
- if (image->file_mode)
- kbuf += mchunk;
- else
- buf += mchunk;
mbytes -= mchunk;
cond_resched();
@@ -891,6 +846,7 @@ static int kimage_load_crash_segment(struct kimage *image,
out:
return result;
}
+#endif
int kimage_load_segment(struct kimage *image,
struct kexec_segment *segment)
@@ -901,9 +857,11 @@ int kimage_load_segment(struct kimage *image,
case KEXEC_TYPE_DEFAULT:
result = kimage_load_normal_segment(image, segment);
break;
+#ifdef CONFIG_CRASH_DUMP
case KEXEC_TYPE_CRASH:
result = kimage_load_crash_segment(image, segment);
break;
+#endif
}
return result;
@@ -1028,186 +986,6 @@ bool kexec_load_permitted(int kexec_image_type)
}
/*
- * No panic_cpu check version of crash_kexec(). This function is called
- * only when panic_cpu holds the current CPU number; this is the only CPU
- * which processes crash_kexec routines.
- */
-void __noclone __crash_kexec(struct pt_regs *regs)
-{
- /* Take the kexec_lock here to prevent sys_kexec_load
- * running on one cpu from replacing the crash kernel
- * we are using after a panic on a different cpu.
- *
- * If the crash kernel was not located in a fixed area
- * of memory the xchg(&kexec_crash_image) would be
- * sufficient. But since I reuse the memory...
- */
- if (kexec_trylock()) {
- if (kexec_crash_image) {
- struct pt_regs fixed_regs;
-
- crash_setup_regs(&fixed_regs, regs);
- crash_save_vmcoreinfo();
- machine_crash_shutdown(&fixed_regs);
- machine_kexec(kexec_crash_image);
- }
- kexec_unlock();
- }
-}
-STACK_FRAME_NON_STANDARD(__crash_kexec);
-
-__bpf_kfunc void crash_kexec(struct pt_regs *regs)
-{
- int old_cpu, this_cpu;
-
- /*
- * Only one CPU is allowed to execute the crash_kexec() code as with
- * panic(). Otherwise parallel calls of panic() and crash_kexec()
- * may stop each other. To exclude them, we use panic_cpu here too.
- */
- old_cpu = PANIC_CPU_INVALID;
- this_cpu = raw_smp_processor_id();
-
- if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) {
- /* This is the 1st CPU which comes here, so go ahead. */
- __crash_kexec(regs);
-
- /*
- * Reset panic_cpu to allow another panic()/crash_kexec()
- * call.
- */
- atomic_set(&panic_cpu, PANIC_CPU_INVALID);
- }
-}
-
-static inline resource_size_t crash_resource_size(const struct resource *res)
-{
- return !res->end ? 0 : resource_size(res);
-}
-
-ssize_t crash_get_memory_size(void)
-{
- ssize_t size = 0;
-
- if (!kexec_trylock())
- return -EBUSY;
-
- size += crash_resource_size(&crashk_res);
- size += crash_resource_size(&crashk_low_res);
-
- kexec_unlock();
- return size;
-}
-
-static int __crash_shrink_memory(struct resource *old_res,
- unsigned long new_size)
-{
- struct resource *ram_res;
-
- ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
- if (!ram_res)
- return -ENOMEM;
-
- ram_res->start = old_res->start + new_size;
- ram_res->end = old_res->end;
- ram_res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM;
- ram_res->name = "System RAM";
-
- if (!new_size) {
- release_resource(old_res);
- old_res->start = 0;
- old_res->end = 0;
- } else {
- crashk_res.end = ram_res->start - 1;
- }
-
- crash_free_reserved_phys_range(ram_res->start, ram_res->end);
- insert_resource(&iomem_resource, ram_res);
-
- return 0;
-}
-
-int crash_shrink_memory(unsigned long new_size)
-{
- int ret = 0;
- unsigned long old_size, low_size;
-
- if (!kexec_trylock())
- return -EBUSY;
-
- if (kexec_crash_image) {
- ret = -ENOENT;
- goto unlock;
- }
-
- low_size = crash_resource_size(&crashk_low_res);
- old_size = crash_resource_size(&crashk_res) + low_size;
- new_size = roundup(new_size, KEXEC_CRASH_MEM_ALIGN);
- if (new_size >= old_size) {
- ret = (new_size == old_size) ? 0 : -EINVAL;
- goto unlock;
- }
-
- /*
- * (low_size > new_size) implies that low_size is greater than zero.
- * This also means that if low_size is zero, the else branch is taken.
- *
- * If low_size is greater than 0, (low_size > new_size) indicates that
- * crashk_low_res also needs to be shrunken. Otherwise, only crashk_res
- * needs to be shrunken.
- */
- if (low_size > new_size) {
- ret = __crash_shrink_memory(&crashk_res, 0);
- if (ret)
- goto unlock;
-
- ret = __crash_shrink_memory(&crashk_low_res, new_size);
- } else {
- ret = __crash_shrink_memory(&crashk_res, new_size - low_size);
- }
-
- /* Swap crashk_res and crashk_low_res if needed */
- if (!crashk_res.end && crashk_low_res.end) {
- crashk_res.start = crashk_low_res.start;
- crashk_res.end = crashk_low_res.end;
- release_resource(&crashk_low_res);
- crashk_low_res.start = 0;
- crashk_low_res.end = 0;
- insert_resource(&iomem_resource, &crashk_res);
- }
-
-unlock:
- kexec_unlock();
- return ret;
-}
-
-void crash_save_cpu(struct pt_regs *regs, int cpu)
-{
- struct elf_prstatus prstatus;
- u32 *buf;
-
- if ((cpu < 0) || (cpu >= nr_cpu_ids))
- return;
-
- /* Using ELF notes here is opportunistic.
- * I need a well defined structure format
- * for the data I pass, and I need tags
- * on the data to indicate what information I have
- * squirrelled away. ELF notes happen to provide
- * all of that, so there is no need to invent something new.
- */
- buf = (u32 *)per_cpu_ptr(crash_notes, cpu);
- if (!buf)
- return;
- memset(&prstatus, 0, sizeof(prstatus));
- prstatus.common.pr_pid = current->pid;
- elf_core_copy_regs(&prstatus.pr_reg, regs);
- buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
- &prstatus, sizeof(prstatus));
- final_note(buf);
-}
-
-/*
* Move into place and start executing a preloaded standalone
* executable. If nothing was preloaded return an error.
*/
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index bef2f6f2571b..2d1db05fbf04 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -285,11 +285,13 @@ kimage_file_alloc_init(struct kimage **rimage, int kernel_fd,
kexec_file_dbg_print = !!(flags & KEXEC_FILE_DEBUG);
image->file_mode = 1;
+#ifdef CONFIG_CRASH_DUMP
if (kexec_on_panic) {
/* Enable special crash kernel control page alloc policy. */
image->control_page = crashk_res.start;
image->type = KEXEC_TYPE_CRASH;
}
+#endif
ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd,
cmdline_ptr, cmdline_len, flags);
@@ -349,13 +351,14 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
if (!kexec_trylock())
return -EBUSY;
+#ifdef CONFIG_CRASH_DUMP
if (image_type == KEXEC_TYPE_CRASH) {
dest_image = &kexec_crash_image;
if (kexec_crash_image)
arch_kexec_unprotect_crashkres();
- } else {
+ } else
+#endif
dest_image = &kexec_image;
- }
if (flags & KEXEC_FILE_UNLOAD)
goto exchange;
@@ -419,8 +422,10 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
exchange:
image = xchg(dest_image, image);
out:
+#ifdef CONFIG_CRASH_DUMP
if ((flags & KEXEC_FILE_ON_CRASH) && kexec_crash_image)
arch_kexec_protect_crashkres();
+#endif
kexec_unlock();
kimage_free(image);
@@ -535,8 +540,10 @@ static int kexec_walk_memblock(struct kexec_buf *kbuf,
phys_addr_t mstart, mend;
struct resource res = { };
+#ifdef CONFIG_CRASH_DUMP
if (kbuf->image->type == KEXEC_TYPE_CRASH)
return func(&crashk_res, kbuf);
+#endif
/*
* Using MEMBLOCK_NONE will properly skip MEMBLOCK_DRIVER_MANAGED. See
@@ -595,12 +602,14 @@ static int kexec_walk_memblock(struct kexec_buf *kbuf,
static int kexec_walk_resources(struct kexec_buf *kbuf,
int (*func)(struct resource *, void *))
{
+#ifdef CONFIG_CRASH_DUMP
if (kbuf->image->type == KEXEC_TYPE_CRASH)
return walk_iomem_res_desc(crashk_res.desc,
IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
crashk_res.start, crashk_res.end,
kbuf, func);
- else if (kbuf->top_down)
+#endif
+ if (kbuf->top_down)
return walk_system_ram_res_rev(0, ULONG_MAX, kbuf, func);
else
return walk_system_ram_res(0, ULONG_MAX, kbuf, func);
diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h
index 74da1409cd14..2595defe8c0d 100644
--- a/kernel/kexec_internal.h
+++ b/kernel/kexec_internal.h
@@ -4,6 +4,8 @@
#include <linux/kexec.h>
+struct kexec_segment;
+
struct kimage *do_kimage_alloc_init(void);
int sanity_check_segment_list(struct kimage *image);
void kimage_free_page_list(struct list_head *list);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 9d9095e81792..65adc815fc6e 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1567,10 +1567,17 @@ static int check_kprobe_address_safe(struct kprobe *p,
jump_label_lock();
preempt_disable();
- /* Ensure it is not in reserved area nor out of text */
- if (!(core_kernel_text((unsigned long) p->addr) ||
- is_module_text_address((unsigned long) p->addr)) ||
- in_gate_area_no_mm((unsigned long) p->addr) ||
+ /* Ensure the address is in a text area, and find a module if exists. */
+ *probed_mod = NULL;
+ if (!core_kernel_text((unsigned long) p->addr)) {
+ *probed_mod = __module_text_address((unsigned long) p->addr);
+ if (!(*probed_mod)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+ /* Ensure it is not in reserved area. */
+ if (in_gate_area_no_mm((unsigned long) p->addr) ||
within_kprobe_blacklist((unsigned long) p->addr) ||
jump_label_text_reserved(p->addr, p->addr) ||
static_call_text_reserved(p->addr, p->addr) ||
@@ -1580,8 +1587,7 @@ static int check_kprobe_address_safe(struct kprobe *p,
goto out;
}
- /* Check if 'p' is probing a module. */
- *probed_mod = __module_text_address((unsigned long) p->addr);
+ /* Get module refcount and reject __init functions for loaded modules. */
if (*probed_mod) {
/*
* We must hold a refcount of the probed module while updating
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 1d4bc493b2f4..495b69a71a5d 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -39,7 +39,7 @@ static struct kobj_attribute _name##_attr = __ATTR_RW(_name)
static ssize_t uevent_seqnum_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sysfs_emit(buf, "%llu\n", (unsigned long long)uevent_seqnum);
+ return sysfs_emit(buf, "%llu\n", (u64)atomic64_read(&uevent_seqnum));
}
KERNEL_ATTR_RO(uevent_seqnum);
@@ -120,6 +120,7 @@ static ssize_t kexec_loaded_show(struct kobject *kobj,
}
KERNEL_ATTR_RO(kexec_loaded);
+#ifdef CONFIG_CRASH_DUMP
static ssize_t kexec_crash_loaded_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
@@ -152,9 +153,10 @@ static ssize_t kexec_crash_size_store(struct kobject *kobj,
}
KERNEL_ATTR_RW(kexec_crash_size);
+#endif /* CONFIG_CRASH_DUMP*/
#endif /* CONFIG_KEXEC_CORE */
-#ifdef CONFIG_CRASH_CORE
+#ifdef CONFIG_VMCORE_INFO
static ssize_t vmcoreinfo_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
@@ -177,7 +179,7 @@ KERNEL_ATTR_RO(crash_elfcorehdr_size);
#endif
-#endif /* CONFIG_CRASH_CORE */
+#endif /* CONFIG_VMCORE_INFO */
/* whether file capabilities are enabled */
static ssize_t fscaps_show(struct kobject *kobj,
@@ -262,10 +264,12 @@ static struct attribute * kernel_attrs[] = {
#endif
#ifdef CONFIG_KEXEC_CORE
&kexec_loaded_attr.attr,
+#ifdef CONFIG_CRASH_DUMP
&kexec_crash_loaded_attr.attr,
&kexec_crash_size_attr.attr,
#endif
-#ifdef CONFIG_CRASH_CORE
+#endif
+#ifdef CONFIG_VMCORE_INFO
&vmcoreinfo_attr.attr,
#ifdef CONFIG_CRASH_HOTPLUG
&crash_elfcorehdr_size_attr.attr,
diff --git a/kernel/kthread.c b/kernel/kthread.c
index c5e40830c1f2..f7be976ff88a 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -315,6 +315,7 @@ void __noreturn kthread_exit(long result)
kthread->result = result;
do_exit(0);
}
+EXPORT_SYMBOL(kthread_exit);
/**
* kthread_complete_and_exit - Exit the current kthread.
diff --git a/kernel/locking/lock_events.h b/kernel/locking/lock_events.h
index a6016b91803d..d2345e9c0190 100644
--- a/kernel/locking/lock_events.h
+++ b/kernel/locking/lock_events.h
@@ -53,8 +53,8 @@ static inline void __lockevent_add(enum lock_events event, int inc)
#else /* CONFIG_LOCK_EVENT_COUNTS */
#define lockevent_inc(ev)
-#define lockevent_add(ev, c)
-#define lockevent_cond_inc(ev, c)
+#define lockevent_add(ev, c) do { (void)(c); } while (0)
+#define lockevent_cond_inc(ev, c) do { (void)(c); } while (0)
#endif /* CONFIG_LOCK_EVENT_COUNTS */
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index ebe6b8ec7cb3..1df5fef8a656 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -220,21 +220,18 @@ static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
*/
static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
{
- u32 old, new, val = atomic_read(&lock->val);
+ u32 old, new;
- for (;;) {
- new = (val & _Q_LOCKED_PENDING_MASK) | tail;
+ old = atomic_read(&lock->val);
+ do {
+ new = (old & _Q_LOCKED_PENDING_MASK) | tail;
/*
* We can use relaxed semantics since the caller ensures that
* the MCS node is properly initialized before updating the
* tail.
*/
- old = atomic_cmpxchg_relaxed(&lock->val, val, new);
- if (old == val)
- break;
+ } while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new));
- val = old;
- }
return old;
}
#endif /* _Q_PENDING_BITS == 8 */
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index ae2b12f68b90..f5a36e67b593 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -86,9 +86,10 @@ static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock)
*/
for (;;) {
int val = atomic_read(&lock->val);
+ u8 old = 0;
if (!(val & _Q_LOCKED_PENDING_MASK) &&
- (cmpxchg_acquire(&lock->locked, 0, _Q_LOCKED_VAL) == 0)) {
+ try_cmpxchg_acquire(&lock->locked, &old, _Q_LOCKED_VAL)) {
lockevent_inc(pv_lock_stealing);
return true;
}
@@ -116,11 +117,12 @@ static __always_inline void set_pending(struct qspinlock *lock)
* barrier. Therefore, an atomic cmpxchg_acquire() is used to acquire the
* lock just to be sure that it will get it.
*/
-static __always_inline int trylock_clear_pending(struct qspinlock *lock)
+static __always_inline bool trylock_clear_pending(struct qspinlock *lock)
{
+ u16 old = _Q_PENDING_VAL;
+
return !READ_ONCE(lock->locked) &&
- (cmpxchg_acquire(&lock->locked_pending, _Q_PENDING_VAL,
- _Q_LOCKED_VAL) == _Q_PENDING_VAL);
+ try_cmpxchg_acquire(&lock->locked_pending, &old, _Q_LOCKED_VAL);
}
#else /* _Q_PENDING_BITS == 8 */
static __always_inline void set_pending(struct qspinlock *lock)
@@ -128,27 +130,21 @@ static __always_inline void set_pending(struct qspinlock *lock)
atomic_or(_Q_PENDING_VAL, &lock->val);
}
-static __always_inline int trylock_clear_pending(struct qspinlock *lock)
+static __always_inline bool trylock_clear_pending(struct qspinlock *lock)
{
- int val = atomic_read(&lock->val);
-
- for (;;) {
- int old, new;
-
- if (val & _Q_LOCKED_MASK)
- break;
+ int old, new;
+ old = atomic_read(&lock->val);
+ do {
+ if (old & _Q_LOCKED_MASK)
+ return false;
/*
* Try to clear pending bit & set locked bit
*/
- old = val;
- new = (val & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL;
- val = atomic_cmpxchg_acquire(&lock->val, old, new);
+ new = (old & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL;
+ } while (!atomic_try_cmpxchg_acquire (&lock->val, &old, new));
- if (val == old)
- return 1;
- }
- return 0;
+ return true;
}
#endif /* _Q_PENDING_BITS == 8 */
@@ -216,8 +212,9 @@ static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node)
int hopcnt = 0;
for_each_hash_entry(he, offset, hash) {
+ struct qspinlock *old = NULL;
hopcnt++;
- if (!cmpxchg(&he->lock, NULL, lock)) {
+ if (try_cmpxchg(&he->lock, &old, lock)) {
WRITE_ONCE(he->node, node);
lockevent_pv_hop(hopcnt);
return &he->lock;
@@ -294,7 +291,7 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
{
struct pv_node *pn = (struct pv_node *)node;
struct pv_node *pp = (struct pv_node *)prev;
- bool __maybe_unused wait_early;
+ bool wait_early;
int loop;
for (;;) {
@@ -360,7 +357,7 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
{
struct pv_node *pn = (struct pv_node *)node;
-
+ enum vcpu_state old = vcpu_halted;
/*
* If the vCPU is indeed halted, advance its state to match that of
* pv_wait_node(). If OTOH this fails, the vCPU was running and will
@@ -377,8 +374,7 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
* subsequent writes.
*/
smp_mb__before_atomic();
- if (cmpxchg_relaxed(&pn->state, vcpu_halted, vcpu_hashed)
- != vcpu_halted)
+ if (!try_cmpxchg_relaxed(&pn->state, &old, vcpu_hashed))
return;
/*
@@ -546,15 +542,14 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
#ifndef __pv_queued_spin_unlock
__visible __lockfunc void __pv_queued_spin_unlock(struct qspinlock *lock)
{
- u8 locked;
+ u8 locked = _Q_LOCKED_VAL;
/*
* We must not unlock if SLOW, because in that case we must first
* unhash. Otherwise it would be possible to have multiple @lock
* entries, which would be BAD.
*/
- locked = cmpxchg_release(&lock->locked, _Q_LOCKED_VAL, 0);
- if (likely(locked == _Q_LOCKED_VAL))
+ if (try_cmpxchg_release(&lock->locked, &locked, 0))
return;
__pv_queued_spin_unlock_slowpath(lock, locked);
diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig
index 0ea1b2970a23..f3e0329337f6 100644
--- a/kernel/module/Kconfig
+++ b/kernel/module/Kconfig
@@ -236,6 +236,10 @@ choice
possible to load a signed module containing the algorithm to check
the signature on that module.
+config MODULE_SIG_SHA1
+ bool "Sign modules with SHA-1"
+ select CRYPTO_SHA1
+
config MODULE_SIG_SHA256
bool "Sign modules with SHA-256"
select CRYPTO_SHA256
@@ -265,6 +269,7 @@ endchoice
config MODULE_SIG_HASH
string
depends on MODULE_SIG || IMA_APPRAISE_MODSIG
+ default "sha1" if MODULE_SIG_SHA1
default "sha256" if MODULE_SIG_SHA256
default "sha384" if MODULE_SIG_SHA384
default "sha512" if MODULE_SIG_SHA512
@@ -362,8 +367,7 @@ config MODPROBE_PATH
userspace can still load modules explicitly).
config TRIM_UNUSED_KSYMS
- bool "Trim unused exported kernel symbols" if EXPERT
- depends on !COMPILE_TEST
+ bool "Trim unused exported kernel symbols"
help
The kernel and some modules make many symbols available for
other modules to use via EXPORT_SYMBOL() and variants. Depending
diff --git a/kernel/module/internal.h b/kernel/module/internal.h
index c8b7b4dcf782..2ebece8a789f 100644
--- a/kernel/module/internal.h
+++ b/kernel/module/internal.h
@@ -322,9 +322,9 @@ static inline struct module *mod_find(unsigned long addr, struct mod_tree_root *
}
#endif /* CONFIG_MODULES_TREE_LOOKUP */
-void module_enable_ro(const struct module *mod, bool after_init);
-void module_enable_nx(const struct module *mod);
-void module_enable_x(const struct module *mod);
+int module_enable_rodata_ro(const struct module *mod, bool after_init);
+int module_enable_data_nx(const struct module *mod);
+int module_enable_text_rox(const struct module *mod);
int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
char *secstrings, struct module *mod);
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 36681911c05a..e1e8a7a9d6c1 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -2489,6 +2489,11 @@ static void do_free_init(struct work_struct *w)
}
}
+void flush_module_init_free_work(void)
+{
+ flush_work(&init_free_wq);
+}
+
#undef MODULE_PARAM_PREFIX
#define MODULE_PARAM_PREFIX "module."
/* Default value for module->async_probe_requested */
@@ -2571,7 +2576,9 @@ static noinline int do_init_module(struct module *mod)
/* Switch to core kallsyms now init is done: kallsyms may be walking! */
rcu_assign_pointer(mod->kallsyms, &mod->core_kallsyms);
#endif
- module_enable_ro(mod, true);
+ ret = module_enable_rodata_ro(mod, true);
+ if (ret)
+ goto fail_mutex_unlock;
mod_tree_remove_init(mod);
module_arch_freeing_init(mod);
for_class_mod_mem_type(type, init) {
@@ -2593,8 +2600,8 @@ static noinline int do_init_module(struct module *mod)
* Note that module_alloc() on most architectures creates W+X page
* mappings which won't be cleaned up until do_free_init() runs. Any
* code such as mark_rodata_ro() which depends on those mappings to
- * be cleaned up needs to sync with the queued work - ie
- * rcu_barrier()
+ * be cleaned up needs to sync with the queued work by invoking
+ * flush_module_init_free_work().
*/
if (llist_add(&freeinit->node, &init_free_list))
schedule_work(&init_free_wq);
@@ -2609,6 +2616,8 @@ static noinline int do_init_module(struct module *mod)
return 0;
+fail_mutex_unlock:
+ mutex_unlock(&module_mutex);
fail_free_freeinit:
kfree(freeinit);
fail:
@@ -2736,9 +2745,15 @@ static int complete_formation(struct module *mod, struct load_info *info)
module_bug_finalize(info->hdr, info->sechdrs, mod);
module_cfi_finalize(info->hdr, info->sechdrs, mod);
- module_enable_ro(mod, false);
- module_enable_nx(mod);
- module_enable_x(mod);
+ err = module_enable_rodata_ro(mod, false);
+ if (err)
+ goto out_strict_rwx;
+ err = module_enable_data_nx(mod);
+ if (err)
+ goto out_strict_rwx;
+ err = module_enable_text_rox(mod);
+ if (err)
+ goto out_strict_rwx;
/*
* Mark state as coming so strong_try_module_get() ignores us,
@@ -2749,6 +2764,8 @@ static int complete_formation(struct module *mod, struct load_info *info)
return 0;
+out_strict_rwx:
+ module_bug_cleanup(mod);
out:
mutex_unlock(&module_mutex);
return err;
diff --git a/kernel/module/strict_rwx.c b/kernel/module/strict_rwx.c
index a2b656b4e3d2..c45caa4690e5 100644
--- a/kernel/module/strict_rwx.c
+++ b/kernel/module/strict_rwx.c
@@ -11,13 +11,16 @@
#include <linux/set_memory.h>
#include "internal.h"
-static void module_set_memory(const struct module *mod, enum mod_mem_type type,
- int (*set_memory)(unsigned long start, int num_pages))
+static int module_set_memory(const struct module *mod, enum mod_mem_type type,
+ int (*set_memory)(unsigned long start, int num_pages))
{
const struct module_memory *mod_mem = &mod->mem[type];
+ if (!mod_mem->base)
+ return 0;
+
set_vm_flush_reset_perms(mod_mem->base);
- set_memory((unsigned long)mod_mem->base, mod_mem->size >> PAGE_SHIFT);
+ return set_memory((unsigned long)mod_mem->base, mod_mem->size >> PAGE_SHIFT);
}
/*
@@ -26,37 +29,53 @@ static void module_set_memory(const struct module *mod, enum mod_mem_type type,
* CONFIG_STRICT_MODULE_RWX because they are needed regardless of whether we
* are strict.
*/
-void module_enable_x(const struct module *mod)
+int module_enable_text_rox(const struct module *mod)
{
- for_class_mod_mem_type(type, text)
- module_set_memory(mod, type, set_memory_x);
+ for_class_mod_mem_type(type, text) {
+ int ret;
+
+ if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
+ ret = module_set_memory(mod, type, set_memory_rox);
+ else
+ ret = module_set_memory(mod, type, set_memory_x);
+ if (ret)
+ return ret;
+ }
+ return 0;
}
-void module_enable_ro(const struct module *mod, bool after_init)
+int module_enable_rodata_ro(const struct module *mod, bool after_init)
{
- if (!IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
- return;
-#ifdef CONFIG_STRICT_MODULE_RWX
- if (!rodata_enabled)
- return;
-#endif
+ int ret;
+
+ if (!IS_ENABLED(CONFIG_STRICT_MODULE_RWX) || !rodata_enabled)
+ return 0;
- module_set_memory(mod, MOD_TEXT, set_memory_ro);
- module_set_memory(mod, MOD_INIT_TEXT, set_memory_ro);
- module_set_memory(mod, MOD_RODATA, set_memory_ro);
- module_set_memory(mod, MOD_INIT_RODATA, set_memory_ro);
+ ret = module_set_memory(mod, MOD_RODATA, set_memory_ro);
+ if (ret)
+ return ret;
+ ret = module_set_memory(mod, MOD_INIT_RODATA, set_memory_ro);
+ if (ret)
+ return ret;
if (after_init)
- module_set_memory(mod, MOD_RO_AFTER_INIT, set_memory_ro);
+ return module_set_memory(mod, MOD_RO_AFTER_INIT, set_memory_ro);
+
+ return 0;
}
-void module_enable_nx(const struct module *mod)
+int module_enable_data_nx(const struct module *mod)
{
if (!IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
- return;
+ return 0;
- for_class_mod_mem_type(type, data)
- module_set_memory(mod, type, set_memory_nx);
+ for_class_mod_mem_type(type, data) {
+ int ret = module_set_memory(mod, type, set_memory_nx);
+
+ if (ret)
+ return ret;
+ }
+ return 0;
}
int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
diff --git a/kernel/padata.c b/kernel/padata.c
index 179fb1518070..53f4bc912712 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -106,7 +106,7 @@ static int __init padata_work_alloc_mt(int nworks, void *data,
{
int i;
- spin_lock(&padata_works_lock);
+ spin_lock_bh(&padata_works_lock);
/* Start at 1 because the current task participates in the job. */
for (i = 1; i < nworks; ++i) {
struct padata_work *pw = padata_work_alloc();
@@ -116,7 +116,7 @@ static int __init padata_work_alloc_mt(int nworks, void *data,
padata_work_init(pw, padata_mt_helper, data, 0);
list_add(&pw->pw_list, head);
}
- spin_unlock(&padata_works_lock);
+ spin_unlock_bh(&padata_works_lock);
return i;
}
@@ -134,12 +134,12 @@ static void __init padata_works_free(struct list_head *works)
if (list_empty(works))
return;
- spin_lock(&padata_works_lock);
+ spin_lock_bh(&padata_works_lock);
list_for_each_entry_safe(cur, next, works, pw_list) {
list_del(&cur->pw_list);
padata_work_free(cur);
}
- spin_unlock(&padata_works_lock);
+ spin_unlock_bh(&padata_works_lock);
}
static void padata_parallel_worker(struct work_struct *parallel_work)
@@ -485,7 +485,8 @@ void __init padata_do_multithreaded(struct padata_mt_job *job)
struct padata_work my_work, *pw;
struct padata_mt_job_state ps;
LIST_HEAD(works);
- int nworks;
+ int nworks, nid;
+ static atomic_t last_used_nid __initdata;
if (job->size == 0)
return;
@@ -517,7 +518,16 @@ void __init padata_do_multithreaded(struct padata_mt_job *job)
ps.chunk_size = roundup(ps.chunk_size, job->align);
list_for_each_entry(pw, &works, pw_list)
- queue_work(system_unbound_wq, &pw->pw_work);
+ if (job->numa_aware) {
+ int old_node = atomic_read(&last_used_nid);
+
+ do {
+ nid = next_node_in(old_node, node_states[N_CPU]);
+ } while (!atomic_try_cmpxchg(&last_used_nid, &old_node, nid));
+ queue_work_node(nid, system_unbound_wq, &pw->pw_work);
+ } else {
+ queue_work(system_unbound_wq, &pw->pw_work);
+ }
/* Use the current thread, which saves starting a workqueue worker. */
padata_work_init(&my_work, padata_mt_helper, &ps, PADATA_WORK_ONSTACK);
diff --git a/kernel/panic.c b/kernel/panic.c
index f22d8f33ea14..747c3f3d289a 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -73,6 +73,7 @@ EXPORT_SYMBOL_GPL(panic_timeout);
#define PANIC_PRINT_FTRACE_INFO 0x00000010
#define PANIC_PRINT_ALL_PRINTK_MSG 0x00000020
#define PANIC_PRINT_ALL_CPU_BT 0x00000040
+#define PANIC_PRINT_BLOCKED_TASKS 0x00000080
unsigned long panic_print;
ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
@@ -227,6 +228,9 @@ static void panic_print_sys_info(bool console_flush)
if (panic_print & PANIC_PRINT_FTRACE_INFO)
ftrace_dump(DUMP_ALL);
+
+ if (panic_print & PANIC_PRINT_BLOCKED_TASKS)
+ show_state_filter(TASK_UNINTERRUPTIBLE);
}
void check_panic_on_warn(const char *origin)
@@ -674,8 +678,13 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
pr_warn("WARNING: CPU: %d PID: %d at %pS\n",
raw_smp_processor_id(), current->pid, caller);
+#pragma GCC diagnostic push
+#ifndef __clang__
+#pragma GCC diagnostic ignored "-Wsuggest-attribute=format"
+#endif
if (args)
vprintk(args->fmt, args->args);
+#pragma GCC diagnostic pop
print_modules();
diff --git a/kernel/pid.c b/kernel/pid.c
index 99a0c5eb24b8..da76ed1873f7 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -62,17 +62,13 @@ struct pid init_struct_pid = {
int pid_max = PID_MAX_DEFAULT;
-#define RESERVED_PIDS 300
-
int pid_max_min = RESERVED_PIDS + 1;
int pid_max_max = PID_MAX_LIMIT;
-#ifdef CONFIG_FS_PID
/*
* Pseudo filesystems start inode numbering after one. We use Reserved
* PIDs as a natural offset.
*/
static u64 pidfs_ino = RESERVED_PIDS;
-#endif
/*
* PID-map pages start out as NULL, they get allocated upon
@@ -280,10 +276,8 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
spin_lock_irq(&pidmap_lock);
if (!(ns->pid_allocated & PIDNS_ADDING))
goto out_unlock;
-#ifdef CONFIG_FS_PID
pid->stashed = NULL;
pid->ino = ++pidfs_ino;
-#endif
for ( ; upid >= pid->numbers; --upid) {
/* Make the PID visible to find_pid_ns. */
idr_replace(&upid->ns->idr, pid, upid->nr);
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 4b31629c5be4..afce8130d8b9 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -39,9 +39,9 @@ config HIBERNATION
bool "Hibernation (aka 'suspend to disk')"
depends on SWAP && ARCH_HIBERNATION_POSSIBLE
select HIBERNATE_CALLBACKS
- select LZO_COMPRESS
- select LZO_DECOMPRESS
select CRC32
+ select CRYPTO
+ select CRYPTO_LZO
help
Enable the suspend to disk (STD) functionality, which is usually
called "hibernation" in user interfaces. STD checkpoints the
@@ -92,6 +92,28 @@ config HIBERNATION_SNAPSHOT_DEV
If in doubt, say Y.
+choice
+ prompt "Default compressor"
+ default HIBERNATION_COMP_LZO
+ depends on HIBERNATION
+
+config HIBERNATION_COMP_LZO
+ bool "lzo"
+ depends on CRYPTO_LZO
+
+config HIBERNATION_COMP_LZ4
+ bool "lz4"
+ depends on CRYPTO_LZ4
+
+endchoice
+
+config HIBERNATION_DEF_COMP
+ string
+ default "lzo" if HIBERNATION_COMP_LZO
+ default "lz4" if HIBERNATION_COMP_LZ4
+ help
+ Default compressor to be used for hibernation.
+
config PM_STD_PARTITION
string "Default resume partition"
depends on HIBERNATION
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index 7b44f5b89fa1..927cc55ba0b3 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -23,6 +23,12 @@
*/
static DEFINE_MUTEX(em_pd_mutex);
+static void em_cpufreq_update_efficiencies(struct device *dev,
+ struct em_perf_state *table);
+static void em_check_capacity_update(void);
+static void em_update_workfn(struct work_struct *work);
+static DECLARE_DELAYED_WORK(em_update_work, em_update_workfn);
+
static bool _is_cpu_device(struct device *dev)
{
return (dev->bus == &cpu_subsys);
@@ -31,19 +37,65 @@ static bool _is_cpu_device(struct device *dev)
#ifdef CONFIG_DEBUG_FS
static struct dentry *rootdir;
-static void em_debug_create_ps(struct em_perf_state *ps, struct dentry *pd)
+struct em_dbg_info {
+ struct em_perf_domain *pd;
+ int ps_id;
+};
+
+#define DEFINE_EM_DBG_SHOW(name, fname) \
+static int em_debug_##fname##_show(struct seq_file *s, void *unused) \
+{ \
+ struct em_dbg_info *em_dbg = s->private; \
+ struct em_perf_state *table; \
+ unsigned long val; \
+ \
+ rcu_read_lock(); \
+ table = em_perf_state_from_pd(em_dbg->pd); \
+ val = table[em_dbg->ps_id].name; \
+ rcu_read_unlock(); \
+ \
+ seq_printf(s, "%lu\n", val); \
+ return 0; \
+} \
+DEFINE_SHOW_ATTRIBUTE(em_debug_##fname)
+
+DEFINE_EM_DBG_SHOW(frequency, frequency);
+DEFINE_EM_DBG_SHOW(power, power);
+DEFINE_EM_DBG_SHOW(cost, cost);
+DEFINE_EM_DBG_SHOW(performance, performance);
+DEFINE_EM_DBG_SHOW(flags, inefficiency);
+
+static void em_debug_create_ps(struct em_perf_domain *em_pd,
+ struct em_dbg_info *em_dbg, int i,
+ struct dentry *pd)
{
+ struct em_perf_state *table;
+ unsigned long freq;
struct dentry *d;
char name[24];
- snprintf(name, sizeof(name), "ps:%lu", ps->frequency);
+ em_dbg[i].pd = em_pd;
+ em_dbg[i].ps_id = i;
+
+ rcu_read_lock();
+ table = em_perf_state_from_pd(em_pd);
+ freq = table[i].frequency;
+ rcu_read_unlock();
+
+ snprintf(name, sizeof(name), "ps:%lu", freq);
/* Create per-ps directory */
d = debugfs_create_dir(name, pd);
- debugfs_create_ulong("frequency", 0444, d, &ps->frequency);
- debugfs_create_ulong("power", 0444, d, &ps->power);
- debugfs_create_ulong("cost", 0444, d, &ps->cost);
- debugfs_create_ulong("inefficient", 0444, d, &ps->flags);
+ debugfs_create_file("frequency", 0444, d, &em_dbg[i],
+ &em_debug_frequency_fops);
+ debugfs_create_file("power", 0444, d, &em_dbg[i],
+ &em_debug_power_fops);
+ debugfs_create_file("cost", 0444, d, &em_dbg[i],
+ &em_debug_cost_fops);
+ debugfs_create_file("performance", 0444, d, &em_dbg[i],
+ &em_debug_performance_fops);
+ debugfs_create_file("inefficient", 0444, d, &em_dbg[i],
+ &em_debug_inefficiency_fops);
}
static int em_debug_cpus_show(struct seq_file *s, void *unused)
@@ -66,6 +118,7 @@ DEFINE_SHOW_ATTRIBUTE(em_debug_flags);
static void em_debug_create_pd(struct device *dev)
{
+ struct em_dbg_info *em_dbg;
struct dentry *d;
int i;
@@ -79,9 +132,14 @@ static void em_debug_create_pd(struct device *dev)
debugfs_create_file("flags", 0444, d, dev->em_pd,
&em_debug_flags_fops);
+ em_dbg = devm_kcalloc(dev, dev->em_pd->nr_perf_states,
+ sizeof(*em_dbg), GFP_KERNEL);
+ if (!em_dbg)
+ return;
+
/* Create a sub-directory for each performance state */
for (i = 0; i < dev->em_pd->nr_perf_states; i++)
- em_debug_create_ps(&dev->em_pd->table[i], d);
+ em_debug_create_ps(dev->em_pd, em_dbg, i, d);
}
@@ -103,18 +161,192 @@ static void em_debug_create_pd(struct device *dev) {}
static void em_debug_remove_pd(struct device *dev) {}
#endif
+static void em_destroy_table_rcu(struct rcu_head *rp)
+{
+ struct em_perf_table __rcu *table;
+
+ table = container_of(rp, struct em_perf_table, rcu);
+ kfree(table);
+}
+
+static void em_release_table_kref(struct kref *kref)
+{
+ struct em_perf_table __rcu *table;
+
+ /* It was the last owner of this table so we can free */
+ table = container_of(kref, struct em_perf_table, kref);
+
+ call_rcu(&table->rcu, em_destroy_table_rcu);
+}
+
+/**
+ * em_table_free() - Handles safe free of the EM table when needed
+ * @table : EM table which is going to be freed
+ *
+ * No return values.
+ */
+void em_table_free(struct em_perf_table __rcu *table)
+{
+ kref_put(&table->kref, em_release_table_kref);
+}
+
+/**
+ * em_table_alloc() - Allocate a new EM table
+ * @pd : EM performance domain for which this must be done
+ *
+ * Allocate a new EM table and initialize its kref to indicate that it
+ * has a user.
+ * Returns allocated table or NULL.
+ */
+struct em_perf_table __rcu *em_table_alloc(struct em_perf_domain *pd)
+{
+ struct em_perf_table __rcu *table;
+ int table_size;
+
+ table_size = sizeof(struct em_perf_state) * pd->nr_perf_states;
+
+ table = kzalloc(sizeof(*table) + table_size, GFP_KERNEL);
+ if (!table)
+ return NULL;
+
+ kref_init(&table->kref);
+
+ return table;
+}
+
+static void em_init_performance(struct device *dev, struct em_perf_domain *pd,
+ struct em_perf_state *table, int nr_states)
+{
+ u64 fmax, max_cap;
+ int i, cpu;
+
+ /* This is needed only for CPUs and EAS skip other devices */
+ if (!_is_cpu_device(dev))
+ return;
+
+ cpu = cpumask_first(em_span_cpus(pd));
+
+ /*
+ * Calculate the performance value for each frequency with
+ * linear relationship. The final CPU capacity might not be ready at
+ * boot time, but the EM will be updated a bit later with correct one.
+ */
+ fmax = (u64) table[nr_states - 1].frequency;
+ max_cap = (u64) arch_scale_cpu_capacity(cpu);
+ for (i = 0; i < nr_states; i++)
+ table[i].performance = div64_u64(max_cap * table[i].frequency,
+ fmax);
+}
+
+static int em_compute_costs(struct device *dev, struct em_perf_state *table,
+ struct em_data_callback *cb, int nr_states,
+ unsigned long flags)
+{
+ unsigned long prev_cost = ULONG_MAX;
+ int i, ret;
+
+ /* Compute the cost of each performance state. */
+ for (i = nr_states - 1; i >= 0; i--) {
+ unsigned long power_res, cost;
+
+ if ((flags & EM_PERF_DOMAIN_ARTIFICIAL) && cb->get_cost) {
+ ret = cb->get_cost(dev, table[i].frequency, &cost);
+ if (ret || !cost || cost > EM_MAX_POWER) {
+ dev_err(dev, "EM: invalid cost %lu %d\n",
+ cost, ret);
+ return -EINVAL;
+ }
+ } else {
+ /* increase resolution of 'cost' precision */
+ power_res = table[i].power * 10;
+ cost = power_res / table[i].performance;
+ }
+
+ table[i].cost = cost;
+
+ if (table[i].cost >= prev_cost) {
+ table[i].flags = EM_PERF_STATE_INEFFICIENT;
+ dev_dbg(dev, "EM: OPP:%lu is inefficient\n",
+ table[i].frequency);
+ } else {
+ prev_cost = table[i].cost;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * em_dev_compute_costs() - Calculate cost values for new runtime EM table
+ * @dev : Device for which the EM table is to be updated
+ * @table : The new EM table that is going to get the costs calculated
+ * @nr_states : Number of performance states
+ *
+ * Calculate the em_perf_state::cost values for new runtime EM table. The
+ * values are used for EAS during task placement. It also calculates and sets
+ * the efficiency flag for each performance state. When the function finish
+ * successfully the EM table is ready to be updated and used by EAS.
+ *
+ * Return 0 on success or a proper error in case of failure.
+ */
+int em_dev_compute_costs(struct device *dev, struct em_perf_state *table,
+ int nr_states)
+{
+ return em_compute_costs(dev, table, NULL, nr_states, 0);
+}
+
+/**
+ * em_dev_update_perf_domain() - Update runtime EM table for a device
+ * @dev : Device for which the EM is to be updated
+ * @new_table : The new EM table that is going to be used from now
+ *
+ * Update EM runtime modifiable table for the @dev using the provided @table.
+ *
+ * This function uses a mutex to serialize writers, so it must not be called
+ * from a non-sleeping context.
+ *
+ * Return 0 on success or an error code on failure.
+ */
+int em_dev_update_perf_domain(struct device *dev,
+ struct em_perf_table __rcu *new_table)
+{
+ struct em_perf_table __rcu *old_table;
+ struct em_perf_domain *pd;
+
+ if (!dev)
+ return -EINVAL;
+
+ /* Serialize update/unregister or concurrent updates */
+ mutex_lock(&em_pd_mutex);
+
+ if (!dev->em_pd) {
+ mutex_unlock(&em_pd_mutex);
+ return -EINVAL;
+ }
+ pd = dev->em_pd;
+
+ kref_get(&new_table->kref);
+
+ old_table = pd->em_table;
+ rcu_assign_pointer(pd->em_table, new_table);
+
+ em_cpufreq_update_efficiencies(dev, new_table->state);
+
+ em_table_free(old_table);
+
+ mutex_unlock(&em_pd_mutex);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(em_dev_update_perf_domain);
+
static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
- int nr_states, struct em_data_callback *cb,
+ struct em_perf_state *table,
+ struct em_data_callback *cb,
unsigned long flags)
{
- unsigned long power, freq, prev_freq = 0, prev_cost = ULONG_MAX;
- struct em_perf_state *table;
+ unsigned long power, freq, prev_freq = 0;
+ int nr_states = pd->nr_perf_states;
int i, ret;
- u64 fmax;
-
- table = kcalloc(nr_states, sizeof(*table), GFP_KERNEL);
- if (!table)
- return -ENOMEM;
/* Build the list of performance states for this performance domain */
for (i = 0, freq = 0; i < nr_states; i++, freq++) {
@@ -127,7 +359,7 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
if (ret) {
dev_err(dev, "EM: invalid perf. state: %d\n",
ret);
- goto free_ps_table;
+ return -EINVAL;
}
/*
@@ -137,7 +369,7 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
if (freq <= prev_freq) {
dev_err(dev, "EM: non-increasing freq: %lu\n",
freq);
- goto free_ps_table;
+ return -EINVAL;
}
/*
@@ -147,55 +379,27 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
if (!power || power > EM_MAX_POWER) {
dev_err(dev, "EM: invalid power: %lu\n",
power);
- goto free_ps_table;
+ return -EINVAL;
}
table[i].power = power;
table[i].frequency = prev_freq = freq;
}
- /* Compute the cost of each performance state. */
- fmax = (u64) table[nr_states - 1].frequency;
- for (i = nr_states - 1; i >= 0; i--) {
- unsigned long power_res, cost;
+ em_init_performance(dev, pd, table, nr_states);
- if (flags & EM_PERF_DOMAIN_ARTIFICIAL) {
- ret = cb->get_cost(dev, table[i].frequency, &cost);
- if (ret || !cost || cost > EM_MAX_POWER) {
- dev_err(dev, "EM: invalid cost %lu %d\n",
- cost, ret);
- goto free_ps_table;
- }
- } else {
- power_res = table[i].power;
- cost = div64_u64(fmax * power_res, table[i].frequency);
- }
-
- table[i].cost = cost;
-
- if (table[i].cost >= prev_cost) {
- table[i].flags = EM_PERF_STATE_INEFFICIENT;
- dev_dbg(dev, "EM: OPP:%lu is inefficient\n",
- table[i].frequency);
- } else {
- prev_cost = table[i].cost;
- }
- }
-
- pd->table = table;
- pd->nr_perf_states = nr_states;
+ ret = em_compute_costs(dev, table, cb, nr_states, flags);
+ if (ret)
+ return -EINVAL;
return 0;
-
-free_ps_table:
- kfree(table);
- return -EINVAL;
}
static int em_create_pd(struct device *dev, int nr_states,
struct em_data_callback *cb, cpumask_t *cpus,
unsigned long flags)
{
+ struct em_perf_table __rcu *em_table;
struct em_perf_domain *pd;
struct device *cpu_dev;
int cpu, ret, num_cpus;
@@ -220,11 +424,17 @@ static int em_create_pd(struct device *dev, int nr_states,
return -ENOMEM;
}
- ret = em_create_perf_table(dev, pd, nr_states, cb, flags);
- if (ret) {
- kfree(pd);
- return ret;
- }
+ pd->nr_perf_states = nr_states;
+
+ em_table = em_table_alloc(pd);
+ if (!em_table)
+ goto free_pd;
+
+ ret = em_create_perf_table(dev, pd, em_table->state, cb, flags);
+ if (ret)
+ goto free_pd_table;
+
+ rcu_assign_pointer(pd->em_table, em_table);
if (_is_cpu_device(dev))
for_each_cpu(cpu, cpus) {
@@ -235,26 +445,37 @@ static int em_create_pd(struct device *dev, int nr_states,
dev->em_pd = pd;
return 0;
+
+free_pd_table:
+ kfree(em_table);
+free_pd:
+ kfree(pd);
+ return -EINVAL;
}
-static void em_cpufreq_update_efficiencies(struct device *dev)
+static void
+em_cpufreq_update_efficiencies(struct device *dev, struct em_perf_state *table)
{
struct em_perf_domain *pd = dev->em_pd;
- struct em_perf_state *table;
struct cpufreq_policy *policy;
int found = 0;
- int i;
+ int i, cpu;
- if (!_is_cpu_device(dev) || !pd)
+ if (!_is_cpu_device(dev))
return;
- policy = cpufreq_cpu_get(cpumask_first(em_span_cpus(pd)));
- if (!policy) {
- dev_warn(dev, "EM: Access to CPUFreq policy failed");
+ /* Try to get a CPU which is active and in this PD */
+ cpu = cpumask_first_and(em_span_cpus(pd), cpu_active_mask);
+ if (cpu >= nr_cpu_ids) {
+ dev_warn(dev, "EM: No online CPU for CPUFreq policy\n");
return;
}
- table = pd->table;
+ policy = cpufreq_cpu_get(cpu);
+ if (!policy) {
+ dev_warn(dev, "EM: Access to CPUFreq policy failed\n");
+ return;
+ }
for (i = 0; i < pd->nr_perf_states; i++) {
if (!(table[i].flags & EM_PERF_STATE_INEFFICIENT))
@@ -391,19 +612,34 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
else if (cb->get_cost)
flags |= EM_PERF_DOMAIN_ARTIFICIAL;
+ /*
+ * EM only supports uW (exception is artificial EM).
+ * Therefore, check and force the drivers to provide
+ * power in uW.
+ */
+ if (!microwatts && !(flags & EM_PERF_DOMAIN_ARTIFICIAL)) {
+ dev_err(dev, "EM: only supports uW power values\n");
+ ret = -EINVAL;
+ goto unlock;
+ }
+
ret = em_create_pd(dev, nr_states, cb, cpus, flags);
if (ret)
goto unlock;
dev->em_pd->flags |= flags;
- em_cpufreq_update_efficiencies(dev);
+ em_cpufreq_update_efficiencies(dev, dev->em_pd->em_table->state);
em_debug_create_pd(dev);
dev_info(dev, "EM: created perf domain\n");
unlock:
mutex_unlock(&em_pd_mutex);
+
+ if (_is_cpu_device(dev))
+ em_check_capacity_update();
+
return ret;
}
EXPORT_SYMBOL_GPL(em_dev_register_perf_domain);
@@ -430,9 +666,193 @@ void em_dev_unregister_perf_domain(struct device *dev)
mutex_lock(&em_pd_mutex);
em_debug_remove_pd(dev);
- kfree(dev->em_pd->table);
+ em_table_free(dev->em_pd->em_table);
+
kfree(dev->em_pd);
dev->em_pd = NULL;
mutex_unlock(&em_pd_mutex);
}
EXPORT_SYMBOL_GPL(em_dev_unregister_perf_domain);
+
+static struct em_perf_table __rcu *em_table_dup(struct em_perf_domain *pd)
+{
+ struct em_perf_table __rcu *em_table;
+ struct em_perf_state *ps, *new_ps;
+ int ps_size;
+
+ em_table = em_table_alloc(pd);
+ if (!em_table)
+ return NULL;
+
+ new_ps = em_table->state;
+
+ rcu_read_lock();
+ ps = em_perf_state_from_pd(pd);
+ /* Initialize data based on old table */
+ ps_size = sizeof(struct em_perf_state) * pd->nr_perf_states;
+ memcpy(new_ps, ps, ps_size);
+
+ rcu_read_unlock();
+
+ return em_table;
+}
+
+static int em_recalc_and_update(struct device *dev, struct em_perf_domain *pd,
+ struct em_perf_table __rcu *em_table)
+{
+ int ret;
+
+ ret = em_compute_costs(dev, em_table->state, NULL, pd->nr_perf_states,
+ pd->flags);
+ if (ret)
+ goto free_em_table;
+
+ ret = em_dev_update_perf_domain(dev, em_table);
+ if (ret)
+ goto free_em_table;
+
+ /*
+ * This is one-time-update, so give up the ownership in this updater.
+ * The EM framework has incremented the usage counter and from now
+ * will keep the reference (then free the memory when needed).
+ */
+free_em_table:
+ em_table_free(em_table);
+ return ret;
+}
+
+/*
+ * Adjustment of CPU performance values after boot, when all CPUs capacites
+ * are correctly calculated.
+ */
+static void em_adjust_new_capacity(struct device *dev,
+ struct em_perf_domain *pd,
+ u64 max_cap)
+{
+ struct em_perf_table __rcu *em_table;
+
+ em_table = em_table_dup(pd);
+ if (!em_table) {
+ dev_warn(dev, "EM: allocation failed\n");
+ return;
+ }
+
+ em_init_performance(dev, pd, em_table->state, pd->nr_perf_states);
+
+ em_recalc_and_update(dev, pd, em_table);
+}
+
+static void em_check_capacity_update(void)
+{
+ cpumask_var_t cpu_done_mask;
+ struct em_perf_state *table;
+ struct em_perf_domain *pd;
+ unsigned long cpu_capacity;
+ int cpu;
+
+ if (!zalloc_cpumask_var(&cpu_done_mask, GFP_KERNEL)) {
+ pr_warn("no free memory\n");
+ return;
+ }
+
+ /* Check if CPUs capacity has changed than update EM */
+ for_each_possible_cpu(cpu) {
+ struct cpufreq_policy *policy;
+ unsigned long em_max_perf;
+ struct device *dev;
+
+ if (cpumask_test_cpu(cpu, cpu_done_mask))
+ continue;
+
+ policy = cpufreq_cpu_get(cpu);
+ if (!policy) {
+ pr_debug("Accessing cpu%d policy failed\n", cpu);
+ schedule_delayed_work(&em_update_work,
+ msecs_to_jiffies(1000));
+ break;
+ }
+ cpufreq_cpu_put(policy);
+
+ pd = em_cpu_get(cpu);
+ if (!pd || em_is_artificial(pd))
+ continue;
+
+ cpumask_or(cpu_done_mask, cpu_done_mask,
+ em_span_cpus(pd));
+
+ cpu_capacity = arch_scale_cpu_capacity(cpu);
+
+ rcu_read_lock();
+ table = em_perf_state_from_pd(pd);
+ em_max_perf = table[pd->nr_perf_states - 1].performance;
+ rcu_read_unlock();
+
+ /*
+ * Check if the CPU capacity has been adjusted during boot
+ * and trigger the update for new performance values.
+ */
+ if (em_max_perf == cpu_capacity)
+ continue;
+
+ pr_debug("updating cpu%d cpu_cap=%lu old capacity=%lu\n",
+ cpu, cpu_capacity, em_max_perf);
+
+ dev = get_cpu_device(cpu);
+ em_adjust_new_capacity(dev, pd, cpu_capacity);
+ }
+
+ free_cpumask_var(cpu_done_mask);
+}
+
+static void em_update_workfn(struct work_struct *work)
+{
+ em_check_capacity_update();
+}
+
+/**
+ * em_dev_update_chip_binning() - Update Energy Model after the new voltage
+ * information is present in the OPPs.
+ * @dev : Device for which the Energy Model has to be updated.
+ *
+ * This function allows to update easily the EM with new values available in
+ * the OPP framework and DT. It can be used after the chip has been properly
+ * verified by device drivers and the voltages adjusted for the 'chip binning'.
+ */
+int em_dev_update_chip_binning(struct device *dev)
+{
+ struct em_perf_table __rcu *em_table;
+ struct em_perf_domain *pd;
+ int i, ret;
+
+ if (IS_ERR_OR_NULL(dev))
+ return -EINVAL;
+
+ pd = em_pd_get(dev);
+ if (!pd) {
+ dev_warn(dev, "Couldn't find Energy Model\n");
+ return -EINVAL;
+ }
+
+ em_table = em_table_dup(pd);
+ if (!em_table) {
+ dev_warn(dev, "EM: allocation failed\n");
+ return -ENOMEM;
+ }
+
+ /* Update power values which might change due to new voltage in OPPs */
+ for (i = 0; i < pd->nr_perf_states; i++) {
+ unsigned long freq = em_table->state[i].frequency;
+ unsigned long power;
+
+ ret = dev_pm_opp_calc_power(dev, &power, &freq);
+ if (ret) {
+ em_table_free(em_table);
+ return ret;
+ }
+
+ em_table->state[i].power = power;
+ }
+
+ return em_recalc_and_update(dev, pd, em_table);
+}
+EXPORT_SYMBOL_GPL(em_dev_update_chip_binning);
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 4b0b7cf2e019..0a213f69a9e4 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -47,6 +47,15 @@ dev_t swsusp_resume_device;
sector_t swsusp_resume_block;
__visible int in_suspend __nosavedata;
+static char hibernate_compressor[CRYPTO_MAX_ALG_NAME] = CONFIG_HIBERNATION_DEF_COMP;
+
+/*
+ * Compression/decompression algorithm to be used while saving/loading
+ * image to/from disk. This would later be used in 'kernel/power/swap.c'
+ * to allocate comp streams.
+ */
+char hib_comp_algo[CRYPTO_MAX_ALG_NAME];
+
enum {
HIBERNATION_INVALID,
HIBERNATION_PLATFORM,
@@ -718,6 +727,9 @@ static int load_image_and_restore(void)
return error;
}
+#define COMPRESSION_ALGO_LZO "lzo"
+#define COMPRESSION_ALGO_LZ4 "lz4"
+
/**
* hibernate - Carry out system hibernation, including saving the image.
*/
@@ -732,6 +744,17 @@ int hibernate(void)
return -EPERM;
}
+ /*
+ * Query for the compression algorithm support if compression is enabled.
+ */
+ if (!nocompress) {
+ strscpy(hib_comp_algo, hibernate_compressor, sizeof(hib_comp_algo));
+ if (crypto_has_comp(hib_comp_algo, 0, 0) != 1) {
+ pr_err("%s compression is not available\n", hib_comp_algo);
+ return -EOPNOTSUPP;
+ }
+ }
+
sleep_flags = lock_system_sleep();
/* The snapshot device should not be opened while we're running */
if (!hibernate_acquire()) {
@@ -766,11 +789,24 @@ int hibernate(void)
if (hibernation_mode == HIBERNATION_PLATFORM)
flags |= SF_PLATFORM_MODE;
- if (nocompress)
+ if (nocompress) {
flags |= SF_NOCOMPRESS_MODE;
- else
+ } else {
flags |= SF_CRC32_MODE;
+ /*
+ * By default, LZO compression is enabled. Use SF_COMPRESSION_ALG_LZ4
+ * to override this behaviour and use LZ4.
+ *
+ * Refer kernel/power/power.h for more details
+ */
+
+ if (!strcmp(hib_comp_algo, COMPRESSION_ALGO_LZ4))
+ flags |= SF_COMPRESSION_ALG_LZ4;
+ else
+ flags |= SF_COMPRESSION_ALG_LZO;
+ }
+
pm_pr_dbg("Writing hibernation image.\n");
error = swsusp_write(flags);
swsusp_free();
@@ -955,6 +991,22 @@ static int software_resume(void)
if (error)
goto Unlock;
+ /*
+ * Check if the hibernation image is compressed. If so, query for
+ * the algorithm support.
+ */
+ if (!(swsusp_header_flags & SF_NOCOMPRESS_MODE)) {
+ if (swsusp_header_flags & SF_COMPRESSION_ALG_LZ4)
+ strscpy(hib_comp_algo, COMPRESSION_ALGO_LZ4, sizeof(hib_comp_algo));
+ else
+ strscpy(hib_comp_algo, COMPRESSION_ALGO_LZO, sizeof(hib_comp_algo));
+ if (crypto_has_comp(hib_comp_algo, 0, 0) != 1) {
+ pr_err("%s compression is not available\n", hib_comp_algo);
+ error = -EOPNOTSUPP;
+ goto Unlock;
+ }
+ }
+
/* The snapshot device should not be opened while we're running */
if (!hibernate_acquire()) {
error = -EBUSY;
@@ -1309,7 +1361,7 @@ static int __init resume_setup(char *str)
if (noresume)
return 1;
- strncpy(resume_file, str, 255);
+ strscpy(resume_file, str);
return 1;
}
@@ -1370,6 +1422,57 @@ static int __init nohibernate_setup(char *str)
return 1;
}
+static const char * const comp_alg_enabled[] = {
+#if IS_ENABLED(CONFIG_CRYPTO_LZO)
+ COMPRESSION_ALGO_LZO,
+#endif
+#if IS_ENABLED(CONFIG_CRYPTO_LZ4)
+ COMPRESSION_ALGO_LZ4,
+#endif
+};
+
+static int hibernate_compressor_param_set(const char *compressor,
+ const struct kernel_param *kp)
+{
+ unsigned int sleep_flags;
+ int index, ret;
+
+ sleep_flags = lock_system_sleep();
+
+ index = sysfs_match_string(comp_alg_enabled, compressor);
+ if (index >= 0) {
+ ret = param_set_copystring(comp_alg_enabled[index], kp);
+ if (!ret)
+ strscpy(hib_comp_algo, comp_alg_enabled[index],
+ sizeof(hib_comp_algo));
+ } else {
+ ret = index;
+ }
+
+ unlock_system_sleep(sleep_flags);
+
+ if (ret)
+ pr_debug("Cannot set specified compressor %s\n",
+ compressor);
+
+ return ret;
+}
+
+static const struct kernel_param_ops hibernate_compressor_param_ops = {
+ .set = hibernate_compressor_param_set,
+ .get = param_get_string,
+};
+
+static struct kparam_string hibernate_compressor_param_string = {
+ .maxlen = sizeof(hibernate_compressor),
+ .string = hibernate_compressor,
+};
+
+module_param_cb(compressor, &hibernate_compressor_param_ops,
+ &hibernate_compressor_param_string, 0644);
+MODULE_PARM_DESC(compressor,
+ "Compression algorithm to be used with hibernation");
+
__setup("noresume", noresume_setup);
__setup("resume_offset=", resume_offset_setup);
__setup("resume=", resume_setup);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index b1ae9b677d03..a9e0693aaf69 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -95,19 +95,6 @@ int unregister_pm_notifier(struct notifier_block *nb)
}
EXPORT_SYMBOL_GPL(unregister_pm_notifier);
-void pm_report_hw_sleep_time(u64 t)
-{
- suspend_stats.last_hw_sleep = t;
- suspend_stats.total_hw_sleep += t;
-}
-EXPORT_SYMBOL_GPL(pm_report_hw_sleep_time);
-
-void pm_report_max_hw_sleep(u64 t)
-{
- suspend_stats.max_hw_sleep = t;
-}
-EXPORT_SYMBOL_GPL(pm_report_max_hw_sleep);
-
int pm_notifier_call_chain_robust(unsigned long val_up, unsigned long val_down)
{
int ret;
@@ -319,26 +306,86 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
power_attr(pm_test);
#endif /* CONFIG_PM_SLEEP_DEBUG */
-static char *suspend_step_name(enum suspend_stat_step step)
-{
- switch (step) {
- case SUSPEND_FREEZE:
- return "freeze";
- case SUSPEND_PREPARE:
- return "prepare";
- case SUSPEND_SUSPEND:
- return "suspend";
- case SUSPEND_SUSPEND_NOIRQ:
- return "suspend_noirq";
- case SUSPEND_RESUME_NOIRQ:
- return "resume_noirq";
- case SUSPEND_RESUME:
- return "resume";
- default:
- return "";
+#define SUSPEND_NR_STEPS SUSPEND_RESUME
+#define REC_FAILED_NUM 2
+
+struct suspend_stats {
+ unsigned int step_failures[SUSPEND_NR_STEPS];
+ unsigned int success;
+ unsigned int fail;
+ int last_failed_dev;
+ char failed_devs[REC_FAILED_NUM][40];
+ int last_failed_errno;
+ int errno[REC_FAILED_NUM];
+ int last_failed_step;
+ u64 last_hw_sleep;
+ u64 total_hw_sleep;
+ u64 max_hw_sleep;
+ enum suspend_stat_step failed_steps[REC_FAILED_NUM];
+};
+
+static struct suspend_stats suspend_stats;
+static DEFINE_MUTEX(suspend_stats_lock);
+
+void dpm_save_failed_dev(const char *name)
+{
+ mutex_lock(&suspend_stats_lock);
+
+ strscpy(suspend_stats.failed_devs[suspend_stats.last_failed_dev],
+ name, sizeof(suspend_stats.failed_devs[0]));
+ suspend_stats.last_failed_dev++;
+ suspend_stats.last_failed_dev %= REC_FAILED_NUM;
+
+ mutex_unlock(&suspend_stats_lock);
+}
+
+void dpm_save_failed_step(enum suspend_stat_step step)
+{
+ suspend_stats.step_failures[step-1]++;
+ suspend_stats.failed_steps[suspend_stats.last_failed_step] = step;
+ suspend_stats.last_failed_step++;
+ suspend_stats.last_failed_step %= REC_FAILED_NUM;
+}
+
+void dpm_save_errno(int err)
+{
+ if (!err) {
+ suspend_stats.success++;
+ return;
}
+
+ suspend_stats.fail++;
+
+ suspend_stats.errno[suspend_stats.last_failed_errno] = err;
+ suspend_stats.last_failed_errno++;
+ suspend_stats.last_failed_errno %= REC_FAILED_NUM;
}
+void pm_report_hw_sleep_time(u64 t)
+{
+ suspend_stats.last_hw_sleep = t;
+ suspend_stats.total_hw_sleep += t;
+}
+EXPORT_SYMBOL_GPL(pm_report_hw_sleep_time);
+
+void pm_report_max_hw_sleep(u64 t)
+{
+ suspend_stats.max_hw_sleep = t;
+}
+EXPORT_SYMBOL_GPL(pm_report_max_hw_sleep);
+
+static const char * const suspend_step_names[] = {
+ [SUSPEND_WORKING] = "",
+ [SUSPEND_FREEZE] = "freeze",
+ [SUSPEND_PREPARE] = "prepare",
+ [SUSPEND_SUSPEND] = "suspend",
+ [SUSPEND_SUSPEND_LATE] = "suspend_late",
+ [SUSPEND_SUSPEND_NOIRQ] = "suspend_noirq",
+ [SUSPEND_RESUME_NOIRQ] = "resume_noirq",
+ [SUSPEND_RESUME_EARLY] = "resume_early",
+ [SUSPEND_RESUME] = "resume",
+};
+
#define suspend_attr(_name, format_str) \
static ssize_t _name##_show(struct kobject *kobj, \
struct kobj_attribute *attr, char *buf) \
@@ -347,20 +394,30 @@ static ssize_t _name##_show(struct kobject *kobj, \
} \
static struct kobj_attribute _name = __ATTR_RO(_name)
-suspend_attr(success, "%d\n");
-suspend_attr(fail, "%d\n");
-suspend_attr(failed_freeze, "%d\n");
-suspend_attr(failed_prepare, "%d\n");
-suspend_attr(failed_suspend, "%d\n");
-suspend_attr(failed_suspend_late, "%d\n");
-suspend_attr(failed_suspend_noirq, "%d\n");
-suspend_attr(failed_resume, "%d\n");
-suspend_attr(failed_resume_early, "%d\n");
-suspend_attr(failed_resume_noirq, "%d\n");
+suspend_attr(success, "%u\n");
+suspend_attr(fail, "%u\n");
suspend_attr(last_hw_sleep, "%llu\n");
suspend_attr(total_hw_sleep, "%llu\n");
suspend_attr(max_hw_sleep, "%llu\n");
+#define suspend_step_attr(_name, step) \
+static ssize_t _name##_show(struct kobject *kobj, \
+ struct kobj_attribute *attr, char *buf) \
+{ \
+ return sprintf(buf, "%u\n", \
+ suspend_stats.step_failures[step-1]); \
+} \
+static struct kobj_attribute _name = __ATTR_RO(_name)
+
+suspend_step_attr(failed_freeze, SUSPEND_FREEZE);
+suspend_step_attr(failed_prepare, SUSPEND_PREPARE);
+suspend_step_attr(failed_suspend, SUSPEND_SUSPEND);
+suspend_step_attr(failed_suspend_late, SUSPEND_SUSPEND_LATE);
+suspend_step_attr(failed_suspend_noirq, SUSPEND_SUSPEND_NOIRQ);
+suspend_step_attr(failed_resume, SUSPEND_RESUME);
+suspend_step_attr(failed_resume_early, SUSPEND_RESUME_EARLY);
+suspend_step_attr(failed_resume_noirq, SUSPEND_RESUME_NOIRQ);
+
static ssize_t last_failed_dev_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
@@ -392,16 +449,14 @@ static struct kobj_attribute last_failed_errno = __ATTR_RO(last_failed_errno);
static ssize_t last_failed_step_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- int index;
enum suspend_stat_step step;
- char *last_failed_step = NULL;
+ int index;
index = suspend_stats.last_failed_step + REC_FAILED_NUM - 1;
index %= REC_FAILED_NUM;
step = suspend_stats.failed_steps[index];
- last_failed_step = suspend_step_name(step);
- return sprintf(buf, "%s\n", last_failed_step);
+ return sprintf(buf, "%s\n", suspend_step_names[step]);
}
static struct kobj_attribute last_failed_step = __ATTR_RO(last_failed_step);
@@ -449,6 +504,7 @@ static const struct attribute_group suspend_attr_group = {
static int suspend_stats_show(struct seq_file *s, void *unused)
{
int i, index, last_dev, last_errno, last_step;
+ enum suspend_stat_step step;
last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1;
last_dev %= REC_FAILED_NUM;
@@ -456,47 +512,35 @@ static int suspend_stats_show(struct seq_file *s, void *unused)
last_errno %= REC_FAILED_NUM;
last_step = suspend_stats.last_failed_step + REC_FAILED_NUM - 1;
last_step %= REC_FAILED_NUM;
- seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n"
- "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n",
- "success", suspend_stats.success,
- "fail", suspend_stats.fail,
- "failed_freeze", suspend_stats.failed_freeze,
- "failed_prepare", suspend_stats.failed_prepare,
- "failed_suspend", suspend_stats.failed_suspend,
- "failed_suspend_late",
- suspend_stats.failed_suspend_late,
- "failed_suspend_noirq",
- suspend_stats.failed_suspend_noirq,
- "failed_resume", suspend_stats.failed_resume,
- "failed_resume_early",
- suspend_stats.failed_resume_early,
- "failed_resume_noirq",
- suspend_stats.failed_resume_noirq);
+
+ seq_printf(s, "success: %u\nfail: %u\n",
+ suspend_stats.success, suspend_stats.fail);
+
+ for (step = SUSPEND_FREEZE; step <= SUSPEND_NR_STEPS; step++)
+ seq_printf(s, "failed_%s: %u\n", suspend_step_names[step],
+ suspend_stats.step_failures[step-1]);
+
seq_printf(s, "failures:\n last_failed_dev:\t%-s\n",
- suspend_stats.failed_devs[last_dev]);
+ suspend_stats.failed_devs[last_dev]);
for (i = 1; i < REC_FAILED_NUM; i++) {
index = last_dev + REC_FAILED_NUM - i;
index %= REC_FAILED_NUM;
- seq_printf(s, "\t\t\t%-s\n",
- suspend_stats.failed_devs[index]);
+ seq_printf(s, "\t\t\t%-s\n", suspend_stats.failed_devs[index]);
}
seq_printf(s, " last_failed_errno:\t%-d\n",
suspend_stats.errno[last_errno]);
for (i = 1; i < REC_FAILED_NUM; i++) {
index = last_errno + REC_FAILED_NUM - i;
index %= REC_FAILED_NUM;
- seq_printf(s, "\t\t\t%-d\n",
- suspend_stats.errno[index]);
+ seq_printf(s, "\t\t\t%-d\n", suspend_stats.errno[index]);
}
seq_printf(s, " last_failed_step:\t%-s\n",
- suspend_step_name(
- suspend_stats.failed_steps[last_step]));
+ suspend_step_names[suspend_stats.failed_steps[last_step]]);
for (i = 1; i < REC_FAILED_NUM; i++) {
index = last_step + REC_FAILED_NUM - i;
index %= REC_FAILED_NUM;
seq_printf(s, "\t\t\t%-s\n",
- suspend_step_name(
- suspend_stats.failed_steps[index]));
+ suspend_step_names[suspend_stats.failed_steps[index]]);
}
return 0;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 8499a39c62f4..de0e6b1077f2 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -6,6 +6,7 @@
#include <linux/compiler.h>
#include <linux/cpu.h>
#include <linux/cpuidle.h>
+#include <linux/crypto.h>
struct swsusp_info {
struct new_utsname uts;
@@ -54,6 +55,10 @@ asmlinkage int swsusp_save(void);
/* kernel/power/hibernate.c */
extern bool freezer_test_done;
+extern char hib_comp_algo[CRYPTO_MAX_ALG_NAME];
+
+/* kernel/power/swap.c */
+extern unsigned int swsusp_header_flags;
extern int hibernation_snapshot(int platform_mode);
extern int hibernation_restore(int platform_mode);
@@ -148,7 +153,7 @@ extern unsigned int snapshot_additional_pages(struct zone *zone);
extern unsigned long snapshot_get_image_size(void);
extern int snapshot_read_next(struct snapshot_handle *handle);
extern int snapshot_write_next(struct snapshot_handle *handle);
-extern void snapshot_write_finalize(struct snapshot_handle *handle);
+int snapshot_write_finalize(struct snapshot_handle *handle);
extern int snapshot_image_loaded(struct snapshot_handle *handle);
extern bool hibernate_acquire(void);
@@ -162,11 +167,25 @@ extern int swsusp_swap_in_use(void);
* Flags that can be passed from the hibernatig hernel to the "boot" kernel in
* the image header.
*/
+#define SF_COMPRESSION_ALG_LZO 0 /* dummy, details given below */
#define SF_PLATFORM_MODE 1
#define SF_NOCOMPRESS_MODE 2
#define SF_CRC32_MODE 4
#define SF_HW_SIG 8
+/*
+ * Bit to indicate the compression algorithm to be used(for LZ4). The same
+ * could be checked while saving/loading image to/from disk to use the
+ * corresponding algorithms.
+ *
+ * By default, LZO compression is enabled if SF_CRC32_MODE is set. Use
+ * SF_COMPRESSION_ALG_LZ4 to override this behaviour and use LZ4.
+ *
+ * SF_CRC32_MODE, SF_COMPRESSION_ALG_LZO(dummy) -> Compression, LZO
+ * SF_CRC32_MODE, SF_COMPRESSION_ALG_LZ4 -> Compression, LZ4
+ */
+#define SF_COMPRESSION_ALG_LZ4 16
+
/* kernel/power/hibernate.c */
int swsusp_check(bool exclusive);
extern void swsusp_free(void);
@@ -327,3 +346,5 @@ static inline void pm_sleep_enable_secondary_cpus(void)
suspend_enable_secondary_cpus();
cpuidle_resume();
}
+
+void dpm_save_errno(int err);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 5c96ff067c64..405eddbda4fc 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -58,22 +58,24 @@ static inline void hibernate_restore_protection_end(void)
hibernate_restore_protection_active = false;
}
-static inline void hibernate_restore_protect_page(void *page_address)
+static inline int __must_check hibernate_restore_protect_page(void *page_address)
{
if (hibernate_restore_protection_active)
- set_memory_ro((unsigned long)page_address, 1);
+ return set_memory_ro((unsigned long)page_address, 1);
+ return 0;
}
-static inline void hibernate_restore_unprotect_page(void *page_address)
+static inline int hibernate_restore_unprotect_page(void *page_address)
{
if (hibernate_restore_protection_active)
- set_memory_rw((unsigned long)page_address, 1);
+ return set_memory_rw((unsigned long)page_address, 1);
+ return 0;
}
#else
static inline void hibernate_restore_protection_begin(void) {}
static inline void hibernate_restore_protection_end(void) {}
-static inline void hibernate_restore_protect_page(void *page_address) {}
-static inline void hibernate_restore_unprotect_page(void *page_address) {}
+static inline int __must_check hibernate_restore_protect_page(void *page_address) {return 0; }
+static inline int hibernate_restore_unprotect_page(void *page_address) {return 0; }
#endif /* CONFIG_STRICT_KERNEL_RWX && CONFIG_ARCH_HAS_SET_MEMORY */
@@ -2832,7 +2834,9 @@ next:
}
} else {
copy_last_highmem_page();
- hibernate_restore_protect_page(handle->buffer);
+ error = hibernate_restore_protect_page(handle->buffer);
+ if (error)
+ return error;
handle->buffer = get_buffer(&orig_bm, &ca);
if (IS_ERR(handle->buffer))
return PTR_ERR(handle->buffer);
@@ -2858,15 +2862,18 @@ next:
* stored in highmem. Additionally, it recycles bitmap memory that's not
* necessary any more.
*/
-void snapshot_write_finalize(struct snapshot_handle *handle)
+int snapshot_write_finalize(struct snapshot_handle *handle)
{
+ int error;
+
copy_last_highmem_page();
- hibernate_restore_protect_page(handle->buffer);
+ error = hibernate_restore_protect_page(handle->buffer);
/* Do that only if we have loaded the image entirely */
if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages + nr_zero_pages) {
memory_bm_recycle(&orig_bm);
free_highmem_data();
}
+ return error;
}
int snapshot_image_loaded(struct snapshot_handle *handle)
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index fa3bf161d13f..09f8397bae15 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -106,6 +106,12 @@ static void s2idle_enter(void)
swait_event_exclusive(s2idle_wait_head,
s2idle_state == S2IDLE_STATE_WAKE);
+ /*
+ * Kick all CPUs to ensure that they resume their timers and restore
+ * consistent system state.
+ */
+ wake_up_all_idle_cpus();
+
cpus_read_unlock();
raw_spin_lock_irq(&s2idle_lock);
@@ -192,6 +198,7 @@ static int __init mem_sleep_default_setup(char *str)
if (mem_sleep_labels[state] &&
!strcmp(str, mem_sleep_labels[state])) {
mem_sleep_default = state;
+ mem_sleep_current = state;
break;
}
@@ -367,7 +374,6 @@ static int suspend_prepare(suspend_state_t state)
if (!error)
return 0;
- suspend_stats.failed_freeze++;
dpm_save_failed_step(SUSPEND_FREEZE);
pm_notifier_call_chain(PM_POST_SUSPEND);
Restore:
@@ -617,12 +623,7 @@ int pm_suspend(suspend_state_t state)
pr_info("suspend entry (%s)\n", mem_sleep_labels[state]);
error = enter_state(state);
- if (error) {
- suspend_stats.fail++;
- dpm_save_failed_errno(error);
- } else {
- suspend_stats.success++;
- }
+ dpm_save_errno(error);
pr_info("suspend exit\n");
return error;
}
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index b663a97f5867..d4856ec61570 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -201,7 +201,7 @@ static int __init test_suspend(void)
}
/* RTCs have initialized by now too ... can we use one? */
- dev = class_find_device(rtc_class, NULL, NULL, has_wakealarm);
+ dev = class_find_device(&rtc_class, NULL, NULL, has_wakealarm);
if (dev) {
rtc = rtc_class_open(dev_name(dev));
put_device(dev);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 692f12fe60c1..5bc04bfe2db1 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -23,7 +23,6 @@
#include <linux/swapops.h>
#include <linux/pm.h>
#include <linux/slab.h>
-#include <linux/lzo.h>
#include <linux/vmalloc.h>
#include <linux/cpumask.h>
#include <linux/atomic.h>
@@ -339,6 +338,13 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
return error;
}
+/*
+ * Hold the swsusp_header flag. This is used in software_resume() in
+ * 'kernel/power/hibernate' to check if the image is compressed and query
+ * for the compression algorithm support(if so).
+ */
+unsigned int swsusp_header_flags;
+
/**
* swsusp_swap_check - check if the resume device is a swap device
* and get its index (if so)
@@ -514,25 +520,30 @@ static int swap_writer_finish(struct swap_map_handle *handle,
return error;
}
+/*
+ * Bytes we need for compressed data in worst case. We assume(limitation)
+ * this is the worst of all the compression algorithms.
+ */
+#define bytes_worst_compress(x) ((x) + ((x) / 16) + 64 + 3 + 2)
+
/* We need to remember how much compressed data we need to read. */
-#define LZO_HEADER sizeof(size_t)
+#define CMP_HEADER sizeof(size_t)
/* Number of pages/bytes we'll compress at one time. */
-#define LZO_UNC_PAGES 32
-#define LZO_UNC_SIZE (LZO_UNC_PAGES * PAGE_SIZE)
+#define UNC_PAGES 32
+#define UNC_SIZE (UNC_PAGES * PAGE_SIZE)
-/* Number of pages/bytes we need for compressed data (worst case). */
-#define LZO_CMP_PAGES DIV_ROUND_UP(lzo1x_worst_compress(LZO_UNC_SIZE) + \
- LZO_HEADER, PAGE_SIZE)
-#define LZO_CMP_SIZE (LZO_CMP_PAGES * PAGE_SIZE)
+/* Number of pages we need for compressed data (worst case). */
+#define CMP_PAGES DIV_ROUND_UP(bytes_worst_compress(UNC_SIZE) + \
+ CMP_HEADER, PAGE_SIZE)
+#define CMP_SIZE (CMP_PAGES * PAGE_SIZE)
/* Maximum number of threads for compression/decompression. */
-#define LZO_THREADS 3
+#define CMP_THREADS 3
/* Minimum/maximum number of pages for read buffering. */
-#define LZO_MIN_RD_PAGES 1024
-#define LZO_MAX_RD_PAGES 8192
-
+#define CMP_MIN_RD_PAGES 1024
+#define CMP_MAX_RD_PAGES 8192
/**
* save_image - save the suspend image data
@@ -593,8 +604,8 @@ struct crc_data {
wait_queue_head_t go; /* start crc update */
wait_queue_head_t done; /* crc update done */
u32 *crc32; /* points to handle's crc32 */
- size_t *unc_len[LZO_THREADS]; /* uncompressed lengths */
- unsigned char *unc[LZO_THREADS]; /* uncompressed data */
+ size_t *unc_len[CMP_THREADS]; /* uncompressed lengths */
+ unsigned char *unc[CMP_THREADS]; /* uncompressed data */
};
/*
@@ -625,10 +636,11 @@ static int crc32_threadfn(void *data)
return 0;
}
/*
- * Structure used for LZO data compression.
+ * Structure used for data compression.
*/
struct cmp_data {
struct task_struct *thr; /* thread */
+ struct crypto_comp *cc; /* crypto compressor stream */
atomic_t ready; /* ready to start flag */
atomic_t stop; /* ready to stop flag */
int ret; /* return code */
@@ -636,17 +648,20 @@ struct cmp_data {
wait_queue_head_t done; /* compression done */
size_t unc_len; /* uncompressed length */
size_t cmp_len; /* compressed length */
- unsigned char unc[LZO_UNC_SIZE]; /* uncompressed buffer */
- unsigned char cmp[LZO_CMP_SIZE]; /* compressed buffer */
- unsigned char wrk[LZO1X_1_MEM_COMPRESS]; /* compression workspace */
+ unsigned char unc[UNC_SIZE]; /* uncompressed buffer */
+ unsigned char cmp[CMP_SIZE]; /* compressed buffer */
};
+/* Indicates the image size after compression */
+static atomic_t compressed_size = ATOMIC_INIT(0);
+
/*
* Compression function that runs in its own thread.
*/
-static int lzo_compress_threadfn(void *data)
+static int compress_threadfn(void *data)
{
struct cmp_data *d = data;
+ unsigned int cmp_len = 0;
while (1) {
wait_event(d->go, atomic_read_acquire(&d->ready) ||
@@ -660,9 +675,13 @@ static int lzo_compress_threadfn(void *data)
}
atomic_set(&d->ready, 0);
- d->ret = lzo1x_1_compress(d->unc, d->unc_len,
- d->cmp + LZO_HEADER, &d->cmp_len,
- d->wrk);
+ cmp_len = CMP_SIZE - CMP_HEADER;
+ d->ret = crypto_comp_compress(d->cc, d->unc, d->unc_len,
+ d->cmp + CMP_HEADER,
+ &cmp_len);
+ d->cmp_len = cmp_len;
+
+ atomic_set(&compressed_size, atomic_read(&compressed_size) + d->cmp_len);
atomic_set_release(&d->stop, 1);
wake_up(&d->done);
}
@@ -670,14 +689,14 @@ static int lzo_compress_threadfn(void *data)
}
/**
- * save_image_lzo - Save the suspend image data compressed with LZO.
+ * save_compressed_image - Save the suspend image data after compression.
* @handle: Swap map handle to use for saving the image.
* @snapshot: Image to read data from.
* @nr_to_write: Number of pages to save.
*/
-static int save_image_lzo(struct swap_map_handle *handle,
- struct snapshot_handle *snapshot,
- unsigned int nr_to_write)
+static int save_compressed_image(struct swap_map_handle *handle,
+ struct snapshot_handle *snapshot,
+ unsigned int nr_to_write)
{
unsigned int m;
int ret = 0;
@@ -694,23 +713,25 @@ static int save_image_lzo(struct swap_map_handle *handle,
hib_init_batch(&hb);
+ atomic_set(&compressed_size, 0);
+
/*
* We'll limit the number of threads for compression to limit memory
* footprint.
*/
nr_threads = num_online_cpus() - 1;
- nr_threads = clamp_val(nr_threads, 1, LZO_THREADS);
+ nr_threads = clamp_val(nr_threads, 1, CMP_THREADS);
page = (void *)__get_free_page(GFP_NOIO | __GFP_HIGH);
if (!page) {
- pr_err("Failed to allocate LZO page\n");
+ pr_err("Failed to allocate %s page\n", hib_comp_algo);
ret = -ENOMEM;
goto out_clean;
}
data = vzalloc(array_size(nr_threads, sizeof(*data)));
if (!data) {
- pr_err("Failed to allocate LZO data\n");
+ pr_err("Failed to allocate %s data\n", hib_comp_algo);
ret = -ENOMEM;
goto out_clean;
}
@@ -729,7 +750,14 @@ static int save_image_lzo(struct swap_map_handle *handle,
init_waitqueue_head(&data[thr].go);
init_waitqueue_head(&data[thr].done);
- data[thr].thr = kthread_run(lzo_compress_threadfn,
+ data[thr].cc = crypto_alloc_comp(hib_comp_algo, 0, 0);
+ if (IS_ERR_OR_NULL(data[thr].cc)) {
+ pr_err("Could not allocate comp stream %ld\n", PTR_ERR(data[thr].cc));
+ ret = -EFAULT;
+ goto out_clean;
+ }
+
+ data[thr].thr = kthread_run(compress_threadfn,
&data[thr],
"image_compress/%u", thr);
if (IS_ERR(data[thr].thr)) {
@@ -767,7 +795,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
*/
handle->reqd_free_pages = reqd_free_pages();
- pr_info("Using %u thread(s) for compression\n", nr_threads);
+ pr_info("Using %u thread(s) for %s compression\n", nr_threads, hib_comp_algo);
pr_info("Compressing and saving image data (%u pages)...\n",
nr_to_write);
m = nr_to_write / 10;
@@ -777,7 +805,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
start = ktime_get();
for (;;) {
for (thr = 0; thr < nr_threads; thr++) {
- for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) {
+ for (off = 0; off < UNC_SIZE; off += PAGE_SIZE) {
ret = snapshot_read_next(snapshot);
if (ret < 0)
goto out_finish;
@@ -817,14 +845,14 @@ static int save_image_lzo(struct swap_map_handle *handle,
ret = data[thr].ret;
if (ret < 0) {
- pr_err("LZO compression failed\n");
+ pr_err("%s compression failed\n", hib_comp_algo);
goto out_finish;
}
if (unlikely(!data[thr].cmp_len ||
data[thr].cmp_len >
- lzo1x_worst_compress(data[thr].unc_len))) {
- pr_err("Invalid LZO compressed length\n");
+ bytes_worst_compress(data[thr].unc_len))) {
+ pr_err("Invalid %s compressed length\n", hib_comp_algo);
ret = -1;
goto out_finish;
}
@@ -840,7 +868,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
* read it.
*/
for (off = 0;
- off < LZO_HEADER + data[thr].cmp_len;
+ off < CMP_HEADER + data[thr].cmp_len;
off += PAGE_SIZE) {
memcpy(page, data[thr].cmp + off, PAGE_SIZE);
@@ -862,6 +890,9 @@ out_finish:
if (!ret)
pr_info("Image saving done\n");
swsusp_show_speed(start, stop, nr_to_write, "Wrote");
+ pr_info("Image size after compression: %d kbytes\n",
+ (atomic_read(&compressed_size) / 1024));
+
out_clean:
hib_finish_batch(&hb);
if (crc) {
@@ -870,9 +901,12 @@ out_clean:
kfree(crc);
}
if (data) {
- for (thr = 0; thr < nr_threads; thr++)
+ for (thr = 0; thr < nr_threads; thr++) {
if (data[thr].thr)
kthread_stop(data[thr].thr);
+ if (data[thr].cc)
+ crypto_free_comp(data[thr].cc);
+ }
vfree(data);
}
if (page) free_page((unsigned long)page);
@@ -942,7 +976,7 @@ int swsusp_write(unsigned int flags)
if (!error) {
error = (flags & SF_NOCOMPRESS_MODE) ?
save_image(&handle, &snapshot, pages - 1) :
- save_image_lzo(&handle, &snapshot, pages - 1);
+ save_compressed_image(&handle, &snapshot, pages - 1);
}
out_finish:
error = swap_writer_finish(&handle, flags, error);
@@ -1100,8 +1134,8 @@ static int load_image(struct swap_map_handle *handle,
ret = err2;
if (!ret) {
pr_info("Image loading done\n");
- snapshot_write_finalize(snapshot);
- if (!snapshot_image_loaded(snapshot))
+ ret = snapshot_write_finalize(snapshot);
+ if (!ret && !snapshot_image_loaded(snapshot))
ret = -ENODATA;
}
swsusp_show_speed(start, stop, nr_to_read, "Read");
@@ -1109,10 +1143,11 @@ static int load_image(struct swap_map_handle *handle,
}
/*
- * Structure used for LZO data decompression.
+ * Structure used for data decompression.
*/
struct dec_data {
struct task_struct *thr; /* thread */
+ struct crypto_comp *cc; /* crypto compressor stream */
atomic_t ready; /* ready to start flag */
atomic_t stop; /* ready to stop flag */
int ret; /* return code */
@@ -1120,16 +1155,17 @@ struct dec_data {
wait_queue_head_t done; /* decompression done */
size_t unc_len; /* uncompressed length */
size_t cmp_len; /* compressed length */
- unsigned char unc[LZO_UNC_SIZE]; /* uncompressed buffer */
- unsigned char cmp[LZO_CMP_SIZE]; /* compressed buffer */
+ unsigned char unc[UNC_SIZE]; /* uncompressed buffer */
+ unsigned char cmp[CMP_SIZE]; /* compressed buffer */
};
/*
* Decompression function that runs in its own thread.
*/
-static int lzo_decompress_threadfn(void *data)
+static int decompress_threadfn(void *data)
{
struct dec_data *d = data;
+ unsigned int unc_len = 0;
while (1) {
wait_event(d->go, atomic_read_acquire(&d->ready) ||
@@ -1143,9 +1179,11 @@ static int lzo_decompress_threadfn(void *data)
}
atomic_set(&d->ready, 0);
- d->unc_len = LZO_UNC_SIZE;
- d->ret = lzo1x_decompress_safe(d->cmp + LZO_HEADER, d->cmp_len,
- d->unc, &d->unc_len);
+ unc_len = UNC_SIZE;
+ d->ret = crypto_comp_decompress(d->cc, d->cmp + CMP_HEADER, d->cmp_len,
+ d->unc, &unc_len);
+ d->unc_len = unc_len;
+
if (clean_pages_on_decompress)
flush_icache_range((unsigned long)d->unc,
(unsigned long)d->unc + d->unc_len);
@@ -1157,14 +1195,14 @@ static int lzo_decompress_threadfn(void *data)
}
/**
- * load_image_lzo - Load compressed image data and decompress them with LZO.
+ * load_compressed_image - Load compressed image data and decompress it.
* @handle: Swap map handle to use for loading data.
* @snapshot: Image to copy uncompressed data into.
* @nr_to_read: Number of pages to load.
*/
-static int load_image_lzo(struct swap_map_handle *handle,
- struct snapshot_handle *snapshot,
- unsigned int nr_to_read)
+static int load_compressed_image(struct swap_map_handle *handle,
+ struct snapshot_handle *snapshot,
+ unsigned int nr_to_read)
{
unsigned int m;
int ret = 0;
@@ -1189,18 +1227,18 @@ static int load_image_lzo(struct swap_map_handle *handle,
* footprint.
*/
nr_threads = num_online_cpus() - 1;
- nr_threads = clamp_val(nr_threads, 1, LZO_THREADS);
+ nr_threads = clamp_val(nr_threads, 1, CMP_THREADS);
- page = vmalloc(array_size(LZO_MAX_RD_PAGES, sizeof(*page)));
+ page = vmalloc(array_size(CMP_MAX_RD_PAGES, sizeof(*page)));
if (!page) {
- pr_err("Failed to allocate LZO page\n");
+ pr_err("Failed to allocate %s page\n", hib_comp_algo);
ret = -ENOMEM;
goto out_clean;
}
data = vzalloc(array_size(nr_threads, sizeof(*data)));
if (!data) {
- pr_err("Failed to allocate LZO data\n");
+ pr_err("Failed to allocate %s data\n", hib_comp_algo);
ret = -ENOMEM;
goto out_clean;
}
@@ -1221,7 +1259,14 @@ static int load_image_lzo(struct swap_map_handle *handle,
init_waitqueue_head(&data[thr].go);
init_waitqueue_head(&data[thr].done);
- data[thr].thr = kthread_run(lzo_decompress_threadfn,
+ data[thr].cc = crypto_alloc_comp(hib_comp_algo, 0, 0);
+ if (IS_ERR_OR_NULL(data[thr].cc)) {
+ pr_err("Could not allocate comp stream %ld\n", PTR_ERR(data[thr].cc));
+ ret = -EFAULT;
+ goto out_clean;
+ }
+
+ data[thr].thr = kthread_run(decompress_threadfn,
&data[thr],
"image_decompress/%u", thr);
if (IS_ERR(data[thr].thr)) {
@@ -1262,18 +1307,18 @@ static int load_image_lzo(struct swap_map_handle *handle,
*/
if (low_free_pages() > snapshot_get_image_size())
read_pages = (low_free_pages() - snapshot_get_image_size()) / 2;
- read_pages = clamp_val(read_pages, LZO_MIN_RD_PAGES, LZO_MAX_RD_PAGES);
+ read_pages = clamp_val(read_pages, CMP_MIN_RD_PAGES, CMP_MAX_RD_PAGES);
for (i = 0; i < read_pages; i++) {
- page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ?
+ page[i] = (void *)__get_free_page(i < CMP_PAGES ?
GFP_NOIO | __GFP_HIGH :
GFP_NOIO | __GFP_NOWARN |
__GFP_NORETRY);
if (!page[i]) {
- if (i < LZO_CMP_PAGES) {
+ if (i < CMP_PAGES) {
ring_size = i;
- pr_err("Failed to allocate LZO pages\n");
+ pr_err("Failed to allocate %s pages\n", hib_comp_algo);
ret = -ENOMEM;
goto out_clean;
} else {
@@ -1283,7 +1328,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
}
want = ring_size = i;
- pr_info("Using %u thread(s) for decompression\n", nr_threads);
+ pr_info("Using %u thread(s) for %s decompression\n", nr_threads, hib_comp_algo);
pr_info("Loading and decompressing image data (%u pages)...\n",
nr_to_read);
m = nr_to_read / 10;
@@ -1344,13 +1389,13 @@ static int load_image_lzo(struct swap_map_handle *handle,
data[thr].cmp_len = *(size_t *)page[pg];
if (unlikely(!data[thr].cmp_len ||
data[thr].cmp_len >
- lzo1x_worst_compress(LZO_UNC_SIZE))) {
- pr_err("Invalid LZO compressed length\n");
+ bytes_worst_compress(UNC_SIZE))) {
+ pr_err("Invalid %s compressed length\n", hib_comp_algo);
ret = -1;
goto out_finish;
}
- need = DIV_ROUND_UP(data[thr].cmp_len + LZO_HEADER,
+ need = DIV_ROUND_UP(data[thr].cmp_len + CMP_HEADER,
PAGE_SIZE);
if (need > have) {
if (eof > 1) {
@@ -1361,7 +1406,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
}
for (off = 0;
- off < LZO_HEADER + data[thr].cmp_len;
+ off < CMP_HEADER + data[thr].cmp_len;
off += PAGE_SIZE) {
memcpy(data[thr].cmp + off,
page[pg], PAGE_SIZE);
@@ -1378,7 +1423,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
/*
* Wait for more data while we are decompressing.
*/
- if (have < LZO_CMP_PAGES && asked) {
+ if (have < CMP_PAGES && asked) {
ret = hib_wait_io(&hb);
if (ret)
goto out_finish;
@@ -1396,14 +1441,14 @@ static int load_image_lzo(struct swap_map_handle *handle,
ret = data[thr].ret;
if (ret < 0) {
- pr_err("LZO decompression failed\n");
+ pr_err("%s decompression failed\n", hib_comp_algo);
goto out_finish;
}
if (unlikely(!data[thr].unc_len ||
- data[thr].unc_len > LZO_UNC_SIZE ||
- data[thr].unc_len & (PAGE_SIZE - 1))) {
- pr_err("Invalid LZO uncompressed length\n");
+ data[thr].unc_len > UNC_SIZE ||
+ data[thr].unc_len & (PAGE_SIZE - 1))) {
+ pr_err("Invalid %s uncompressed length\n", hib_comp_algo);
ret = -1;
goto out_finish;
}
@@ -1441,8 +1486,8 @@ out_finish:
stop = ktime_get();
if (!ret) {
pr_info("Image loading done\n");
- snapshot_write_finalize(snapshot);
- if (!snapshot_image_loaded(snapshot))
+ ret = snapshot_write_finalize(snapshot);
+ if (!ret && !snapshot_image_loaded(snapshot))
ret = -ENODATA;
if (!ret) {
if (swsusp_header->flags & SF_CRC32_MODE) {
@@ -1464,9 +1509,12 @@ out_clean:
kfree(crc);
}
if (data) {
- for (thr = 0; thr < nr_threads; thr++)
+ for (thr = 0; thr < nr_threads; thr++) {
if (data[thr].thr)
kthread_stop(data[thr].thr);
+ if (data[thr].cc)
+ crypto_free_comp(data[thr].cc);
+ }
vfree(data);
}
vfree(page);
@@ -1500,7 +1548,7 @@ int swsusp_read(unsigned int *flags_p)
if (!error) {
error = (*flags_p & SF_NOCOMPRESS_MODE) ?
load_image(&handle, &snapshot, header->pages - 1) :
- load_image_lzo(&handle, &snapshot, header->pages - 1);
+ load_compressed_image(&handle, &snapshot, header->pages - 1);
}
swap_reader_finish(&handle);
end:
@@ -1535,6 +1583,7 @@ int swsusp_check(bool exclusive)
if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) {
memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10);
+ swsusp_header_flags = swsusp_header->flags;
/* Reset swap signature now */
error = hib_submit_io(REQ_OP_WRITE | REQ_SYNC,
swsusp_resume_block,
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 3a4e70366f35..3aa41ba22129 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -317,7 +317,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
break;
case SNAPSHOT_ATOMIC_RESTORE:
- snapshot_write_finalize(&data->handle);
+ error = snapshot_write_finalize(&data->handle);
+ if (error)
+ break;
if (data->mode != O_WRONLY || !data->frozen ||
!snapshot_image_loaded(&data->handle)) {
error = -EPERM;
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 360182dfe4cf..3160d287d4cf 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -34,7 +34,7 @@
#include <linux/security.h>
#include <linux/memblock.h>
#include <linux/syscalls.h>
-#include <linux/crash_core.h>
+#include <linux/vmcore_info.h>
#include <linux/ratelimit.h>
#include <linux/kmsg_dump.h>
#include <linux/syslog.h>
@@ -957,7 +957,7 @@ const struct file_operations kmsg_fops = {
.release = devkmsg_release,
};
-#ifdef CONFIG_CRASH_CORE
+#ifdef CONFIG_VMCORE_INFO
/*
* This appends the listed symbols to /proc/vmcore
*
@@ -2009,6 +2009,12 @@ static int console_trylock_spinning(void)
*/
mutex_acquire(&console_lock_dep_map, 0, 1, _THIS_IP_);
+ /*
+ * Update @console_may_schedule for trylock because the previous
+ * owner may have been schedulable.
+ */
+ console_may_schedule = 0;
+
return 1;
}
@@ -3269,6 +3275,21 @@ static int __init keep_bootcon_setup(char *str)
early_param("keep_bootcon", keep_bootcon_setup);
+static int console_call_setup(struct console *newcon, char *options)
+{
+ int err;
+
+ if (!newcon->setup)
+ return 0;
+
+ /* Synchronize with possible boot console. */
+ console_lock();
+ err = newcon->setup(newcon, options);
+ console_unlock();
+
+ return err;
+}
+
/*
* This is called by register_console() to try to match
* the newly registered console with any of the ones selected
@@ -3304,8 +3325,8 @@ static int try_enable_preferred_console(struct console *newcon,
if (_braille_register_console(newcon, c))
return 0;
- if (newcon->setup &&
- (err = newcon->setup(newcon, c->options)) != 0)
+ err = console_call_setup(newcon, c->options);
+ if (err)
return err;
}
newcon->flags |= CON_ENABLED;
@@ -3331,7 +3352,7 @@ static void try_enable_default_console(struct console *newcon)
if (newcon->index < 0)
newcon->index = 0;
- if (newcon->setup && newcon->setup(newcon, NULL) != 0)
+ if (console_call_setup(newcon, NULL) != 0)
return;
newcon->flags |= CON_ENABLED;
diff --git a/kernel/profile.c b/kernel/profile.c
index 8a77769bc4b4..2b775cc5c28f 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -344,49 +344,6 @@ void profile_tick(int type)
#include <linux/seq_file.h>
#include <linux/uaccess.h>
-static int prof_cpu_mask_proc_show(struct seq_file *m, void *v)
-{
- seq_printf(m, "%*pb\n", cpumask_pr_args(prof_cpu_mask));
- return 0;
-}
-
-static int prof_cpu_mask_proc_open(struct inode *inode, struct file *file)
-{
- return single_open(file, prof_cpu_mask_proc_show, NULL);
-}
-
-static ssize_t prof_cpu_mask_proc_write(struct file *file,
- const char __user *buffer, size_t count, loff_t *pos)
-{
- cpumask_var_t new_value;
- int err;
-
- if (!zalloc_cpumask_var(&new_value, GFP_KERNEL))
- return -ENOMEM;
-
- err = cpumask_parse_user(buffer, count, new_value);
- if (!err) {
- cpumask_copy(prof_cpu_mask, new_value);
- err = count;
- }
- free_cpumask_var(new_value);
- return err;
-}
-
-static const struct proc_ops prof_cpu_mask_proc_ops = {
- .proc_open = prof_cpu_mask_proc_open,
- .proc_read = seq_read,
- .proc_lseek = seq_lseek,
- .proc_release = single_release,
- .proc_write = prof_cpu_mask_proc_write,
-};
-
-void create_prof_cpu_mask(void)
-{
- /* create /proc/irq/prof_cpu_mask */
- proc_create("irq/prof_cpu_mask", 0600, NULL, &prof_cpu_mask_proc_ops);
-}
-
/*
* This function accesses profiling information. The returned data is
* binary: the sampling step and the actual contents of the profile
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 2fabd497d659..d5f89f9ef29f 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -375,10 +375,13 @@ static int check_ptrace_options(unsigned long data)
return 0;
}
-static inline void ptrace_set_stopped(struct task_struct *task)
+static inline void ptrace_set_stopped(struct task_struct *task, bool seize)
{
guard(spinlock)(&task->sighand->siglock);
+ /* SEIZE doesn't trap tracee on attach */
+ if (!seize)
+ send_signal_locked(SIGSTOP, SEND_SIG_PRIV, task, PIDTYPE_PID);
/*
* If the task is already STOPPED, set JOBCTL_TRAP_STOP and
* TRAPPING, and kick it so that it transits to TRACED. TRAPPING
@@ -457,14 +460,8 @@ static int ptrace_attach(struct task_struct *task, long request,
return -EPERM;
task->ptrace = flags;
-
ptrace_link(task, current);
-
- /* SEIZE doesn't trap tracee on attach */
- if (!seize)
- send_sig_info(SIGSTOP, SEND_SIG_PRIV, task);
-
- ptrace_set_stopped(task);
+ ptrace_set_stopped(task, seize);
}
}
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index e7d2dd267593..3e079de0f5b4 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -31,7 +31,7 @@ config PREEMPT_RCU
config TINY_RCU
bool
- default y if !PREEMPTION && !SMP
+ default y if !PREEMPT_RCU && !SMP
help
This option selects the RCU implementation that is
designed for UP systems from which real-time response
@@ -85,9 +85,13 @@ config FORCE_TASKS_RCU
idle, and user-mode execution as quiescent states. Not for
manual selection in most cases.
-config TASKS_RCU
+config NEED_TASKS_RCU
bool
default n
+
+config TASKS_RCU
+ bool
+ default NEED_TASKS_RCU && (PREEMPTION || PREEMPT_AUTO)
select IRQ_WORK
config FORCE_TASKS_RUDE_RCU
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 86fce206560e..38238e595a61 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -522,12 +522,18 @@ static inline void show_rcu_tasks_gp_kthreads(void) {}
#ifdef CONFIG_TASKS_RCU
struct task_struct *get_rcu_tasks_gp_kthread(void);
+void rcu_tasks_get_gp_data(int *flags, unsigned long *gp_seq);
#endif // # ifdef CONFIG_TASKS_RCU
#ifdef CONFIG_TASKS_RUDE_RCU
struct task_struct *get_rcu_tasks_rude_gp_kthread(void);
+void rcu_tasks_rude_get_gp_data(int *flags, unsigned long *gp_seq);
#endif // # ifdef CONFIG_TASKS_RUDE_RCU
+#ifdef CONFIG_TASKS_TRACE_RCU
+void rcu_tasks_trace_get_gp_data(int *flags, unsigned long *gp_seq);
+#endif
+
#ifdef CONFIG_TASKS_RCU_GENERIC
void tasks_cblist_init_generic(void);
#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */
@@ -557,8 +563,7 @@ static inline void rcu_set_jiffies_lazy_flush(unsigned long j) { }
#endif
#if defined(CONFIG_TREE_RCU)
-void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
- unsigned long *gp_seq);
+void rcutorture_get_gp_data(int *flags, unsigned long *gp_seq);
void do_trace_rcu_torture_read(const char *rcutorturename,
struct rcu_head *rhp,
unsigned long secs,
@@ -566,8 +571,7 @@ void do_trace_rcu_torture_read(const char *rcutorturename,
unsigned long c);
void rcu_gp_set_torture_wait(int duration);
#else
-static inline void rcutorture_get_gp_data(enum rcutorture_type test_type,
- int *flags, unsigned long *gp_seq)
+static inline void rcutorture_get_gp_data(int *flags, unsigned long *gp_seq)
{
*flags = 0;
*gp_seq = 0;
@@ -587,20 +591,16 @@ static inline void rcu_gp_set_torture_wait(int duration) { }
#ifdef CONFIG_TINY_SRCU
-static inline void srcutorture_get_gp_data(enum rcutorture_type test_type,
- struct srcu_struct *sp, int *flags,
+static inline void srcutorture_get_gp_data(struct srcu_struct *sp, int *flags,
unsigned long *gp_seq)
{
- if (test_type != SRCU_FLAVOR)
- return;
*flags = 0;
*gp_seq = sp->srcu_idx;
}
#elif defined(CONFIG_TREE_SRCU)
-void srcutorture_get_gp_data(enum rcutorture_type test_type,
- struct srcu_struct *sp, int *flags,
+void srcutorture_get_gp_data(struct srcu_struct *sp, int *flags,
unsigned long *gp_seq);
#endif
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 45d6b4c3d199..807fbf6123a7 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -381,6 +381,9 @@ struct rcu_torture_ops {
void (*gp_kthread_dbg)(void);
bool (*check_boost_failed)(unsigned long gp_state, int *cpup);
int (*stall_dur)(void);
+ void (*get_gp_data)(int *flags, unsigned long *gp_seq);
+ void (*gp_slow_register)(atomic_t *rgssp);
+ void (*gp_slow_unregister)(atomic_t *rgssp);
long cbflood_max;
int irq_capable;
int can_boost;
@@ -461,12 +464,13 @@ rcu_torture_pipe_update_one(struct rcu_torture *rp)
WRITE_ONCE(rp->rtort_chkp, NULL);
smp_store_release(&rtrcp->rtc_ready, 1); // Pair with smp_load_acquire().
}
- i = READ_ONCE(rp->rtort_pipe_count);
+ i = rp->rtort_pipe_count;
if (i > RCU_TORTURE_PIPE_LEN)
i = RCU_TORTURE_PIPE_LEN;
atomic_inc(&rcu_torture_wcount[i]);
WRITE_ONCE(rp->rtort_pipe_count, i + 1);
- if (rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
+ ASSERT_EXCLUSIVE_WRITER(rp->rtort_pipe_count);
+ if (i + 1 >= RCU_TORTURE_PIPE_LEN) {
rp->rtort_mbtest = 0;
return true;
}
@@ -564,10 +568,12 @@ static struct rcu_torture_ops rcu_ops = {
.call = call_rcu_hurry,
.cb_barrier = rcu_barrier,
.fqs = rcu_force_quiescent_state,
- .stats = NULL,
.gp_kthread_dbg = show_rcu_gp_kthreads,
.check_boost_failed = rcu_check_boost_fail,
.stall_dur = rcu_jiffies_till_stall_check,
+ .get_gp_data = rcutorture_get_gp_data,
+ .gp_slow_register = rcu_gp_slow_register,
+ .gp_slow_unregister = rcu_gp_slow_unregister,
.irq_capable = 1,
.can_boost = IS_ENABLED(CONFIG_RCU_BOOST),
.extendables = RCUTORTURE_MAX_EXTEND,
@@ -611,9 +617,6 @@ static struct rcu_torture_ops rcu_busted_ops = {
.sync = synchronize_rcu_busted,
.exp_sync = synchronize_rcu_busted,
.call = call_rcu_busted,
- .cb_barrier = NULL,
- .fqs = NULL,
- .stats = NULL,
.irq_capable = 1,
.name = "busted"
};
@@ -627,6 +630,11 @@ static struct srcu_struct srcu_ctld;
static struct srcu_struct *srcu_ctlp = &srcu_ctl;
static struct rcu_torture_ops srcud_ops;
+static void srcu_get_gp_data(int *flags, unsigned long *gp_seq)
+{
+ srcutorture_get_gp_data(srcu_ctlp, flags, gp_seq);
+}
+
static int srcu_torture_read_lock(void)
{
if (cur_ops == &srcud_ops)
@@ -735,6 +743,7 @@ static struct rcu_torture_ops srcu_ops = {
.call = srcu_torture_call,
.cb_barrier = srcu_torture_barrier,
.stats = srcu_torture_stats,
+ .get_gp_data = srcu_get_gp_data,
.cbflood_max = 50000,
.irq_capable = 1,
.no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU),
@@ -773,6 +782,7 @@ static struct rcu_torture_ops srcud_ops = {
.call = srcu_torture_call,
.cb_barrier = srcu_torture_barrier,
.stats = srcu_torture_stats,
+ .get_gp_data = srcu_get_gp_data,
.cbflood_max = 50000,
.irq_capable = 1,
.no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU),
@@ -837,8 +847,6 @@ static struct rcu_torture_ops trivial_ops = {
.get_gp_seq = rcu_no_completed,
.sync = synchronize_rcu_trivial,
.exp_sync = synchronize_rcu_trivial,
- .fqs = NULL,
- .stats = NULL,
.irq_capable = 1,
.name = "trivial"
};
@@ -881,8 +889,7 @@ static struct rcu_torture_ops tasks_ops = {
.call = call_rcu_tasks,
.cb_barrier = rcu_barrier_tasks,
.gp_kthread_dbg = show_rcu_tasks_classic_gp_kthread,
- .fqs = NULL,
- .stats = NULL,
+ .get_gp_data = rcu_tasks_get_gp_data,
.irq_capable = 1,
.slow_gps = 1,
.name = "tasks"
@@ -921,9 +928,8 @@ static struct rcu_torture_ops tasks_rude_ops = {
.call = call_rcu_tasks_rude,
.cb_barrier = rcu_barrier_tasks_rude,
.gp_kthread_dbg = show_rcu_tasks_rude_gp_kthread,
+ .get_gp_data = rcu_tasks_rude_get_gp_data,
.cbflood_max = 50000,
- .fqs = NULL,
- .stats = NULL,
.irq_capable = 1,
.name = "tasks-rude"
};
@@ -973,9 +979,8 @@ static struct rcu_torture_ops tasks_tracing_ops = {
.call = call_rcu_tasks_trace,
.cb_barrier = rcu_barrier_tasks_trace,
.gp_kthread_dbg = show_rcu_tasks_trace_gp_kthread,
+ .get_gp_data = rcu_tasks_trace_get_gp_data,
.cbflood_max = 50000,
- .fqs = NULL,
- .stats = NULL,
.irq_capable = 1,
.slow_gps = 1,
.name = "tasks-tracing"
@@ -1399,6 +1404,7 @@ rcu_torture_writer(void *arg)
if (rp == NULL)
continue;
rp->rtort_pipe_count = 0;
+ ASSERT_EXCLUSIVE_WRITER(rp->rtort_pipe_count);
rcu_torture_writer_state = RTWS_DELAY;
udelay(torture_random(&rand) & 0x3ff);
rcu_torture_writer_state = RTWS_REPLACE;
@@ -1414,6 +1420,7 @@ rcu_torture_writer(void *arg)
atomic_inc(&rcu_torture_wcount[i]);
WRITE_ONCE(old_rp->rtort_pipe_count,
old_rp->rtort_pipe_count + 1);
+ ASSERT_EXCLUSIVE_WRITER(old_rp->rtort_pipe_count);
// Make sure readers block polled grace periods.
if (cur_ops->get_gp_state && cur_ops->poll_gp_state) {
@@ -1586,7 +1593,8 @@ rcu_torture_writer(void *arg)
if (list_empty(&rcu_tortures[i].rtort_free) &&
rcu_access_pointer(rcu_torture_current) != &rcu_tortures[i]) {
tracing_off();
- show_rcu_gp_kthreads();
+ if (cur_ops->gp_kthread_dbg)
+ cur_ops->gp_kthread_dbg();
WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count);
rcu_ftrace_dump(DUMP_ALL);
}
@@ -1997,7 +2005,8 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid)
preempt_disable();
pipe_count = READ_ONCE(p->rtort_pipe_count);
if (pipe_count > RCU_TORTURE_PIPE_LEN) {
- /* Should not happen, but... */
+ // Should not happen in a correct RCU implementation,
+ // happens quite often for torture_type=busted.
pipe_count = RCU_TORTURE_PIPE_LEN;
}
completed = cur_ops->get_gp_seq();
@@ -2259,10 +2268,8 @@ rcu_torture_stats_print(void)
int __maybe_unused flags = 0;
unsigned long __maybe_unused gp_seq = 0;
- rcutorture_get_gp_data(cur_ops->ttype,
- &flags, &gp_seq);
- srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp,
- &flags, &gp_seq);
+ if (cur_ops->get_gp_data)
+ cur_ops->get_gp_data(&flags, &gp_seq);
wtp = READ_ONCE(writer_task);
pr_alert("??? Writer stall state %s(%d) g%lu f%#x ->state %#x cpu %d\n",
rcu_torture_writer_state_getname(),
@@ -2486,8 +2493,8 @@ static int rcu_torture_stall(void *args)
preempt_disable();
pr_alert("%s start on CPU %d.\n",
__func__, raw_smp_processor_id());
- while (ULONG_CMP_LT((unsigned long)ktime_get_seconds(),
- stop_at))
+ while (ULONG_CMP_LT((unsigned long)ktime_get_seconds(), stop_at) &&
+ !kthread_should_stop())
if (stall_cpu_block) {
#ifdef CONFIG_PREEMPTION
preempt_schedule();
@@ -2832,13 +2839,14 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp)
if (!torture_must_stop() && !READ_ONCE(rcu_fwd_emergency_stop) &&
!shutdown_time_arrived()) {
- WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED);
- pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld\n",
+ if (WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED) && cur_ops->gp_kthread_dbg)
+ cur_ops->gp_kthread_dbg();
+ pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld #online %u\n",
__func__,
stoppedat - rfp->rcu_fwd_startat, jiffies - stoppedat,
n_launders + n_max_cbs - n_launders_cb_snap,
n_launders, n_launders_sa,
- n_max_gps, n_max_cbs, cver, gps);
+ n_max_gps, n_max_cbs, cver, gps, num_online_cpus());
atomic_long_add(n_max_cbs, &rcu_fwd_max_cbs);
mutex_lock(&rcu_fwd_mutex); // Serialize histograms.
rcu_torture_fwd_cb_hist(rfp);
@@ -3040,11 +3048,12 @@ static void rcu_torture_barrier_cbf(struct rcu_head *rcu)
}
/* IPI handler to get callback posted on desired CPU, if online. */
-static void rcu_torture_barrier1cb(void *rcu_void)
+static int rcu_torture_barrier1cb(void *rcu_void)
{
struct rcu_head *rhp = rcu_void;
cur_ops->call(rhp, rcu_torture_barrier_cbf);
+ return 0;
}
/* kthread function to register callbacks used to test RCU barriers. */
@@ -3070,11 +3079,9 @@ static int rcu_torture_barrier_cbs(void *arg)
* The above smp_load_acquire() ensures barrier_phase load
* is ordered before the following ->call().
*/
- if (smp_call_function_single(myid, rcu_torture_barrier1cb,
- &rcu, 1)) {
- // IPI failed, so use direct call from current CPU.
+ if (smp_call_on_cpu(myid, rcu_torture_barrier1cb, &rcu, 1))
cur_ops->call(&rcu, rcu_torture_barrier_cbf);
- }
+
if (atomic_dec_and_test(&barrier_cbs_count))
wake_up(&barrier_wq);
} while (!torture_must_stop());
@@ -3340,12 +3347,12 @@ rcu_torture_cleanup(void)
pr_info("%s: Invoking %pS().\n", __func__, cur_ops->cb_barrier);
cur_ops->cb_barrier();
}
- rcu_gp_slow_unregister(NULL);
+ if (cur_ops->gp_slow_unregister)
+ cur_ops->gp_slow_unregister(NULL);
return;
}
if (!cur_ops) {
torture_cleanup_end();
- rcu_gp_slow_unregister(NULL);
return;
}
@@ -3384,8 +3391,8 @@ rcu_torture_cleanup(void)
fakewriter_tasks = NULL;
}
- rcutorture_get_gp_data(cur_ops->ttype, &flags, &gp_seq);
- srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, &flags, &gp_seq);
+ if (cur_ops->get_gp_data)
+ cur_ops->get_gp_data(&flags, &gp_seq);
pr_alert("%s: End-test grace-period state: g%ld f%#x total-gps=%ld\n",
cur_ops->name, (long)gp_seq, flags,
rcutorture_seq_diff(gp_seq, start_gp_seq));
@@ -3444,7 +3451,8 @@ rcu_torture_cleanup(void)
else
rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS");
torture_cleanup_end();
- rcu_gp_slow_unregister(&rcu_fwd_cb_nodelay);
+ if (cur_ops->gp_slow_unregister)
+ cur_ops->gp_slow_unregister(NULL);
}
#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
@@ -3756,8 +3764,8 @@ rcu_torture_init(void)
nrealreaders = 1;
}
rcu_torture_print_module_parms(cur_ops, "Start of test");
- rcutorture_get_gp_data(cur_ops->ttype, &flags, &gp_seq);
- srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, &flags, &gp_seq);
+ if (cur_ops->get_gp_data)
+ cur_ops->get_gp_data(&flags, &gp_seq);
start_gp_seq = gp_seq;
pr_alert("%s: Start-test grace-period state: g%ld f%#x\n",
cur_ops->name, (long)gp_seq, flags);
@@ -3926,7 +3934,8 @@ rcu_torture_init(void)
if (object_debug)
rcu_test_debug_objects();
torture_init_end();
- rcu_gp_slow_register(&rcu_fwd_cb_nodelay);
+ if (cur_ops->gp_slow_register && !WARN_ON_ONCE(!cur_ops->gp_slow_unregister))
+ cur_ops->gp_slow_register(&rcu_fwd_cb_nodelay);
return 0;
unwind:
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index c38e5933a5d6..5afd5cf494db 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -96,9 +96,12 @@ EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
*/
void __srcu_read_unlock(struct srcu_struct *ssp, int idx)
{
- int newval = READ_ONCE(ssp->srcu_lock_nesting[idx]) - 1;
+ int newval;
+ preempt_disable(); // Needed for PREEMPT_AUTO
+ newval = READ_ONCE(ssp->srcu_lock_nesting[idx]) - 1;
WRITE_ONCE(ssp->srcu_lock_nesting[idx], newval);
+ preempt_enable();
if (!newval && READ_ONCE(ssp->srcu_gp_waiting) && in_task())
swake_up_one(&ssp->srcu_wq);
}
@@ -117,8 +120,11 @@ void srcu_drive_gp(struct work_struct *wp)
struct srcu_struct *ssp;
ssp = container_of(wp, struct srcu_struct, srcu_work);
- if (ssp->srcu_gp_running || ULONG_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max)))
+ preempt_disable(); // Needed for PREEMPT_AUTO
+ if (ssp->srcu_gp_running || ULONG_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max))) {
return; /* Already running or nothing to do. */
+ preempt_enable();
+ }
/* Remove recently arrived callbacks and wait for readers. */
WRITE_ONCE(ssp->srcu_gp_running, true);
@@ -130,9 +136,12 @@ void srcu_drive_gp(struct work_struct *wp)
idx = (ssp->srcu_idx & 0x2) / 2;
WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1);
WRITE_ONCE(ssp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */
+ preempt_enable();
swait_event_exclusive(ssp->srcu_wq, !READ_ONCE(ssp->srcu_lock_nesting[idx]));
+ preempt_disable(); // Needed for PREEMPT_AUTO
WRITE_ONCE(ssp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */
WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1);
+ preempt_enable();
/* Invoke the callbacks we removed above. */
while (lh) {
@@ -150,8 +159,11 @@ void srcu_drive_gp(struct work_struct *wp)
* at interrupt level, but the ->srcu_gp_running checks will
* straighten that out.
*/
+ preempt_disable(); // Needed for PREEMPT_AUTO
WRITE_ONCE(ssp->srcu_gp_running, false);
- if (ULONG_CMP_LT(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max)))
+ idx = ULONG_CMP_LT(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max));
+ preempt_enable();
+ if (idx)
schedule_work(&ssp->srcu_work);
}
EXPORT_SYMBOL_GPL(srcu_drive_gp);
@@ -160,9 +172,12 @@ static void srcu_gp_start_if_needed(struct srcu_struct *ssp)
{
unsigned long cookie;
+ preempt_disable(); // Needed for PREEMPT_AUTO
cookie = get_state_synchronize_srcu(ssp);
- if (ULONG_CMP_GE(READ_ONCE(ssp->srcu_idx_max), cookie))
+ if (ULONG_CMP_GE(READ_ONCE(ssp->srcu_idx_max), cookie)) {
+ preempt_enable();
return;
+ }
WRITE_ONCE(ssp->srcu_idx_max, cookie);
if (!READ_ONCE(ssp->srcu_gp_running)) {
if (likely(srcu_init_done))
@@ -170,6 +185,7 @@ static void srcu_gp_start_if_needed(struct srcu_struct *ssp)
else if (list_empty(&ssp->srcu_work.entry))
list_add(&ssp->srcu_work.entry, &srcu_boot_list);
}
+ preempt_enable();
}
/*
@@ -183,11 +199,13 @@ void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
rhp->func = func;
rhp->next = NULL;
+ preempt_disable(); // Needed for PREEMPT_AUTO
local_irq_save(flags);
*ssp->srcu_cb_tail = rhp;
ssp->srcu_cb_tail = &rhp->next;
local_irq_restore(flags);
srcu_gp_start_if_needed(ssp);
+ preempt_enable();
}
EXPORT_SYMBOL_GPL(call_srcu);
@@ -241,9 +259,12 @@ EXPORT_SYMBOL_GPL(get_state_synchronize_srcu);
*/
unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp)
{
- unsigned long ret = get_state_synchronize_srcu(ssp);
+ unsigned long ret;
+ preempt_disable(); // Needed for PREEMPT_AUTO
+ ret = get_state_synchronize_srcu(ssp);
srcu_gp_start_if_needed(ssp);
+ preempt_enable();
return ret;
}
EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu);
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index e4d673fc30f4..bc4b58b0204e 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -1826,12 +1826,9 @@ static void process_srcu(struct work_struct *work)
srcu_reschedule(ssp, curdelay);
}
-void srcutorture_get_gp_data(enum rcutorture_type test_type,
- struct srcu_struct *ssp, int *flags,
+void srcutorture_get_gp_data(struct srcu_struct *ssp, int *flags,
unsigned long *gp_seq)
{
- if (test_type != SRCU_FLAVOR)
- return;
*flags = 0;
*gp_seq = rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq);
}
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index 86df878a2fee..6c2bd9001adc 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -122,7 +122,7 @@ void rcu_sync_enter(struct rcu_sync *rsp)
* we are called at early boot time but this shouldn't happen.
*/
}
- rsp->gp_count++;
+ WRITE_ONCE(rsp->gp_count, rsp->gp_count + 1);
spin_unlock_irq(&rsp->rss_lock);
if (gp_state == GP_IDLE) {
@@ -151,11 +151,15 @@ void rcu_sync_enter(struct rcu_sync *rsp)
*/
void rcu_sync_exit(struct rcu_sync *rsp)
{
+ int gpc;
+
WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_IDLE);
WARN_ON_ONCE(READ_ONCE(rsp->gp_count) == 0);
spin_lock_irq(&rsp->rss_lock);
- if (!--rsp->gp_count) {
+ gpc = rsp->gp_count - 1;
+ WRITE_ONCE(rsp->gp_count, gpc);
+ if (!gpc) {
if (rsp->gp_state == GP_PASSED) {
WRITE_ONCE(rsp->gp_state, GP_EXIT);
rcu_sync_call(rsp);
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 147b5945d67a..e1bf33018e6d 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -74,6 +74,7 @@ struct rcu_tasks_percpu {
* @holdouts_func: This flavor's holdout-list scan function (optional).
* @postgp_func: This flavor's post-grace-period function (optional).
* @call_func: This flavor's call_rcu()-equivalent function.
+ * @wait_state: Task state for synchronous grace-period waits (default TASK_UNINTERRUPTIBLE).
* @rtpcpu: This flavor's rcu_tasks_percpu structure.
* @percpu_enqueue_shift: Shift down CPU ID this much when enqueuing callbacks.
* @percpu_enqueue_lim: Number of per-CPU callback queues in use for enqueuing.
@@ -107,6 +108,7 @@ struct rcu_tasks {
holdouts_func_t holdouts_func;
postgp_func_t postgp_func;
call_rcu_func_t call_func;
+ unsigned int wait_state;
struct rcu_tasks_percpu __percpu *rtpcpu;
int percpu_enqueue_shift;
int percpu_enqueue_lim;
@@ -134,6 +136,7 @@ static struct rcu_tasks rt_name = \
.tasks_gp_mutex = __MUTEX_INITIALIZER(rt_name.tasks_gp_mutex), \
.gp_func = gp, \
.call_func = call, \
+ .wait_state = TASK_UNINTERRUPTIBLE, \
.rtpcpu = &rt_name ## __percpu, \
.lazy_jiffies = DIV_ROUND_UP(HZ, 4), \
.name = n, \
@@ -147,7 +150,7 @@ static struct rcu_tasks rt_name = \
#ifdef CONFIG_TASKS_RCU
-/* Report delay in synchronize_srcu() completion in rcu_tasks_postscan(). */
+/* Report delay of scan exiting tasklist in rcu_tasks_postscan(). */
static void tasks_rcu_exit_srcu_stall(struct timer_list *unused);
static DEFINE_TIMER(tasks_rcu_exit_srcu_stall_timer, tasks_rcu_exit_srcu_stall);
#endif
@@ -638,7 +641,7 @@ static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp)
// If the grace-period kthread is running, use it.
if (READ_ONCE(rtp->kthread_ptr)) {
- wait_rcu_gp(rtp->call_func);
+ wait_rcu_gp_state(rtp->wait_state, rtp->call_func);
return;
}
rcu_tasks_one_gp(rtp, true);
@@ -1160,6 +1163,7 @@ static int __init rcu_spawn_tasks_kthread(void)
rcu_tasks.postscan_func = rcu_tasks_postscan;
rcu_tasks.holdouts_func = check_all_holdout_tasks;
rcu_tasks.postgp_func = rcu_tasks_postgp;
+ rcu_tasks.wait_state = TASK_IDLE;
rcu_spawn_tasks_kthread_generic(&rcu_tasks);
return 0;
}
@@ -1178,6 +1182,13 @@ struct task_struct *get_rcu_tasks_gp_kthread(void)
}
EXPORT_SYMBOL_GPL(get_rcu_tasks_gp_kthread);
+void rcu_tasks_get_gp_data(int *flags, unsigned long *gp_seq)
+{
+ *flags = 0;
+ *gp_seq = rcu_seq_current(&rcu_tasks.tasks_gp_seq);
+}
+EXPORT_SYMBOL_GPL(rcu_tasks_get_gp_data);
+
/*
* Protect against tasklist scan blind spot while the task is exiting and
* may be removed from the tasklist. Do this by adding the task to yet
@@ -1199,8 +1210,7 @@ void exit_tasks_rcu_start(void)
rtpcp = this_cpu_ptr(rcu_tasks.rtpcpu);
t->rcu_tasks_exit_cpu = smp_processor_id();
raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
- if (!rtpcp->rtp_exit_list.next)
- INIT_LIST_HEAD(&rtpcp->rtp_exit_list);
+ WARN_ON_ONCE(!rtpcp->rtp_exit_list.next);
list_add(&t->rcu_tasks_exit_list, &rtpcp->rtp_exit_list);
raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
preempt_enable();
@@ -1358,6 +1368,13 @@ struct task_struct *get_rcu_tasks_rude_gp_kthread(void)
}
EXPORT_SYMBOL_GPL(get_rcu_tasks_rude_gp_kthread);
+void rcu_tasks_rude_get_gp_data(int *flags, unsigned long *gp_seq)
+{
+ *flags = 0;
+ *gp_seq = rcu_seq_current(&rcu_tasks_rude.tasks_gp_seq);
+}
+EXPORT_SYMBOL_GPL(rcu_tasks_rude_get_gp_data);
+
#endif /* #ifdef CONFIG_TASKS_RUDE_RCU */
////////////////////////////////////////////////////////////////////////
@@ -1457,6 +1474,7 @@ static void rcu_st_need_qs(struct task_struct *t, u8 v)
/*
* Do a cmpxchg() on ->trc_reader_special.b.need_qs, allowing for
* the four-byte operand-size restriction of some platforms.
+ *
* Returns the old value, which is often ignored.
*/
u8 rcu_trc_cmpxchg_need_qs(struct task_struct *t, u8 old, u8 new)
@@ -1468,7 +1486,14 @@ u8 rcu_trc_cmpxchg_need_qs(struct task_struct *t, u8 old, u8 new)
if (trs_old.b.need_qs != old)
return trs_old.b.need_qs;
trs_new.b.need_qs = new;
- ret.s = cmpxchg(&t->trc_reader_special.s, trs_old.s, trs_new.s);
+
+ // Although cmpxchg() appears to KCSAN to update all four bytes,
+ // only the .b.need_qs byte actually changes.
+ instrument_atomic_read_write(&t->trc_reader_special.b.need_qs,
+ sizeof(t->trc_reader_special.b.need_qs));
+ // Avoid false-positive KCSAN failures.
+ ret.s = data_race(cmpxchg(&t->trc_reader_special.s, trs_old.s, trs_new.s));
+
return ret.b.need_qs;
}
EXPORT_SYMBOL_GPL(rcu_trc_cmpxchg_need_qs);
@@ -1994,7 +2019,7 @@ void show_rcu_tasks_trace_gp_kthread(void)
{
char buf[64];
- sprintf(buf, "N%lu h:%lu/%lu/%lu",
+ snprintf(buf, sizeof(buf), "N%lu h:%lu/%lu/%lu",
data_race(n_trc_holdouts),
data_race(n_heavy_reader_ofl_updates),
data_race(n_heavy_reader_updates),
@@ -2010,6 +2035,13 @@ struct task_struct *get_rcu_tasks_trace_gp_kthread(void)
}
EXPORT_SYMBOL_GPL(get_rcu_tasks_trace_gp_kthread);
+void rcu_tasks_trace_get_gp_data(int *flags, unsigned long *gp_seq)
+{
+ *flags = 0;
+ *gp_seq = rcu_seq_current(&rcu_tasks_trace.tasks_gp_seq);
+}
+EXPORT_SYMBOL_GPL(rcu_tasks_trace_get_gp_data);
+
#else /* #ifdef CONFIG_TASKS_TRACE_RCU */
static void exit_tasks_rcu_finish_trace(struct task_struct *t) { }
#endif /* #else #ifdef CONFIG_TASKS_TRACE_RCU */
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 705c0d16850a..4402d6f5f857 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -130,9 +130,7 @@ static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused
next = list->next;
prefetch(next);
debug_rcu_head_unqueue(list);
- local_bh_disable();
rcu_reclaim_tiny(list);
- local_bh_enable();
list = next;
}
}
@@ -155,7 +153,9 @@ void synchronize_rcu(void)
lock_is_held(&rcu_lock_map) ||
lock_is_held(&rcu_sched_lock_map),
"Illegal synchronize_rcu() in RCU read-side critical section");
+ preempt_disable();
WRITE_ONCE(rcu_ctrlblk.gp_seq, rcu_ctrlblk.gp_seq + 2);
+ preempt_enable();
}
EXPORT_SYMBOL_GPL(synchronize_rcu);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index d9642dd06c25..28c7031711a3 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -75,6 +75,7 @@
#define MODULE_PARAM_PREFIX "rcutree."
/* Data structures. */
+static void rcu_sr_normal_gp_cleanup_work(struct work_struct *);
static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = {
.gpwrap = true,
@@ -93,6 +94,8 @@ static struct rcu_state rcu_state = {
.exp_mutex = __MUTEX_INITIALIZER(rcu_state.exp_mutex),
.exp_wake_mutex = __MUTEX_INITIALIZER(rcu_state.exp_wake_mutex),
.ofl_lock = __ARCH_SPIN_LOCK_UNLOCKED,
+ .srs_cleanup_work = __WORK_INITIALIZER(rcu_state.srs_cleanup_work,
+ rcu_sr_normal_gp_cleanup_work),
};
/* Dump rcu_node combining tree at boot to verify correct setup. */
@@ -240,8 +243,36 @@ static long rcu_get_n_cbs_cpu(int cpu)
return 0;
}
+/**
+ * rcu_softirq_qs - Provide a set of RCU quiescent states in softirq processing
+ *
+ * Mark a quiescent state for RCU, Tasks RCU, and Tasks Trace RCU.
+ * This is a special-purpose function to be used in the softirq
+ * infrastructure and perhaps the occasional long-running softirq
+ * handler.
+ *
+ * Note that from RCU's viewpoint, a call to rcu_softirq_qs() is
+ * equivalent to momentarily completely enabling preemption. For
+ * example, given this code::
+ *
+ * local_bh_disable();
+ * do_something();
+ * rcu_softirq_qs(); // A
+ * do_something_else();
+ * local_bh_enable(); // B
+ *
+ * A call to synchronize_rcu() that began concurrently with the
+ * call to do_something() would be guaranteed to wait only until
+ * execution reached statement A. Without that rcu_softirq_qs(),
+ * that same synchronize_rcu() would instead be guaranteed to wait
+ * until execution reached statement B.
+ */
void rcu_softirq_qs(void)
{
+ RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
+ lock_is_held(&rcu_lock_map) ||
+ lock_is_held(&rcu_sched_lock_map),
+ "Illegal rcu_softirq_qs() in RCU read-side critical section");
rcu_qs();
rcu_preempt_deferred_qs(current);
rcu_tasks_qs(current, false);
@@ -508,17 +539,10 @@ static struct rcu_node *rcu_get_root(void)
/*
* Send along grace-period-related data for rcutorture diagnostics.
*/
-void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
- unsigned long *gp_seq)
+void rcutorture_get_gp_data(int *flags, unsigned long *gp_seq)
{
- switch (test_type) {
- case RCU_FLAVOR:
- *flags = READ_ONCE(rcu_state.gp_flags);
- *gp_seq = rcu_seq_current(&rcu_state.gp_seq);
- break;
- default:
- break;
- }
+ *flags = READ_ONCE(rcu_state.gp_flags);
+ *gp_seq = rcu_seq_current(&rcu_state.gp_seq);
}
EXPORT_SYMBOL_GPL(rcutorture_get_gp_data);
@@ -813,8 +837,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
__func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext, rnp1->rcu_gp_init_mask);
pr_info("%s %d: %c online: %ld(%d) offline: %ld(%d)\n",
__func__, rdp->cpu, ".o"[rcu_rdp_cpu_online(rdp)],
- (long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_flags,
- (long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_flags);
+ (long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_state,
+ (long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_state);
return 1; /* Break things loose after complaining. */
}
@@ -1423,6 +1447,305 @@ static void rcu_poll_gp_seq_end_unlocked(unsigned long *snap)
}
/*
+ * There is a single llist, which is used for handling
+ * synchronize_rcu() users' enqueued rcu_synchronize nodes.
+ * Within this llist, there are two tail pointers:
+ *
+ * wait tail: Tracks the set of nodes, which need to
+ * wait for the current GP to complete.
+ * done tail: Tracks the set of nodes, for which grace
+ * period has elapsed. These nodes processing
+ * will be done as part of the cleanup work
+ * execution by a kworker.
+ *
+ * At every grace period init, a new wait node is added
+ * to the llist. This wait node is used as wait tail
+ * for this new grace period. Given that there are a fixed
+ * number of wait nodes, if all wait nodes are in use
+ * (which can happen when kworker callback processing
+ * is delayed) and additional grace period is requested.
+ * This means, a system is slow in processing callbacks.
+ *
+ * TODO: If a slow processing is detected, a first node
+ * in the llist should be used as a wait-tail for this
+ * grace period, therefore users which should wait due
+ * to a slow process are handled by _this_ grace period
+ * and not next.
+ *
+ * Below is an illustration of how the done and wait
+ * tail pointers move from one set of rcu_synchronize nodes
+ * to the other, as grace periods start and finish and
+ * nodes are processed by kworker.
+ *
+ *
+ * a. Initial llist callbacks list:
+ *
+ * +----------+ +--------+ +-------+
+ * | | | | | |
+ * | head |---------> | cb2 |--------->| cb1 |
+ * | | | | | |
+ * +----------+ +--------+ +-------+
+ *
+ *
+ *
+ * b. New GP1 Start:
+ *
+ * WAIT TAIL
+ * |
+ * |
+ * v
+ * +----------+ +--------+ +--------+ +-------+
+ * | | | | | | | |
+ * | head ------> wait |------> cb2 |------> | cb1 |
+ * | | | head1 | | | | |
+ * +----------+ +--------+ +--------+ +-------+
+ *
+ *
+ *
+ * c. GP completion:
+ *
+ * WAIT_TAIL == DONE_TAIL
+ *
+ * DONE TAIL
+ * |
+ * |
+ * v
+ * +----------+ +--------+ +--------+ +-------+
+ * | | | | | | | |
+ * | head ------> wait |------> cb2 |------> | cb1 |
+ * | | | head1 | | | | |
+ * +----------+ +--------+ +--------+ +-------+
+ *
+ *
+ *
+ * d. New callbacks and GP2 start:
+ *
+ * WAIT TAIL DONE TAIL
+ * | |
+ * | |
+ * v v
+ * +----------+ +------+ +------+ +------+ +-----+ +-----+ +-----+
+ * | | | | | | | | | | | | | |
+ * | head ------> wait |--->| cb4 |--->| cb3 |--->|wait |--->| cb2 |--->| cb1 |
+ * | | | head2| | | | | |head1| | | | |
+ * +----------+ +------+ +------+ +------+ +-----+ +-----+ +-----+
+ *
+ *
+ *
+ * e. GP2 completion:
+ *
+ * WAIT_TAIL == DONE_TAIL
+ * DONE TAIL
+ * |
+ * |
+ * v
+ * +----------+ +------+ +------+ +------+ +-----+ +-----+ +-----+
+ * | | | | | | | | | | | | | |
+ * | head ------> wait |--->| cb4 |--->| cb3 |--->|wait |--->| cb2 |--->| cb1 |
+ * | | | head2| | | | | |head1| | | | |
+ * +----------+ +------+ +------+ +------+ +-----+ +-----+ +-----+
+ *
+ *
+ * While the llist state transitions from d to e, a kworker
+ * can start executing rcu_sr_normal_gp_cleanup_work() and
+ * can observe either the old done tail (@c) or the new
+ * done tail (@e). So, done tail updates and reads need
+ * to use the rel-acq semantics. If the concurrent kworker
+ * observes the old done tail, the newly queued work
+ * execution will process the updated done tail. If the
+ * concurrent kworker observes the new done tail, then
+ * the newly queued work will skip processing the done
+ * tail, as workqueue semantics guarantees that the new
+ * work is executed only after the previous one completes.
+ *
+ * f. kworker callbacks processing complete:
+ *
+ *
+ * DONE TAIL
+ * |
+ * |
+ * v
+ * +----------+ +--------+
+ * | | | |
+ * | head ------> wait |
+ * | | | head2 |
+ * +----------+ +--------+
+ *
+ */
+static bool rcu_sr_is_wait_head(struct llist_node *node)
+{
+ return &(rcu_state.srs_wait_nodes)[0].node <= node &&
+ node <= &(rcu_state.srs_wait_nodes)[SR_NORMAL_GP_WAIT_HEAD_MAX - 1].node;
+}
+
+static struct llist_node *rcu_sr_get_wait_head(void)
+{
+ struct sr_wait_node *sr_wn;
+ int i;
+
+ for (i = 0; i < SR_NORMAL_GP_WAIT_HEAD_MAX; i++) {
+ sr_wn = &(rcu_state.srs_wait_nodes)[i];
+
+ if (!atomic_cmpxchg_acquire(&sr_wn->inuse, 0, 1))
+ return &sr_wn->node;
+ }
+
+ return NULL;
+}
+
+static void rcu_sr_put_wait_head(struct llist_node *node)
+{
+ struct sr_wait_node *sr_wn = container_of(node, struct sr_wait_node, node);
+
+ atomic_set_release(&sr_wn->inuse, 0);
+}
+
+/* Disabled by default. */
+static int rcu_normal_wake_from_gp;
+module_param(rcu_normal_wake_from_gp, int, 0644);
+static struct workqueue_struct *sync_wq;
+
+static void rcu_sr_normal_complete(struct llist_node *node)
+{
+ struct rcu_synchronize *rs = container_of(
+ (struct rcu_head *) node, struct rcu_synchronize, head);
+ unsigned long oldstate = (unsigned long) rs->head.func;
+
+ WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) &&
+ !poll_state_synchronize_rcu(oldstate),
+ "A full grace period is not passed yet: %lu",
+ rcu_seq_diff(get_state_synchronize_rcu(), oldstate));
+
+ /* Finally. */
+ complete(&rs->completion);
+}
+
+static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work)
+{
+ struct llist_node *done, *rcu, *next, *head;
+
+ /*
+ * This work execution can potentially execute
+ * while a new done tail is being updated by
+ * grace period kthread in rcu_sr_normal_gp_cleanup().
+ * So, read and updates of done tail need to
+ * follow acq-rel semantics.
+ *
+ * Given that wq semantics guarantees that a single work
+ * cannot execute concurrently by multiple kworkers,
+ * the done tail list manipulations are protected here.
+ */
+ done = smp_load_acquire(&rcu_state.srs_done_tail);
+ if (!done)
+ return;
+
+ WARN_ON_ONCE(!rcu_sr_is_wait_head(done));
+ head = done->next;
+ done->next = NULL;
+
+ /*
+ * The dummy node, which is pointed to by the
+ * done tail which is acq-read above is not removed
+ * here. This allows lockless additions of new
+ * rcu_synchronize nodes in rcu_sr_normal_add_req(),
+ * while the cleanup work executes. The dummy
+ * nodes is removed, in next round of cleanup
+ * work execution.
+ */
+ llist_for_each_safe(rcu, next, head) {
+ if (!rcu_sr_is_wait_head(rcu)) {
+ rcu_sr_normal_complete(rcu);
+ continue;
+ }
+
+ rcu_sr_put_wait_head(rcu);
+ }
+}
+
+/*
+ * Helper function for rcu_gp_cleanup().
+ */
+static void rcu_sr_normal_gp_cleanup(void)
+{
+ struct llist_node *wait_tail, *next, *rcu;
+ int done = 0;
+
+ wait_tail = rcu_state.srs_wait_tail;
+ if (wait_tail == NULL)
+ return;
+
+ rcu_state.srs_wait_tail = NULL;
+ ASSERT_EXCLUSIVE_WRITER(rcu_state.srs_wait_tail);
+ WARN_ON_ONCE(!rcu_sr_is_wait_head(wait_tail));
+
+ /*
+ * Process (a) and (d) cases. See an illustration.
+ */
+ llist_for_each_safe(rcu, next, wait_tail->next) {
+ if (rcu_sr_is_wait_head(rcu))
+ break;
+
+ rcu_sr_normal_complete(rcu);
+ // It can be last, update a next on this step.
+ wait_tail->next = next;
+
+ if (++done == SR_MAX_USERS_WAKE_FROM_GP)
+ break;
+ }
+
+ // concurrent sr_normal_gp_cleanup work might observe this update.
+ smp_store_release(&rcu_state.srs_done_tail, wait_tail);
+ ASSERT_EXCLUSIVE_WRITER(rcu_state.srs_done_tail);
+
+ /*
+ * We schedule a work in order to perform a final processing
+ * of outstanding users(if still left) and releasing wait-heads
+ * added by rcu_sr_normal_gp_init() call.
+ */
+ queue_work(sync_wq, &rcu_state.srs_cleanup_work);
+}
+
+/*
+ * Helper function for rcu_gp_init().
+ */
+static bool rcu_sr_normal_gp_init(void)
+{
+ struct llist_node *first;
+ struct llist_node *wait_head;
+ bool start_new_poll = false;
+
+ first = READ_ONCE(rcu_state.srs_next.first);
+ if (!first || rcu_sr_is_wait_head(first))
+ return start_new_poll;
+
+ wait_head = rcu_sr_get_wait_head();
+ if (!wait_head) {
+ // Kick another GP to retry.
+ start_new_poll = true;
+ return start_new_poll;
+ }
+
+ /* Inject a wait-dummy-node. */
+ llist_add(wait_head, &rcu_state.srs_next);
+
+ /*
+ * A waiting list of rcu_synchronize nodes should be empty on
+ * this step, since a GP-kthread, rcu_gp_init() -> gp_cleanup(),
+ * rolls it over. If not, it is a BUG, warn a user.
+ */
+ WARN_ON_ONCE(rcu_state.srs_wait_tail != NULL);
+ rcu_state.srs_wait_tail = wait_head;
+ ASSERT_EXCLUSIVE_WRITER(rcu_state.srs_wait_tail);
+
+ return start_new_poll;
+}
+
+static void rcu_sr_normal_add_req(struct rcu_synchronize *rs)
+{
+ llist_add((struct llist_node *) &rs->head, &rcu_state.srs_next);
+}
+
+/*
* Initialize a new grace period. Return false if no grace period required.
*/
static noinline_for_stack bool rcu_gp_init(void)
@@ -1432,10 +1755,11 @@ static noinline_for_stack bool rcu_gp_init(void)
unsigned long mask;
struct rcu_data *rdp;
struct rcu_node *rnp = rcu_get_root();
+ bool start_new_poll;
WRITE_ONCE(rcu_state.gp_activity, jiffies);
raw_spin_lock_irq_rcu_node(rnp);
- if (!READ_ONCE(rcu_state.gp_flags)) {
+ if (!rcu_state.gp_flags) {
/* Spurious wakeup, tell caller to go back to sleep. */
raw_spin_unlock_irq_rcu_node(rnp);
return false;
@@ -1456,11 +1780,25 @@ static noinline_for_stack bool rcu_gp_init(void)
/* Record GP times before starting GP, hence rcu_seq_start(). */
rcu_seq_start(&rcu_state.gp_seq);
ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq);
+ start_new_poll = rcu_sr_normal_gp_init();
trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start"));
rcu_poll_gp_seq_start(&rcu_state.gp_seq_polled_snap);
raw_spin_unlock_irq_rcu_node(rnp);
/*
+ * The "start_new_poll" is set to true, only when this GP is not able
+ * to handle anything and there are outstanding users. It happens when
+ * the rcu_sr_normal_gp_init() function was not able to insert a dummy
+ * separator to the llist, because there were no left any dummy-nodes.
+ *
+ * Number of dummy-nodes is fixed, it could be that we are run out of
+ * them, if so we start a new pool request to repeat a try. It is rare
+ * and it means that a system is doing a slow processing of callbacks.
+ */
+ if (start_new_poll)
+ (void) start_poll_synchronize_rcu();
+
+ /*
* Apply per-leaf buffered online and offline operations to
* the rcu_node tree. Note that this new grace period need not
* wait for subsequent online CPUs, and that RCU hooks in the CPU
@@ -1620,8 +1958,7 @@ static void rcu_gp_fqs(bool first_time)
/* Clear flag to prevent immediate re-entry. */
if (READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_FQS) {
raw_spin_lock_irq_rcu_node(rnp);
- WRITE_ONCE(rcu_state.gp_flags,
- READ_ONCE(rcu_state.gp_flags) & ~RCU_GP_FLAG_FQS);
+ WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags & ~RCU_GP_FLAG_FQS);
raw_spin_unlock_irq_rcu_node(rnp);
}
}
@@ -1825,6 +2162,9 @@ static noinline void rcu_gp_cleanup(void)
}
raw_spin_unlock_irq_rcu_node(rnp);
+ // Make synchronize_rcu() users aware of the end of old grace period.
+ rcu_sr_normal_gp_cleanup();
+
// If strict, make all CPUs aware of the end of the old grace period.
if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD))
on_each_cpu(rcu_strict_gp_boundary, NULL, 0);
@@ -1882,8 +2222,7 @@ static void rcu_report_qs_rsp(unsigned long flags)
{
raw_lockdep_assert_held_rcu_node(rcu_get_root());
WARN_ON_ONCE(!rcu_gp_in_progress());
- WRITE_ONCE(rcu_state.gp_flags,
- READ_ONCE(rcu_state.gp_flags) | RCU_GP_FLAG_FQS);
+ WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags | RCU_GP_FLAG_FQS);
raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(), flags);
rcu_gp_kthread_wake();
}
@@ -2398,8 +2737,7 @@ void rcu_force_quiescent_state(void)
raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags);
return; /* Someone beat us to it. */
}
- WRITE_ONCE(rcu_state.gp_flags,
- READ_ONCE(rcu_state.gp_flags) | RCU_GP_FLAG_FQS);
+ WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags | RCU_GP_FLAG_FQS);
raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags);
rcu_gp_kthread_wake();
}
@@ -3559,6 +3897,43 @@ static int rcu_blocking_is_gp(void)
return true;
}
+/*
+ * Helper function for the synchronize_rcu() API.
+ */
+static void synchronize_rcu_normal(void)
+{
+ struct rcu_synchronize rs;
+
+ trace_rcu_sr_normal(rcu_state.name, &rs.head, TPS("request"));
+
+ if (!READ_ONCE(rcu_normal_wake_from_gp)) {
+ wait_rcu_gp(call_rcu_hurry);
+ goto trace_complete_out;
+ }
+
+ init_rcu_head_on_stack(&rs.head);
+ init_completion(&rs.completion);
+
+ /*
+ * This code might be preempted, therefore take a GP
+ * snapshot before adding a request.
+ */
+ if (IS_ENABLED(CONFIG_PROVE_RCU))
+ rs.head.func = (void *) get_state_synchronize_rcu();
+
+ rcu_sr_normal_add_req(&rs);
+
+ /* Kick a GP and start waiting. */
+ (void) start_poll_synchronize_rcu();
+
+ /* Now we can wait. */
+ wait_for_completion(&rs.completion);
+ destroy_rcu_head_on_stack(&rs.head);
+
+trace_complete_out:
+ trace_rcu_sr_normal(rcu_state.name, &rs.head, TPS("complete"));
+}
+
/**
* synchronize_rcu - wait until a grace period has elapsed.
*
@@ -3610,7 +3985,7 @@ void synchronize_rcu(void)
if (rcu_gp_is_expedited())
synchronize_rcu_expedited();
else
- wait_rcu_gp(call_rcu_hurry);
+ synchronize_rcu_normal();
return;
}
@@ -4303,7 +4678,7 @@ EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
// whether spinlocks may be acquired safely.
static bool rcu_init_invoked(void)
{
- return !!rcu_state.n_online_cpus;
+ return !!READ_ONCE(rcu_state.n_online_cpus);
}
/*
@@ -4395,9 +4770,9 @@ rcu_boot_init_percpu_data(int cpu)
WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(cpu)));
rdp->barrier_seq_snap = rcu_state.barrier_sequence;
rdp->rcu_ofl_gp_seq = rcu_state.gp_seq;
- rdp->rcu_ofl_gp_flags = RCU_GP_CLEANED;
+ rdp->rcu_ofl_gp_state = RCU_GP_CLEANED;
rdp->rcu_onl_gp_seq = rcu_state.gp_seq;
- rdp->rcu_onl_gp_flags = RCU_GP_CLEANED;
+ rdp->rcu_onl_gp_state = RCU_GP_CLEANED;
rdp->last_sched_clock = jiffies;
rdp->cpu = cpu;
rcu_boot_init_nocb_percpu_data(rdp);
@@ -4513,6 +4888,7 @@ int rcutree_prepare_cpu(unsigned int cpu)
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
rcu_spawn_rnp_kthreads(rnp);
rcu_spawn_cpu_nocb_kthread(cpu);
+ ASSERT_EXCLUSIVE_WRITER(rcu_state.n_online_cpus);
WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus + 1);
return 0;
@@ -4656,7 +5032,7 @@ void rcutree_report_cpu_starting(unsigned int cpu)
ASSERT_EXCLUSIVE_WRITER(rcu_state.ncpus);
rcu_gpnum_ovf(rnp, rdp); /* Offline-induced counter wrap? */
rdp->rcu_onl_gp_seq = READ_ONCE(rcu_state.gp_seq);
- rdp->rcu_onl_gp_flags = READ_ONCE(rcu_state.gp_flags);
+ rdp->rcu_onl_gp_state = READ_ONCE(rcu_state.gp_state);
/* An incoming CPU should never be blocking a grace period. */
if (WARN_ON_ONCE(rnp->qsmask & mask)) { /* RCU waiting on incoming CPU? */
@@ -4707,7 +5083,7 @@ void rcutree_report_cpu_dead(void)
arch_spin_lock(&rcu_state.ofl_lock);
raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq);
- rdp->rcu_ofl_gp_flags = READ_ONCE(rcu_state.gp_flags);
+ rdp->rcu_ofl_gp_state = READ_ONCE(rcu_state.gp_state);
if (rnp->qsmask & mask) { /* RCU waiting on outgoing CPU? */
/* Report quiescent state -before- changing ->qsmaskinitnext! */
rcu_disable_urgency_upon_qs(rdp);
@@ -4781,6 +5157,7 @@ void rcutree_migrate_callbacks(int cpu)
*/
int rcutree_dead_cpu(unsigned int cpu)
{
+ ASSERT_EXCLUSIVE_WRITER(rcu_state.n_online_cpus);
WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - 1);
// Stop-machine done, so allow nohz_full to disable tick.
tick_dep_clear(TICK_DEP_BIT_RCU);
@@ -5229,6 +5606,9 @@ void __init rcu_init(void)
rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0);
WARN_ON(!rcu_gp_wq);
+ sync_wq = alloc_workqueue("sync_wq", WQ_MEM_RECLAIM, 0);
+ WARN_ON(!sync_wq);
+
/* Fill in default value for rcutree.qovld boot parameter. */
/* -After- the rcu_node ->lock fields are initialized! */
if (qovld < 0)
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index df48160b3136..bae7925c497f 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -273,9 +273,9 @@ struct rcu_data {
bool rcu_iw_pending; /* Is ->rcu_iw pending? */
unsigned long rcu_iw_gp_seq; /* ->gp_seq associated with ->rcu_iw. */
unsigned long rcu_ofl_gp_seq; /* ->gp_seq at last offline. */
- short rcu_ofl_gp_flags; /* ->gp_flags at last offline. */
+ short rcu_ofl_gp_state; /* ->gp_state at last offline. */
unsigned long rcu_onl_gp_seq; /* ->gp_seq at last online. */
- short rcu_onl_gp_flags; /* ->gp_flags at last online. */
+ short rcu_onl_gp_state; /* ->gp_state at last online. */
unsigned long last_fqs_resched; /* Time of last rcu_resched(). */
unsigned long last_sched_clock; /* Jiffies of last rcu_sched_clock_irq(). */
struct rcu_snap_record snap_record; /* Snapshot of core stats at half of */
@@ -316,6 +316,19 @@ do { \
} while (0)
/*
+ * A max threshold for synchronize_rcu() users which are
+ * awaken directly by the rcu_gp_kthread(). Left part is
+ * deferred to the main worker.
+ */
+#define SR_MAX_USERS_WAKE_FROM_GP 5
+#define SR_NORMAL_GP_WAIT_HEAD_MAX 5
+
+struct sr_wait_node {
+ atomic_t inuse;
+ struct llist_node node;
+};
+
+/*
* RCU global state, including node hierarchy. This hierarchy is
* represented in "heap" form in a dense array. The root (first level)
* of the hierarchy is in ->node[0] (referenced by ->level[0]), the second
@@ -400,6 +413,13 @@ struct rcu_state {
/* Synchronize offline with */
/* GP pre-initialization. */
int nocb_is_setup; /* nocb is setup from boot */
+
+ /* synchronize_rcu() part. */
+ struct llist_head srs_next; /* request a GP users. */
+ struct llist_node *srs_wait_tail; /* wait for GP users. */
+ struct llist_node *srs_done_tail; /* ready for GP users. */
+ struct sr_wait_node srs_wait_nodes[SR_NORMAL_GP_WAIT_HEAD_MAX];
+ struct work_struct srs_cleanup_work;
};
/* Values for rcu_state structure's gp_flags field. */
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 6b83537480b1..8a1d9c8bd9f7 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -930,7 +930,7 @@ void synchronize_rcu_expedited(void)
/* If expedited grace periods are prohibited, fall back to normal. */
if (rcu_gp_is_normal()) {
- wait_rcu_gp(call_rcu_hurry);
+ synchronize_rcu_normal();
return;
}
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 36a8b5dbf5b5..340bbefe5f65 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -805,8 +805,8 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
rdp = per_cpu_ptr(&rcu_data, cpu);
pr_info("\t%d: %c online: %ld(%d) offline: %ld(%d)\n",
cpu, ".o"[rcu_rdp_cpu_online(rdp)],
- (long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_flags,
- (long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_flags);
+ (long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_state,
+ (long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_state);
}
}
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 5d666428546b..460efecd077b 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -504,7 +504,8 @@ static void print_cpu_stall_info(int cpu)
rcu_dynticks_in_eqs(rcu_dynticks_snap(cpu));
rcuc_starved = rcu_is_rcuc_kthread_starving(rdp, &j);
if (rcuc_starved)
- sprintf(buf, " rcuc=%ld jiffies(starved)", j);
+ // Print signed value, as negative values indicate a probable bug.
+ snprintf(buf, sizeof(buf), " rcuc=%ld jiffies(starved)", j);
pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%04x/%ld/%#lx softirq=%u/%u fqs=%ld%s%s\n",
cpu,
"O."[!!cpu_online(cpu)],
@@ -579,7 +580,7 @@ static void rcu_check_gp_kthread_expired_fqs_timer(void)
pr_err("%s kthread timer wakeup didn't happen for %ld jiffies! g%ld f%#x %s(%d) ->state=%#x\n",
rcu_state.name, (jiffies - jiffies_fqs),
(long)rcu_seq_current(&rcu_state.gp_seq),
- data_race(rcu_state.gp_flags),
+ data_race(READ_ONCE(rcu_state.gp_flags)), // Diagnostic read
gp_state_getname(RCU_GP_WAIT_FQS), RCU_GP_WAIT_FQS,
data_race(READ_ONCE(gpk->__state)));
pr_err("\tPossible timer handling issue on cpu=%d timer-softirq=%u\n",
@@ -628,7 +629,8 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
totqlen += rcu_get_n_cbs_cpu(cpu);
pr_err("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu ncpus=%d)\n",
smp_processor_id(), (long)(jiffies - gps),
- (long)rcu_seq_current(&rcu_state.gp_seq), totqlen, rcu_state.n_online_cpus);
+ (long)rcu_seq_current(&rcu_state.gp_seq), totqlen,
+ data_race(rcu_state.n_online_cpus)); // Diagnostic read
if (ndetected) {
rcu_dump_cpu_stacks();
@@ -689,7 +691,8 @@ static void print_cpu_stall(unsigned long gps)
totqlen += rcu_get_n_cbs_cpu(cpu);
pr_err("\t(t=%lu jiffies g=%ld q=%lu ncpus=%d)\n",
jiffies - gps,
- (long)rcu_seq_current(&rcu_state.gp_seq), totqlen, rcu_state.n_online_cpus);
+ (long)rcu_seq_current(&rcu_state.gp_seq), totqlen,
+ data_race(rcu_state.n_online_cpus)); // Diagnostic read
rcu_check_gp_kthread_expired_fqs_timer();
rcu_check_gp_kthread_starvation();
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 46aaaa9fe339..f8436969e0c8 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -408,7 +408,7 @@ void wakeme_after_rcu(struct rcu_head *head)
}
EXPORT_SYMBOL_GPL(wakeme_after_rcu);
-void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array,
+void __wait_rcu_gp(bool checktiny, unsigned int state, int n, call_rcu_func_t *crcu_array,
struct rcu_synchronize *rs_array)
{
int i;
@@ -440,7 +440,7 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array,
if (crcu_array[j] == crcu_array[i])
break;
if (j == i) {
- wait_for_completion(&rs_array[i].completion);
+ wait_for_completion_state(&rs_array[i].completion, state);
destroy_rcu_head_on_stack(&rs_array[i].head);
}
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d44efa0d0611..1a914388144a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -108,7 +108,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
-EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_thermal_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_hw_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
@@ -5662,13 +5662,13 @@ static inline u64 cpu_resched_latency(struct rq *rq) { return 0; }
* This function gets called by the timer code, with HZ frequency.
* We call it with interrupts disabled.
*/
-void scheduler_tick(void)
+void sched_tick(void)
{
int cpu = smp_processor_id();
struct rq *rq = cpu_rq(cpu);
struct task_struct *curr = rq->curr;
struct rq_flags rf;
- unsigned long thermal_pressure;
+ unsigned long hw_pressure;
u64 resched_latency;
if (housekeeping_cpu(cpu, HK_TYPE_TICK))
@@ -5679,8 +5679,8 @@ void scheduler_tick(void)
rq_lock(rq, &rf);
update_rq_clock(rq);
- thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
- update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
+ hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
+ update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure);
curr->sched_class->task_tick(rq, curr, 0);
if (sched_feat(LATENCY_WARN))
resched_latency = cpu_resched_latency(rq);
@@ -5700,7 +5700,7 @@ void scheduler_tick(void)
#ifdef CONFIG_SMP
rq->idle_balance = idle_cpu(cpu);
- trigger_load_balance(rq);
+ sched_balance_trigger(rq);
#endif
}
@@ -6585,7 +6585,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
* paths. For example, see arch/x86/entry_64.S.
*
* To drive preemption between tasks, the scheduler sets the flag in timer
- * interrupt handler scheduler_tick().
+ * interrupt handler sched_tick().
*
* 3. Wakeups don't really cause entry into schedule(). They add a
* task to the run-queue and that's it.
@@ -6647,7 +6647,9 @@ static void __sched notrace __schedule(unsigned int sched_mode)
* if (signal_pending_state()) if (p->state & @state)
*
* Also, the membarrier system call requires a full memory barrier
- * after coming from user-space, before storing to rq->curr.
+ * after coming from user-space, before storing to rq->curr; this
+ * barrier matches a full barrier in the proximity of the membarrier
+ * system call exit.
*/
rq_lock(rq, &rf);
smp_mb__after_spinlock();
@@ -6718,12 +6720,20 @@ static void __sched notrace __schedule(unsigned int sched_mode)
*
* Here are the schemes providing that barrier on the
* various architectures:
- * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC.
- * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC.
+ * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC,
+ * RISC-V. switch_mm() relies on membarrier_arch_switch_mm()
+ * on PowerPC and on RISC-V.
* - finish_lock_switch() for weakly-ordered
* architectures where spin_unlock is a full barrier,
* - switch_to() for arm64 (weakly-ordered, spin_unlock
* is a RELEASE barrier),
+ *
+ * The barrier matches a full barrier in the proximity of
+ * the membarrier system call entry.
+ *
+ * On RISC-V, this barrier pairing is also needed for the
+ * SYNC_CORE command when switching between processes, cf.
+ * the inline comments in membarrier_arch_switch_mm().
*/
++*switch_count;
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index af7952f12e6c..aa48b2ec879d 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -424,19 +424,6 @@ static inline void irqtime_account_process_tick(struct task_struct *p, int user_
*/
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-# ifndef __ARCH_HAS_VTIME_TASK_SWITCH
-void vtime_task_switch(struct task_struct *prev)
-{
- if (is_idle_task(prev))
- vtime_account_idle(prev);
- else
- vtime_account_kernel(prev);
-
- vtime_flush(prev);
- arch_vtime_task_switch(prev);
-}
-# endif
-
void vtime_account_irq(struct task_struct *tsk, unsigned int offset)
{
unsigned int pc = irq_count() - offset;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6a16129f9a5c..146ecf9cc3af 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -78,15 +78,9 @@ static unsigned int normalized_sysctl_sched_base_slice = 750000ULL;
const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
-int sched_thermal_decay_shift;
static int __init setup_sched_thermal_decay_shift(char *str)
{
- int _shift = 0;
-
- if (kstrtoint(str, 0, &_shift))
- pr_warn("Unable to set scheduler thermal pressure decay shift parameter\n");
-
- sched_thermal_decay_shift = clamp(_shift, 0, 10);
+ pr_warn("Ignoring the deprecated sched_thermal_decay_shift= option\n");
return 1;
}
__setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift);
@@ -388,8 +382,8 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
/*
* With cfs_rq being unthrottled/throttled during an enqueue,
- * it can happen the tmp_alone_branch points the a leaf that
- * we finally want to del. In this case, tmp_alone_branch moves
+ * it can happen the tmp_alone_branch points to the leaf that
+ * we finally want to delete. In this case, tmp_alone_branch moves
* to the prev element but it will point to rq->leaf_cfs_rq_list
* at the end of the enqueue.
*/
@@ -406,7 +400,7 @@ static inline void assert_list_leaf_cfs_rq(struct rq *rq)
SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list);
}
-/* Iterate thr' all leaf cfs_rq's on a runqueue */
+/* Iterate through all leaf cfs_rq's on a runqueue */
#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \
leaf_cfs_rq_list)
@@ -595,13 +589,13 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
*
* [[ NOTE: this is only equal to the ideal scheduler under the condition
* that join/leave operations happen at lag_i = 0, otherwise the
- * virtual time has non-continguous motion equivalent to:
+ * virtual time has non-contiguous motion equivalent to:
*
* V +-= lag_i / W
*
* Also see the comment in place_entity() that deals with this. ]]
*
- * However, since v_i is u64, and the multiplcation could easily overflow
+ * However, since v_i is u64, and the multiplication could easily overflow
* transform it into a relative form that uses smaller quantities:
*
* Substitute: v_i == (v_i - v0) + v0
@@ -671,7 +665,7 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
}
if (load) {
- /* sign flips effective floor / ceil */
+ /* sign flips effective floor / ceiling */
if (avg < 0)
avg -= (load - 1);
avg = div_s64(avg, load);
@@ -696,15 +690,21 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
*
* XXX could add max_slice to the augmented data to track this.
*/
-static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static s64 entity_lag(u64 avruntime, struct sched_entity *se)
{
- s64 lag, limit;
+ s64 vlag, limit;
+
+ vlag = avruntime - se->vruntime;
+ limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
+
+ return clamp(vlag, -limit, limit);
+}
+static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
SCHED_WARN_ON(!se->on_rq);
- lag = avg_vruntime(cfs_rq) - se->vruntime;
- limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
- se->vlag = clamp(lag, -limit, limit);
+ se->vlag = entity_lag(avg_vruntime(cfs_rq), se);
}
/*
@@ -721,7 +721,7 @@ static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
*
* lag_i >= 0 -> \Sum (v_i - v)*w_i >= (v_i - v)*(\Sum w_i)
*
- * Note: using 'avg_vruntime() > se->vruntime' is inacurate due
+ * Note: using 'avg_vruntime() > se->vruntime' is inaccurate due
* to the loss in precision caused by the division.
*/
static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime)
@@ -1024,7 +1024,7 @@ void init_entity_runnable_average(struct sched_entity *se)
if (entity_is_task(se))
sa->load_avg = scale_load_down(se->load.weight);
- /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
+ /* when this task is enqueued, it will contribute to its cfs_rq's load_avg */
}
/*
@@ -1616,7 +1616,7 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
max_dist = READ_ONCE(sched_max_numa_distance);
/*
* This code is called for each node, introducing N^2 complexity,
- * which should be ok given the number of nodes rarely exceeds 8.
+ * which should be OK given the number of nodes rarely exceeds 8.
*/
for_each_online_node(node) {
unsigned long faults;
@@ -1831,6 +1831,12 @@ bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio,
int last_cpupid, this_cpupid;
/*
+ * Cannot migrate to memoryless nodes.
+ */
+ if (!node_state(dst_nid, N_MEMORY))
+ return false;
+
+ /*
* The pages in slow memory node should be migrated according
* to hot/cold instead of private/shared.
*/
@@ -3284,7 +3290,7 @@ retry_pids:
/*
* Shared library pages mapped by multiple processes are not
* migrated as it is expected they are cache replicated. Avoid
- * hinting faults in read-only file-backed mappings or the vdso
+ * hinting faults in read-only file-backed mappings or the vDSO
* as migrating the pages will be of marginal benefit.
*/
if (!vma->vm_mm ||
@@ -3295,7 +3301,7 @@ retry_pids:
/*
* Skip inaccessible VMAs to avoid any confusion between
- * PROT_NONE and NUMA hinting ptes
+ * PROT_NONE and NUMA hinting PTEs
*/
if (!vma_is_accessible(vma)) {
trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_INACCESSIBLE);
@@ -3327,7 +3333,7 @@ retry_pids:
}
/*
- * Scanning the VMA's of short lived tasks add more overhead. So
+ * Scanning the VMAs of short lived tasks add more overhead. So
* delay the scan for new VMAs.
*/
if (mm->numa_scan_seq && time_before(jiffies,
@@ -3371,7 +3377,7 @@ retry_pids:
/*
* Try to scan sysctl_numa_balancing_size worth of
* hpages that have at least one present PTE that
- * is not already pte-numa. If the VMA contains
+ * is not already PTE-numa. If the VMA contains
* areas that are unused or already full of prot_numa
* PTEs, scan up to virtpages, to skip through those
* areas faster.
@@ -3670,16 +3676,15 @@ static inline void
dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
#endif
-static void reweight_eevdf(struct cfs_rq *cfs_rq, struct sched_entity *se,
+static void reweight_eevdf(struct sched_entity *se, u64 avruntime,
unsigned long weight)
{
unsigned long old_weight = se->load.weight;
- u64 avruntime = avg_vruntime(cfs_rq);
s64 vlag, vslice;
/*
* VRUNTIME
- * ========
+ * --------
*
* COROLLARY #1: The virtual runtime of the entity needs to be
* adjusted if re-weight at !0-lag point.
@@ -3755,14 +3760,14 @@ static void reweight_eevdf(struct cfs_rq *cfs_rq, struct sched_entity *se,
* = V - vl'
*/
if (avruntime != se->vruntime) {
- vlag = (s64)(avruntime - se->vruntime);
+ vlag = entity_lag(avruntime, se);
vlag = div_s64(vlag * old_weight, weight);
se->vruntime = avruntime - vlag;
}
/*
* DEADLINE
- * ========
+ * --------
*
* When the weight changes, the virtual time slope changes and
* we should adjust the relative virtual deadline accordingly.
@@ -3781,25 +3786,26 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
unsigned long weight)
{
bool curr = cfs_rq->curr == se;
+ u64 avruntime;
if (se->on_rq) {
/* commit outstanding execution time */
- if (curr)
- update_curr(cfs_rq);
- else
+ update_curr(cfs_rq);
+ avruntime = avg_vruntime(cfs_rq);
+ if (!curr)
__dequeue_entity(cfs_rq, se);
update_load_sub(&cfs_rq->load, se->load.weight);
}
dequeue_load_avg(cfs_rq, se);
- if (!se->on_rq) {
+ if (se->on_rq) {
+ reweight_eevdf(se, avruntime, weight);
+ } else {
/*
* Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
* we need to scale se->vlag when w_i changes.
*/
se->vlag = div_s64(se->vlag * se->load.weight, weight);
- } else {
- reweight_eevdf(cfs_rq, se, weight);
}
update_load_set(&se->load, weight);
@@ -4733,7 +4739,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
/*
* Track task load average for carrying it to new CPU after migrated, and
- * track group sched_entity load average for task_h_load calc in migration
+ * track group sched_entity load average for task_h_load calculation in migration
*/
if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
__update_load_avg_se(now, cfs_rq, se);
@@ -4816,7 +4822,7 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
return cfs_rq->avg.load_avg;
}
-static int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
+static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf);
static inline unsigned long task_util(struct task_struct *p)
{
@@ -4959,13 +4965,22 @@ done:
trace_sched_util_est_se_tp(&p->se);
}
+static inline unsigned long get_actual_cpu_capacity(int cpu)
+{
+ unsigned long capacity = arch_scale_cpu_capacity(cpu);
+
+ capacity -= max(hw_load_avg(cpu_rq(cpu)), cpufreq_get_pressure(cpu));
+
+ return capacity;
+}
+
static inline int util_fits_cpu(unsigned long util,
unsigned long uclamp_min,
unsigned long uclamp_max,
int cpu)
{
- unsigned long capacity_orig, capacity_orig_thermal;
unsigned long capacity = capacity_of(cpu);
+ unsigned long capacity_orig;
bool fits, uclamp_max_fits;
/*
@@ -4987,7 +5002,7 @@ static inline int util_fits_cpu(unsigned long util,
* Similarly if a task is capped to arch_scale_cpu_capacity(little_cpu), it
* should fit a little cpu even if there's some pressure.
*
- * Only exception is for thermal pressure since it has a direct impact
+ * Only exception is for HW or cpufreq pressure since it has a direct impact
* on available OPP of the system.
*
* We honour it for uclamp_min only as a drop in performance level
@@ -4997,7 +5012,6 @@ static inline int util_fits_cpu(unsigned long util,
* goal is to cap the task. So it's okay if it's getting less.
*/
capacity_orig = arch_scale_cpu_capacity(cpu);
- capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
/*
* We want to force a task to fit a cpu as implied by uclamp_max.
@@ -5014,14 +5028,14 @@ static inline int util_fits_cpu(unsigned long util,
* | | | | | | |
* | | | | | | |
* +----------------------------------------
- * cpu0 cpu1 cpu2
+ * CPU0 CPU1 CPU2
*
* In the above example if a task is capped to a specific performance
* point, y, then when:
*
- * * util = 80% of x then it does not fit on cpu0 and should migrate
- * to cpu1
- * * util = 80% of y then it is forced to fit on cpu1 to honour
+ * * util = 80% of x then it does not fit on CPU0 and should migrate
+ * to CPU1
+ * * util = 80% of y then it is forced to fit on CPU1 to honour
* uclamp_max request.
*
* which is what we're enforcing here. A task always fits if
@@ -5052,7 +5066,7 @@ static inline int util_fits_cpu(unsigned long util,
* | | | | | | |
* | | | | | | | (region c, boosted, util < uclamp_min)
* +----------------------------------------
- * cpu0 cpu1 cpu2
+ * CPU0 CPU1 CPU2
*
* a) If util > uclamp_max, then we're capped, we don't care about
* actual fitness value here. We only care if uclamp_max fits
@@ -5072,7 +5086,8 @@ static inline int util_fits_cpu(unsigned long util,
* handle the case uclamp_min > uclamp_max.
*/
uclamp_min = min(uclamp_min, uclamp_max);
- if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal))
+ if (fits && (util < uclamp_min) &&
+ (uclamp_min > get_actual_cpu_capacity(cpu)))
return -1;
return fits;
@@ -5092,15 +5107,19 @@ static inline int task_fits_cpu(struct task_struct *p, int cpu)
static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
{
+ int cpu = cpu_of(rq);
+
if (!sched_asym_cpucap_active())
return;
- if (!p || p->nr_cpus_allowed == 1) {
- rq->misfit_task_load = 0;
- return;
- }
+ /*
+ * Affinity allows us to go somewhere higher? Or are we on biggest
+ * available CPU already? Or do we fit into this CPU ?
+ */
+ if (!p || (p->nr_cpus_allowed == 1) ||
+ (arch_scale_cpu_capacity(cpu) == p->max_allowed_capacity) ||
+ task_fits_cpu(p, cpu)) {
- if (task_fits_cpu(p, cpu_of(rq))) {
rq->misfit_task_load = 0;
return;
}
@@ -5136,7 +5155,7 @@ attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
static inline void
detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
-static inline int newidle_balance(struct rq *rq, struct rq_flags *rf)
+static inline int sched_balance_newidle(struct rq *rq, struct rq_flags *rf)
{
return 0;
}
@@ -5242,7 +5261,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
se->vruntime = vruntime - lag;
/*
- * When joining the competition; the exisiting tasks will be,
+ * When joining the competition; the existing tasks will be,
* on average, halfway through their slice, as such start tasks
* off with half a slice to ease into the competition.
*/
@@ -5391,7 +5410,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* Now advance min_vruntime if @se was the entity holding it back,
* except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
* put back on, and if we advance min_vruntime, we'll be placed back
- * further than we started -- ie. we'll be penalized.
+ * further than we started -- i.e. we'll be penalized.
*/
if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
update_min_vruntime(cfs_rq);
@@ -5427,7 +5446,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
/*
* Track our maximum slice length, if the CPU's load is at
- * least twice that of our own weight (i.e. dont track it
+ * least twice that of our own weight (i.e. don't track it
* when there are only lesser-weight tasks around):
*/
if (schedstat_enabled() &&
@@ -6663,22 +6682,47 @@ static inline void hrtick_update(struct rq *rq)
#ifdef CONFIG_SMP
static inline bool cpu_overutilized(int cpu)
{
- unsigned long rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
- unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
+ unsigned long rq_util_min, rq_util_max;
+
+ if (!sched_energy_enabled())
+ return false;
+
+ rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
+ rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
/* Return true only if the utilization doesn't fit CPU's capacity */
return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu);
}
-static inline void update_overutilized_status(struct rq *rq)
+/*
+ * overutilized value make sense only if EAS is enabled
+ */
+static inline bool is_rd_overutilized(struct root_domain *rd)
{
- if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) {
- WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
- trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED);
- }
+ return !sched_energy_enabled() || READ_ONCE(rd->overutilized);
+}
+
+static inline void set_rd_overutilized(struct root_domain *rd, bool flag)
+{
+ if (!sched_energy_enabled())
+ return;
+
+ WRITE_ONCE(rd->overutilized, flag);
+ trace_sched_overutilized_tp(rd, flag);
+}
+
+static inline void check_update_overutilized_status(struct rq *rq)
+{
+ /*
+ * overutilized field is used for load balancing decisions only
+ * if energy aware scheduler is being used
+ */
+
+ if (!is_rd_overutilized(rq->rd) && cpu_overutilized(rq->cpu))
+ set_rd_overutilized(rq->rd, 1);
}
#else
-static inline void update_overutilized_status(struct rq *rq) { }
+static inline void check_update_overutilized_status(struct rq *rq) { }
#endif
/* Runqueue only has SCHED_IDLE tasks enqueued */
@@ -6779,7 +6823,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
* and the following generally works well enough in practice.
*/
if (!task_new)
- update_overutilized_status(rq);
+ check_update_overutilized_status(rq);
enqueue_throttle:
assert_list_leaf_cfs_rq(rq);
@@ -6866,7 +6910,7 @@ dequeue_throttle:
#ifdef CONFIG_SMP
-/* Working cpumask for: load_balance, load_balance_newidle. */
+/* Working cpumask for: sched_balance_rq(), sched_balance_newidle(). */
static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask);
static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask);
@@ -7098,13 +7142,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
}
static struct sched_group *
-find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu);
+sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int this_cpu);
/*
- * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group.
+ * sched_balance_find_dst_group_cpu - find the idlest CPU among the CPUs in the group.
*/
static int
-find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
+sched_balance_find_dst_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
{
unsigned long load, min_load = ULONG_MAX;
unsigned int min_exit_latency = UINT_MAX;
@@ -7160,7 +7204,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
}
-static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
+static inline int sched_balance_find_dst_cpu(struct sched_domain *sd, struct task_struct *p,
int cpu, int prev_cpu, int sd_flag)
{
int new_cpu = cpu;
@@ -7185,13 +7229,13 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
continue;
}
- group = find_idlest_group(sd, p, cpu);
+ group = sched_balance_find_dst_group(sd, p, cpu);
if (!group) {
sd = sd->child;
continue;
}
- new_cpu = find_idlest_group_cpu(group, p, cpu);
+ new_cpu = sched_balance_find_dst_group_cpu(group, p, cpu);
if (new_cpu == cpu) {
/* Now try balancing at a lower domain level of 'cpu': */
sd = sd->child;
@@ -7459,7 +7503,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
* Look for the CPU with best capacity.
*/
else if (fits < 0)
- cpu_cap = arch_scale_cpu_capacity(cpu) - thermal_load_avg(cpu_rq(cpu));
+ cpu_cap = get_actual_cpu_capacity(cpu);
/*
* First, select CPU which fits better (-1 being better than 0).
@@ -7503,7 +7547,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
/*
* On asymmetric system, update task utilization because we will check
- * that the task fits with cpu's capacity.
+ * that the task fits with CPU's capacity.
*/
if (sched_asym_cpucap_active()) {
sync_entity_load_avg(&p->se);
@@ -7936,7 +7980,7 @@ compute_energy(struct energy_env *eenv, struct perf_domain *pd,
* NOTE: Forkees are not accepted in the energy-aware wake-up path because
* they don't have any useful utilization data yet and it's not possible to
* forecast their impact on energy consumption. Consequently, they will be
- * placed by find_idlest_cpu() on the least loaded CPU, which might turn out
+ * placed by sched_balance_find_dst_cpu() on the least loaded CPU, which might turn out
* to be energy-inefficient in some use-cases. The alternative would be to
* bias new tasks towards specific types of CPUs first, or to try to infer
* their util_avg from the parent task, but those heuristics could hurt
@@ -7952,15 +7996,15 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
struct root_domain *rd = this_rq()->rd;
int cpu, best_energy_cpu, target = -1;
int prev_fits = -1, best_fits = -1;
- unsigned long best_thermal_cap = 0;
- unsigned long prev_thermal_cap = 0;
+ unsigned long best_actual_cap = 0;
+ unsigned long prev_actual_cap = 0;
struct sched_domain *sd;
struct perf_domain *pd;
struct energy_env eenv;
rcu_read_lock();
pd = rcu_dereference(rd->pd);
- if (!pd || READ_ONCE(rd->overutilized))
+ if (!pd)
goto unlock;
/*
@@ -7983,7 +8027,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
for (; pd; pd = pd->next) {
unsigned long util_min = p_util_min, util_max = p_util_max;
- unsigned long cpu_cap, cpu_thermal_cap, util;
+ unsigned long cpu_cap, cpu_actual_cap, util;
long prev_spare_cap = -1, max_spare_cap = -1;
unsigned long rq_util_min, rq_util_max;
unsigned long cur_delta, base_energy;
@@ -7995,18 +8039,17 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
if (cpumask_empty(cpus))
continue;
- /* Account thermal pressure for the energy estimation */
+ /* Account external pressure for the energy estimation */
cpu = cpumask_first(cpus);
- cpu_thermal_cap = arch_scale_cpu_capacity(cpu);
- cpu_thermal_cap -= arch_scale_thermal_pressure(cpu);
+ cpu_actual_cap = get_actual_cpu_capacity(cpu);
- eenv.cpu_cap = cpu_thermal_cap;
+ eenv.cpu_cap = cpu_actual_cap;
eenv.pd_cap = 0;
for_each_cpu(cpu, cpus) {
struct rq *rq = cpu_rq(cpu);
- eenv.pd_cap += cpu_thermal_cap;
+ eenv.pd_cap += cpu_actual_cap;
if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
continue;
@@ -8027,7 +8070,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
if (uclamp_is_used() && !uclamp_rq_is_idle(rq)) {
/*
* Open code uclamp_rq_util_with() except for
- * the clamp() part. Ie: apply max aggregation
+ * the clamp() part. I.e.: apply max aggregation
* only. util_fits_cpu() logic requires to
* operate on non clamped util but must use the
* max-aggregated uclamp_{min, max}.
@@ -8077,7 +8120,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
if (prev_delta < base_energy)
goto unlock;
prev_delta -= base_energy;
- prev_thermal_cap = cpu_thermal_cap;
+ prev_actual_cap = cpu_actual_cap;
best_delta = min(best_delta, prev_delta);
}
@@ -8092,7 +8135,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
* but best energy cpu has better capacity.
*/
if ((max_fits < 0) &&
- (cpu_thermal_cap <= best_thermal_cap))
+ (cpu_actual_cap <= best_actual_cap))
continue;
cur_delta = compute_energy(&eenv, pd, cpus, p,
@@ -8113,14 +8156,14 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
best_delta = cur_delta;
best_energy_cpu = max_spare_cap_cpu;
best_fits = max_fits;
- best_thermal_cap = cpu_thermal_cap;
+ best_actual_cap = cpu_actual_cap;
}
}
rcu_read_unlock();
if ((best_fits > prev_fits) ||
((best_fits > 0) && (best_delta < prev_delta)) ||
- ((best_fits < 0) && (best_thermal_cap > prev_thermal_cap)))
+ ((best_fits < 0) && (best_actual_cap > prev_actual_cap)))
target = best_energy_cpu;
return target;
@@ -8163,7 +8206,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
cpumask_test_cpu(cpu, p->cpus_ptr))
return cpu;
- if (sched_energy_enabled()) {
+ if (!is_rd_overutilized(this_rq()->rd)) {
new_cpu = find_energy_efficient_cpu(p, prev_cpu);
if (new_cpu >= 0)
return new_cpu;
@@ -8201,7 +8244,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
if (unlikely(sd)) {
/* Slow path */
- new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
+ new_cpu = sched_balance_find_dst_cpu(sd, p, cpu, prev_cpu, sd_flag);
} else if (wake_flags & WF_TTWU) { /* XXX always ? */
/* Fast path */
new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
@@ -8247,14 +8290,46 @@ static void task_dead_fair(struct task_struct *p)
remove_entity_load_avg(&p->se);
}
+/*
+ * Set the max capacity the task is allowed to run at for misfit detection.
+ */
+static void set_task_max_allowed_capacity(struct task_struct *p)
+{
+ struct asym_cap_data *entry;
+
+ if (!sched_asym_cpucap_active())
+ return;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(entry, &asym_cap_list, link) {
+ cpumask_t *cpumask;
+
+ cpumask = cpu_capacity_span(entry);
+ if (!cpumask_intersects(p->cpus_ptr, cpumask))
+ continue;
+
+ p->max_allowed_capacity = entry->capacity;
+ break;
+ }
+ rcu_read_unlock();
+}
+
+static void set_cpus_allowed_fair(struct task_struct *p, struct affinity_context *ctx)
+{
+ set_cpus_allowed_common(p, ctx);
+ set_task_max_allowed_capacity(p);
+}
+
static int
balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
if (rq->nr_running)
return 1;
- return newidle_balance(rq, rf) != 0;
+ return sched_balance_newidle(rq, rf) != 0;
}
+#else
+static inline void set_task_max_allowed_capacity(struct task_struct *p) {}
#endif /* CONFIG_SMP */
static void set_next_buddy(struct sched_entity *se)
@@ -8505,10 +8580,10 @@ idle:
if (!rf)
return NULL;
- new_tasks = newidle_balance(rq, rf);
+ new_tasks = sched_balance_newidle(rq, rf);
/*
- * Because newidle_balance() releases (and re-acquires) rq->lock, it is
+ * Because sched_balance_newidle() releases (and re-acquires) rq->lock, it is
* possible for any higher priority task to appear. In that case we
* must re-start the pick_next_entity() loop.
*/
@@ -8586,7 +8661,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
return false;
- /* Tell the scheduler that we'd really like pse to run next. */
+ /* Tell the scheduler that we'd really like se to run next. */
set_next_buddy(se);
yield_task_fair(rq);
@@ -8924,7 +8999,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
return 0;
- /* Disregard pcpu kthreads; they are where they need to be. */
+ /* Disregard percpu kthreads; they are where they need to be. */
if (kthread_is_per_cpu(p))
return 0;
@@ -9070,7 +9145,7 @@ static int detach_tasks(struct lb_env *env)
* We don't want to steal all, otherwise we may be treated likewise,
* which could at worst lead to a livelock crash.
*/
- if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
+ if (env->idle && env->src_rq->nr_running <= 1)
break;
env->loop++;
@@ -9249,7 +9324,7 @@ static inline bool others_have_blocked(struct rq *rq)
if (cpu_util_dl(rq))
return true;
- if (thermal_load_avg(rq))
+ if (hw_load_avg(rq))
return true;
if (cpu_util_irq(rq))
@@ -9279,7 +9354,7 @@ static bool __update_blocked_others(struct rq *rq, bool *done)
{
const struct sched_class *curr_class;
u64 now = rq_clock_pelt(rq);
- unsigned long thermal_pressure;
+ unsigned long hw_pressure;
bool decayed;
/*
@@ -9288,11 +9363,11 @@ static bool __update_blocked_others(struct rq *rq, bool *done)
*/
curr_class = rq->curr->sched_class;
- thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
+ hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) |
update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) |
- update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure) |
+ update_hw_load_avg(now, rq, hw_pressure) |
update_irq_load_avg(rq, 0);
if (others_have_blocked(rq))
@@ -9411,7 +9486,7 @@ static unsigned long task_h_load(struct task_struct *p)
}
#endif
-static void update_blocked_averages(int cpu)
+static void sched_balance_update_blocked_averages(int cpu)
{
bool decayed = false, done = true;
struct rq *rq = cpu_rq(cpu);
@@ -9430,25 +9505,25 @@ static void update_blocked_averages(int cpu)
rq_unlock_irqrestore(rq, &rf);
}
-/********** Helpers for find_busiest_group ************************/
+/********** Helpers for sched_balance_find_src_group ************************/
/*
- * sg_lb_stats - stats of a sched_group required for load_balancing
+ * sg_lb_stats - stats of a sched_group required for load-balancing:
*/
struct sg_lb_stats {
- unsigned long avg_load; /*Avg load across the CPUs of the group */
- unsigned long group_load; /* Total load over the CPUs of the group */
- unsigned long group_capacity;
- unsigned long group_util; /* Total utilization over the CPUs of the group */
- unsigned long group_runnable; /* Total runnable time over the CPUs of the group */
- unsigned int sum_nr_running; /* Nr of tasks running in the group */
- unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */
- unsigned int idle_cpus;
+ unsigned long avg_load; /* Avg load over the CPUs of the group */
+ unsigned long group_load; /* Total load over the CPUs of the group */
+ unsigned long group_capacity; /* Capacity over the CPUs of the group */
+ unsigned long group_util; /* Total utilization over the CPUs of the group */
+ unsigned long group_runnable; /* Total runnable time over the CPUs of the group */
+ unsigned int sum_nr_running; /* Nr of all tasks running in the group */
+ unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */
+ unsigned int idle_cpus; /* Nr of idle CPUs in the group */
unsigned int group_weight;
enum group_type group_type;
- unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */
- unsigned int group_smt_balance; /* Task on busy SMT be moved */
- unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
+ unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */
+ unsigned int group_smt_balance; /* Task on busy SMT be moved */
+ unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
unsigned int nr_preferred_running;
@@ -9456,19 +9531,18 @@ struct sg_lb_stats {
};
/*
- * sd_lb_stats - Structure to store the statistics of a sched_domain
- * during load balancing.
+ * sd_lb_stats - stats of a sched_domain required for load-balancing:
*/
struct sd_lb_stats {
- struct sched_group *busiest; /* Busiest group in this sd */
- struct sched_group *local; /* Local group in this sd */
- unsigned long total_load; /* Total load of all groups in sd */
- unsigned long total_capacity; /* Total capacity of all groups in sd */
- unsigned long avg_load; /* Average load across all groups in sd */
- unsigned int prefer_sibling; /* tasks should go to sibling first */
-
- struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
- struct sg_lb_stats local_stat; /* Statistics of the local group */
+ struct sched_group *busiest; /* Busiest group in this sd */
+ struct sched_group *local; /* Local group in this sd */
+ unsigned long total_load; /* Total load of all groups in sd */
+ unsigned long total_capacity; /* Total capacity of all groups in sd */
+ unsigned long avg_load; /* Average load across all groups in sd */
+ unsigned int prefer_sibling; /* Tasks should go to sibling first */
+
+ struct sg_lb_stats busiest_stat; /* Statistics of the busiest group */
+ struct sg_lb_stats local_stat; /* Statistics of the local group */
};
static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
@@ -9494,8 +9568,8 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
static unsigned long scale_rt_capacity(int cpu)
{
+ unsigned long max = get_actual_cpu_capacity(cpu);
struct rq *rq = cpu_rq(cpu);
- unsigned long max = arch_scale_cpu_capacity(cpu);
unsigned long used, free;
unsigned long irq;
@@ -9507,12 +9581,9 @@ static unsigned long scale_rt_capacity(int cpu)
/*
* avg_rt.util_avg and avg_dl.util_avg track binary signals
* (running and not running) with weights 0 and 1024 respectively.
- * avg_thermal.load_avg tracks thermal pressure and the weighted
- * average uses the actual delta max capacity(load).
*/
used = cpu_util_rt(rq);
used += cpu_util_dl(rq);
- used += thermal_load_avg(rq);
if (unlikely(used >= max))
return 1;
@@ -9605,16 +9676,10 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
(arch_scale_cpu_capacity(cpu_of(rq)) * 100));
}
-/*
- * Check whether a rq has a misfit task and if it looks like we can actually
- * help that task: we can migrate the task to a CPU of higher capacity, or
- * the task's current CPU is heavily pressured.
- */
-static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
+/* Check if the rq has a misfit task */
+static inline bool check_misfit_status(struct rq *rq)
{
- return rq->misfit_task_load &&
- (arch_scale_cpu_capacity(rq->cpu) < rq->rd->max_cpu_capacity ||
- check_cpu_capacity(rq, sd));
+ return rq->misfit_task_load;
}
/*
@@ -9638,7 +9703,7 @@ static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
*
* When this is so detected; this group becomes a candidate for busiest; see
* update_sd_pick_busiest(). And calculate_imbalance() and
- * find_busiest_group() avoid some of the usual balance conditions to allow it
+ * sched_balance_find_src_group() avoid some of the usual balance conditions to allow it
* to create an effective group imbalance.
*
* This is a somewhat tricky proposition since the next run might not find the
@@ -9803,7 +9868,7 @@ static inline bool smt_vs_nonsmt_groups(struct sched_group *sg1,
static inline bool smt_balance(struct lb_env *env, struct sg_lb_stats *sgs,
struct sched_group *group)
{
- if (env->idle == CPU_NOT_IDLE)
+ if (!env->idle)
return false;
/*
@@ -9827,7 +9892,7 @@ static inline long sibling_imbalance(struct lb_env *env,
int ncores_busiest, ncores_local;
long imbalance;
- if (env->idle == CPU_NOT_IDLE || !busiest->sum_nr_running)
+ if (!env->idle || !busiest->sum_nr_running)
return 0;
ncores_busiest = sds->busiest->cores;
@@ -9873,13 +9938,15 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
* @sds: Load-balancing data with statistics of the local group.
* @group: sched_group whose statistics are to be updated.
* @sgs: variable to hold the statistics for this group.
- * @sg_status: Holds flag indicating the status of the sched_group
+ * @sg_overloaded: sched_group is overloaded
+ * @sg_overutilized: sched_group is overutilized
*/
static inline void update_sg_lb_stats(struct lb_env *env,
struct sd_lb_stats *sds,
struct sched_group *group,
struct sg_lb_stats *sgs,
- int *sg_status)
+ bool *sg_overloaded,
+ bool *sg_overutilized)
{
int i, nr_running, local_group;
@@ -9900,10 +9967,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->sum_nr_running += nr_running;
if (nr_running > 1)
- *sg_status |= SG_OVERLOAD;
+ *sg_overloaded = 1;
if (cpu_overutilized(i))
- *sg_status |= SG_OVERUTILIZED;
+ *sg_overutilized = 1;
#ifdef CONFIG_NUMA_BALANCING
sgs->nr_numa_running += rq->nr_numa_running;
@@ -9925,10 +9992,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
/* Check for a misfit task on the cpu */
if (sgs->group_misfit_task_load < rq->misfit_task_load) {
sgs->group_misfit_task_load = rq->misfit_task_load;
- *sg_status |= SG_OVERLOAD;
+ *sg_overloaded = 1;
}
- } else if ((env->idle != CPU_NOT_IDLE) &&
- sched_reduced_capacity(rq, env->sd)) {
+ } else if (env->idle && sched_reduced_capacity(rq, env->sd)) {
/* Check for a task running on a CPU with reduced capacity */
if (sgs->group_misfit_task_load < load)
sgs->group_misfit_task_load = load;
@@ -9940,7 +10006,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->group_weight = group->group_weight;
/* Check if dst CPU is idle and preferred to this group */
- if (!local_group && env->idle != CPU_NOT_IDLE && sgs->sum_h_nr_running &&
+ if (!local_group && env->idle && sgs->sum_h_nr_running &&
sched_group_asym(env, sgs, group))
sgs->group_asym_packing = 1;
@@ -10078,7 +10144,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
has_spare:
/*
- * Select not overloaded group with lowest number of idle cpus
+ * Select not overloaded group with lowest number of idle CPUs
* and highest number of running tasks. We could also compare
* the spare capacity which is more stable but it can end up
* that the group has less spare capacity but finally more idle
@@ -10298,13 +10364,13 @@ static bool update_pick_idlest(struct sched_group *idlest,
}
/*
- * find_idlest_group() finds and returns the least busy CPU group within the
+ * sched_balance_find_dst_group() finds and returns the least busy CPU group within the
* domain.
*
* Assumes p is allowed on at least one CPU in sd.
*/
static struct sched_group *
-find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
+sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
{
struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups;
struct sg_lb_stats local_sgs, tmp_sgs;
@@ -10552,7 +10618,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
struct sg_lb_stats *local = &sds->local_stat;
struct sg_lb_stats tmp_sgs;
unsigned long sum_util = 0;
- int sg_status = 0;
+ bool sg_overloaded = 0, sg_overutilized = 0;
do {
struct sg_lb_stats *sgs = &tmp_sgs;
@@ -10568,7 +10634,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
update_group_capacity(env->sd, env->dst_cpu);
}
- update_sg_lb_stats(env, sds, sg, sgs, &sg_status);
+ update_sg_lb_stats(env, sds, sg, sgs, &sg_overloaded, &sg_overutilized);
if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) {
sds->busiest = sg;
@@ -10596,19 +10662,13 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
env->fbq_type = fbq_classify_group(&sds->busiest_stat);
if (!env->sd->parent) {
- struct root_domain *rd = env->dst_rq->rd;
-
/* update overload indicator if we are at root domain */
- WRITE_ONCE(rd->overload, sg_status & SG_OVERLOAD);
+ set_rd_overloaded(env->dst_rq->rd, sg_overloaded);
/* Update over-utilization (tipping point, U >= 0) indicator */
- WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED);
- trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED);
- } else if (sg_status & SG_OVERUTILIZED) {
- struct root_domain *rd = env->dst_rq->rd;
-
- WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED);
- trace_sched_overutilized_tp(rd, SG_OVERUTILIZED);
+ set_rd_overutilized(env->dst_rq->rd, sg_overutilized);
+ } else if (sg_overutilized) {
+ set_rd_overutilized(env->dst_rq->rd, sg_overutilized);
}
update_idle_cpu_scan(env, sum_util);
@@ -10698,7 +10758,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
* waiting task in this overloaded busiest group. Let's
* try to pull it.
*/
- if (env->idle != CPU_NOT_IDLE && env->imbalance == 0) {
+ if (env->idle && env->imbalance == 0) {
env->migration_type = migrate_task;
env->imbalance = 1;
}
@@ -10717,7 +10777,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
/*
* If there is no overload, we just want to even the number of
- * idle cpus.
+ * idle CPUs.
*/
env->migration_type = migrate_task;
env->imbalance = max_t(long, 0,
@@ -10790,7 +10850,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
) / SCHED_CAPACITY_SCALE;
}
-/******* find_busiest_group() helpers end here *********************/
+/******* sched_balance_find_src_group() helpers end here *********************/
/*
* Decision matrix according to the local and busiest group type:
@@ -10813,7 +10873,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
*/
/**
- * find_busiest_group - Returns the busiest group within the sched_domain
+ * sched_balance_find_src_group - Returns the busiest group within the sched_domain
* if there is an imbalance.
* @env: The load balancing environment.
*
@@ -10822,7 +10882,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
*
* Return: - The busiest group if imbalance exists.
*/
-static struct sched_group *find_busiest_group(struct lb_env *env)
+static struct sched_group *sched_balance_find_src_group(struct lb_env *env)
{
struct sg_lb_stats *local, *busiest;
struct sd_lb_stats sds;
@@ -10845,12 +10905,9 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
if (busiest->group_type == group_misfit_task)
goto force_balance;
- if (sched_energy_enabled()) {
- struct root_domain *rd = env->dst_rq->rd;
-
- if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
- goto out_balanced;
- }
+ if (!is_rd_overutilized(env->dst_rq->rd) &&
+ rcu_dereference(env->dst_rq->rd->pd))
+ goto out_balanced;
/* ASYM feature bypasses nice load balance check */
if (busiest->group_type == group_asym_packing)
@@ -10913,7 +10970,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
goto force_balance;
if (busiest->group_type != group_overloaded) {
- if (env->idle == CPU_NOT_IDLE) {
+ if (!env->idle) {
/*
* If the busiest group is not overloaded (and as a
* result the local one too) but this CPU is already
@@ -10961,9 +11018,9 @@ out_balanced:
}
/*
- * find_busiest_queue - find the busiest runqueue among the CPUs in the group.
+ * sched_balance_find_src_rq - find the busiest runqueue among the CPUs in the group.
*/
-static struct rq *find_busiest_queue(struct lb_env *env,
+static struct rq *sched_balance_find_src_rq(struct lb_env *env,
struct sched_group *group)
{
struct rq *busiest = NULL, *rq;
@@ -11121,7 +11178,7 @@ asym_active_balance(struct lb_env *env)
* the lower priority @env::dst_cpu help it. Do not follow
* CPU priority.
*/
- return env->idle != CPU_NOT_IDLE && sched_use_asym_prio(env->sd, env->dst_cpu) &&
+ return env->idle && sched_use_asym_prio(env->sd, env->dst_cpu) &&
(sched_asym_prefer(env->dst_cpu, env->src_cpu) ||
!sched_use_asym_prio(env->sd, env->src_cpu));
}
@@ -11159,7 +11216,7 @@ static int need_active_balance(struct lb_env *env)
* because of other sched_class or IRQs if more capacity stays
* available on dst_cpu.
*/
- if ((env->idle != CPU_NOT_IDLE) &&
+ if (env->idle &&
(env->src_rq->cfs.h_nr_running == 1)) {
if ((check_cpu_capacity(env->src_rq, sd)) &&
(capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
@@ -11244,7 +11301,7 @@ static int should_we_balance(struct lb_env *env)
* Check this_cpu to ensure it is balanced within domain. Attempt to move
* tasks if there is an imbalance.
*/
-static int load_balance(int this_cpu, struct rq *this_rq,
+static int sched_balance_rq(int this_cpu, struct rq *this_rq,
struct sched_domain *sd, enum cpu_idle_type idle,
int *continue_balancing)
{
@@ -11276,13 +11333,13 @@ redo:
goto out_balanced;
}
- group = find_busiest_group(&env);
+ group = sched_balance_find_src_group(&env);
if (!group) {
schedstat_inc(sd->lb_nobusyg[idle]);
goto out_balanced;
}
- busiest = find_busiest_queue(&env, group);
+ busiest = sched_balance_find_src_rq(&env, group);
if (!busiest) {
schedstat_inc(sd->lb_nobusyq[idle]);
goto out_balanced;
@@ -11300,7 +11357,7 @@ redo:
env.flags |= LBF_ALL_PINNED;
if (busiest->nr_running > 1) {
/*
- * Attempt to move tasks. If find_busiest_group has found
+ * Attempt to move tasks. If sched_balance_find_src_group has found
* an imbalance but busiest->nr_running <= 1, the group is
* still unbalanced. ld_moved simply stays zero, so it is
* correctly treated as an imbalance.
@@ -11415,8 +11472,12 @@ more_balance:
* We do not want newidle balance, which can be very
* frequent, pollute the failure counter causing
* excessive cache_hot migrations and active balances.
+ *
+ * Similarly for migration_misfit which is not related to
+ * load/util migration, don't pollute nr_balance_failed.
*/
- if (idle != CPU_NEWLY_IDLE)
+ if (idle != CPU_NEWLY_IDLE &&
+ env.migration_type != migrate_misfit)
sd->nr_balance_failed++;
if (need_active_balance(&env)) {
@@ -11495,12 +11556,17 @@ out_one_pinned:
ld_moved = 0;
/*
- * newidle_balance() disregards balance intervals, so we could
+ * sched_balance_newidle() disregards balance intervals, so we could
* repeatedly reach this code, which would lead to balance_interval
* skyrocketing in a short amount of time. Skip the balance_interval
* increase logic to avoid that.
+ *
+ * Similarly misfit migration which is not necessarily an indication of
+ * the system being busy and requires lb to backoff to let it settle
+ * down.
*/
- if (env.idle == CPU_NEWLY_IDLE)
+ if (env.idle == CPU_NEWLY_IDLE ||
+ env.migration_type == migrate_misfit)
goto out;
/* tune up the balancing interval */
@@ -11633,10 +11699,23 @@ out_unlock:
return 0;
}
-static DEFINE_SPINLOCK(balancing);
+/*
+ * This flag serializes load-balancing passes over large domains
+ * (above the NODE topology level) - only one load-balancing instance
+ * may run at a time, to reduce overhead on very large systems with
+ * lots of CPUs and large NUMA distances.
+ *
+ * - Note that load-balancing passes triggered while another one
+ * is executing are skipped and not re-tried.
+ *
+ * - Also note that this does not serialize rebalance_domains()
+ * execution, as non-SD_SERIALIZE domains will still be
+ * load-balanced in parallel.
+ */
+static atomic_t sched_balance_running = ATOMIC_INIT(0);
/*
- * Scale the max load_balance interval with the number of CPUs in the system.
+ * Scale the max sched_balance_rq interval with the number of CPUs in the system.
* This trades load-balance latency on larger machines for less cross talk.
*/
void update_max_interval(void)
@@ -11674,7 +11753,7 @@ static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost)
*
* Balancing parameters are set up in init_sched_domains.
*/
-static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
+static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
{
int continue_balancing = 1;
int cpu = rq->cpu;
@@ -11711,25 +11790,25 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
need_serialize = sd->flags & SD_SERIALIZE;
if (need_serialize) {
- if (!spin_trylock(&balancing))
+ if (atomic_cmpxchg_acquire(&sched_balance_running, 0, 1))
goto out;
}
if (time_after_eq(jiffies, sd->last_balance + interval)) {
- if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
+ if (sched_balance_rq(cpu, rq, sd, idle, &continue_balancing)) {
/*
* The LBF_DST_PINNED logic could have changed
* env->dst_cpu, so we can't know our idle
* state even if we migrated tasks. Update it.
*/
- idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
- busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
+ idle = idle_cpu(cpu);
+ busy = !idle && !sched_idle_cpu(cpu);
}
sd->last_balance = jiffies;
interval = get_sd_balance_interval(sd, busy);
}
if (need_serialize)
- spin_unlock(&balancing);
+ atomic_set_release(&sched_balance_running, 0);
out:
if (time_after(next_balance, sd->last_balance + interval)) {
next_balance = sd->last_balance + interval;
@@ -11889,7 +11968,7 @@ static void nohz_balancer_kick(struct rq *rq)
* currently idle; in which case, kick the ILB to move tasks
* around.
*
- * When balancing betwen cores, all the SMT siblings of the
+ * When balancing between cores, all the SMT siblings of the
* preferred CPU must be idle.
*/
for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
@@ -11906,7 +11985,7 @@ static void nohz_balancer_kick(struct rq *rq)
* When ASYM_CPUCAPACITY; see if there's a higher capacity CPU
* to run the misfit task on.
*/
- if (check_misfit_status(rq, sd)) {
+ if (check_misfit_status(rq)) {
flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
goto unlock;
}
@@ -12050,7 +12129,7 @@ void nohz_balance_enter_idle(int cpu)
out:
/*
* Each time a cpu enter idle, we assume that it has blocked load and
- * enable the periodic update of the load of idle cpus
+ * enable the periodic update of the load of idle CPUs
*/
WRITE_ONCE(nohz.has_blocked, 1);
}
@@ -12068,13 +12147,13 @@ static bool update_nohz_stats(struct rq *rq)
if (!time_after(jiffies, READ_ONCE(rq->last_blocked_load_update_tick)))
return true;
- update_blocked_averages(cpu);
+ sched_balance_update_blocked_averages(cpu);
return rq->has_blocked_load;
}
/*
- * Internal function that runs load balance for all idle cpus. The load balance
+ * Internal function that runs load balance for all idle CPUs. The load balance
* can be a simple update of blocked load or a complete load balance with
* tasks movement depending of flags.
*/
@@ -12150,7 +12229,7 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags)
rq_unlock_irqrestore(rq, &rf);
if (flags & NOHZ_BALANCE_KICK)
- rebalance_domains(rq, CPU_IDLE);
+ sched_balance_domains(rq, CPU_IDLE);
}
if (time_after(next_balance, rq->next_balance)) {
@@ -12179,7 +12258,7 @@ abort:
/*
* In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
- * rebalancing for all the cpus for whom scheduler ticks are stopped.
+ * rebalancing for all the CPUs for whom scheduler ticks are stopped.
*/
static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
{
@@ -12210,7 +12289,7 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
* called from this function on (this) CPU that's not yet in the mask. That's
* OK because the goal of nohz_run_idle_balance() is to run ILB only for
* updating the blocked load of already idle CPUs without waking up one of
- * those idle CPUs and outside the preempt disable / irq off phase of the local
+ * those idle CPUs and outside the preempt disable / IRQ off phase of the local
* cpu about to enter idle, because it can take a long time.
*/
void nohz_run_idle_balance(int cpu)
@@ -12221,7 +12300,7 @@ void nohz_run_idle_balance(int cpu)
/*
* Update the blocked load only if no SCHED_SOFTIRQ is about to happen
- * (ie NOHZ_STATS_KICK set) and will do the same.
+ * (i.e. NOHZ_STATS_KICK set) and will do the same.
*/
if ((flags == NOHZ_NEWILB_KICK) && !need_resched())
_nohz_idle_balance(cpu_rq(cpu), NOHZ_STATS_KICK);
@@ -12266,7 +12345,7 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { }
#endif /* CONFIG_NO_HZ_COMMON */
/*
- * newidle_balance is called by schedule() if this_cpu is about to become
+ * sched_balance_newidle is called by schedule() if this_cpu is about to become
* idle. Attempts to pull tasks from other CPUs.
*
* Returns:
@@ -12274,10 +12353,11 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { }
* 0 - failed, no new tasks
* > 0 - success, new (fair) tasks present
*/
-static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
+static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
{
unsigned long next_balance = jiffies + HZ;
int this_cpu = this_rq->cpu;
+ int continue_balancing = 1;
u64 t0, t1, curr_cost = 0;
struct sched_domain *sd;
int pulled_task = 0;
@@ -12292,8 +12372,9 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
return 0;
/*
- * We must set idle_stamp _before_ calling idle_balance(), such that we
- * measure the duration of idle_balance() as idle time.
+ * We must set idle_stamp _before_ calling sched_balance_rq()
+ * for CPU_NEWLY_IDLE, such that we measure the this duration
+ * as idle time.
*/
this_rq->idle_stamp = rq_clock(this_rq);
@@ -12314,7 +12395,7 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
rcu_read_lock();
sd = rcu_dereference_check_sched_domain(this_rq->sd);
- if (!READ_ONCE(this_rq->rd->overload) ||
+ if (!get_rd_overloaded(this_rq->rd) ||
(sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) {
if (sd)
@@ -12328,11 +12409,10 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
raw_spin_rq_unlock(this_rq);
t0 = sched_clock_cpu(this_cpu);
- update_blocked_averages(this_cpu);
+ sched_balance_update_blocked_averages(this_cpu);
rcu_read_lock();
for_each_domain(this_cpu, sd) {
- int continue_balancing = 1;
u64 domain_cost;
update_next_balance(sd, &next_balance);
@@ -12342,7 +12422,7 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
if (sd->flags & SD_BALANCE_NEWIDLE) {
- pulled_task = load_balance(this_cpu, this_rq,
+ pulled_task = sched_balance_rq(this_cpu, this_rq,
sd, CPU_NEWLY_IDLE,
&continue_balancing);
@@ -12358,8 +12438,7 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
* Stop searching for tasks to pull if there are
* now runnable tasks on this rq.
*/
- if (pulled_task || this_rq->nr_running > 0 ||
- this_rq->ttwu_pending)
+ if (pulled_task || !continue_balancing)
break;
}
rcu_read_unlock();
@@ -12397,19 +12476,21 @@ out:
}
/*
- * run_rebalance_domains is triggered when needed from the scheduler tick.
- * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
+ * This softirq handler is triggered via SCHED_SOFTIRQ from two places:
+ *
+ * - directly from the local scheduler_tick() for periodic load balancing
+ *
+ * - indirectly from a remote scheduler_tick() for NOHZ idle balancing
+ * through the SMP cross-call nohz_csd_func()
*/
-static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
+static __latent_entropy void sched_balance_softirq(struct softirq_action *h)
{
struct rq *this_rq = this_rq();
- enum cpu_idle_type idle = this_rq->idle_balance ?
- CPU_IDLE : CPU_NOT_IDLE;
-
+ enum cpu_idle_type idle = this_rq->idle_balance;
/*
- * If this CPU has a pending nohz_balance_kick, then do the
+ * If this CPU has a pending NOHZ_BALANCE_KICK, then do the
* balancing on behalf of the other idle CPUs whose ticks are
- * stopped. Do nohz_idle_balance *before* rebalance_domains to
+ * stopped. Do nohz_idle_balance *before* sched_balance_domains to
* give the idle CPUs a chance to load balance. Else we may
* load balance only within the local sched_domain hierarchy
* and abort nohz_idle_balance altogether if we pull some load.
@@ -12418,14 +12499,14 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
return;
/* normal load balance */
- update_blocked_averages(this_rq->cpu);
- rebalance_domains(this_rq, idle);
+ sched_balance_update_blocked_averages(this_rq->cpu);
+ sched_balance_domains(this_rq, idle);
}
/*
* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
*/
-void trigger_load_balance(struct rq *rq)
+void sched_balance_trigger(struct rq *rq)
{
/*
* Don't need to rebalance while attached to NULL domain or
@@ -12609,7 +12690,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
task_tick_numa(rq, curr);
update_misfit_status(curr, rq);
- update_overutilized_status(task_rq(curr));
+ check_update_overutilized_status(task_rq(curr));
task_tick_core(rq, curr);
}
@@ -12629,6 +12710,8 @@ static void task_fork_fair(struct task_struct *p)
rq_lock(rq, &rf);
update_rq_clock(rq);
+ set_task_max_allowed_capacity(p);
+
cfs_rq = task_cfs_rq(current);
curr = cfs_rq->curr;
if (curr)
@@ -12752,6 +12835,8 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
{
attach_task_cfs_rq(p);
+ set_task_max_allowed_capacity(p);
+
if (task_on_rq_queued(p)) {
/*
* We were most likely switched from sched_rt, so
@@ -13123,7 +13208,7 @@ DEFINE_SCHED_CLASS(fair) = {
.rq_offline = rq_offline_fair,
.task_dead = task_dead_fair,
- .set_cpus_allowed = set_cpus_allowed_common,
+ .set_cpus_allowed = set_cpus_allowed_fair,
#endif
.task_tick = task_tick_fair,
@@ -13203,7 +13288,7 @@ __init void init_sched_fair_class(void)
#endif
}
- open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
+ open_softirq(SCHED_SOFTIRQ, sched_balance_softirq);
#ifdef CONFIG_NO_HZ_COMMON
nohz.next_balance = jiffies;
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 373d42c707bc..5891e715f00d 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -46,7 +46,16 @@ int housekeeping_any_cpu(enum hk_type type)
if (cpu < nr_cpu_ids)
return cpu;
- return cpumask_any_and(housekeeping.cpumasks[type], cpu_online_mask);
+ cpu = cpumask_any_and(housekeeping.cpumasks[type], cpu_online_mask);
+ if (likely(cpu < nr_cpu_ids))
+ return cpu;
+ /*
+ * Unless we have another problem this can only happen
+ * at boot time before start_secondary() brings the 1st
+ * housekeeping CPU up.
+ */
+ WARN_ON_ONCE(system_state == SYSTEM_RUNNING ||
+ type != HK_TYPE_TIMER);
}
}
return smp_processor_id();
@@ -109,6 +118,7 @@ static void __init housekeeping_setup_type(enum hk_type type,
static int __init housekeeping_setup(char *str, unsigned long flags)
{
cpumask_var_t non_housekeeping_mask, housekeeping_staging;
+ unsigned int first_cpu;
int err = 0;
if ((flags & HK_FLAG_TICK) && !(housekeeping.flags & HK_FLAG_TICK)) {
@@ -129,7 +139,8 @@ static int __init housekeeping_setup(char *str, unsigned long flags)
cpumask_andnot(housekeeping_staging,
cpu_possible_mask, non_housekeeping_mask);
- if (!cpumask_intersects(cpu_present_mask, housekeeping_staging)) {
+ first_cpu = cpumask_first_and(cpu_present_mask, housekeeping_staging);
+ if (first_cpu >= nr_cpu_ids || first_cpu >= setup_max_cpus) {
__cpumask_set_cpu(smp_processor_id(), housekeeping_staging);
__cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask);
if (!housekeeping.flags) {
@@ -138,6 +149,9 @@ static int __init housekeeping_setup(char *str, unsigned long flags)
}
}
+ if (cpumask_empty(non_housekeeping_mask))
+ goto free_housekeeping_staging;
+
if (!housekeeping.flags) {
/* First setup call ("nohz_full=" or "isolcpus=") */
enum hk_type type;
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index 52c8f8226b0d..ca9da66cc894 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -379,7 +379,7 @@ void calc_global_load(void)
}
/*
- * Called from scheduler_tick() to periodically update this CPU's
+ * Called from sched_tick() to periodically update this CPU's
* active count.
*/
void calc_global_load_tick(struct rq *this_rq)
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index 4e715b9b278e..809194cd779f 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -254,7 +254,7 @@ static int membarrier_global_expedited(void)
return 0;
/*
- * Matches memory barriers around rq->curr modification in
+ * Matches memory barriers after rq->curr modification in
* scheduler.
*/
smp_mb(); /* system call entry is not a mb. */
@@ -304,7 +304,7 @@ static int membarrier_global_expedited(void)
/*
* Memory barrier on the caller thread _after_ we finished
- * waiting for the last IPI. Matches memory barriers around
+ * waiting for the last IPI. Matches memory barriers before
* rq->curr modification in scheduler.
*/
smp_mb(); /* exit from system call is not a mb */
@@ -324,6 +324,7 @@ static int membarrier_private_expedited(int flags, int cpu_id)
MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
return -EPERM;
ipi_func = ipi_sync_core;
+ prepare_sync_core_cmd(mm);
} else if (flags == MEMBARRIER_FLAG_RSEQ) {
if (!IS_ENABLED(CONFIG_RSEQ))
return -EINVAL;
@@ -343,8 +344,12 @@ static int membarrier_private_expedited(int flags, int cpu_id)
return 0;
/*
- * Matches memory barriers around rq->curr modification in
+ * Matches memory barriers after rq->curr modification in
* scheduler.
+ *
+ * On RISC-V, this barrier pairing is also needed for the
+ * SYNC_CORE command when switching between processes, cf.
+ * the inline comments in membarrier_arch_switch_mm().
*/
smp_mb(); /* system call entry is not a mb. */
@@ -420,7 +425,7 @@ out:
/*
* Memory barrier on the caller thread _after_ we finished
- * waiting for the last IPI. Matches memory barriers around
+ * waiting for the last IPI. Matches memory barriers before
* rq->curr modification in scheduler.
*/
smp_mb(); /* exit from system call is not a mb */
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index 63b6cf898220..ef00382de595 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -208,8 +208,8 @@ ___update_load_sum(u64 now, struct sched_avg *sa,
* se has been already dequeued but cfs_rq->curr still points to it.
* This means that weight will be 0 but not running for a sched_entity
* but also for a cfs_rq if the latter becomes idle. As an example,
- * this happens during idle_balance() which calls
- * update_blocked_averages().
+ * this happens during sched_balance_newidle() which calls
+ * sched_balance_update_blocked_averages().
*
* Also see the comment in accumulate_sum().
*/
@@ -384,30 +384,30 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
return 0;
}
-#ifdef CONFIG_SCHED_THERMAL_PRESSURE
+#ifdef CONFIG_SCHED_HW_PRESSURE
/*
- * thermal:
+ * hardware:
*
* load_sum = \Sum se->avg.load_sum but se->avg.load_sum is not tracked
*
* util_avg and runnable_load_avg are not supported and meaningless.
*
* Unlike rt/dl utilization tracking that track time spent by a cpu
- * running a rt/dl task through util_avg, the average thermal pressure is
- * tracked through load_avg. This is because thermal pressure signal is
+ * running a rt/dl task through util_avg, the average HW pressure is
+ * tracked through load_avg. This is because HW pressure signal is
* time weighted "delta" capacity unlike util_avg which is binary.
* "delta capacity" = actual capacity -
- * capped capacity a cpu due to a thermal event.
+ * capped capacity a cpu due to a HW event.
*/
-int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity)
+int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity)
{
- if (___update_load_sum(now, &rq->avg_thermal,
+ if (___update_load_sum(now, &rq->avg_hw,
capacity,
capacity,
capacity)) {
- ___update_load_avg(&rq->avg_thermal, 1);
- trace_pelt_thermal_tp(rq);
+ ___update_load_avg(&rq->avg_hw, 1);
+ trace_pelt_hw_tp(rq);
return 1;
}
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index 9e1083465fbc..2150062949d4 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -7,21 +7,21 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq);
int update_rt_rq_load_avg(u64 now, struct rq *rq, int running);
int update_dl_rq_load_avg(u64 now, struct rq *rq, int running);
-#ifdef CONFIG_SCHED_THERMAL_PRESSURE
-int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity);
+#ifdef CONFIG_SCHED_HW_PRESSURE
+int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity);
-static inline u64 thermal_load_avg(struct rq *rq)
+static inline u64 hw_load_avg(struct rq *rq)
{
- return READ_ONCE(rq->avg_thermal.load_avg);
+ return READ_ONCE(rq->avg_hw.load_avg);
}
#else
static inline int
-update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity)
+update_hw_load_avg(u64 now, struct rq *rq, u64 capacity)
{
return 0;
}
-static inline u64 thermal_load_avg(struct rq *rq)
+static inline u64 hw_load_avg(struct rq *rq)
{
return 0;
}
@@ -202,12 +202,12 @@ update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
}
static inline int
-update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity)
+update_hw_load_avg(u64 now, struct rq *rq, u64 capacity)
{
return 0;
}
-static inline u64 thermal_load_avg(struct rq *rq)
+static inline u64 hw_load_avg(struct rq *rq)
{
return 0;
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d2242679239e..a831af102070 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -79,6 +79,8 @@
# include <asm/paravirt_api_clock.h>
#endif
+#include <asm/barrier.h>
+
#include "cpupri.h"
#include "cpudeadline.h"
@@ -110,6 +112,20 @@ extern int sysctl_sched_rt_runtime;
extern int sched_rr_timeslice;
/*
+ * Asymmetric CPU capacity bits
+ */
+struct asym_cap_data {
+ struct list_head link;
+ struct rcu_head rcu;
+ unsigned long capacity;
+ unsigned long cpus[];
+};
+
+extern struct list_head asym_cap_list;
+
+#define cpu_capacity_span(asym_data) to_cpumask((asym_data)->cpus)
+
+/*
* Helpers for converting nanosecond timing to jiffy resolution
*/
#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
@@ -699,7 +715,7 @@ struct rt_rq {
} highest_prio;
#endif
#ifdef CONFIG_SMP
- int overloaded;
+ bool overloaded;
struct plist_head pushable_tasks;
#endif /* CONFIG_SMP */
@@ -743,7 +759,7 @@ struct dl_rq {
u64 next;
} earliest_dl;
- int overloaded;
+ bool overloaded;
/*
* Tasks on this rq that can be pushed away. They are kept in
@@ -836,10 +852,6 @@ struct perf_domain {
struct rcu_head rcu;
};
-/* Scheduling group status flags */
-#define SG_OVERLOAD 0x1 /* More than one runnable task on a CPU. */
-#define SG_OVERUTILIZED 0x2 /* One or more CPUs are over-utilized. */
-
/*
* We add the notion of a root-domain which will be used to define per-domain
* variables. Each exclusive cpuset essentially defines an island domain by
@@ -860,10 +872,10 @@ struct root_domain {
* - More than one runnable task
* - Running task is misfit
*/
- int overload;
+ bool overloaded;
/* Indicate one or more cpus over-utilized (tipping point) */
- int overutilized;
+ bool overutilized;
/*
* The bit corresponding to a CPU gets set here if such CPU has more
@@ -903,8 +915,6 @@ struct root_domain {
cpumask_var_t rto_mask;
struct cpupri cpupri;
- unsigned long max_cpu_capacity;
-
/*
* NULL-terminated list of performance domains intersecting with the
* CPUs of the rd. Protected by RCU.
@@ -918,6 +928,17 @@ extern void rq_attach_root(struct rq *rq, struct root_domain *rd);
extern void sched_get_rd(struct root_domain *rd);
extern void sched_put_rd(struct root_domain *rd);
+static inline int get_rd_overloaded(struct root_domain *rd)
+{
+ return READ_ONCE(rd->overloaded);
+}
+
+static inline void set_rd_overloaded(struct root_domain *rd, int status)
+{
+ if (get_rd_overloaded(rd) != status)
+ WRITE_ONCE(rd->overloaded, status);
+}
+
#ifdef HAVE_RT_PUSH_IPI
extern void rto_push_irq_work_func(struct irq_work *work);
#endif
@@ -1089,8 +1110,8 @@ struct rq {
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
struct sched_avg avg_irq;
#endif
-#ifdef CONFIG_SCHED_THERMAL_PRESSURE
- struct sched_avg avg_thermal;
+#ifdef CONFIG_SCHED_HW_PRESSURE
+ struct sched_avg avg_hw;
#endif
u64 idle_stamp;
u64 avg_idle;
@@ -1531,24 +1552,6 @@ static inline u64 rq_clock_task(struct rq *rq)
return rq->clock_task;
}
-/**
- * By default the decay is the default pelt decay period.
- * The decay shift can change the decay period in
- * multiples of 32.
- * Decay shift Decay period(ms)
- * 0 32
- * 1 64
- * 2 128
- * 3 256
- * 4 512
- */
-extern int sched_thermal_decay_shift;
-
-static inline u64 rq_clock_thermal(struct rq *rq)
-{
- return rq_clock_task(rq) >> sched_thermal_decay_shift;
-}
-
static inline void rq_clock_skip_update(struct rq *rq)
{
lockdep_assert_rq_held(rq);
@@ -2397,7 +2400,7 @@ extern struct task_struct *pick_next_task_idle(struct rq *rq);
extern void update_group_capacity(struct sched_domain *sd, int cpu);
-extern void trigger_load_balance(struct rq *rq);
+extern void sched_balance_trigger(struct rq *rq);
extern void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx);
@@ -2517,10 +2520,8 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
}
#ifdef CONFIG_SMP
- if (prev_nr < 2 && rq->nr_running >= 2) {
- if (!READ_ONCE(rq->rd->overload))
- WRITE_ONCE(rq->rd->overload, 1);
- }
+ if (prev_nr < 2 && rq->nr_running >= 2)
+ set_rd_overloaded(rq->rd, 1);
#endif
sched_update_tick_dependency(rq);
@@ -2904,7 +2905,7 @@ extern void cfs_bandwidth_usage_dec(void);
#define NOHZ_NEWILB_KICK_BIT 2
#define NOHZ_NEXT_KICK_BIT 3
-/* Run rebalance_domains() */
+/* Run sched_balance_domains() */
#define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT)
/* Update blocked load */
#define NOHZ_STATS_KICK BIT(NOHZ_STATS_KICK_BIT)
@@ -3445,13 +3446,19 @@ static inline void switch_mm_cid(struct rq *rq,
* between rq->curr store and load of {prev,next}->mm->pcpu_cid[cpu].
* Provide it here.
*/
- if (!prev->mm) // from kernel
+ if (!prev->mm) { // from kernel
smp_mb();
- /*
- * user -> user transition guarantees a memory barrier through
- * switch_mm() when current->mm changes. If current->mm is
- * unchanged, no barrier is needed.
- */
+ } else { // from user
+ /*
+ * user->user transition relies on an implicit
+ * memory barrier in switch_mm() when
+ * current->mm changes. If the architecture
+ * switch_mm() does not have an implicit memory
+ * barrier, it is emitted here. If current->mm
+ * is unchanged, no barrier is needed.
+ */
+ smp_mb__after_switch_mm();
+ }
}
if (prev->mm_cid_active) {
mm_cid_snapshot_time(rq, prev->mm);
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 857f837f52cb..78e48f5426ee 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -113,7 +113,7 @@ void __update_stats_enqueue_sleeper(struct rq *rq, struct task_struct *p,
* Bump this up when changing the output format or the meaning of an existing
* format, so that tools can adapt (or abort)
*/
-#define SCHEDSTAT_VERSION 15
+#define SCHEDSTAT_VERSION 16
static int show_schedstat(struct seq_file *seq, void *v)
{
@@ -150,8 +150,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
seq_printf(seq, "domain%d %*pb", dcount++,
cpumask_pr_args(sched_domain_span(sd)));
- for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
- itype++) {
+ for (itype = 0; itype < CPU_MAX_IDLE_TYPES; itype++) {
seq_printf(seq, " %u %u %u %u %u %u %u %u",
sd->lb_count[itype],
sd->lb_balanced[itype],
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 99ea5986038c..63aecd2a7a9f 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1330,23 +1330,12 @@ next:
}
/*
- * Asymmetric CPU capacity bits
- */
-struct asym_cap_data {
- struct list_head link;
- unsigned long capacity;
- unsigned long cpus[];
-};
-
-/*
* Set of available CPUs grouped by their corresponding capacities
* Each list entry contains a CPU mask reflecting CPUs that share the same
* capacity.
* The lifespan of data is unlimited.
*/
-static LIST_HEAD(asym_cap_list);
-
-#define cpu_capacity_span(asym_data) to_cpumask((asym_data)->cpus)
+LIST_HEAD(asym_cap_list);
/*
* Verify whether there is any CPU capacity asymmetry in a given sched domain.
@@ -1386,21 +1375,39 @@ asym_cpu_capacity_classify(const struct cpumask *sd_span,
}
+static void free_asym_cap_entry(struct rcu_head *head)
+{
+ struct asym_cap_data *entry = container_of(head, struct asym_cap_data, rcu);
+ kfree(entry);
+}
+
static inline void asym_cpu_capacity_update_data(int cpu)
{
unsigned long capacity = arch_scale_cpu_capacity(cpu);
- struct asym_cap_data *entry = NULL;
+ struct asym_cap_data *insert_entry = NULL;
+ struct asym_cap_data *entry;
+ /*
+ * Search if capacity already exits. If not, track which the entry
+ * where we should insert to keep the list ordered descendingly.
+ */
list_for_each_entry(entry, &asym_cap_list, link) {
if (capacity == entry->capacity)
goto done;
+ else if (!insert_entry && capacity > entry->capacity)
+ insert_entry = list_prev_entry(entry, link);
}
entry = kzalloc(sizeof(*entry) + cpumask_size(), GFP_KERNEL);
if (WARN_ONCE(!entry, "Failed to allocate memory for asymmetry data\n"))
return;
entry->capacity = capacity;
- list_add(&entry->link, &asym_cap_list);
+
+ /* If NULL then the new capacity is the smallest, add last. */
+ if (!insert_entry)
+ list_add_tail_rcu(&entry->link, &asym_cap_list);
+ else
+ list_add_rcu(&entry->link, &insert_entry->link);
done:
__cpumask_set_cpu(cpu, cpu_capacity_span(entry));
}
@@ -1423,8 +1430,8 @@ static void asym_cpu_capacity_scan(void)
list_for_each_entry_safe(entry, next, &asym_cap_list, link) {
if (cpumask_empty(cpu_capacity_span(entry))) {
- list_del(&entry->link);
- kfree(entry);
+ list_del_rcu(&entry->link);
+ call_rcu(&entry->rcu, free_asym_cap_entry);
}
}
@@ -1434,8 +1441,8 @@ static void asym_cpu_capacity_scan(void)
*/
if (list_is_singular(&asym_cap_list)) {
entry = list_first_entry(&asym_cap_list, typeof(*entry), link);
- list_del(&entry->link);
- kfree(entry);
+ list_del_rcu(&entry->link);
+ call_rcu(&entry->rcu, free_asym_cap_entry);
}
}
@@ -2507,16 +2514,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
/* Attach the domains */
rcu_read_lock();
for_each_cpu(i, cpu_map) {
- unsigned long capacity;
-
rq = cpu_rq(i);
sd = *per_cpu_ptr(d.sd, i);
- capacity = arch_scale_cpu_capacity(i);
- /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
- if (capacity > READ_ONCE(d.rd->max_cpu_capacity))
- WRITE_ONCE(d.rd->max_cpu_capacity, capacity);
-
cpu_attach_domain(sd, d.rd, i);
if (lowest_flag_domain(i, SD_CLUSTER))
@@ -2530,10 +2530,8 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
if (has_cluster)
static_branch_inc_cpuslocked(&sched_cluster_active);
- if (rq && sched_debug_verbose) {
- pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n",
- cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
- }
+ if (rq && sched_debug_verbose)
+ pr_info("root domain span: %*pbl\n", cpumask_pr_args(cpu_map));
ret = 0;
error:
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index aca7b437882e..f70e031e06a8 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -2334,7 +2334,7 @@ static bool seccomp_actions_logged_from_names(u32 *actions_logged, char *names)
return true;
}
-static int read_actions_logged(struct ctl_table *ro_table, void *buffer,
+static int read_actions_logged(const struct ctl_table *ro_table, void *buffer,
size_t *lenp, loff_t *ppos)
{
char names[sizeof(seccomp_actions_avail)];
@@ -2352,7 +2352,7 @@ static int read_actions_logged(struct ctl_table *ro_table, void *buffer,
return proc_dostring(&table, 0, buffer, lenp, ppos);
}
-static int write_actions_logged(struct ctl_table *ro_table, void *buffer,
+static int write_actions_logged(const struct ctl_table *ro_table, void *buffer,
size_t *lenp, loff_t *ppos, u32 *actions_logged)
{
char names[sizeof(seccomp_actions_avail)];
diff --git a/kernel/signal.c b/kernel/signal.c
index bdca529f0f7b..7bdbcf1b78d0 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2741,12 +2741,15 @@ relock:
/* Has this task already been marked for death? */
if ((signal->flags & SIGNAL_GROUP_EXIT) ||
signal->group_exec_task) {
- clear_siginfo(&ksig->info);
- ksig->info.si_signo = signr = SIGKILL;
+ signr = SIGKILL;
sigdelset(&current->pending.signal, SIGKILL);
trace_signal_deliver(SIGKILL, SEND_SIG_NOINFO,
- &sighand->action[SIGKILL - 1]);
+ &sighand->action[SIGKILL-1]);
recalc_sigpending();
+ /*
+ * implies do_group_exit() or return to PF_USER_WORKER,
+ * no need to initialize ksig->info/etc.
+ */
goto fatal;
}
@@ -2856,7 +2859,7 @@ relock:
spin_lock_irq(&sighand->siglock);
}
- if (likely(do_signal_stop(ksig->info.si_signo))) {
+ if (likely(do_signal_stop(signr))) {
/* It released the siglock. */
goto relock;
}
@@ -2880,7 +2883,7 @@ relock:
if (sig_kernel_coredump(signr)) {
if (print_fatal_signals)
- print_fatal_signal(ksig->info.si_signo);
+ print_fatal_signal(signr);
proc_coredump_connector(current);
/*
* If it was able to dump core, this kills all
@@ -2895,8 +2898,9 @@ relock:
/*
* PF_USER_WORKER threads will catch and exit on fatal signals
- * themselves. They have cleanup that must be performed, so
- * we cannot call do_exit() on their behalf.
+ * themselves. They have cleanup that must be performed, so we
+ * cannot call do_exit() on their behalf. Note that ksig won't
+ * be properly initialized, PF_USER_WORKER's shouldn't use it.
*/
if (current->flags & PF_USER_WORKER)
goto out;
@@ -2904,17 +2908,17 @@ relock:
/*
* Death signals, no core dump.
*/
- do_group_exit(ksig->info.si_signo);
+ do_group_exit(signr);
/* NOTREACHED */
}
spin_unlock_irq(&sighand->siglock);
-out:
+
ksig->sig = signr;
- if (!(ksig->ka.sa.sa_flags & SA_EXPOSE_TAGBITS))
+ if (signr && !(ksig->ka.sa.sa_flags & SA_EXPOSE_TAGBITS))
hide_si_addr_tag_bits(ksig);
-
- return ksig->sig > 0;
+out:
+ return signr > 0;
}
/**
diff --git a/kernel/softirq.c b/kernel/softirq.c
index b315b21fb28c..02582017759a 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -508,7 +508,7 @@ static inline bool lockdep_softirq_start(void) { return false; }
static inline void lockdep_softirq_end(bool in_hardirq) { }
#endif
-asmlinkage __visible void __softirq_entry __do_softirq(void)
+static void handle_softirqs(bool ksirqd)
{
unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
unsigned long old_flags = current->flags;
@@ -563,8 +563,7 @@ restart:
pending >>= softirq_bit;
}
- if (!IS_ENABLED(CONFIG_PREEMPT_RT) &&
- __this_cpu_read(ksoftirqd) == current)
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT) && ksirqd)
rcu_softirq_qs();
local_irq_disable();
@@ -584,6 +583,11 @@ restart:
current_restore_flags(old_flags, PF_MEMALLOC);
}
+asmlinkage __visible void __softirq_entry __do_softirq(void)
+{
+ handle_softirqs(false);
+}
+
/**
* irq_enter_rcu - Enter an interrupt context with RCU watching
*/
@@ -921,7 +925,7 @@ static void run_ksoftirqd(unsigned int cpu)
* We can safely run softirq on inline stack, as we are not deep
* in the task stack here.
*/
- __do_softirq();
+ handle_softirqs(true);
ksoftirqd_run_end();
cond_resched();
return;
diff --git a/kernel/stackleak.c b/kernel/stackleak.c
index 34c9d81eea94..59cdfaf5118e 100644
--- a/kernel/stackleak.c
+++ b/kernel/stackleak.c
@@ -27,10 +27,10 @@ static int stack_erasing_sysctl(struct ctl_table *table, int write,
int ret = 0;
int state = !static_branch_unlikely(&stack_erasing_bypass);
int prev_state = state;
+ struct ctl_table table_copy = *table;
- table->data = &state;
- table->maxlen = sizeof(int);
- ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ table_copy.data = &state;
+ ret = proc_dointvec_minmax(&table_copy, write, buffer, lenp, ppos);
state = !!state;
if (ret || !write || state == prev_state)
return ret;
diff --git a/kernel/sys.c b/kernel/sys.c
index f8e543f1e38a..8bb106a56b3a 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2408,8 +2408,11 @@ static inline int prctl_set_mdwe(unsigned long bits, unsigned long arg3,
if (bits & PR_MDWE_NO_INHERIT && !(bits & PR_MDWE_REFUSE_EXEC_GAIN))
return -EINVAL;
- /* PARISC cannot allow mdwe as it needs writable stacks */
- if (IS_ENABLED(CONFIG_PARISC))
+ /*
+ * EOPNOTSUPP might be more appropriate here in principle, but
+ * existing userspace depends on EINVAL specifically.
+ */
+ if (!arch_memory_deny_write_exec_supported())
return -EINVAL;
current_bits = get_current_mdwe();
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 157f7ce2942d..81cc974913bb 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1710,9 +1710,9 @@ static struct ctl_table kern_table[] = {
{
.procname = "ftrace_dump_on_oops",
.data = &ftrace_dump_on_oops,
- .maxlen = sizeof(int),
+ .maxlen = MAX_TRACER_SIZE,
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dostring,
},
{
.procname = "traceoff_on_warning",
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index fc3b1a06c981..8ebb6d5a106b 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -202,7 +202,7 @@ config HIGH_RES_TIMERS
the size of the kernel image.
config CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
- int "Clocksource watchdog maximum allowable skew (in μs)"
+ int "Clocksource watchdog maximum allowable skew (in microseconds)"
depends on CLOCKSOURCE_WATCHDOG
range 50 1000
default 125
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 4657cb8e8b1f..5abfa4390673 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -134,7 +134,7 @@ static struct class_interface alarmtimer_rtc_interface = {
static int alarmtimer_rtc_interface_setup(void)
{
- alarmtimer_rtc_interface.class = rtc_class;
+ alarmtimer_rtc_interface.class = &rtc_class;
return class_interface_register(&alarmtimer_rtc_interface);
}
static void alarmtimer_rtc_interface_remove(void)
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index a7ca458cdd9c..60a6484831b1 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -677,7 +677,7 @@ static ssize_t current_device_show(struct device *dev,
raw_spin_lock_irq(&clockevents_lock);
td = tick_get_tick_dev(dev);
if (td && td->evtdev)
- count = snprintf(buf, PAGE_SIZE, "%s\n", td->evtdev->name);
+ count = sysfs_emit(buf, "%s\n", td->evtdev->name);
raw_spin_unlock_irq(&clockevents_lock);
return count;
}
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index e5b260aa0e02..d25ba49e313c 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -20,6 +20,16 @@
#include "tick-internal.h"
#include "timekeeping_internal.h"
+static noinline u64 cycles_to_nsec_safe(struct clocksource *cs, u64 start, u64 end)
+{
+ u64 delta = clocksource_delta(end, start, cs->mask);
+
+ if (likely(delta < cs->max_cycles))
+ return clocksource_cyc2ns(delta, cs->mult, cs->shift);
+
+ return mul_u64_u32_shr(delta, cs->mult, cs->shift);
+}
+
/**
* clocks_calc_mult_shift - calculate mult/shift factors for scaled math of clocks
* @mult: pointer to mult variable
@@ -222,8 +232,8 @@ enum wd_read_status {
static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow)
{
unsigned int nretries, max_retries;
- u64 wd_end, wd_end2, wd_delta;
int64_t wd_delay, wd_seq_delay;
+ u64 wd_end, wd_end2;
max_retries = clocksource_get_max_watchdog_retry();
for (nretries = 0; nretries <= max_retries; nretries++) {
@@ -234,9 +244,7 @@ static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow,
wd_end2 = watchdog->read(watchdog);
local_irq_enable();
- wd_delta = clocksource_delta(wd_end, *wdnow, watchdog->mask);
- wd_delay = clocksource_cyc2ns(wd_delta, watchdog->mult,
- watchdog->shift);
+ wd_delay = cycles_to_nsec_safe(watchdog, *wdnow, wd_end);
if (wd_delay <= WATCHDOG_MAX_SKEW) {
if (nretries > 1 || nretries >= max_retries) {
pr_warn("timekeeping watchdog on CPU%d: %s retried %d times before success\n",
@@ -254,8 +262,7 @@ static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow,
* report system busy, reinit the watchdog and skip the current
* watchdog test.
*/
- wd_delta = clocksource_delta(wd_end2, wd_end, watchdog->mask);
- wd_seq_delay = clocksource_cyc2ns(wd_delta, watchdog->mult, watchdog->shift);
+ wd_seq_delay = cycles_to_nsec_safe(watchdog, wd_end, wd_end2);
if (wd_seq_delay > WATCHDOG_MAX_SKEW/2)
goto skip_test;
}
@@ -366,8 +373,7 @@ void clocksource_verify_percpu(struct clocksource *cs)
delta = (csnow_end - csnow_mid) & cs->mask;
if (delta < 0)
cpumask_set_cpu(cpu, &cpus_ahead);
- delta = clocksource_delta(csnow_end, csnow_begin, cs->mask);
- cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift);
+ cs_nsec = cycles_to_nsec_safe(cs, csnow_begin, csnow_end);
if (cs_nsec > cs_nsec_max)
cs_nsec_max = cs_nsec;
if (cs_nsec < cs_nsec_min)
@@ -398,8 +404,8 @@ static inline void clocksource_reset_watchdog(void)
static void clocksource_watchdog(struct timer_list *unused)
{
- u64 csnow, wdnow, cslast, wdlast, delta;
int64_t wd_nsec, cs_nsec, interval;
+ u64 csnow, wdnow, cslast, wdlast;
int next_cpu, reset_pending;
struct clocksource *cs;
enum wd_read_status read_ret;
@@ -456,12 +462,8 @@ static void clocksource_watchdog(struct timer_list *unused)
continue;
}
- delta = clocksource_delta(wdnow, cs->wd_last, watchdog->mask);
- wd_nsec = clocksource_cyc2ns(delta, watchdog->mult,
- watchdog->shift);
-
- delta = clocksource_delta(csnow, cs->cs_last, cs->mask);
- cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift);
+ wd_nsec = cycles_to_nsec_safe(watchdog, cs->wd_last, wdnow);
+ cs_nsec = cycles_to_nsec_safe(cs, cs->cs_last, csnow);
wdlast = cs->wd_last; /* save these in case we print them */
cslast = cs->cs_last;
cs->cs_last = csnow;
@@ -832,7 +834,7 @@ void clocksource_start_suspend_timing(struct clocksource *cs, u64 start_cycles)
*/
u64 clocksource_stop_suspend_timing(struct clocksource *cs, u64 cycle_now)
{
- u64 now, delta, nsec = 0;
+ u64 now, nsec = 0;
if (!suspend_clocksource)
return 0;
@@ -847,12 +849,8 @@ u64 clocksource_stop_suspend_timing(struct clocksource *cs, u64 cycle_now)
else
now = suspend_clocksource->read(suspend_clocksource);
- if (now > suspend_start) {
- delta = clocksource_delta(now, suspend_start,
- suspend_clocksource->mask);
- nsec = mul_u64_u32_shr(delta, suspend_clocksource->mult,
- suspend_clocksource->shift);
- }
+ if (now > suspend_start)
+ nsec = cycles_to_nsec_safe(suspend_clocksource, suspend_start, now);
/*
* Disable the suspend timer to save power if current clocksource is
@@ -1336,7 +1334,7 @@ static ssize_t current_clocksource_show(struct device *dev,
ssize_t count = 0;
mutex_lock(&clocksource_mutex);
- count = snprintf(buf, PAGE_SIZE, "%s\n", curr_clocksource->name);
+ count = sysfs_emit(buf, "%s\n", curr_clocksource->name);
mutex_unlock(&clocksource_mutex);
return count;
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 70625dff62ce..492c14aac642 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -644,17 +644,12 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
/*
* Is the high resolution mode active ?
*/
-static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base)
+static inline int hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base)
{
return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ?
cpu_base->hres_active : 0;
}
-static inline int hrtimer_hres_active(void)
-{
- return __hrtimer_hres_active(this_cpu_ptr(&hrtimer_bases));
-}
-
static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base,
struct hrtimer *next_timer,
ktime_t expires_next)
@@ -678,7 +673,7 @@ static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base,
* set. So we'd effectively block all timers until the T2 event
* fires.
*/
- if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected)
+ if (!hrtimer_hres_active(cpu_base) || cpu_base->hang_detected)
return;
tick_program_event(expires_next, 1);
@@ -789,12 +784,12 @@ static void retrigger_next_event(void *arg)
* function call will take care of the reprogramming in case the
* CPU was in a NOHZ idle sleep.
*/
- if (!__hrtimer_hres_active(base) && !tick_nohz_active)
+ if (!hrtimer_hres_active(base) && !tick_nohz_active)
return;
raw_spin_lock(&base->lock);
hrtimer_update_base(base);
- if (__hrtimer_hres_active(base))
+ if (hrtimer_hres_active(base))
hrtimer_force_reprogram(base, 0);
else
hrtimer_update_next_event(base);
@@ -951,7 +946,7 @@ void clock_was_set(unsigned int bases)
cpumask_var_t mask;
int cpu;
- if (!__hrtimer_hres_active(cpu_base) && !tick_nohz_active)
+ if (!hrtimer_hres_active(cpu_base) && !tick_nohz_active)
goto out_timerfd;
if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) {
@@ -1491,7 +1486,7 @@ u64 hrtimer_get_next_event(void)
raw_spin_lock_irqsave(&cpu_base->lock, flags);
- if (!__hrtimer_hres_active(cpu_base))
+ if (!hrtimer_hres_active(cpu_base))
expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);
raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
@@ -1514,7 +1509,7 @@ u64 hrtimer_next_event_without(const struct hrtimer *exclude)
raw_spin_lock_irqsave(&cpu_base->lock, flags);
- if (__hrtimer_hres_active(cpu_base)) {
+ if (hrtimer_hres_active(cpu_base)) {
unsigned int active;
if (!cpu_base->softirq_activated) {
@@ -1875,25 +1870,7 @@ retry:
tick_program_event(expires_next, 1);
pr_warn_once("hrtimer: interrupt took %llu ns\n", ktime_to_ns(delta));
}
-
-/* called with interrupts disabled */
-static inline void __hrtimer_peek_ahead_timers(void)
-{
- struct tick_device *td;
-
- if (!hrtimer_hres_active())
- return;
-
- td = this_cpu_ptr(&tick_cpu_device);
- if (td && td->evtdev)
- hrtimer_interrupt(td->evtdev);
-}
-
-#else /* CONFIG_HIGH_RES_TIMERS */
-
-static inline void __hrtimer_peek_ahead_timers(void) { }
-
-#endif /* !CONFIG_HIGH_RES_TIMERS */
+#endif /* !CONFIG_HIGH_RES_TIMERS */
/*
* Called from run_local_timers in hardirq context every jiffy
@@ -1904,7 +1881,7 @@ void hrtimer_run_queues(void)
unsigned long flags;
ktime_t now;
- if (__hrtimer_hres_active(cpu_base))
+ if (hrtimer_hres_active(cpu_base))
return;
/*
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index 9de66bbbb3d1..4782edcbe7b9 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -129,15 +129,17 @@ static int posix_clock_open(struct inode *inode, struct file *fp)
goto out;
}
pccontext->clk = clk;
- fp->private_data = pccontext;
- if (clk->ops.open)
+ if (clk->ops.open) {
err = clk->ops.open(pccontext, fp->f_mode);
- else
- err = 0;
-
- if (!err) {
- get_device(clk->dev);
+ if (err) {
+ kfree(pccontext);
+ goto out;
+ }
}
+
+ fp->private_data = pccontext;
+ get_device(clk->dev);
+ err = 0;
out:
up_read(&clk->rwsem);
return err;
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index fb0fdec8719a..d88b13076b79 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -7,6 +7,7 @@
* Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
* Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
*/
+#include <linux/compiler.h>
#include <linux/cpu.h>
#include <linux/err.h>
#include <linux/hrtimer.h>
@@ -84,7 +85,7 @@ int tick_is_oneshot_available(void)
*/
static void tick_periodic(int cpu)
{
- if (tick_do_timer_cpu == cpu) {
+ if (READ_ONCE(tick_do_timer_cpu) == cpu) {
raw_spin_lock(&jiffies_lock);
write_seqcount_begin(&jiffies_seq);
@@ -215,8 +216,8 @@ static void tick_setup_device(struct tick_device *td,
* If no cpu took the do_timer update, assign it to
* this cpu:
*/
- if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) {
- tick_do_timer_cpu = cpu;
+ if (READ_ONCE(tick_do_timer_cpu) == TICK_DO_TIMER_BOOT) {
+ WRITE_ONCE(tick_do_timer_cpu, cpu);
tick_next_period = ktime_get();
#ifdef CONFIG_NO_HZ_FULL
/*
@@ -232,7 +233,7 @@ static void tick_setup_device(struct tick_device *td,
!tick_nohz_full_cpu(cpu)) {
tick_take_do_timer_from_boot();
tick_do_timer_boot_cpu = -1;
- WARN_ON(tick_do_timer_cpu != cpu);
+ WARN_ON(READ_ONCE(tick_do_timer_cpu) != cpu);
#endif
}
@@ -406,10 +407,10 @@ void tick_assert_timekeeping_handover(void)
int tick_cpu_dying(unsigned int dying_cpu)
{
/*
- * If the current CPU is the timekeeper, it's the only one that
- * can safely hand over its duty. Also all online CPUs are in
- * stop machine, guaranteed not to be idle, therefore it's safe
- * to pick any online successor.
+ * If the current CPU is the timekeeper, it's the only one that can
+ * safely hand over its duty. Also all online CPUs are in stop
+ * machine, guaranteed not to be idle, therefore there is no
+ * concurrency and it's safe to pick any online successor.
*/
if (tick_do_timer_cpu == dying_cpu)
tick_do_timer_cpu = cpumask_first(cpu_online_mask);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 269e21590df5..71a792cd8936 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -8,6 +8,7 @@
*
* Started by: Thomas Gleixner and Ingo Molnar
*/
+#include <linux/compiler.h>
#include <linux/cpu.h>
#include <linux/err.h>
#include <linux/hrtimer.h>
@@ -204,7 +205,7 @@ static inline void tick_sched_flag_clear(struct tick_sched *ts,
static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)
{
- int cpu = smp_processor_id();
+ int tick_cpu, cpu = smp_processor_id();
/*
* Check if the do_timer duty was dropped. We don't care about
@@ -216,16 +217,18 @@ static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)
* If nohz_full is enabled, this should not happen because the
* 'tick_do_timer_cpu' CPU never relinquishes.
*/
- if (IS_ENABLED(CONFIG_NO_HZ_COMMON) &&
- unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) {
+ tick_cpu = READ_ONCE(tick_do_timer_cpu);
+
+ if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && unlikely(tick_cpu == TICK_DO_TIMER_NONE)) {
#ifdef CONFIG_NO_HZ_FULL
WARN_ON_ONCE(tick_nohz_full_running);
#endif
- tick_do_timer_cpu = cpu;
+ WRITE_ONCE(tick_do_timer_cpu, cpu);
+ tick_cpu = cpu;
}
/* Check if jiffies need an update */
- if (tick_do_timer_cpu == cpu)
+ if (tick_cpu == cpu)
tick_do_update_jiffies64(now);
/*
@@ -610,7 +613,7 @@ bool tick_nohz_cpu_hotpluggable(unsigned int cpu)
* timers, workqueues, timekeeping, ...) on behalf of full dynticks
* CPUs. It must remain online when nohz full is enabled.
*/
- if (tick_nohz_full_running && tick_do_timer_cpu == cpu)
+ if (tick_nohz_full_running && READ_ONCE(tick_do_timer_cpu) == cpu)
return false;
return true;
}
@@ -697,6 +700,7 @@ bool tick_nohz_tick_stopped_cpu(int cpu)
/**
* tick_nohz_update_jiffies - update jiffies when idle was interrupted
+ * @now: current ktime_t
*
* Called from interrupt entry when the CPU was idle
*
@@ -794,7 +798,7 @@ static u64 get_cpu_sleep_time_us(struct tick_sched *ts, ktime_t *sleeptime,
* This time is measured via accounting rather than sampling,
* and is as accurate as ktime_get() is.
*
- * This function returns -1 if NOHZ is not enabled.
+ * Return: -1 if NOHZ is not enabled, else total idle time of the @cpu
*/
u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
{
@@ -820,7 +824,7 @@ EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
* This time is measured via accounting rather than sampling,
* and is as accurate as ktime_get() is.
*
- * This function returns -1 if NOHZ is not enabled.
+ * Return: -1 if NOHZ is not enabled, else total iowait time of @cpu
*/
u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
{
@@ -890,6 +894,7 @@ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
{
u64 basemono, next_tick, delta, expires;
unsigned long basejiff;
+ int tick_cpu;
basemono = get_jiffies_update(&basejiff);
ts->last_jiffies = basejiff;
@@ -946,9 +951,9 @@ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
* Otherwise we can sleep as long as we want.
*/
delta = timekeeping_max_deferment();
- if (cpu != tick_do_timer_cpu &&
- (tick_do_timer_cpu != TICK_DO_TIMER_NONE ||
- !tick_sched_flag_test(ts, TS_FLAG_DO_TIMER_LAST)))
+ tick_cpu = READ_ONCE(tick_do_timer_cpu);
+ if (tick_cpu != cpu &&
+ (tick_cpu != TICK_DO_TIMER_NONE || !tick_sched_flag_test(ts, TS_FLAG_DO_TIMER_LAST)))
delta = KTIME_MAX;
/* Calculate the next expiry time */
@@ -969,6 +974,7 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
unsigned long basejiff = ts->last_jiffies;
u64 basemono = ts->timer_expires_base;
bool timer_idle = tick_sched_flag_test(ts, TS_FLAG_STOPPED);
+ int tick_cpu;
u64 expires;
/* Make sure we won't be trying to stop it twice in a row. */
@@ -1006,10 +1012,11 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
* do_timer() never gets invoked. Keep track of the fact that it
* was the one which had the do_timer() duty last.
*/
- if (cpu == tick_do_timer_cpu) {
- tick_do_timer_cpu = TICK_DO_TIMER_NONE;
+ tick_cpu = READ_ONCE(tick_do_timer_cpu);
+ if (tick_cpu == cpu) {
+ WRITE_ONCE(tick_do_timer_cpu, TICK_DO_TIMER_NONE);
tick_sched_flag_set(ts, TS_FLAG_DO_TIMER_LAST);
- } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) {
+ } else if (tick_cpu != TICK_DO_TIMER_NONE) {
tick_sched_flag_clear(ts, TS_FLAG_DO_TIMER_LAST);
}
@@ -1172,15 +1179,17 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
return false;
if (tick_nohz_full_enabled()) {
+ int tick_cpu = READ_ONCE(tick_do_timer_cpu);
+
/*
* Keep the tick alive to guarantee timekeeping progression
* if there are full dynticks CPUs around
*/
- if (tick_do_timer_cpu == cpu)
+ if (tick_cpu == cpu)
return false;
/* Should not happen for nohz-full */
- if (WARN_ON_ONCE(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
+ if (WARN_ON_ONCE(tick_cpu == TICK_DO_TIMER_NONE))
return false;
}
@@ -1287,6 +1296,8 @@ void tick_nohz_irq_exit(void)
/**
* tick_nohz_idle_got_tick - Check whether or not the tick handler has run
+ *
+ * Return: %true if the tick handler has run, otherwise %false
*/
bool tick_nohz_idle_got_tick(void)
{
@@ -1305,6 +1316,8 @@ bool tick_nohz_idle_got_tick(void)
* stopped, it returns the next hrtimer.
*
* Called from power state control code with interrupts disabled
+ *
+ * Return: the next expiration time
*/
ktime_t tick_nohz_get_next_hrtimer(void)
{
@@ -1320,6 +1333,8 @@ ktime_t tick_nohz_get_next_hrtimer(void)
* The return value of this function and/or the value returned by it through the
* @delta_next pointer can be negative which must be taken into account by its
* callers.
+ *
+ * Return: the expected length of the current sleep
*/
ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next)
{
@@ -1357,8 +1372,11 @@ ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next)
/**
* tick_nohz_get_idle_calls_cpu - return the current idle calls counter value
* for a particular CPU.
+ * @cpu: target CPU number
*
* Called from the schedutil frequency scaling governor in scheduler context.
+ *
+ * Return: the current idle calls counter value for @cpu
*/
unsigned long tick_nohz_get_idle_calls_cpu(int cpu)
{
@@ -1371,6 +1389,8 @@ unsigned long tick_nohz_get_idle_calls_cpu(int cpu)
* tick_nohz_get_idle_calls - return the current idle calls counter value
*
* Called from the schedutil frequency scaling governor in scheduler context.
+ *
+ * Return: the current idle calls counter value for the current CPU
*/
unsigned long tick_nohz_get_idle_calls(void)
{
@@ -1559,7 +1579,7 @@ early_param("skew_tick", skew_tick);
/**
* tick_setup_sched_timer - setup the tick emulation timer
- * @mode: tick_nohz_mode to setup for
+ * @hrtimer: whether to use the hrtimer or not
*/
void tick_setup_sched_timer(bool hrtimer)
{
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index e11c4dc65bcb..b4a7822f495d 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -46,8 +46,8 @@ struct tick_device {
* @next_tick: Next tick to be fired when in dynticks mode.
* @idle_jiffies: jiffies at the entry to idle for idle time accounting
* @idle_waketime: Time when the idle was interrupted
+ * @idle_sleeptime_seq: sequence counter for data consistency
* @idle_entrytime: Time when the idle call was entered
- * @nohz_mode: Mode - one state of tick_nohz_mode
* @last_jiffies: Base jiffies snapshot when next event was last computed
* @timer_expires_base: Base time clock monotonic for @timer_expires
* @timer_expires: Anticipated timer expiration time (in case sched tick is stopped)
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index b58dffc58a8f..4e18db1819f8 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -237,7 +237,9 @@ static void timekeeping_check_update(struct timekeeper *tk, u64 offset)
}
}
-static inline u64 timekeeping_get_delta(const struct tk_read_base *tkr)
+static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles);
+
+static inline u64 timekeeping_debug_get_ns(const struct tk_read_base *tkr)
{
struct timekeeper *tk = &tk_core.timekeeper;
u64 now, last, mask, max, delta;
@@ -264,34 +266,23 @@ static inline u64 timekeeping_get_delta(const struct tk_read_base *tkr)
* Try to catch underflows by checking if we are seeing small
* mask-relative negative values.
*/
- if (unlikely((~delta & mask) < (mask >> 3))) {
+ if (unlikely((~delta & mask) < (mask >> 3)))
tk->underflow_seen = 1;
- delta = 0;
- }
- /* Cap delta value to the max_cycles values to avoid mult overflows */
- if (unlikely(delta > max)) {
+ /* Check for multiplication overflows */
+ if (unlikely(delta > max))
tk->overflow_seen = 1;
- delta = tkr->clock->max_cycles;
- }
- return delta;
+ /* timekeeping_cycles_to_ns() handles both under and overflow */
+ return timekeeping_cycles_to_ns(tkr, now);
}
#else
static inline void timekeeping_check_update(struct timekeeper *tk, u64 offset)
{
}
-static inline u64 timekeeping_get_delta(const struct tk_read_base *tkr)
+static inline u64 timekeeping_debug_get_ns(const struct tk_read_base *tkr)
{
- u64 cycle_now, delta;
-
- /* read clocksource */
- cycle_now = tk_clock_read(tkr);
-
- /* calculate the delta since the last update_wall_time */
- delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask);
-
- return delta;
+ BUG();
}
#endif
@@ -370,32 +361,46 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
}
/* Timekeeper helper functions. */
+static noinline u64 delta_to_ns_safe(const struct tk_read_base *tkr, u64 delta)
+{
+ return mul_u64_u32_add_u64_shr(delta, tkr->mult, tkr->xtime_nsec, tkr->shift);
+}
-static inline u64 timekeeping_delta_to_ns(const struct tk_read_base *tkr, u64 delta)
+static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles)
{
- u64 nsec;
+ /* Calculate the delta since the last update_wall_time() */
+ u64 mask = tkr->mask, delta = (cycles - tkr->cycle_last) & mask;
- nsec = delta * tkr->mult + tkr->xtime_nsec;
- nsec >>= tkr->shift;
+ /*
+ * This detects both negative motion and the case where the delta
+ * overflows the multiplication with tkr->mult.
+ */
+ if (unlikely(delta > tkr->clock->max_cycles)) {
+ /*
+ * Handle clocksource inconsistency between CPUs to prevent
+ * time from going backwards by checking for the MSB of the
+ * mask being set in the delta.
+ */
+ if (delta & ~(mask >> 1))
+ return tkr->xtime_nsec >> tkr->shift;
+
+ return delta_to_ns_safe(tkr, delta);
+ }
- return nsec;
+ return ((delta * tkr->mult) + tkr->xtime_nsec) >> tkr->shift;
}
-static inline u64 timekeeping_get_ns(const struct tk_read_base *tkr)
+static __always_inline u64 __timekeeping_get_ns(const struct tk_read_base *tkr)
{
- u64 delta;
-
- delta = timekeeping_get_delta(tkr);
- return timekeeping_delta_to_ns(tkr, delta);
+ return timekeeping_cycles_to_ns(tkr, tk_clock_read(tkr));
}
-static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles)
+static inline u64 timekeeping_get_ns(const struct tk_read_base *tkr)
{
- u64 delta;
+ if (IS_ENABLED(CONFIG_DEBUG_TIMEKEEPING))
+ return timekeeping_debug_get_ns(tkr);
- /* calculate the delta since the last update_wall_time */
- delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask);
- return timekeeping_delta_to_ns(tkr, delta);
+ return __timekeeping_get_ns(tkr);
}
/**
@@ -431,14 +436,6 @@ static void update_fast_timekeeper(const struct tk_read_base *tkr,
memcpy(base + 1, base, sizeof(*base));
}
-static __always_inline u64 fast_tk_get_delta_ns(struct tk_read_base *tkr)
-{
- u64 delta, cycles = tk_clock_read(tkr);
-
- delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask);
- return timekeeping_delta_to_ns(tkr, delta);
-}
-
static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
{
struct tk_read_base *tkr;
@@ -449,7 +446,7 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
seq = raw_read_seqcount_latch(&tkf->seq);
tkr = tkf->base + (seq & 0x01);
now = ktime_to_ns(tkr->base);
- now += fast_tk_get_delta_ns(tkr);
+ now += __timekeeping_get_ns(tkr);
} while (raw_read_seqcount_latch_retry(&tkf->seq, seq));
return now;
@@ -565,7 +562,7 @@ static __always_inline u64 __ktime_get_real_fast(struct tk_fast *tkf, u64 *mono)
tkr = tkf->base + (seq & 0x01);
basem = ktime_to_ns(tkr->base);
baser = ktime_to_ns(tkr->base_real);
- delta = fast_tk_get_delta_ns(tkr);
+ delta = __timekeeping_get_ns(tkr);
} while (raw_read_seqcount_latch_retry(&tkf->seq, seq));
if (mono)
@@ -800,10 +797,15 @@ static void timekeeping_forward_now(struct timekeeper *tk)
tk->tkr_mono.cycle_last = cycle_now;
tk->tkr_raw.cycle_last = cycle_now;
- tk->tkr_mono.xtime_nsec += delta * tk->tkr_mono.mult;
- tk->tkr_raw.xtime_nsec += delta * tk->tkr_raw.mult;
+ while (delta > 0) {
+ u64 max = tk->tkr_mono.clock->max_cycles;
+ u64 incr = delta < max ? delta : max;
- tk_normalize_xtime(tk);
+ tk->tkr_mono.xtime_nsec += incr * tk->tkr_mono.mult;
+ tk->tkr_raw.xtime_nsec += incr * tk->tkr_raw.mult;
+ tk_normalize_xtime(tk);
+ delta -= incr;
+ }
}
/**
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index e69e75d3858c..e394d6d5b9b5 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -64,15 +64,15 @@ EXPORT_SYMBOL(jiffies_64);
/*
* The timer wheel has LVL_DEPTH array levels. Each level provides an array of
- * LVL_SIZE buckets. Each level is driven by its own clock and therefor each
+ * LVL_SIZE buckets. Each level is driven by its own clock and therefore each
* level has a different granularity.
*
- * The level granularity is: LVL_CLK_DIV ^ lvl
+ * The level granularity is: LVL_CLK_DIV ^ level
* The level clock frequency is: HZ / (LVL_CLK_DIV ^ level)
*
* The array level of a newly armed timer depends on the relative expiry
* time. The farther the expiry time is away the higher the array level and
- * therefor the granularity becomes.
+ * therefore the granularity becomes.
*
* Contrary to the original timer wheel implementation, which aims for 'exact'
* expiry of the timers, this implementation removes the need for recascading
@@ -207,7 +207,7 @@ EXPORT_SYMBOL(jiffies_64);
* struct timer_base - Per CPU timer base (number of base depends on config)
* @lock: Lock protecting the timer_base
* @running_timer: When expiring timers, the lock is dropped. To make
- * sure not to race agains deleting/modifying a
+ * sure not to race against deleting/modifying a
* currently running timer, the pointer is set to the
* timer, which expires at the moment. If no timer is
* running, the pointer is NULL.
@@ -642,7 +642,8 @@ trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
* the base lock:
*/
if (base->is_idle) {
- WARN_ON_ONCE(!(timer->flags & TIMER_PINNED));
+ WARN_ON_ONCE(!(timer->flags & TIMER_PINNED ||
+ tick_nohz_full_cpu(base->cpu)));
wake_up_nohz_cpu(base->cpu);
}
}
@@ -736,7 +737,7 @@ static bool timer_is_static_object(void *addr)
}
/*
- * fixup_init is called when:
+ * timer_fixup_init is called when:
* - an active object is initialized
*/
static bool timer_fixup_init(void *addr, enum debug_obj_state state)
@@ -760,7 +761,7 @@ static void stub_timer(struct timer_list *unused)
}
/*
- * fixup_activate is called when:
+ * timer_fixup_activate is called when:
* - an active object is activated
* - an unknown non-static object is activated
*/
@@ -782,7 +783,7 @@ static bool timer_fixup_activate(void *addr, enum debug_obj_state state)
}
/*
- * fixup_free is called when:
+ * timer_fixup_free is called when:
* - an active object is freed
*/
static bool timer_fixup_free(void *addr, enum debug_obj_state state)
@@ -800,7 +801,7 @@ static bool timer_fixup_free(void *addr, enum debug_obj_state state)
}
/*
- * fixup_assert_init is called when:
+ * timer_fixup_assert_init is called when:
* - an untracked/uninit-ed object is found
*/
static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state)
@@ -913,7 +914,7 @@ static void do_init_timer(struct timer_list *timer,
* @key: lockdep class key of the fake lock used for tracking timer
* sync lock dependencies
*
- * init_timer_key() must be done to a timer prior calling *any* of the
+ * init_timer_key() must be done to a timer prior to calling *any* of the
* other timer functions.
*/
void init_timer_key(struct timer_list *timer,
@@ -1416,7 +1417,7 @@ static int __timer_delete(struct timer_list *timer, bool shutdown)
* If @shutdown is set then the lock has to be taken whether the
* timer is pending or not to protect against a concurrent rearm
* which might hit between the lockless pending check and the lock
- * aquisition. By taking the lock it is ensured that such a newly
+ * acquisition. By taking the lock it is ensured that such a newly
* enqueued timer is dequeued and cannot end up with
* timer->function == NULL in the expiry code.
*
@@ -2292,13 +2293,20 @@ static inline u64 __get_next_timer_interrupt(unsigned long basej, u64 basem,
*/
if (!base_local->is_idle && time_after(nextevt, basej + 1)) {
base_local->is_idle = true;
+ /*
+ * Global timers queued locally while running in a task
+ * in nohz_full mode need a self-IPI to kick reprogramming
+ * in IRQ tail.
+ */
+ if (tick_nohz_full_cpu(base_local->cpu))
+ base_global->is_idle = true;
trace_timer_base_idle(true, base_local->cpu);
}
*idle = base_local->is_idle;
/*
* When timer base is not set idle, undo the effect of
- * tmigr_cpu_deactivate() to prevent inconsitent states - active
+ * tmigr_cpu_deactivate() to prevent inconsistent states - active
* timer base but inactive timer migration hierarchy.
*
* When timer base was already marked idle, nothing will be
@@ -2364,6 +2372,8 @@ void timer_clear_idle(void)
* path. Required for BASE_LOCAL only.
*/
__this_cpu_write(timer_bases[BASE_LOCAL].is_idle, false);
+ if (tick_nohz_full_cpu(smp_processor_id()))
+ __this_cpu_write(timer_bases[BASE_GLOBAL].is_idle, false);
trace_timer_base_idle(false, smp_processor_id());
/* Activate without holding the timer_base->lock */
@@ -2478,7 +2488,7 @@ void update_process_times(int user_tick)
if (in_irq())
irq_work_tick();
#endif
- scheduler_tick();
+ sched_tick();
if (IS_ENABLED(CONFIG_POSIX_TIMERS))
run_posix_cpu_timers();
}
diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c
index 8f49b6b96dfd..84413114db5c 100644
--- a/kernel/time/timer_migration.c
+++ b/kernel/time/timer_migration.c
@@ -765,10 +765,17 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child,
* is never set.
* - tmigr_inactive_up() takes care of the propagation by
* itself and ignores the return value. But an immediate
- * return is required because nothing has to be done in this
- * level as the event could be ignored.
+ * return is possible if there is a parent, sparing group
+ * locking at this level, because the upper walking call to
+ * the parent will take care about removing this event from
+ * within the group and update next_expiry accordingly.
+ *
+ * However if there is no parent, ie: the hierarchy has only a
+ * single level so @group is the top level group, make sure the
+ * first event information of the group is updated properly and
+ * also handled properly, so skip this fast return path.
*/
- if (evt->ignore && !remote)
+ if (evt->ignore && !remote && group->parent)
return true;
raw_spin_lock(&group->lock);
@@ -782,8 +789,11 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child,
* queue when the expiry time changed only or when it could be ignored.
*/
if (timerqueue_node_queued(&evt->nextevt)) {
- if ((evt->nextevt.expires == nextexp) && !evt->ignore)
+ if ((evt->nextevt.expires == nextexp) && !evt->ignore) {
+ /* Make sure not to miss a new CPU event with the same expiry */
+ evt->cpu = first_childevt->cpu;
goto check_toplvl;
+ }
if (!timerqueue_del(&group->events, &evt->nextevt))
WRITE_ONCE(group->next_expiry, KTIME_MAX);
@@ -1058,8 +1068,15 @@ void tmigr_handle_remote(void)
* in tmigr_handle_remote_up() anyway. Keep this check to speed up the
* return when nothing has to be done.
*/
- if (!tmigr_check_migrator(tmc->tmgroup, tmc->childmask))
- return;
+ if (!tmigr_check_migrator(tmc->tmgroup, tmc->childmask)) {
+ /*
+ * If this CPU was an idle migrator, make sure to clear its wakeup
+ * value so it won't chase timers that have already expired elsewhere.
+ * This avoids endless requeue from tmigr_new_timer().
+ */
+ if (READ_ONCE(tmc->wakeup) == KTIME_MAX)
+ return;
+ }
data.now = get_jiffies_update(&data.basej);
@@ -1579,7 +1596,7 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node)
} while (i < tmigr_hierarchy_levels);
- do {
+ while (i > 0) {
group = stack[--i];
if (err < 0) {
@@ -1628,7 +1645,7 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node)
tmigr_connect_child_parent(child, group);
}
}
- } while (i > 0);
+ }
kfree(stack);
diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c
index f0d5062d9cbc..9193d6133e5d 100644
--- a/kernel/time/vsyscall.c
+++ b/kernel/time/vsyscall.c
@@ -22,10 +22,16 @@ static inline void update_vdso_data(struct vdso_data *vdata,
u64 nsec, sec;
vdata[CS_HRES_COARSE].cycle_last = tk->tkr_mono.cycle_last;
+#ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT
+ vdata[CS_HRES_COARSE].max_cycles = tk->tkr_mono.clock->max_cycles;
+#endif
vdata[CS_HRES_COARSE].mask = tk->tkr_mono.mask;
vdata[CS_HRES_COARSE].mult = tk->tkr_mono.mult;
vdata[CS_HRES_COARSE].shift = tk->tkr_mono.shift;
vdata[CS_RAW].cycle_last = tk->tkr_raw.cycle_last;
+#ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT
+ vdata[CS_RAW].max_cycles = tk->tkr_raw.clock->max_cycles;
+#endif
vdata[CS_RAW].mask = tk->tkr_raw.mask;
vdata[CS_RAW].mult = tk->tkr_raw.mult;
vdata[CS_RAW].shift = tk->tkr_raw.shift;
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 61c541c36596..b3d7f62ac581 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -163,7 +163,7 @@ config TRACING
select BINARY_PRINTF
select EVENT_TRACING
select TRACE_CLOCK
- select TASKS_RCU if PREEMPTION
+ select NEED_TASKS_RCU
config GENERIC_TRACER
bool
@@ -204,7 +204,7 @@ config FUNCTION_TRACER
select GENERIC_TRACER
select CONTEXT_SWITCH_TRACER
select GLOB
- select TASKS_RCU if PREEMPTION
+ select NEED_TASKS_RCU
select TASKS_RUDE_RCU
help
Enable the kernel to trace every kernel function. This is done
@@ -965,7 +965,7 @@ config FTRACE_RECORD_RECURSION
config FTRACE_RECORD_RECURSION_SIZE
int "Max number of recursed functions to record"
- default 128
+ default 128
depends on FTRACE_RECORD_RECURSION
help
This defines the limit of number of functions that can be
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 0a5c4efc73c3..f5154c051d2c 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1053,9 +1053,15 @@ static unsigned long get_entry_ip(unsigned long fentry_ip)
{
u32 instr;
- /* Being extra safe in here in case entry ip is on the page-edge. */
- if (get_kernel_nofault(instr, (u32 *) fentry_ip - 1))
- return fentry_ip;
+ /* We want to be extra safe in case entry ip is on the page edge,
+ * but otherwise we need to avoid get_kernel_nofault()'s overhead.
+ */
+ if ((fentry_ip & ~PAGE_MASK) < ENDBR_INSN_SIZE) {
+ if (get_kernel_nofault(instr, (u32 *)(fentry_ip - ENDBR_INSN_SIZE)))
+ return fentry_ip;
+ } else {
+ instr = *(u32 *)(fentry_ip - ENDBR_INSN_SIZE);
+ }
if (is_endbr(instr))
fentry_ip -= ENDBR_INSN_SIZE;
return fentry_ip;
@@ -1182,9 +1188,6 @@ static const struct bpf_func_proto bpf_get_attach_cookie_proto_tracing = {
BPF_CALL_3(bpf_get_branch_snapshot, void *, buf, u32, size, u64, flags)
{
-#ifndef CONFIG_X86
- return -ENOENT;
-#else
static const u32 br_entry_size = sizeof(struct perf_branch_entry);
u32 entry_cnt = size / br_entry_size;
@@ -1197,7 +1200,6 @@ BPF_CALL_3(bpf_get_branch_snapshot, void *, buf, u32, size, u64, flags)
return -ENOENT;
return entry_cnt * br_entry_size;
-#endif
}
static const struct bpf_func_proto bpf_get_branch_snapshot_proto = {
@@ -1525,8 +1527,6 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_ktime_get_boot_ns_proto;
case BPF_FUNC_tail_call:
return &bpf_tail_call_proto;
- case BPF_FUNC_get_current_pid_tgid:
- return &bpf_get_current_pid_tgid_proto;
case BPF_FUNC_get_current_task:
return &bpf_get_current_task_proto;
case BPF_FUNC_get_current_task_btf:
@@ -1582,8 +1582,6 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_send_signal_thread_proto;
case BPF_FUNC_perf_event_read_value:
return &bpf_perf_event_read_value_proto;
- case BPF_FUNC_get_ns_current_pid_tgid:
- return &bpf_get_ns_current_pid_tgid_proto;
case BPF_FUNC_ringbuf_output:
return &bpf_ringbuf_output_proto;
case BPF_FUNC_ringbuf_reserve:
@@ -1633,6 +1631,17 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
}
}
+static bool is_kprobe_multi(const struct bpf_prog *prog)
+{
+ return prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI ||
+ prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION;
+}
+
+static inline bool is_kprobe_session(const struct bpf_prog *prog)
+{
+ return prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION;
+}
+
static const struct bpf_func_proto *
kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
@@ -1648,13 +1657,13 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_override_return_proto;
#endif
case BPF_FUNC_get_func_ip:
- if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI)
+ if (is_kprobe_multi(prog))
return &bpf_get_func_ip_proto_kprobe_multi;
if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI)
return &bpf_get_func_ip_proto_uprobe_multi;
return &bpf_get_func_ip_proto_kprobe;
case BPF_FUNC_get_attach_cookie:
- if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI)
+ if (is_kprobe_multi(prog))
return &bpf_get_attach_cookie_proto_kmulti;
if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI)
return &bpf_get_attach_cookie_proto_umulti;
@@ -2008,6 +2017,8 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_stackid_proto_raw_tp;
case BPF_FUNC_get_stack:
return &bpf_get_stack_proto_raw_tp;
+ case BPF_FUNC_get_attach_cookie:
+ return &bpf_get_attach_cookie_proto_tracing;
default:
return bpf_tracing_func_proto(func_id, prog);
}
@@ -2070,6 +2081,9 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_get_func_arg_cnt:
return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_cnt_proto : NULL;
case BPF_FUNC_get_attach_cookie:
+ if (prog->type == BPF_PROG_TYPE_TRACING &&
+ prog->expected_attach_type == BPF_TRACE_RAW_TP)
+ return &bpf_get_attach_cookie_proto_tracing;
return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto_tracing : NULL;
default:
fn = raw_tp_prog_func_proto(func_id, prog);
@@ -2370,16 +2384,26 @@ void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp)
}
static __always_inline
-void __bpf_trace_run(struct bpf_prog *prog, u64 *args)
+void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args)
{
+ struct bpf_prog *prog = link->link.prog;
+ struct bpf_run_ctx *old_run_ctx;
+ struct bpf_trace_run_ctx run_ctx;
+
cant_sleep();
if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
bpf_prog_inc_misses_counter(prog);
goto out;
}
+
+ run_ctx.bpf_cookie = link->cookie;
+ old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
+
rcu_read_lock();
(void) bpf_prog_run(prog, args);
rcu_read_unlock();
+
+ bpf_reset_run_ctx(old_run_ctx);
out:
this_cpu_dec(*(prog->active));
}
@@ -2408,12 +2432,12 @@ out:
#define __SEQ_0_11 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
#define BPF_TRACE_DEFN_x(x) \
- void bpf_trace_run##x(struct bpf_prog *prog, \
+ void bpf_trace_run##x(struct bpf_raw_tp_link *link, \
REPEAT(x, SARG, __DL_COM, __SEQ_0_11)) \
{ \
u64 args[x]; \
REPEAT(x, COPY, __DL_SEM, __SEQ_0_11); \
- __bpf_trace_run(prog, args); \
+ __bpf_trace_run(link, args); \
} \
EXPORT_SYMBOL_GPL(bpf_trace_run##x)
BPF_TRACE_DEFN_x(1);
@@ -2429,9 +2453,10 @@ BPF_TRACE_DEFN_x(10);
BPF_TRACE_DEFN_x(11);
BPF_TRACE_DEFN_x(12);
-static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
+int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_raw_tp_link *link)
{
struct tracepoint *tp = btp->tp;
+ struct bpf_prog *prog = link->link.prog;
/*
* check that program doesn't access arguments beyond what's
@@ -2443,18 +2468,12 @@ static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *
if (prog->aux->max_tp_access > btp->writable_size)
return -EINVAL;
- return tracepoint_probe_register_may_exist(tp, (void *)btp->bpf_func,
- prog);
+ return tracepoint_probe_register_may_exist(tp, (void *)btp->bpf_func, link);
}
-int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
+int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_raw_tp_link *link)
{
- return __bpf_probe_register(btp, prog);
-}
-
-int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
-{
- return tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, prog);
+ return tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, link);
}
int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
@@ -2577,6 +2596,12 @@ static int __init bpf_event_init(void)
fs_initcall(bpf_event_init);
#endif /* CONFIG_MODULES */
+struct bpf_session_run_ctx {
+ struct bpf_run_ctx run_ctx;
+ bool is_return;
+ void *data;
+};
+
#ifdef CONFIG_FPROBE
struct bpf_kprobe_multi_link {
struct bpf_link link;
@@ -2590,7 +2615,7 @@ struct bpf_kprobe_multi_link {
};
struct bpf_kprobe_multi_run_ctx {
- struct bpf_run_ctx run_ctx;
+ struct bpf_session_run_ctx session_ctx;
struct bpf_kprobe_multi_link *link;
unsigned long entry_ip;
};
@@ -2728,7 +2753,7 @@ static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link,
static const struct bpf_link_ops bpf_kprobe_multi_link_lops = {
.release = bpf_kprobe_multi_link_release,
- .dealloc = bpf_kprobe_multi_link_dealloc,
+ .dealloc_deferred = bpf_kprobe_multi_link_dealloc,
.fill_link_info = bpf_kprobe_multi_link_fill_link_info,
};
@@ -2769,7 +2794,8 @@ static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx)
if (WARN_ON_ONCE(!ctx))
return 0;
- run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx, run_ctx);
+ run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx,
+ session_ctx.run_ctx);
link = run_ctx->link;
if (!link->cookies)
return 0;
@@ -2786,15 +2812,21 @@ static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx)
{
struct bpf_kprobe_multi_run_ctx *run_ctx;
- run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx, run_ctx);
+ run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx,
+ session_ctx.run_ctx);
return run_ctx->entry_ip;
}
static int
kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
- unsigned long entry_ip, struct pt_regs *regs)
+ unsigned long entry_ip, struct pt_regs *regs,
+ bool is_return, void *data)
{
struct bpf_kprobe_multi_run_ctx run_ctx = {
+ .session_ctx = {
+ .is_return = is_return,
+ .data = data,
+ },
.link = link,
.entry_ip = entry_ip,
};
@@ -2809,7 +2841,7 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
migrate_disable();
rcu_read_lock();
- old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
+ old_run_ctx = bpf_set_run_ctx(&run_ctx.session_ctx.run_ctx);
err = bpf_prog_run(link->link.prog, regs);
bpf_reset_run_ctx(old_run_ctx);
rcu_read_unlock();
@@ -2826,10 +2858,11 @@ kprobe_multi_link_handler(struct fprobe *fp, unsigned long fentry_ip,
void *data)
{
struct bpf_kprobe_multi_link *link;
+ int err;
link = container_of(fp, struct bpf_kprobe_multi_link, fp);
- kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs);
- return 0;
+ err = kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs, false, data);
+ return is_kprobe_session(link->link.prog) ? err : 0;
}
static void
@@ -2840,7 +2873,7 @@ kprobe_multi_link_exit_handler(struct fprobe *fp, unsigned long fentry_ip,
struct bpf_kprobe_multi_link *link;
link = container_of(fp, struct bpf_kprobe_multi_link, fp);
- kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs);
+ kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs, true, data);
}
static int symbols_cmp_r(const void *a, const void *b, const void *priv)
@@ -2973,7 +3006,7 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
if (sizeof(u64) != sizeof(void *))
return -EOPNOTSUPP;
- if (prog->expected_attach_type != BPF_TRACE_KPROBE_MULTI)
+ if (!is_kprobe_multi(prog))
return -EINVAL;
flags = attr->link_create.kprobe_multi.flags;
@@ -3054,10 +3087,12 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
if (err)
goto error;
- if (flags & BPF_F_KPROBE_MULTI_RETURN)
- link->fp.exit_handler = kprobe_multi_link_exit_handler;
- else
+ if (!(flags & BPF_F_KPROBE_MULTI_RETURN))
link->fp.entry_handler = kprobe_multi_link_handler;
+ if ((flags & BPF_F_KPROBE_MULTI_RETURN) || is_kprobe_session(prog))
+ link->fp.exit_handler = kprobe_multi_link_exit_handler;
+ if (is_kprobe_session(prog))
+ link->fp.entry_data_size = sizeof(u64);
link->addrs = addrs;
link->cookies = cookies;
@@ -3157,6 +3192,9 @@ static void bpf_uprobe_multi_link_release(struct bpf_link *link)
umulti_link = container_of(link, struct bpf_uprobe_multi_link, link);
bpf_uprobe_unregister(&umulti_link->path, umulti_link->uprobes, umulti_link->cnt);
+ if (umulti_link->task)
+ put_task_struct(umulti_link->task);
+ path_put(&umulti_link->path);
}
static void bpf_uprobe_multi_link_dealloc(struct bpf_link *link)
@@ -3164,9 +3202,6 @@ static void bpf_uprobe_multi_link_dealloc(struct bpf_link *link)
struct bpf_uprobe_multi_link *umulti_link;
umulti_link = container_of(link, struct bpf_uprobe_multi_link, link);
- if (umulti_link->task)
- put_task_struct(umulti_link->task);
- path_put(&umulti_link->path);
kvfree(umulti_link->uprobes);
kfree(umulti_link);
}
@@ -3242,7 +3277,7 @@ static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link,
static const struct bpf_link_ops bpf_uprobe_multi_link_lops = {
.release = bpf_uprobe_multi_link_release,
- .dealloc = bpf_uprobe_multi_link_dealloc,
+ .dealloc_deferred = bpf_uprobe_multi_link_dealloc,
.fill_link_info = bpf_uprobe_multi_link_fill_link_info,
};
@@ -3483,3 +3518,54 @@ static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx)
return 0;
}
#endif /* CONFIG_UPROBES */
+
+#ifdef CONFIG_FPROBE
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc bool bpf_session_is_return(void)
+{
+ struct bpf_session_run_ctx *session_ctx;
+
+ session_ctx = container_of(current->bpf_ctx, struct bpf_session_run_ctx, run_ctx);
+ return session_ctx->is_return;
+}
+
+__bpf_kfunc __u64 *bpf_session_cookie(void)
+{
+ struct bpf_session_run_ctx *session_ctx;
+
+ session_ctx = container_of(current->bpf_ctx, struct bpf_session_run_ctx, run_ctx);
+ return session_ctx->data;
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(kprobe_multi_kfunc_set_ids)
+BTF_ID_FLAGS(func, bpf_session_is_return)
+BTF_ID_FLAGS(func, bpf_session_cookie)
+BTF_KFUNCS_END(kprobe_multi_kfunc_set_ids)
+
+static int bpf_kprobe_multi_filter(const struct bpf_prog *prog, u32 kfunc_id)
+{
+ if (!btf_id_set8_contains(&kprobe_multi_kfunc_set_ids, kfunc_id))
+ return 0;
+
+ if (!is_kprobe_session(prog))
+ return -EACCES;
+
+ return 0;
+}
+
+static const struct btf_kfunc_id_set bpf_kprobe_multi_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &kprobe_multi_kfunc_set_ids,
+ .filter = bpf_kprobe_multi_filter,
+};
+
+static int __init bpf_kprobe_multi_kfuncs_init(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_KPROBE, &bpf_kprobe_multi_kfunc_set);
+}
+
+late_initcall(bpf_kprobe_multi_kfuncs_init);
+#endif
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 83ba342aef31..6c96b30f3d63 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1160,7 +1160,7 @@ __ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
* Search a given @hash to see if a given instruction pointer (@ip)
* exists in it.
*
- * Returns the entry that holds the @ip if found. NULL otherwise.
+ * Returns: the entry that holds the @ip if found. NULL otherwise.
*/
struct ftrace_func_entry *
ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
@@ -1282,7 +1282,7 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash)
/**
* ftrace_free_filter - remove all filters for an ftrace_ops
- * @ops - the ops to remove the filters from
+ * @ops: the ops to remove the filters from
*/
void ftrace_free_filter(struct ftrace_ops *ops)
{
@@ -1587,7 +1587,7 @@ static struct dyn_ftrace *lookup_rec(unsigned long start, unsigned long end)
* @end: end of range to search (inclusive). @end points to the last byte
* to check.
*
- * Returns rec->ip if the related ftrace location is a least partly within
+ * Returns: rec->ip if the related ftrace location is a least partly within
* the given address range. That is, the first address of the instruction
* that is either a NOP or call to the function tracer. It checks the ftrace
* internal tables to determine if the address belongs or not.
@@ -1607,9 +1607,10 @@ unsigned long ftrace_location_range(unsigned long start, unsigned long end)
* ftrace_location - return the ftrace location
* @ip: the instruction pointer to check
*
- * If @ip matches the ftrace location, return @ip.
- * If @ip matches sym+0, return sym's ftrace location.
- * Otherwise, return 0.
+ * Returns:
+ * * If @ip matches the ftrace location, return @ip.
+ * * If @ip matches sym+0, return sym's ftrace location.
+ * * Otherwise, return 0.
*/
unsigned long ftrace_location(unsigned long ip)
{
@@ -1639,7 +1640,7 @@ out:
* @start: start of range to search
* @end: end of range to search (inclusive). @end points to the last byte to check.
*
- * Returns 1 if @start and @end contains a ftrace location.
+ * Returns: 1 if @start and @end contains a ftrace location.
* That is, the instruction that is either a NOP or call to
* the function tracer. It checks the ftrace internal tables to
* determine if the address belongs or not.
@@ -2574,7 +2575,7 @@ static void call_direct_funcs(unsigned long ip, unsigned long pip,
* wants to convert to a callback that saves all regs. If FTRACE_FL_REGS
* is not set, then it wants to convert to the normal callback.
*
- * Returns the address of the trampoline to set to
+ * Returns: the address of the trampoline to set to
*/
unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec)
{
@@ -2615,7 +2616,7 @@ unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec)
* a function that saves all the regs. Basically the '_EN' version
* represents the current state of the function.
*
- * Returns the address of the trampoline that is currently being called
+ * Returns: the address of the trampoline that is currently being called
*/
unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec)
{
@@ -2719,7 +2720,7 @@ struct ftrace_rec_iter {
/**
* ftrace_rec_iter_start - start up iterating over traced functions
*
- * Returns an iterator handle that is used to iterate over all
+ * Returns: an iterator handle that is used to iterate over all
* the records that represent address locations where functions
* are traced.
*
@@ -2751,7 +2752,7 @@ struct ftrace_rec_iter *ftrace_rec_iter_start(void)
* ftrace_rec_iter_next - get the next record to process.
* @iter: The handle to the iterator.
*
- * Returns the next iterator after the given iterator @iter.
+ * Returns: the next iterator after the given iterator @iter.
*/
struct ftrace_rec_iter *ftrace_rec_iter_next(struct ftrace_rec_iter *iter)
{
@@ -2776,7 +2777,7 @@ struct ftrace_rec_iter *ftrace_rec_iter_next(struct ftrace_rec_iter *iter)
* ftrace_rec_iter_record - get the record at the iterator location
* @iter: The current iterator location
*
- * Returns the record that the current @iter is at.
+ * Returns: the record that the current @iter is at.
*/
struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter)
{
@@ -3156,8 +3157,7 @@ out:
* synchronize_rcu_tasks() will wait for those tasks to
* execute and either schedule voluntarily or enter user space.
*/
- if (IS_ENABLED(CONFIG_PREEMPTION))
- synchronize_rcu_tasks();
+ synchronize_rcu_tasks();
ftrace_trampoline_free(ops);
}
@@ -4010,6 +4010,8 @@ ftrace_avail_addrs_open(struct inode *inode, struct file *file)
* ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set.
* tracing_lseek() should be used as the lseek routine, and
* release must call ftrace_regex_release().
+ *
+ * Returns: 0 on success or a negative errno value on failure
*/
int
ftrace_regex_open(struct ftrace_ops *ops, int flag,
@@ -4626,7 +4628,7 @@ struct ftrace_func_mapper {
/**
* allocate_ftrace_func_mapper - allocate a new ftrace_func_mapper
*
- * Returns a ftrace_func_mapper descriptor that can be used to map ips to data.
+ * Returns: a ftrace_func_mapper descriptor that can be used to map ips to data.
*/
struct ftrace_func_mapper *allocate_ftrace_func_mapper(void)
{
@@ -4646,7 +4648,7 @@ struct ftrace_func_mapper *allocate_ftrace_func_mapper(void)
* @mapper: The mapper that has the ip maps
* @ip: the instruction pointer to find the data for
*
- * Returns the data mapped to @ip if found otherwise NULL. The return
+ * Returns: the data mapped to @ip if found otherwise NULL. The return
* is actually the address of the mapper data pointer. The address is
* returned for use cases where the data is no bigger than a long, and
* the user can use the data pointer as its data instead of having to
@@ -4672,7 +4674,7 @@ void **ftrace_func_mapper_find_ip(struct ftrace_func_mapper *mapper,
* @ip: The instruction pointer address to map @data to
* @data: The data to map to @ip
*
- * Returns 0 on success otherwise an error.
+ * Returns: 0 on success otherwise an error.
*/
int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper,
unsigned long ip, void *data)
@@ -4701,7 +4703,7 @@ int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper,
* @mapper: The mapper that has the ip maps
* @ip: The instruction pointer address to remove the data from
*
- * Returns the data if it is found, otherwise NULL.
+ * Returns: the data if it is found, otherwise NULL.
* Note, if the data pointer is used as the data itself, (see
* ftrace_func_mapper_find_ip(), then the return value may be meaningless,
* if the data pointer was set to zero.
@@ -5625,10 +5627,10 @@ EXPORT_SYMBOL_GPL(modify_ftrace_direct);
/**
* ftrace_set_filter_ip - set a function to filter on in ftrace by address
- * @ops - the ops to set the filter with
- * @ip - the address to add to or remove from the filter.
- * @remove - non zero to remove the ip from the filter
- * @reset - non zero to reset all filters before applying this filter.
+ * @ops: the ops to set the filter with
+ * @ip: the address to add to or remove from the filter.
+ * @remove: non zero to remove the ip from the filter
+ * @reset: non zero to reset all filters before applying this filter.
*
* Filters denote which functions should be enabled when tracing is enabled
* If @ip is NULL, it fails to update filter.
@@ -5647,11 +5649,11 @@ EXPORT_SYMBOL_GPL(ftrace_set_filter_ip);
/**
* ftrace_set_filter_ips - set functions to filter on in ftrace by addresses
- * @ops - the ops to set the filter with
- * @ips - the array of addresses to add to or remove from the filter.
- * @cnt - the number of addresses in @ips
- * @remove - non zero to remove ips from the filter
- * @reset - non zero to reset all filters before applying this filter.
+ * @ops: the ops to set the filter with
+ * @ips: the array of addresses to add to or remove from the filter.
+ * @cnt: the number of addresses in @ips
+ * @remove: non zero to remove ips from the filter
+ * @reset: non zero to reset all filters before applying this filter.
*
* Filters denote which functions should be enabled when tracing is enabled
* If @ips array or any ip specified within is NULL , it fails to update filter.
@@ -5670,7 +5672,7 @@ EXPORT_SYMBOL_GPL(ftrace_set_filter_ips);
/**
* ftrace_ops_set_global_filter - setup ops to use global filters
- * @ops - the ops which will use the global filters
+ * @ops: the ops which will use the global filters
*
* ftrace users who need global function trace filtering should call this.
* It can set the global filter only if ops were not initialized before.
@@ -5694,10 +5696,10 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
/**
* ftrace_set_filter - set a function to filter on in ftrace
- * @ops - the ops to set the filter with
- * @buf - the string that holds the function filter text.
- * @len - the length of the string.
- * @reset - non zero to reset all filters before applying this filter.
+ * @ops: the ops to set the filter with
+ * @buf: the string that holds the function filter text.
+ * @len: the length of the string.
+ * @reset: non-zero to reset all filters before applying this filter.
*
* Filters denote which functions should be enabled when tracing is enabled.
* If @buf is NULL and reset is set, all functions will be enabled for tracing.
@@ -5716,10 +5718,10 @@ EXPORT_SYMBOL_GPL(ftrace_set_filter);
/**
* ftrace_set_notrace - set a function to not trace in ftrace
- * @ops - the ops to set the notrace filter with
- * @buf - the string that holds the function notrace text.
- * @len - the length of the string.
- * @reset - non zero to reset all filters before applying this filter.
+ * @ops: the ops to set the notrace filter with
+ * @buf: the string that holds the function notrace text.
+ * @len: the length of the string.
+ * @reset: non-zero to reset all filters before applying this filter.
*
* Notrace Filters denote which functions should not be enabled when tracing
* is enabled. If @buf is NULL and reset is set, all functions will be enabled
@@ -5738,9 +5740,9 @@ int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf,
EXPORT_SYMBOL_GPL(ftrace_set_notrace);
/**
* ftrace_set_global_filter - set a function to filter on with global tracers
- * @buf - the string that holds the function filter text.
- * @len - the length of the string.
- * @reset - non zero to reset all filters before applying this filter.
+ * @buf: the string that holds the function filter text.
+ * @len: the length of the string.
+ * @reset: non-zero to reset all filters before applying this filter.
*
* Filters denote which functions should be enabled when tracing is enabled.
* If @buf is NULL and reset is set, all functions will be enabled for tracing.
@@ -5753,9 +5755,9 @@ EXPORT_SYMBOL_GPL(ftrace_set_global_filter);
/**
* ftrace_set_global_notrace - set a function to not trace with global tracers
- * @buf - the string that holds the function notrace text.
- * @len - the length of the string.
- * @reset - non zero to reset all filters before applying this filter.
+ * @buf: the string that holds the function notrace text.
+ * @len: the length of the string.
+ * @reset: non-zero to reset all filters before applying this filter.
*
* Notrace Filters denote which functions should not be enabled when tracing
* is enabled. If @buf is NULL and reset is set, all functions will be enabled
@@ -7443,7 +7445,7 @@ NOKPROBE_SYMBOL(ftrace_ops_assist_func);
* have its own recursion protection, then it should call the
* ftrace_ops_assist_func() instead.
*
- * Returns the function that the trampoline should call for @ops.
+ * Returns: the function that the trampoline should call for @ops.
*/
ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)
{
@@ -7897,7 +7899,7 @@ void ftrace_kill(void)
/**
* ftrace_is_dead - Test if ftrace is dead or not.
*
- * Returns 1 if ftrace is "dead", zero otherwise.
+ * Returns: 1 if ftrace is "dead", zero otherwise.
*/
int ftrace_is_dead(void)
{
@@ -8142,8 +8144,7 @@ static int kallsyms_callback(void *data, const char *name, unsigned long addr)
* @addrs array, which needs to be big enough to store at least @cnt
* addresses.
*
- * This function returns 0 if all provided symbols are found,
- * -ESRCH otherwise.
+ * Returns: 0 if all provided symbols are found, -ESRCH otherwise.
*/
int ftrace_lookup_symbols(const char **sorted_syms, size_t cnt, unsigned long *addrs)
{
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 3103a484182e..6511dc3a00da 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -384,6 +384,7 @@ struct rb_irq_work {
struct irq_work work;
wait_queue_head_t waiters;
wait_queue_head_t full_waiters;
+ atomic_t seq;
bool waiters_pending;
bool full_waiters_pending;
bool wakeup_full;
@@ -753,6 +754,9 @@ static void rb_wake_up_waiters(struct irq_work *work)
{
struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work);
+ /* For waiters waiting for the first wake up */
+ (void)atomic_fetch_inc_release(&rbwork->seq);
+
wake_up_all(&rbwork->waiters);
if (rbwork->full_waiters_pending || rbwork->wakeup_full) {
/* Only cpu_buffer sets the above flags */
@@ -834,51 +838,24 @@ static bool rb_watermark_hit(struct trace_buffer *buffer, int cpu, int full)
pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
ret = !pagebusy && full_hit(buffer, cpu, full);
- if (!cpu_buffer->shortest_full ||
- cpu_buffer->shortest_full > full)
- cpu_buffer->shortest_full = full;
+ if (!ret && (!cpu_buffer->shortest_full ||
+ cpu_buffer->shortest_full > full)) {
+ cpu_buffer->shortest_full = full;
+ }
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
}
return ret;
}
-/**
- * ring_buffer_wait - wait for input to the ring buffer
- * @buffer: buffer to wait on
- * @cpu: the cpu buffer to wait on
- * @full: wait until the percentage of pages are available, if @cpu != RING_BUFFER_ALL_CPUS
- *
- * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon
- * as data is added to any of the @buffer's cpu buffers. Otherwise
- * it will wait for data to be added to a specific cpu buffer.
- */
-int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
+static inline bool
+rb_wait_cond(struct rb_irq_work *rbwork, struct trace_buffer *buffer,
+ int cpu, int full, ring_buffer_cond_fn cond, void *data)
{
- struct ring_buffer_per_cpu *cpu_buffer;
- DEFINE_WAIT(wait);
- struct rb_irq_work *work;
- int ret = 0;
-
- /*
- * Depending on what the caller is waiting for, either any
- * data in any cpu buffer, or a specific buffer, put the
- * caller on the appropriate wait queue.
- */
- if (cpu == RING_BUFFER_ALL_CPUS) {
- work = &buffer->irq_work;
- /* Full only makes sense on per cpu reads */
- full = 0;
- } else {
- if (!cpumask_test_cpu(cpu, buffer->cpumask))
- return -ENODEV;
- cpu_buffer = buffer->buffers[cpu];
- work = &cpu_buffer->irq_work;
- }
+ if (rb_watermark_hit(buffer, cpu, full))
+ return true;
- if (full)
- prepare_to_wait(&work->full_waiters, &wait, TASK_INTERRUPTIBLE);
- else
- prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE);
+ if (cond(data))
+ return true;
/*
* The events can happen in critical sections where
@@ -901,27 +878,82 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
* a task has been queued. It's OK for spurious wake ups.
*/
if (full)
- work->full_waiters_pending = true;
+ rbwork->full_waiters_pending = true;
else
- work->waiters_pending = true;
+ rbwork->waiters_pending = true;
- if (rb_watermark_hit(buffer, cpu, full))
- goto out;
+ return false;
+}
- if (signal_pending(current)) {
- ret = -EINTR;
- goto out;
+struct rb_wait_data {
+ struct rb_irq_work *irq_work;
+ int seq;
+};
+
+/*
+ * The default wait condition for ring_buffer_wait() is to just to exit the
+ * wait loop the first time it is woken up.
+ */
+static bool rb_wait_once(void *data)
+{
+ struct rb_wait_data *rdata = data;
+ struct rb_irq_work *rbwork = rdata->irq_work;
+
+ return atomic_read_acquire(&rbwork->seq) != rdata->seq;
+}
+
+/**
+ * ring_buffer_wait - wait for input to the ring buffer
+ * @buffer: buffer to wait on
+ * @cpu: the cpu buffer to wait on
+ * @full: wait until the percentage of pages are available, if @cpu != RING_BUFFER_ALL_CPUS
+ * @cond: condition function to break out of wait (NULL to run once)
+ * @data: the data to pass to @cond.
+ *
+ * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon
+ * as data is added to any of the @buffer's cpu buffers. Otherwise
+ * it will wait for data to be added to a specific cpu buffer.
+ */
+int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full,
+ ring_buffer_cond_fn cond, void *data)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct wait_queue_head *waitq;
+ struct rb_irq_work *rbwork;
+ struct rb_wait_data rdata;
+ int ret = 0;
+
+ /*
+ * Depending on what the caller is waiting for, either any
+ * data in any cpu buffer, or a specific buffer, put the
+ * caller on the appropriate wait queue.
+ */
+ if (cpu == RING_BUFFER_ALL_CPUS) {
+ rbwork = &buffer->irq_work;
+ /* Full only makes sense on per cpu reads */
+ full = 0;
+ } else {
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return -ENODEV;
+ cpu_buffer = buffer->buffers[cpu];
+ rbwork = &cpu_buffer->irq_work;
}
- schedule();
- out:
if (full)
- finish_wait(&work->full_waiters, &wait);
+ waitq = &rbwork->full_waiters;
else
- finish_wait(&work->waiters, &wait);
+ waitq = &rbwork->waiters;
+
+ /* Set up to exit loop as soon as it is woken */
+ if (!cond) {
+ cond = rb_wait_once;
+ rdata.irq_work = rbwork;
+ rdata.seq = atomic_read_acquire(&rbwork->seq);
+ data = &rdata;
+ }
- if (!ret && !rb_watermark_hit(buffer, cpu, full) && signal_pending(current))
- ret = -EINTR;
+ ret = wait_event_interruptible((*waitq),
+ rb_wait_cond(rbwork, buffer, cpu, full, cond, data));
return ret;
}
@@ -959,21 +991,30 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu,
}
if (full) {
- unsigned long flags;
-
poll_wait(filp, &rbwork->full_waiters, poll_table);
- raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ if (rb_watermark_hit(buffer, cpu, full))
+ return EPOLLIN | EPOLLRDNORM;
+ /*
+ * Only allow full_waiters_pending update to be seen after
+ * the shortest_full is set (in rb_watermark_hit). If the
+ * writer sees the full_waiters_pending flag set, it will
+ * compare the amount in the ring buffer to shortest_full.
+ * If the amount in the ring buffer is greater than the
+ * shortest_full percent, it will call the irq_work handler
+ * to wake up this list. The irq_handler will reset shortest_full
+ * back to zero. That's done under the reader_lock, but
+ * the below smp_mb() makes sure that the update to
+ * full_waiters_pending doesn't leak up into the above.
+ */
+ smp_mb();
rbwork->full_waiters_pending = true;
- if (!cpu_buffer->shortest_full ||
- cpu_buffer->shortest_full > full)
- cpu_buffer->shortest_full = full;
- raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
- } else {
- poll_wait(filp, &rbwork->waiters, poll_table);
- rbwork->waiters_pending = true;
+ return 0;
}
+ poll_wait(filp, &rbwork->waiters, poll_table);
+ rbwork->waiters_pending = true;
+
/*
* There's a tight race between setting the waiters_pending and
* checking if the ring buffer is empty. Once the waiters_pending bit
@@ -989,9 +1030,6 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu,
*/
smp_mb();
- if (full)
- return full_hit(buffer, cpu, full) ? EPOLLIN | EPOLLRDNORM : 0;
-
if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) ||
(cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu)))
return EPOLLIN | EPOLLRDNORM;
@@ -1355,7 +1393,6 @@ static void rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
old_write = local_add_return(RB_WRITE_INTCNT, &next_page->write);
old_entries = local_add_return(RB_WRITE_INTCNT, &next_page->entries);
- local_inc(&cpu_buffer->pages_touched);
/*
* Just make sure we have seen our old_write and synchronize
* with any interrupts that come in.
@@ -1392,8 +1429,9 @@ static void rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
*/
local_set(&next_page->page->commit, 0);
- /* Again, either we update tail_page or an interrupt does */
- (void)cmpxchg(&cpu_buffer->tail_page, tail_page, next_page);
+ /* Either we update tail_page or an interrupt does */
+ if (try_cmpxchg(&cpu_buffer->tail_page, &tail_page, next_page))
+ local_inc(&cpu_buffer->pages_touched);
}
}
@@ -1485,7 +1523,8 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
list_add(&bpage->list, pages);
- page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), mflags,
+ page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu),
+ mflags | __GFP_ZERO,
cpu_buffer->buffer->subbuf_order);
if (!page)
goto free_pages;
@@ -1570,7 +1609,8 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
cpu_buffer->reader_page = bpage;
- page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, cpu_buffer->buffer->subbuf_order);
+ page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_ZERO,
+ cpu_buffer->buffer->subbuf_order);
if (!page)
goto fail_free_reader;
bpage->page = page_address(page);
@@ -4350,7 +4390,7 @@ int ring_buffer_iter_empty(struct ring_buffer_iter *iter)
cpu_buffer = iter->cpu_buffer;
reader = cpu_buffer->reader_page;
head_page = cpu_buffer->head_page;
- commit_page = cpu_buffer->commit_page;
+ commit_page = READ_ONCE(cpu_buffer->commit_page);
commit_ts = commit_page->page->time_stamp;
/*
@@ -5538,7 +5578,8 @@ ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu)
if (bpage->data)
goto out;
- page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_NORETRY,
+ page = alloc_pages_node(cpu_to_node(cpu),
+ GFP_KERNEL | __GFP_NORETRY | __GFP_ZERO,
cpu_buffer->buffer->subbuf_order);
if (!page) {
kfree(bpage);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c9c898307348..233d1af39fff 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -13,7 +13,7 @@
* Copyright (C) 2004 Nadia Yvette Chambers
*/
#include <linux/ring_buffer.h>
-#include <generated/utsrelease.h>
+#include <linux/utsname.h>
#include <linux/stacktrace.h>
#include <linux/writeback.h>
#include <linux/kallsyms.h>
@@ -39,7 +39,6 @@
#include <linux/ctype.h>
#include <linux/init.h>
#include <linux/panic_notifier.h>
-#include <linux/kmemleak.h>
#include <linux/poll.h>
#include <linux/nmi.h>
#include <linux/fs.h>
@@ -105,7 +104,7 @@ dummy_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
* tracing is active, only save the comm when a trace event
* occurred.
*/
-static DEFINE_PER_CPU(bool, trace_taskinfo_save);
+DEFINE_PER_CPU(bool, trace_taskinfo_save);
/*
* Kill all tracing for good (never come back).
@@ -131,9 +130,12 @@ cpumask_var_t __read_mostly tracing_buffer_mask;
* /proc/sys/kernel/ftrace_dump_on_oops
* Set 1 if you want to dump buffers of all CPUs
* Set 2 if you want to dump the buffer of the CPU that triggered oops
+ * Set instance name if you want to dump the specific trace instance
+ * Multiple instance dump is also supported, and instances are seperated
+ * by commas.
*/
-
-enum ftrace_dump_mode ftrace_dump_on_oops;
+/* Set to string format zero to disable by default */
+char ftrace_dump_on_oops[MAX_TRACER_SIZE] = "0";
/* When set, tracing will stop when a WARN*() is hit */
int __disable_trace_on_warning;
@@ -179,7 +181,6 @@ static void ftrace_trace_userstack(struct trace_array *tr,
struct trace_buffer *buffer,
unsigned int trace_ctx);
-#define MAX_TRACER_SIZE 100
static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
static char *default_bootup_tracer;
@@ -202,19 +203,33 @@ static int __init set_cmdline_ftrace(char *str)
}
__setup("ftrace=", set_cmdline_ftrace);
+int ftrace_dump_on_oops_enabled(void)
+{
+ if (!strcmp("0", ftrace_dump_on_oops))
+ return 0;
+ else
+ return 1;
+}
+
static int __init set_ftrace_dump_on_oops(char *str)
{
- if (*str++ != '=' || !*str || !strcmp("1", str)) {
- ftrace_dump_on_oops = DUMP_ALL;
+ if (!*str) {
+ strscpy(ftrace_dump_on_oops, "1", MAX_TRACER_SIZE);
return 1;
}
- if (!strcmp("orig_cpu", str) || !strcmp("2", str)) {
- ftrace_dump_on_oops = DUMP_ORIG;
- return 1;
- }
+ if (*str == ',') {
+ strscpy(ftrace_dump_on_oops, "1", MAX_TRACER_SIZE);
+ strscpy(ftrace_dump_on_oops + 1, str, MAX_TRACER_SIZE - 1);
+ return 1;
+ }
- return 0;
+ if (*str++ == '=') {
+ strscpy(ftrace_dump_on_oops, str, MAX_TRACER_SIZE);
+ return 1;
+ }
+
+ return 0;
}
__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
@@ -1301,6 +1316,50 @@ static void free_snapshot(struct trace_array *tr)
tr->allocated_snapshot = false;
}
+static int tracing_arm_snapshot_locked(struct trace_array *tr)
+{
+ int ret;
+
+ lockdep_assert_held(&trace_types_lock);
+
+ spin_lock(&tr->snapshot_trigger_lock);
+ if (tr->snapshot == UINT_MAX) {
+ spin_unlock(&tr->snapshot_trigger_lock);
+ return -EBUSY;
+ }
+
+ tr->snapshot++;
+ spin_unlock(&tr->snapshot_trigger_lock);
+
+ ret = tracing_alloc_snapshot_instance(tr);
+ if (ret) {
+ spin_lock(&tr->snapshot_trigger_lock);
+ tr->snapshot--;
+ spin_unlock(&tr->snapshot_trigger_lock);
+ }
+
+ return ret;
+}
+
+int tracing_arm_snapshot(struct trace_array *tr)
+{
+ int ret;
+
+ mutex_lock(&trace_types_lock);
+ ret = tracing_arm_snapshot_locked(tr);
+ mutex_unlock(&trace_types_lock);
+
+ return ret;
+}
+
+void tracing_disarm_snapshot(struct trace_array *tr)
+{
+ spin_lock(&tr->snapshot_trigger_lock);
+ if (!WARN_ON(!tr->snapshot))
+ tr->snapshot--;
+ spin_unlock(&tr->snapshot_trigger_lock);
+}
+
/**
* tracing_alloc_snapshot - allocate snapshot buffer.
*
@@ -1374,10 +1433,6 @@ int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data,
mutex_lock(&trace_types_lock);
- ret = tracing_alloc_snapshot_instance(tr);
- if (ret)
- goto fail_unlock;
-
if (tr->current_trace->use_max_tr) {
ret = -EBUSY;
goto fail_unlock;
@@ -1396,6 +1451,10 @@ int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data,
goto fail_unlock;
}
+ ret = tracing_arm_snapshot_locked(tr);
+ if (ret)
+ goto fail_unlock;
+
local_irq_disable();
arch_spin_lock(&tr->max_lock);
tr->cond_snapshot = cond_snapshot;
@@ -1440,6 +1499,8 @@ int tracing_snapshot_cond_disable(struct trace_array *tr)
arch_spin_unlock(&tr->max_lock);
local_irq_enable();
+ tracing_disarm_snapshot(tr);
+
return ret;
}
EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable);
@@ -1482,6 +1543,7 @@ int tracing_snapshot_cond_disable(struct trace_array *tr)
}
EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable);
#define free_snapshot(tr) do { } while (0)
+#define tracing_arm_snapshot_locked(tr) ({ -EBUSY; })
#endif /* CONFIG_TRACER_SNAPSHOT */
void tracer_tracing_off(struct trace_array *tr)
@@ -1955,15 +2017,36 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
#endif /* CONFIG_TRACER_MAX_TRACE */
+struct pipe_wait {
+ struct trace_iterator *iter;
+ int wait_index;
+};
+
+static bool wait_pipe_cond(void *data)
+{
+ struct pipe_wait *pwait = data;
+ struct trace_iterator *iter = pwait->iter;
+
+ if (atomic_read_acquire(&iter->wait_index) != pwait->wait_index)
+ return true;
+
+ return iter->closed;
+}
+
static int wait_on_pipe(struct trace_iterator *iter, int full)
{
+ struct pipe_wait pwait;
int ret;
/* Iterators are static, they should be filled or empty */
if (trace_buffer_iter(iter, iter->cpu_file))
return 0;
- ret = ring_buffer_wait(iter->array_buffer->buffer, iter->cpu_file, full);
+ pwait.wait_index = atomic_read_acquire(&iter->wait_index);
+ pwait.iter = iter;
+
+ ret = ring_buffer_wait(iter->array_buffer->buffer, iter->cpu_file, full,
+ wait_pipe_cond, &pwait);
#ifdef CONFIG_TRACER_MAX_TRACE
/*
@@ -2299,98 +2382,6 @@ void tracing_reset_all_online_cpus(void)
mutex_unlock(&trace_types_lock);
}
-/*
- * The tgid_map array maps from pid to tgid; i.e. the value stored at index i
- * is the tgid last observed corresponding to pid=i.
- */
-static int *tgid_map;
-
-/* The maximum valid index into tgid_map. */
-static size_t tgid_map_max;
-
-#define SAVED_CMDLINES_DEFAULT 128
-#define NO_CMDLINE_MAP UINT_MAX
-/*
- * Preemption must be disabled before acquiring trace_cmdline_lock.
- * The various trace_arrays' max_lock must be acquired in a context
- * where interrupt is disabled.
- */
-static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED;
-struct saved_cmdlines_buffer {
- unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
- unsigned *map_cmdline_to_pid;
- unsigned cmdline_num;
- int cmdline_idx;
- char saved_cmdlines[];
-};
-static struct saved_cmdlines_buffer *savedcmd;
-
-static inline char *get_saved_cmdlines(int idx)
-{
- return &savedcmd->saved_cmdlines[idx * TASK_COMM_LEN];
-}
-
-static inline void set_cmdline(int idx, const char *cmdline)
-{
- strncpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN);
-}
-
-static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s)
-{
- int order = get_order(sizeof(*s) + s->cmdline_num * TASK_COMM_LEN);
-
- kfree(s->map_cmdline_to_pid);
- kmemleak_free(s);
- free_pages((unsigned long)s, order);
-}
-
-static struct saved_cmdlines_buffer *allocate_cmdlines_buffer(unsigned int val)
-{
- struct saved_cmdlines_buffer *s;
- struct page *page;
- int orig_size, size;
- int order;
-
- /* Figure out how much is needed to hold the given number of cmdlines */
- orig_size = sizeof(*s) + val * TASK_COMM_LEN;
- order = get_order(orig_size);
- size = 1 << (order + PAGE_SHIFT);
- page = alloc_pages(GFP_KERNEL, order);
- if (!page)
- return NULL;
-
- s = page_address(page);
- kmemleak_alloc(s, size, 1, GFP_KERNEL);
- memset(s, 0, sizeof(*s));
-
- /* Round up to actual allocation */
- val = (size - sizeof(*s)) / TASK_COMM_LEN;
- s->cmdline_num = val;
-
- s->map_cmdline_to_pid = kmalloc_array(val,
- sizeof(*s->map_cmdline_to_pid),
- GFP_KERNEL);
- if (!s->map_cmdline_to_pid) {
- free_saved_cmdlines_buffer(s);
- return NULL;
- }
-
- s->cmdline_idx = 0;
- memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP,
- sizeof(s->map_pid_to_cmdline));
- memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP,
- val * sizeof(*s->map_cmdline_to_pid));
-
- return s;
-}
-
-static int trace_create_savedcmd(void)
-{
- savedcmd = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT);
-
- return savedcmd ? 0 : -ENOMEM;
-}
-
int is_tracing_stopped(void)
{
return global_trace.stop_count;
@@ -2483,201 +2474,6 @@ void tracing_stop(void)
return tracing_stop_tr(&global_trace);
}
-static int trace_save_cmdline(struct task_struct *tsk)
-{
- unsigned tpid, idx;
-
- /* treat recording of idle task as a success */
- if (!tsk->pid)
- return 1;
-
- tpid = tsk->pid & (PID_MAX_DEFAULT - 1);
-
- /*
- * It's not the end of the world if we don't get
- * the lock, but we also don't want to spin
- * nor do we want to disable interrupts,
- * so if we miss here, then better luck next time.
- *
- * This is called within the scheduler and wake up, so interrupts
- * had better been disabled and run queue lock been held.
- */
- lockdep_assert_preemption_disabled();
- if (!arch_spin_trylock(&trace_cmdline_lock))
- return 0;
-
- idx = savedcmd->map_pid_to_cmdline[tpid];
- if (idx == NO_CMDLINE_MAP) {
- idx = (savedcmd->cmdline_idx + 1) % savedcmd->cmdline_num;
-
- savedcmd->map_pid_to_cmdline[tpid] = idx;
- savedcmd->cmdline_idx = idx;
- }
-
- savedcmd->map_cmdline_to_pid[idx] = tsk->pid;
- set_cmdline(idx, tsk->comm);
-
- arch_spin_unlock(&trace_cmdline_lock);
-
- return 1;
-}
-
-static void __trace_find_cmdline(int pid, char comm[])
-{
- unsigned map;
- int tpid;
-
- if (!pid) {
- strcpy(comm, "<idle>");
- return;
- }
-
- if (WARN_ON_ONCE(pid < 0)) {
- strcpy(comm, "<XXX>");
- return;
- }
-
- tpid = pid & (PID_MAX_DEFAULT - 1);
- map = savedcmd->map_pid_to_cmdline[tpid];
- if (map != NO_CMDLINE_MAP) {
- tpid = savedcmd->map_cmdline_to_pid[map];
- if (tpid == pid) {
- strscpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN);
- return;
- }
- }
- strcpy(comm, "<...>");
-}
-
-void trace_find_cmdline(int pid, char comm[])
-{
- preempt_disable();
- arch_spin_lock(&trace_cmdline_lock);
-
- __trace_find_cmdline(pid, comm);
-
- arch_spin_unlock(&trace_cmdline_lock);
- preempt_enable();
-}
-
-static int *trace_find_tgid_ptr(int pid)
-{
- /*
- * Pairs with the smp_store_release in set_tracer_flag() to ensure that
- * if we observe a non-NULL tgid_map then we also observe the correct
- * tgid_map_max.
- */
- int *map = smp_load_acquire(&tgid_map);
-
- if (unlikely(!map || pid > tgid_map_max))
- return NULL;
-
- return &map[pid];
-}
-
-int trace_find_tgid(int pid)
-{
- int *ptr = trace_find_tgid_ptr(pid);
-
- return ptr ? *ptr : 0;
-}
-
-static int trace_save_tgid(struct task_struct *tsk)
-{
- int *ptr;
-
- /* treat recording of idle task as a success */
- if (!tsk->pid)
- return 1;
-
- ptr = trace_find_tgid_ptr(tsk->pid);
- if (!ptr)
- return 0;
-
- *ptr = tsk->tgid;
- return 1;
-}
-
-static bool tracing_record_taskinfo_skip(int flags)
-{
- if (unlikely(!(flags & (TRACE_RECORD_CMDLINE | TRACE_RECORD_TGID))))
- return true;
- if (!__this_cpu_read(trace_taskinfo_save))
- return true;
- return false;
-}
-
-/**
- * tracing_record_taskinfo - record the task info of a task
- *
- * @task: task to record
- * @flags: TRACE_RECORD_CMDLINE for recording comm
- * TRACE_RECORD_TGID for recording tgid
- */
-void tracing_record_taskinfo(struct task_struct *task, int flags)
-{
- bool done;
-
- if (tracing_record_taskinfo_skip(flags))
- return;
-
- /*
- * Record as much task information as possible. If some fail, continue
- * to try to record the others.
- */
- done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(task);
- done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(task);
-
- /* If recording any information failed, retry again soon. */
- if (!done)
- return;
-
- __this_cpu_write(trace_taskinfo_save, false);
-}
-
-/**
- * tracing_record_taskinfo_sched_switch - record task info for sched_switch
- *
- * @prev: previous task during sched_switch
- * @next: next task during sched_switch
- * @flags: TRACE_RECORD_CMDLINE for recording comm
- * TRACE_RECORD_TGID for recording tgid
- */
-void tracing_record_taskinfo_sched_switch(struct task_struct *prev,
- struct task_struct *next, int flags)
-{
- bool done;
-
- if (tracing_record_taskinfo_skip(flags))
- return;
-
- /*
- * Record as much task information as possible. If some fail, continue
- * to try to record the others.
- */
- done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(prev);
- done &= !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(next);
- done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(prev);
- done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(next);
-
- /* If recording any information failed, retry again soon. */
- if (!done)
- return;
-
- __this_cpu_write(trace_taskinfo_save, false);
-}
-
-/* Helpers to record a specific task information */
-void tracing_record_cmdline(struct task_struct *task)
-{
- tracing_record_taskinfo(task, TRACE_RECORD_CMDLINE);
-}
-
-void tracing_record_tgid(struct task_struct *task)
-{
- tracing_record_taskinfo(task, TRACE_RECORD_TGID);
-}
-
/*
* Several functions return TRACE_TYPE_PARTIAL_LINE if the trace_seq
* overflowed, and TRACE_TYPE_HANDLED otherwise. This helper function
@@ -4368,7 +4164,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
get_total_entries(buf, &total, &entries);
seq_printf(m, "# %s latency trace v1.1.5 on %s\n",
- name, UTS_RELEASE);
+ name, init_utsname()->release);
seq_puts(m, "# -----------------------------------"
"---------------------------------\n");
seq_printf(m, "# latency: %lu us, #%lu/%lu, CPU#%d |"
@@ -5436,8 +5232,6 @@ int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set)
int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
{
- int *map;
-
if ((mask == TRACE_ITER_RECORD_TGID) ||
(mask == TRACE_ITER_RECORD_CMD))
lockdep_assert_held(&event_mutex);
@@ -5460,20 +5254,8 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
trace_event_enable_cmd_record(enabled);
if (mask == TRACE_ITER_RECORD_TGID) {
- if (!tgid_map) {
- tgid_map_max = pid_max;
- map = kvcalloc(tgid_map_max + 1, sizeof(*tgid_map),
- GFP_KERNEL);
- /*
- * Pairs with smp_load_acquire() in
- * trace_find_tgid_ptr() to ensure that if it observes
- * the tgid_map we just allocated then it also observes
- * the corresponding tgid_map_max value.
- */
- smp_store_release(&tgid_map, map);
- }
- if (!tgid_map) {
+ if (trace_alloc_tgid_map() < 0) {
tr->trace_flags &= ~TRACE_ITER_RECORD_TGID;
return -ENOMEM;
}
@@ -5747,16 +5529,15 @@ static const char readme_msg[] =
"\t args: <name>=fetcharg[:type]\n"
"\t fetcharg: (%<register>|$<efield>), @<address>, @<symbol>[+|-<offset>],\n"
#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
-#ifdef CONFIG_PROBE_EVENTS_BTF_ARGS
"\t $stack<index>, $stack, $retval, $comm, $arg<N>,\n"
+#ifdef CONFIG_PROBE_EVENTS_BTF_ARGS
"\t <argname>[->field[->field|.field...]],\n"
-#else
- "\t $stack<index>, $stack, $retval, $comm, $arg<N>,\n"
#endif
#else
"\t $stack<index>, $stack, $retval, $comm,\n"
#endif
"\t +|-[u]<offset>(<fetcharg>), \\imm-value, \\\"imm-string\"\n"
+ "\t kernel return probes support: $retval, $arg<N>, $comm\n"
"\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, char, string, symbol,\n"
"\t b<bit-width>@<bit-offset>/<container-size>, ustring,\n"
"\t symstr, <type>\\[<array-size>\\]\n"
@@ -5918,207 +5699,6 @@ static const struct file_operations tracing_readme_fops = {
.llseek = generic_file_llseek,
};
-static void *saved_tgids_next(struct seq_file *m, void *v, loff_t *pos)
-{
- int pid = ++(*pos);
-
- return trace_find_tgid_ptr(pid);
-}
-
-static void *saved_tgids_start(struct seq_file *m, loff_t *pos)
-{
- int pid = *pos;
-
- return trace_find_tgid_ptr(pid);
-}
-
-static void saved_tgids_stop(struct seq_file *m, void *v)
-{
-}
-
-static int saved_tgids_show(struct seq_file *m, void *v)
-{
- int *entry = (int *)v;
- int pid = entry - tgid_map;
- int tgid = *entry;
-
- if (tgid == 0)
- return SEQ_SKIP;
-
- seq_printf(m, "%d %d\n", pid, tgid);
- return 0;
-}
-
-static const struct seq_operations tracing_saved_tgids_seq_ops = {
- .start = saved_tgids_start,
- .stop = saved_tgids_stop,
- .next = saved_tgids_next,
- .show = saved_tgids_show,
-};
-
-static int tracing_saved_tgids_open(struct inode *inode, struct file *filp)
-{
- int ret;
-
- ret = tracing_check_open_get_tr(NULL);
- if (ret)
- return ret;
-
- return seq_open(filp, &tracing_saved_tgids_seq_ops);
-}
-
-
-static const struct file_operations tracing_saved_tgids_fops = {
- .open = tracing_saved_tgids_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
-static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos)
-{
- unsigned int *ptr = v;
-
- if (*pos || m->count)
- ptr++;
-
- (*pos)++;
-
- for (; ptr < &savedcmd->map_cmdline_to_pid[savedcmd->cmdline_num];
- ptr++) {
- if (*ptr == -1 || *ptr == NO_CMDLINE_MAP)
- continue;
-
- return ptr;
- }
-
- return NULL;
-}
-
-static void *saved_cmdlines_start(struct seq_file *m, loff_t *pos)
-{
- void *v;
- loff_t l = 0;
-
- preempt_disable();
- arch_spin_lock(&trace_cmdline_lock);
-
- v = &savedcmd->map_cmdline_to_pid[0];
- while (l <= *pos) {
- v = saved_cmdlines_next(m, v, &l);
- if (!v)
- return NULL;
- }
-
- return v;
-}
-
-static void saved_cmdlines_stop(struct seq_file *m, void *v)
-{
- arch_spin_unlock(&trace_cmdline_lock);
- preempt_enable();
-}
-
-static int saved_cmdlines_show(struct seq_file *m, void *v)
-{
- char buf[TASK_COMM_LEN];
- unsigned int *pid = v;
-
- __trace_find_cmdline(*pid, buf);
- seq_printf(m, "%d %s\n", *pid, buf);
- return 0;
-}
-
-static const struct seq_operations tracing_saved_cmdlines_seq_ops = {
- .start = saved_cmdlines_start,
- .next = saved_cmdlines_next,
- .stop = saved_cmdlines_stop,
- .show = saved_cmdlines_show,
-};
-
-static int tracing_saved_cmdlines_open(struct inode *inode, struct file *filp)
-{
- int ret;
-
- ret = tracing_check_open_get_tr(NULL);
- if (ret)
- return ret;
-
- return seq_open(filp, &tracing_saved_cmdlines_seq_ops);
-}
-
-static const struct file_operations tracing_saved_cmdlines_fops = {
- .open = tracing_saved_cmdlines_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
-static ssize_t
-tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf,
- size_t cnt, loff_t *ppos)
-{
- char buf[64];
- int r;
-
- preempt_disable();
- arch_spin_lock(&trace_cmdline_lock);
- r = scnprintf(buf, sizeof(buf), "%u\n", savedcmd->cmdline_num);
- arch_spin_unlock(&trace_cmdline_lock);
- preempt_enable();
-
- return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
-}
-
-static int tracing_resize_saved_cmdlines(unsigned int val)
-{
- struct saved_cmdlines_buffer *s, *savedcmd_temp;
-
- s = allocate_cmdlines_buffer(val);
- if (!s)
- return -ENOMEM;
-
- preempt_disable();
- arch_spin_lock(&trace_cmdline_lock);
- savedcmd_temp = savedcmd;
- savedcmd = s;
- arch_spin_unlock(&trace_cmdline_lock);
- preempt_enable();
- free_saved_cmdlines_buffer(savedcmd_temp);
-
- return 0;
-}
-
-static ssize_t
-tracing_saved_cmdlines_size_write(struct file *filp, const char __user *ubuf,
- size_t cnt, loff_t *ppos)
-{
- unsigned long val;
- int ret;
-
- ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
- if (ret)
- return ret;
-
- /* must have at least 1 entry or less than PID_MAX_DEFAULT */
- if (!val || val > PID_MAX_DEFAULT)
- return -EINVAL;
-
- ret = tracing_resize_saved_cmdlines((unsigned int)val);
- if (ret < 0)
- return ret;
-
- *ppos += cnt;
-
- return cnt;
-}
-
-static const struct file_operations tracing_saved_cmdlines_size_fops = {
- .open = tracing_open_generic,
- .read = tracing_saved_cmdlines_size_read,
- .write = tracing_saved_cmdlines_size_write,
-};
-
#ifdef CONFIG_TRACE_EVAL_MAP_FILE
static union trace_eval_map_item *
update_eval_map(union trace_eval_map_item *ptr)
@@ -6595,11 +6175,12 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
*/
synchronize_rcu();
free_snapshot(tr);
+ tracing_disarm_snapshot(tr);
}
- if (t->use_max_tr && !tr->allocated_snapshot) {
- ret = tracing_alloc_snapshot_instance(tr);
- if (ret < 0)
+ if (!had_max_tr && t->use_max_tr) {
+ ret = tracing_arm_snapshot_locked(tr);
+ if (ret)
goto out;
}
#else
@@ -6608,8 +6189,13 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
if (t->init) {
ret = tracer_init(t, tr);
- if (ret)
+ if (ret) {
+#ifdef CONFIG_TRACER_MAX_TRACE
+ if (t->use_max_tr)
+ tracing_disarm_snapshot(tr);
+#endif
goto out;
+ }
}
tr->current_trace = t;
@@ -7711,10 +7297,11 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
if (tr->allocated_snapshot)
ret = resize_buffer_duplicate_size(&tr->max_buffer,
&tr->array_buffer, iter->cpu_file);
- else
- ret = tracing_alloc_snapshot_instance(tr);
- if (ret < 0)
+
+ ret = tracing_arm_snapshot_locked(tr);
+ if (ret)
break;
+
/* Now, we're going to swap */
if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {
local_irq_disable();
@@ -7724,6 +7311,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
smp_call_function_single(iter->cpu_file, tracing_swap_cpu_buffer,
(void *)tr, 1);
}
+ tracing_disarm_snapshot(tr);
break;
default:
if (tr->allocated_snapshot) {
@@ -8398,9 +7986,9 @@ static int tracing_buffers_flush(struct file *file, fl_owner_t id)
struct ftrace_buffer_info *info = file->private_data;
struct trace_iterator *iter = &info->iter;
- iter->wait_index++;
+ iter->closed = true;
/* Make sure the waiters see the new wait_index */
- smp_wmb();
+ (void)atomic_fetch_inc_release(&iter->wait_index);
ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file);
@@ -8500,6 +8088,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
.spd_release = buffer_spd_release,
};
struct buffer_ref *ref;
+ bool woken = false;
int page_size;
int entries, i;
ssize_t ret = 0;
@@ -8573,17 +8162,17 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
/* did we read anything? */
if (!spd.nr_pages) {
- long wait_index;
if (ret)
goto out;
+ if (woken)
+ goto out;
+
ret = -EAGAIN;
if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK))
goto out;
- wait_index = READ_ONCE(iter->wait_index);
-
ret = wait_on_pipe(iter, iter->snapshot ? 0 : iter->tr->buffer_percent);
if (ret)
goto out;
@@ -8592,10 +8181,8 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
if (!tracer_tracing_is_on(iter->tr))
goto out;
- /* Make sure we see the new wait_index */
- smp_rmb();
- if (wait_index != iter->wait_index)
- goto out;
+ /* Iterate one more time to collect any new data then exit */
+ woken = true;
goto again;
}
@@ -8618,9 +8205,8 @@ static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned
mutex_lock(&trace_types_lock);
- iter->wait_index++;
/* Make sure the waiters see the new wait_index */
- smp_wmb();
+ (void)atomic_fetch_inc_release(&iter->wait_index);
ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file);
@@ -8857,8 +8443,13 @@ ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash,
ops = param ? &snapshot_count_probe_ops : &snapshot_probe_ops;
- if (glob[0] == '!')
- return unregister_ftrace_function_probe_func(glob+1, tr, ops);
+ if (glob[0] == '!') {
+ ret = unregister_ftrace_function_probe_func(glob+1, tr, ops);
+ if (!ret)
+ tracing_disarm_snapshot(tr);
+
+ return ret;
+ }
if (!param)
goto out_reg;
@@ -8877,12 +8468,13 @@ ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash,
return ret;
out_reg:
- ret = tracing_alloc_snapshot_instance(tr);
+ ret = tracing_arm_snapshot(tr);
if (ret < 0)
goto out;
ret = register_ftrace_function_probe(glob, tr, ops, count);
-
+ if (ret < 0)
+ tracing_disarm_snapshot(tr);
out:
return ret < 0 ? ret : 0;
}
@@ -9689,7 +9281,9 @@ trace_array_create_systems(const char *name, const char *systems)
raw_spin_lock_init(&tr->start_lock);
tr->max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
-
+#ifdef CONFIG_TRACER_MAX_TRACE
+ spin_lock_init(&tr->snapshot_trigger_lock);
+#endif
tr->current_trace = &nop_trace;
INIT_LIST_HEAD(&tr->systems);
@@ -10254,14 +9848,14 @@ static struct notifier_block trace_die_notifier = {
static int trace_die_panic_handler(struct notifier_block *self,
unsigned long ev, void *unused)
{
- if (!ftrace_dump_on_oops)
+ if (!ftrace_dump_on_oops_enabled())
return NOTIFY_DONE;
/* The die notifier requires DIE_OOPS to trigger */
if (self == &trace_die_notifier && ev != DIE_OOPS)
return NOTIFY_DONE;
- ftrace_dump(ftrace_dump_on_oops);
+ ftrace_dump(DUMP_PARAM);
return NOTIFY_DONE;
}
@@ -10302,12 +9896,12 @@ trace_printk_seq(struct trace_seq *s)
trace_seq_init(s);
}
-void trace_init_global_iter(struct trace_iterator *iter)
+static void trace_init_iter(struct trace_iterator *iter, struct trace_array *tr)
{
- iter->tr = &global_trace;
+ iter->tr = tr;
iter->trace = iter->tr->current_trace;
iter->cpu_file = RING_BUFFER_ALL_CPUS;
- iter->array_buffer = &global_trace.array_buffer;
+ iter->array_buffer = &tr->array_buffer;
if (iter->trace && iter->trace->open)
iter->trace->open(iter);
@@ -10327,22 +9921,19 @@ void trace_init_global_iter(struct trace_iterator *iter)
iter->fmt_size = STATIC_FMT_BUF_SIZE;
}
-void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
+void trace_init_global_iter(struct trace_iterator *iter)
+{
+ trace_init_iter(iter, &global_trace);
+}
+
+static void ftrace_dump_one(struct trace_array *tr, enum ftrace_dump_mode dump_mode)
{
/* use static because iter can be a bit big for the stack */
static struct trace_iterator iter;
- static atomic_t dump_running;
- struct trace_array *tr = &global_trace;
unsigned int old_userobj;
unsigned long flags;
int cnt = 0, cpu;
- /* Only allow one dump user at a time. */
- if (atomic_inc_return(&dump_running) != 1) {
- atomic_dec(&dump_running);
- return;
- }
-
/*
* Always turn off tracing when we dump.
* We don't need to show trace output of what happens
@@ -10351,12 +9942,12 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
* If the user does a sysrq-z, then they can re-enable
* tracing with echo 1 > tracing_on.
*/
- tracing_off();
+ tracer_tracing_off(tr);
local_irq_save(flags);
/* Simulate the iterator */
- trace_init_global_iter(&iter);
+ trace_init_iter(&iter, tr);
for_each_tracing_cpu(cpu) {
atomic_inc(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled);
@@ -10367,21 +9958,15 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
/* don't look at user memory in panic mode */
tr->trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
- switch (oops_dump_mode) {
- case DUMP_ALL:
- iter.cpu_file = RING_BUFFER_ALL_CPUS;
- break;
- case DUMP_ORIG:
+ if (dump_mode == DUMP_ORIG)
iter.cpu_file = raw_smp_processor_id();
- break;
- case DUMP_NONE:
- goto out_enable;
- default:
- printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n");
+ else
iter.cpu_file = RING_BUFFER_ALL_CPUS;
- }
- printk(KERN_TRACE "Dumping ftrace buffer:\n");
+ if (tr == &global_trace)
+ printk(KERN_TRACE "Dumping ftrace buffer:\n");
+ else
+ printk(KERN_TRACE "Dumping ftrace instance %s buffer:\n", tr->name);
/* Did function tracer already get disabled? */
if (ftrace_is_dead()) {
@@ -10423,15 +10008,84 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
else
printk(KERN_TRACE "---------------------------------\n");
- out_enable:
tr->trace_flags |= old_userobj;
for_each_tracing_cpu(cpu) {
atomic_dec(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled);
}
- atomic_dec(&dump_running);
local_irq_restore(flags);
}
+
+static void ftrace_dump_by_param(void)
+{
+ bool first_param = true;
+ char dump_param[MAX_TRACER_SIZE];
+ char *buf, *token, *inst_name;
+ struct trace_array *tr;
+
+ strscpy(dump_param, ftrace_dump_on_oops, MAX_TRACER_SIZE);
+ buf = dump_param;
+
+ while ((token = strsep(&buf, ",")) != NULL) {
+ if (first_param) {
+ first_param = false;
+ if (!strcmp("0", token))
+ continue;
+ else if (!strcmp("1", token)) {
+ ftrace_dump_one(&global_trace, DUMP_ALL);
+ continue;
+ }
+ else if (!strcmp("2", token) ||
+ !strcmp("orig_cpu", token)) {
+ ftrace_dump_one(&global_trace, DUMP_ORIG);
+ continue;
+ }
+ }
+
+ inst_name = strsep(&token, "=");
+ tr = trace_array_find(inst_name);
+ if (!tr) {
+ printk(KERN_TRACE "Instance %s not found\n", inst_name);
+ continue;
+ }
+
+ if (token && (!strcmp("2", token) ||
+ !strcmp("orig_cpu", token)))
+ ftrace_dump_one(tr, DUMP_ORIG);
+ else
+ ftrace_dump_one(tr, DUMP_ALL);
+ }
+}
+
+void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
+{
+ static atomic_t dump_running;
+
+ /* Only allow one dump user at a time. */
+ if (atomic_inc_return(&dump_running) != 1) {
+ atomic_dec(&dump_running);
+ return;
+ }
+
+ switch (oops_dump_mode) {
+ case DUMP_ALL:
+ ftrace_dump_one(&global_trace, DUMP_ALL);
+ break;
+ case DUMP_ORIG:
+ ftrace_dump_one(&global_trace, DUMP_ORIG);
+ break;
+ case DUMP_PARAM:
+ ftrace_dump_by_param();
+ break;
+ case DUMP_NONE:
+ break;
+ default:
+ printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n");
+ ftrace_dump_one(&global_trace, DUMP_ALL);
+ }
+
+ atomic_dec(&dump_running);
+}
EXPORT_SYMBOL_GPL(ftrace_dump);
#define WRITE_BUFSIZE 4096
@@ -10659,7 +10313,9 @@ __init static int tracer_alloc_buffers(void)
global_trace.current_trace = &nop_trace;
global_trace.max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
-
+#ifdef CONFIG_TRACER_MAX_TRACE
+ spin_lock_init(&global_trace.snapshot_trigger_lock);
+#endif
ftrace_init_global_array_ops(&global_trace);
init_trace_flags_index(&global_trace);
@@ -10696,7 +10352,7 @@ __init static int tracer_alloc_buffers(void)
out_free_pipe_cpumask:
free_cpumask_var(global_trace.pipe_cpumask);
out_free_savedcmd:
- free_saved_cmdlines_buffer(savedcmd);
+ trace_free_saved_cmdlines_buffer();
out_free_temp_buffer:
ring_buffer_free(temp_buffer);
out_rm_hp_state:
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 00f873910c5d..64450615ca0c 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -334,8 +334,8 @@ struct trace_array {
*/
struct array_buffer max_buffer;
bool allocated_snapshot;
-#endif
-#ifdef CONFIG_TRACER_MAX_TRACE
+ spinlock_t snapshot_trigger_lock;
+ unsigned int snapshot;
unsigned long max_latency;
#ifdef CONFIG_FSNOTIFY
struct dentry *d_max_latency;
@@ -1375,6 +1375,16 @@ static inline void trace_buffer_unlock_commit(struct trace_array *tr,
trace_buffer_unlock_commit_regs(tr, buffer, event, trace_ctx, NULL);
}
+DECLARE_PER_CPU(bool, trace_taskinfo_save);
+int trace_save_cmdline(struct task_struct *tsk);
+int trace_create_savedcmd(void);
+int trace_alloc_tgid_map(void);
+void trace_free_saved_cmdlines_buffer(void);
+
+extern const struct file_operations tracing_saved_cmdlines_fops;
+extern const struct file_operations tracing_saved_tgids_fops;
+extern const struct file_operations tracing_saved_cmdlines_size_fops;
+
DECLARE_PER_CPU(struct ring_buffer_event *, trace_buffered_event);
DECLARE_PER_CPU(int, trace_buffered_event_cnt);
void trace_buffered_event_disable(void);
@@ -1973,12 +1983,16 @@ static inline void trace_event_eval_update(struct trace_eval_map **map, int len)
#ifdef CONFIG_TRACER_SNAPSHOT
void tracing_snapshot_instance(struct trace_array *tr);
int tracing_alloc_snapshot_instance(struct trace_array *tr);
+int tracing_arm_snapshot(struct trace_array *tr);
+void tracing_disarm_snapshot(struct trace_array *tr);
#else
static inline void tracing_snapshot_instance(struct trace_array *tr) { }
static inline int tracing_alloc_snapshot_instance(struct trace_array *tr)
{
return 0;
}
+static inline int tracing_arm_snapshot(struct trace_array *tr) { return 0; }
+static inline void tracing_disarm_snapshot(struct trace_array *tr) { }
#endif
#ifdef CONFIG_PREEMPT_TRACER
diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c
index 54d5fa35c90a..811b08439406 100644
--- a/kernel/trace/trace_benchmark.c
+++ b/kernel/trace/trace_benchmark.c
@@ -92,7 +92,6 @@ static void trace_do_benchmark(void)
bm_total += delta;
bm_totalsq += delta * delta;
-
if (bm_cnt > 1) {
/*
* Apply Welford's method to calculate standard deviation:
@@ -105,7 +104,7 @@ static void trace_do_benchmark(void)
stddev = 0;
delta = bm_total;
- do_div(delta, bm_cnt);
+ delta = div64_u64(delta, bm_cnt);
avg = delta;
if (stddev > 0) {
@@ -127,7 +126,7 @@ static void trace_do_benchmark(void)
seed = stddev;
if (!last_seed)
break;
- do_div(seed, last_seed);
+ seed = div64_u64(seed, last_seed);
seed += last_seed;
do_div(seed, 2);
} while (i++ < 10 && last_seed != seed);
diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c
index 03c851f57969..b0e0ec85912e 100644
--- a/kernel/trace/trace_eprobe.c
+++ b/kernel/trace/trace_eprobe.c
@@ -220,7 +220,7 @@ static struct trace_eprobe *alloc_event_probe(const char *group,
if (!ep->event_system)
goto error;
- ret = trace_probe_init(&ep->tp, this_event, group, false);
+ ret = trace_probe_init(&ep->tp, this_event, group, false, nargs);
if (ret < 0)
goto error;
@@ -390,8 +390,8 @@ static int get_eprobe_size(struct trace_probe *tp, void *rec)
/* Note that we don't verify it, since the code does not come from user space */
static int
-process_fetch_insn(struct fetch_insn *code, void *rec, void *dest,
- void *base)
+process_fetch_insn(struct fetch_insn *code, void *rec, void *edata,
+ void *dest, void *base)
{
unsigned long val;
int ret;
@@ -438,7 +438,7 @@ __eprobe_trace_func(struct eprobe_data *edata, void *rec)
return;
entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event);
- store_trace_args(&entry[1], &edata->ep->tp, rec, sizeof(*entry), dsize);
+ store_trace_args(&entry[1], &edata->ep->tp, rec, NULL, sizeof(*entry), dsize);
trace_event_buffer_commit(&fbuffer);
}
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 7c364b87352e..6ef29eba90ce 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1670,6 +1670,7 @@ static int trace_format_open(struct inode *inode, struct file *file)
return 0;
}
+#ifdef CONFIG_PERF_EVENTS
static ssize_t
event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
{
@@ -1684,6 +1685,7 @@ event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
return simple_read_from_buffer(ubuf, cnt, ppos, buf, len);
}
+#endif
static ssize_t
event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
@@ -2152,10 +2154,12 @@ static const struct file_operations ftrace_event_format_fops = {
.release = seq_release,
};
+#ifdef CONFIG_PERF_EVENTS
static const struct file_operations ftrace_event_id_fops = {
.read = event_id_read,
.llseek = default_llseek,
};
+#endif
static const struct file_operations ftrace_event_filter_fops = {
.open = tracing_open_file_tr,
@@ -2548,6 +2552,14 @@ static int event_callback(const char *name, umode_t *mode, void **data,
return 0;
}
+/* The file is incremented on creation and freeing the enable file decrements it */
+static void event_release(const char *name, void *data)
+{
+ struct trace_event_file *file = data;
+
+ event_file_put(file);
+}
+
static int
event_create_dir(struct eventfs_inode *parent, struct trace_event_file *file)
{
@@ -2562,6 +2574,7 @@ event_create_dir(struct eventfs_inode *parent, struct trace_event_file *file)
{
.name = "enable",
.callback = event_callback,
+ .release = event_release,
},
{
.name = "filter",
@@ -2630,6 +2643,9 @@ event_create_dir(struct eventfs_inode *parent, struct trace_event_file *file)
return ret;
}
+ /* Gets decremented on freeing of the "enable" file */
+ event_file_get(file);
+
return 0;
}
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index b33c3861fbbb..4bec043c8690 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -597,20 +597,12 @@ out:
return ret;
}
-/**
- * unregister_trigger - Generic event_command @unreg implementation
- * @glob: The raw string used to register the trigger
- * @test: Trigger-specific data used to find the trigger to remove
- * @file: The trace_event_file associated with the event
- *
- * Common implementation for event trigger unregistration.
- *
- * Usually used directly as the @unreg method in event command
- * implementations.
+/*
+ * True if the trigger was found and unregistered, else false.
*/
-static void unregister_trigger(char *glob,
- struct event_trigger_data *test,
- struct trace_event_file *file)
+static bool try_unregister_trigger(char *glob,
+ struct event_trigger_data *test,
+ struct trace_event_file *file)
{
struct event_trigger_data *data = NULL, *iter;
@@ -626,8 +618,32 @@ static void unregister_trigger(char *glob,
}
}
- if (data && data->ops->free)
- data->ops->free(data);
+ if (data) {
+ if (data->ops->free)
+ data->ops->free(data);
+
+ return true;
+ }
+
+ return false;
+}
+
+/**
+ * unregister_trigger - Generic event_command @unreg implementation
+ * @glob: The raw string used to register the trigger
+ * @test: Trigger-specific data used to find the trigger to remove
+ * @file: The trace_event_file associated with the event
+ *
+ * Common implementation for event trigger unregistration.
+ *
+ * Usually used directly as the @unreg method in event command
+ * implementations.
+ */
+static void unregister_trigger(char *glob,
+ struct event_trigger_data *test,
+ struct trace_event_file *file)
+{
+ try_unregister_trigger(glob, test, file);
}
/*
@@ -1470,12 +1486,23 @@ register_snapshot_trigger(char *glob,
struct event_trigger_data *data,
struct trace_event_file *file)
{
- int ret = tracing_alloc_snapshot_instance(file->tr);
+ int ret = tracing_arm_snapshot(file->tr);
if (ret < 0)
return ret;
- return register_trigger(glob, data, file);
+ ret = register_trigger(glob, data, file);
+ if (ret < 0)
+ tracing_disarm_snapshot(file->tr);
+ return ret;
+}
+
+static void unregister_snapshot_trigger(char *glob,
+ struct event_trigger_data *data,
+ struct trace_event_file *file)
+{
+ if (try_unregister_trigger(glob, data, file))
+ tracing_disarm_snapshot(file->tr);
}
static int
@@ -1510,7 +1537,7 @@ static struct event_command trigger_snapshot_cmd = {
.trigger_type = ETT_SNAPSHOT,
.parse = event_trigger_parse,
.reg = register_snapshot_trigger,
- .unreg = unregister_trigger,
+ .unreg = unregister_snapshot_trigger,
.get_trigger_ops = snapshot_get_trigger_ops,
.set_filter = set_trigger_filter,
};
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index e76f5e1efdf2..70d428c394b6 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -34,7 +34,8 @@
/* Limit how long of an event name plus args within the subsystem. */
#define MAX_EVENT_DESC 512
-#define EVENT_NAME(user_event) ((user_event)->tracepoint.name)
+#define EVENT_NAME(user_event) ((user_event)->reg_name)
+#define EVENT_TP_NAME(user_event) ((user_event)->tracepoint.name)
#define MAX_FIELD_ARRAY_SIZE 1024
/*
@@ -54,10 +55,13 @@
* allows isolation for events by various means.
*/
struct user_event_group {
- char *system_name;
- struct hlist_node node;
- struct mutex reg_mutex;
+ char *system_name;
+ char *system_multi_name;
+ struct hlist_node node;
+ struct mutex reg_mutex;
DECLARE_HASHTABLE(register_table, 8);
+ /* ID that moves forward within the group for multi-event names */
+ u64 multi_id;
};
/* Group for init_user_ns mapping, top-most group */
@@ -78,6 +82,7 @@ static unsigned int current_user_events;
*/
struct user_event {
struct user_event_group *group;
+ char *reg_name;
struct tracepoint tracepoint;
struct trace_event_call call;
struct trace_event_class class;
@@ -127,6 +132,8 @@ struct user_event_enabler {
#define ENABLE_BIT(e) ((int)((e)->values & ENABLE_VAL_BIT_MASK))
+#define EVENT_MULTI_FORMAT(f) ((f) & USER_EVENT_REG_MULTI_FORMAT)
+
/* Used for asynchronous faulting in of pages */
struct user_event_enabler_fault {
struct work_struct work;
@@ -202,6 +209,8 @@ static struct user_event_mm *user_event_mm_get(struct user_event_mm *mm);
static struct user_event_mm *user_event_mm_get_all(struct user_event *user);
static void user_event_mm_put(struct user_event_mm *mm);
static int destroy_user_event(struct user_event *user);
+static bool user_fields_match(struct user_event *user, int argc,
+ const char **argv);
static u32 user_event_key(char *name)
{
@@ -328,6 +337,7 @@ out:
static void user_event_group_destroy(struct user_event_group *group)
{
kfree(group->system_name);
+ kfree(group->system_multi_name);
kfree(group);
}
@@ -346,6 +356,11 @@ static char *user_event_group_system_name(void)
return system_name;
}
+static char *user_event_group_system_multi_name(void)
+{
+ return kstrdup(USER_EVENTS_MULTI_SYSTEM, GFP_KERNEL);
+}
+
static struct user_event_group *current_user_event_group(void)
{
return init_group;
@@ -365,6 +380,11 @@ static struct user_event_group *user_event_group_create(void)
if (!group->system_name)
goto error;
+ group->system_multi_name = user_event_group_system_multi_name();
+
+ if (!group->system_multi_name)
+ goto error;
+
mutex_init(&group->reg_mutex);
hash_init(group->register_table);
@@ -1480,6 +1500,11 @@ static int destroy_user_event(struct user_event *user)
hash_del(&user->node);
user_event_destroy_validators(user);
+
+ /* If we have different names, both must be freed */
+ if (EVENT_NAME(user) != EVENT_TP_NAME(user))
+ kfree(EVENT_TP_NAME(user));
+
kfree(user->call.print_fmt);
kfree(EVENT_NAME(user));
kfree(user);
@@ -1493,17 +1518,36 @@ static int destroy_user_event(struct user_event *user)
}
static struct user_event *find_user_event(struct user_event_group *group,
- char *name, u32 *outkey)
+ char *name, int argc, const char **argv,
+ u32 flags, u32 *outkey)
{
struct user_event *user;
u32 key = user_event_key(name);
*outkey = key;
- hash_for_each_possible(group->register_table, user, node, key)
- if (!strcmp(EVENT_NAME(user), name))
+ hash_for_each_possible(group->register_table, user, node, key) {
+ /*
+ * Single-format events shouldn't return multi-format
+ * events. Callers expect the underlying tracepoint to match
+ * the name exactly in these cases. Only check like-formats.
+ */
+ if (EVENT_MULTI_FORMAT(flags) != EVENT_MULTI_FORMAT(user->reg_flags))
+ continue;
+
+ if (strcmp(EVENT_NAME(user), name))
+ continue;
+
+ if (user_fields_match(user, argc, argv))
return user_event_get(user);
+ /* Scan others if this is a multi-format event */
+ if (EVENT_MULTI_FORMAT(flags))
+ continue;
+
+ return ERR_PTR(-EADDRINUSE);
+ }
+
return NULL;
}
@@ -1860,6 +1904,9 @@ static bool user_fields_match(struct user_event *user, int argc,
struct list_head *head = &user->fields;
int i = 0;
+ if (argc == 0)
+ return list_empty(head);
+
list_for_each_entry_reverse(field, head, link) {
if (!user_field_match(field, argc, argv, &i))
return false;
@@ -1877,13 +1924,15 @@ static bool user_event_match(const char *system, const char *event,
struct user_event *user = container_of(ev, struct user_event, devent);
bool match;
- match = strcmp(EVENT_NAME(user), event) == 0 &&
- (!system || strcmp(system, USER_EVENTS_SYSTEM) == 0);
+ match = strcmp(EVENT_NAME(user), event) == 0;
+
+ if (match && system) {
+ match = strcmp(system, user->group->system_name) == 0 ||
+ strcmp(system, user->group->system_multi_name) == 0;
+ }
- if (match && argc > 0)
+ if (match)
match = user_fields_match(user, argc, argv);
- else if (match && argc == 0)
- match = list_empty(&user->fields);
return match;
}
@@ -1913,6 +1962,33 @@ static int user_event_trace_register(struct user_event *user)
return ret;
}
+static int user_event_set_tp_name(struct user_event *user)
+{
+ lockdep_assert_held(&user->group->reg_mutex);
+
+ if (EVENT_MULTI_FORMAT(user->reg_flags)) {
+ char *multi_name;
+
+ multi_name = kasprintf(GFP_KERNEL_ACCOUNT, "%s.%llx",
+ user->reg_name, user->group->multi_id);
+
+ if (!multi_name)
+ return -ENOMEM;
+
+ user->call.name = multi_name;
+ user->tracepoint.name = multi_name;
+
+ /* Inc to ensure unique multi-event name next time */
+ user->group->multi_id++;
+ } else {
+ /* Non Multi-format uses register name */
+ user->call.name = user->reg_name;
+ user->tracepoint.name = user->reg_name;
+ }
+
+ return 0;
+}
+
/*
* Parses the event name, arguments and flags then registers if successful.
* The name buffer lifetime is owned by this method for success cases only.
@@ -1922,11 +1998,11 @@ static int user_event_parse(struct user_event_group *group, char *name,
char *args, char *flags,
struct user_event **newuser, int reg_flags)
{
- int ret;
- u32 key;
struct user_event *user;
+ char **argv = NULL;
int argc = 0;
- char **argv;
+ int ret;
+ u32 key;
/* Currently don't support any text based flags */
if (flags != NULL)
@@ -1935,41 +2011,34 @@ static int user_event_parse(struct user_event_group *group, char *name,
if (!user_event_capable(reg_flags))
return -EPERM;
+ if (args) {
+ argv = argv_split(GFP_KERNEL, args, &argc);
+
+ if (!argv)
+ return -ENOMEM;
+ }
+
/* Prevent dyn_event from racing */
mutex_lock(&event_mutex);
- user = find_user_event(group, name, &key);
+ user = find_user_event(group, name, argc, (const char **)argv,
+ reg_flags, &key);
mutex_unlock(&event_mutex);
- if (user) {
- if (args) {
- argv = argv_split(GFP_KERNEL, args, &argc);
- if (!argv) {
- ret = -ENOMEM;
- goto error;
- }
+ if (argv)
+ argv_free(argv);
- ret = user_fields_match(user, argc, (const char **)argv);
- argv_free(argv);
-
- } else
- ret = list_empty(&user->fields);
-
- if (ret) {
- *newuser = user;
- /*
- * Name is allocated by caller, free it since it already exists.
- * Caller only worries about failure cases for freeing.
- */
- kfree(name);
- } else {
- ret = -EADDRINUSE;
- goto error;
- }
+ if (IS_ERR(user))
+ return PTR_ERR(user);
+
+ if (user) {
+ *newuser = user;
+ /*
+ * Name is allocated by caller, free it since it already exists.
+ * Caller only worries about failure cases for freeing.
+ */
+ kfree(name);
return 0;
-error:
- user_event_put(user, false);
- return ret;
}
user = kzalloc(sizeof(*user), GFP_KERNEL_ACCOUNT);
@@ -1982,7 +2051,13 @@ error:
INIT_LIST_HEAD(&user->validators);
user->group = group;
- user->tracepoint.name = name;
+ user->reg_name = name;
+ user->reg_flags = reg_flags;
+
+ ret = user_event_set_tp_name(user);
+
+ if (ret)
+ goto put_user;
ret = user_event_parse_fields(user, args);
@@ -1996,11 +2071,14 @@ error:
user->call.data = user;
user->call.class = &user->class;
- user->call.name = name;
user->call.flags = TRACE_EVENT_FL_TRACEPOINT;
user->call.tp = &user->tracepoint;
user->call.event.funcs = &user_event_funcs;
- user->class.system = group->system_name;
+
+ if (EVENT_MULTI_FORMAT(user->reg_flags))
+ user->class.system = group->system_multi_name;
+ else
+ user->class.system = group->system_name;
user->class.fields_array = user_event_fields_array;
user->class.get_fields = user_event_get_fields;
@@ -2022,8 +2100,6 @@ error:
if (ret)
goto put_user_lock;
- user->reg_flags = reg_flags;
-
if (user->reg_flags & USER_EVENT_REG_PERSIST) {
/* Ensure we track self ref and caller ref (2) */
refcount_set(&user->refcnt, 2);
@@ -2047,30 +2123,43 @@ put_user:
user_event_destroy_fields(user);
user_event_destroy_validators(user);
kfree(user->call.print_fmt);
+
+ /* Caller frees reg_name on error, but not multi-name */
+ if (EVENT_NAME(user) != EVENT_TP_NAME(user))
+ kfree(EVENT_TP_NAME(user));
+
kfree(user);
return ret;
}
/*
- * Deletes a previously created event if it is no longer being used.
+ * Deletes previously created events if they are no longer being used.
*/
static int delete_user_event(struct user_event_group *group, char *name)
{
- u32 key;
- struct user_event *user = find_user_event(group, name, &key);
+ struct user_event *user;
+ struct hlist_node *tmp;
+ u32 key = user_event_key(name);
+ int ret = -ENOENT;
- if (!user)
- return -ENOENT;
+ /* Attempt to delete all event(s) with the name passed in */
+ hash_for_each_possible_safe(group->register_table, user, tmp, node, key) {
+ if (strcmp(EVENT_NAME(user), name))
+ continue;
- user_event_put(user, true);
+ if (!user_event_last_ref(user))
+ return -EBUSY;
- if (!user_event_last_ref(user))
- return -EBUSY;
+ if (!user_event_capable(user->reg_flags))
+ return -EPERM;
- if (!user_event_capable(user->reg_flags))
- return -EPERM;
+ ret = destroy_user_event(user);
- return destroy_user_event(user);
+ if (ret)
+ goto out;
+ }
+out:
+ return ret;
}
/*
@@ -2628,7 +2717,7 @@ static int user_seq_show(struct seq_file *m, void *p)
hash_for_each(group->register_table, i, user, node) {
status = user->status;
- seq_printf(m, "%s", EVENT_NAME(user));
+ seq_printf(m, "%s", EVENT_TP_NAME(user));
if (status != 0)
seq_puts(m, " #");
diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c
index 7d2ddbcfa377..4f4280815522 100644
--- a/kernel/trace/trace_fprobe.c
+++ b/kernel/trace/trace_fprobe.c
@@ -4,6 +4,7 @@
* Copyright (C) 2022 Google LLC.
*/
#define pr_fmt(fmt) "trace_fprobe: " fmt
+#include <asm/ptrace.h>
#include <linux/fprobe.h>
#include <linux/module.h>
@@ -129,8 +130,8 @@ static bool trace_fprobe_is_registered(struct trace_fprobe *tf)
* from user space.
*/
static int
-process_fetch_insn(struct fetch_insn *code, void *rec, void *dest,
- void *base)
+process_fetch_insn(struct fetch_insn *code, void *rec, void *edata,
+ void *dest, void *base)
{
struct pt_regs *regs = rec;
unsigned long val;
@@ -152,6 +153,9 @@ retry:
case FETCH_OP_ARG:
val = regs_get_kernel_argument(regs, code->param);
break;
+ case FETCH_OP_EDATA:
+ val = *(unsigned long *)((unsigned long)edata + code->offset);
+ break;
#endif
case FETCH_NOP_SYMBOL: /* Ignore a place holder */
code++;
@@ -184,7 +188,7 @@ __fentry_trace_func(struct trace_fprobe *tf, unsigned long entry_ip,
if (trace_trigger_soft_disabled(trace_file))
return;
- dsize = __get_data_size(&tf->tp, regs);
+ dsize = __get_data_size(&tf->tp, regs, NULL);
entry = trace_event_buffer_reserve(&fbuffer, trace_file,
sizeof(*entry) + tf->tp.size + dsize);
@@ -194,7 +198,7 @@ __fentry_trace_func(struct trace_fprobe *tf, unsigned long entry_ip,
fbuffer.regs = regs;
entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event);
entry->ip = entry_ip;
- store_trace_args(&entry[1], &tf->tp, regs, sizeof(*entry), dsize);
+ store_trace_args(&entry[1], &tf->tp, regs, NULL, sizeof(*entry), dsize);
trace_event_buffer_commit(&fbuffer);
}
@@ -210,11 +214,24 @@ fentry_trace_func(struct trace_fprobe *tf, unsigned long entry_ip,
}
NOKPROBE_SYMBOL(fentry_trace_func);
-/* Kretprobe handler */
+/* function exit handler */
+static int trace_fprobe_entry_handler(struct fprobe *fp, unsigned long entry_ip,
+ unsigned long ret_ip, struct pt_regs *regs,
+ void *entry_data)
+{
+ struct trace_fprobe *tf = container_of(fp, struct trace_fprobe, fp);
+
+ if (tf->tp.entry_arg)
+ store_trace_entry_data(entry_data, &tf->tp, regs);
+
+ return 0;
+}
+NOKPROBE_SYMBOL(trace_fprobe_entry_handler)
+
static nokprobe_inline void
__fexit_trace_func(struct trace_fprobe *tf, unsigned long entry_ip,
unsigned long ret_ip, struct pt_regs *regs,
- struct trace_event_file *trace_file)
+ void *entry_data, struct trace_event_file *trace_file)
{
struct fexit_trace_entry_head *entry;
struct trace_event_buffer fbuffer;
@@ -227,7 +244,7 @@ __fexit_trace_func(struct trace_fprobe *tf, unsigned long entry_ip,
if (trace_trigger_soft_disabled(trace_file))
return;
- dsize = __get_data_size(&tf->tp, regs);
+ dsize = __get_data_size(&tf->tp, regs, entry_data);
entry = trace_event_buffer_reserve(&fbuffer, trace_file,
sizeof(*entry) + tf->tp.size + dsize);
@@ -238,19 +255,19 @@ __fexit_trace_func(struct trace_fprobe *tf, unsigned long entry_ip,
entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event);
entry->func = entry_ip;
entry->ret_ip = ret_ip;
- store_trace_args(&entry[1], &tf->tp, regs, sizeof(*entry), dsize);
+ store_trace_args(&entry[1], &tf->tp, regs, entry_data, sizeof(*entry), dsize);
trace_event_buffer_commit(&fbuffer);
}
static void
fexit_trace_func(struct trace_fprobe *tf, unsigned long entry_ip,
- unsigned long ret_ip, struct pt_regs *regs)
+ unsigned long ret_ip, struct pt_regs *regs, void *entry_data)
{
struct event_file_link *link;
trace_probe_for_each_link_rcu(link, &tf->tp)
- __fexit_trace_func(tf, entry_ip, ret_ip, regs, link->file);
+ __fexit_trace_func(tf, entry_ip, ret_ip, regs, entry_data, link->file);
}
NOKPROBE_SYMBOL(fexit_trace_func);
@@ -269,7 +286,7 @@ static int fentry_perf_func(struct trace_fprobe *tf, unsigned long entry_ip,
if (hlist_empty(head))
return 0;
- dsize = __get_data_size(&tf->tp, regs);
+ dsize = __get_data_size(&tf->tp, regs, NULL);
__size = sizeof(*entry) + tf->tp.size + dsize;
size = ALIGN(__size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
@@ -280,7 +297,7 @@ static int fentry_perf_func(struct trace_fprobe *tf, unsigned long entry_ip,
entry->ip = entry_ip;
memset(&entry[1], 0, dsize);
- store_trace_args(&entry[1], &tf->tp, regs, sizeof(*entry), dsize);
+ store_trace_args(&entry[1], &tf->tp, regs, NULL, sizeof(*entry), dsize);
perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
head, NULL);
return 0;
@@ -289,7 +306,8 @@ NOKPROBE_SYMBOL(fentry_perf_func);
static void
fexit_perf_func(struct trace_fprobe *tf, unsigned long entry_ip,
- unsigned long ret_ip, struct pt_regs *regs)
+ unsigned long ret_ip, struct pt_regs *regs,
+ void *entry_data)
{
struct trace_event_call *call = trace_probe_event_call(&tf->tp);
struct fexit_trace_entry_head *entry;
@@ -301,7 +319,7 @@ fexit_perf_func(struct trace_fprobe *tf, unsigned long entry_ip,
if (hlist_empty(head))
return;
- dsize = __get_data_size(&tf->tp, regs);
+ dsize = __get_data_size(&tf->tp, regs, entry_data);
__size = sizeof(*entry) + tf->tp.size + dsize;
size = ALIGN(__size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
@@ -312,7 +330,7 @@ fexit_perf_func(struct trace_fprobe *tf, unsigned long entry_ip,
entry->func = entry_ip;
entry->ret_ip = ret_ip;
- store_trace_args(&entry[1], &tf->tp, regs, sizeof(*entry), dsize);
+ store_trace_args(&entry[1], &tf->tp, regs, entry_data, sizeof(*entry), dsize);
perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
head, NULL);
}
@@ -343,10 +361,10 @@ static void fexit_dispatcher(struct fprobe *fp, unsigned long entry_ip,
struct trace_fprobe *tf = container_of(fp, struct trace_fprobe, fp);
if (trace_probe_test_flag(&tf->tp, TP_FLAG_TRACE))
- fexit_trace_func(tf, entry_ip, ret_ip, regs);
+ fexit_trace_func(tf, entry_ip, ret_ip, regs, entry_data);
#ifdef CONFIG_PERF_EVENTS
if (trace_probe_test_flag(&tf->tp, TP_FLAG_PROFILE))
- fexit_perf_func(tf, entry_ip, ret_ip, regs);
+ fexit_perf_func(tf, entry_ip, ret_ip, regs, entry_data);
#endif
}
NOKPROBE_SYMBOL(fexit_dispatcher);
@@ -389,7 +407,7 @@ static struct trace_fprobe *alloc_trace_fprobe(const char *group,
tf->tpoint = tpoint;
tf->fp.nr_maxactive = maxactive;
- ret = trace_probe_init(&tf->tp, event, group, false);
+ ret = trace_probe_init(&tf->tp, event, group, false, nargs);
if (ret < 0)
goto error;
@@ -1109,6 +1127,11 @@ static int __trace_fprobe_create(int argc, const char *argv[])
goto error; /* This can be -ENOMEM */
}
+ if (is_return && tf->tp.entry_arg) {
+ tf->fp.entry_handler = trace_fprobe_entry_handler;
+ tf->fp.entry_data_size = traceprobe_get_entry_data_size(&tf->tp);
+ }
+
ret = traceprobe_set_print_fmt(&tf->tp,
is_return ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL);
if (ret < 0)
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index c4c6e0e0068b..14099cc17fc9 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -290,7 +290,7 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group,
INIT_HLIST_NODE(&tk->rp.kp.hlist);
INIT_LIST_HEAD(&tk->rp.kp.list);
- ret = trace_probe_init(&tk->tp, event, group, false);
+ ret = trace_probe_init(&tk->tp, event, group, false, nargs);
if (ret < 0)
goto error;
@@ -740,6 +740,9 @@ static unsigned int number_of_same_symbols(char *func_name)
return ctx.count;
}
+static int trace_kprobe_entry_handler(struct kretprobe_instance *ri,
+ struct pt_regs *regs);
+
static int __trace_kprobe_create(int argc, const char *argv[])
{
/*
@@ -948,6 +951,11 @@ static int __trace_kprobe_create(int argc, const char *argv[])
if (ret)
goto error; /* This can be -ENOMEM */
}
+ /* entry handler for kretprobe */
+ if (is_return && tk->tp.entry_arg) {
+ tk->rp.entry_handler = trace_kprobe_entry_handler;
+ tk->rp.data_size = traceprobe_get_entry_data_size(&tk->tp);
+ }
ptype = is_return ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL;
ret = traceprobe_set_print_fmt(&tk->tp, ptype);
@@ -1303,8 +1311,8 @@ static const struct file_operations kprobe_profile_ops = {
/* Note that we don't verify it, since the code does not come from user space */
static int
-process_fetch_insn(struct fetch_insn *code, void *rec, void *dest,
- void *base)
+process_fetch_insn(struct fetch_insn *code, void *rec, void *edata,
+ void *dest, void *base)
{
struct pt_regs *regs = rec;
unsigned long val;
@@ -1329,6 +1337,9 @@ retry:
case FETCH_OP_ARG:
val = regs_get_kernel_argument(regs, code->param);
break;
+ case FETCH_OP_EDATA:
+ val = *(unsigned long *)((unsigned long)edata + code->offset);
+ break;
#endif
case FETCH_NOP_SYMBOL: /* Ignore a place holder */
code++;
@@ -1359,7 +1370,7 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs,
if (trace_trigger_soft_disabled(trace_file))
return;
- dsize = __get_data_size(&tk->tp, regs);
+ dsize = __get_data_size(&tk->tp, regs, NULL);
entry = trace_event_buffer_reserve(&fbuffer, trace_file,
sizeof(*entry) + tk->tp.size + dsize);
@@ -1368,7 +1379,7 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs,
fbuffer.regs = regs;
entry->ip = (unsigned long)tk->rp.kp.addr;
- store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize);
+ store_trace_args(&entry[1], &tk->tp, regs, NULL, sizeof(*entry), dsize);
trace_event_buffer_commit(&fbuffer);
}
@@ -1384,6 +1395,31 @@ kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs)
NOKPROBE_SYMBOL(kprobe_trace_func);
/* Kretprobe handler */
+
+static int trace_kprobe_entry_handler(struct kretprobe_instance *ri,
+ struct pt_regs *regs)
+{
+ struct kretprobe *rp = get_kretprobe(ri);
+ struct trace_kprobe *tk;
+
+ /*
+ * There is a small chance that get_kretprobe(ri) returns NULL when
+ * the kretprobe is unregister on another CPU between kretprobe's
+ * trampoline_handler and this function.
+ */
+ if (unlikely(!rp))
+ return -ENOENT;
+
+ tk = container_of(rp, struct trace_kprobe, rp);
+
+ /* store argument values into ri->data as entry data */
+ if (tk->tp.entry_arg)
+ store_trace_entry_data(ri->data, &tk->tp, regs);
+
+ return 0;
+}
+
+
static nokprobe_inline void
__kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
struct pt_regs *regs,
@@ -1399,7 +1435,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
if (trace_trigger_soft_disabled(trace_file))
return;
- dsize = __get_data_size(&tk->tp, regs);
+ dsize = __get_data_size(&tk->tp, regs, ri->data);
entry = trace_event_buffer_reserve(&fbuffer, trace_file,
sizeof(*entry) + tk->tp.size + dsize);
@@ -1409,7 +1445,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
fbuffer.regs = regs;
entry->func = (unsigned long)tk->rp.kp.addr;
entry->ret_ip = get_kretprobe_retaddr(ri);
- store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize);
+ store_trace_args(&entry[1], &tk->tp, regs, ri->data, sizeof(*entry), dsize);
trace_event_buffer_commit(&fbuffer);
}
@@ -1557,7 +1593,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
if (hlist_empty(head))
return 0;
- dsize = __get_data_size(&tk->tp, regs);
+ dsize = __get_data_size(&tk->tp, regs, NULL);
__size = sizeof(*entry) + tk->tp.size + dsize;
size = ALIGN(__size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
@@ -1568,7 +1604,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
entry->ip = (unsigned long)tk->rp.kp.addr;
memset(&entry[1], 0, dsize);
- store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize);
+ store_trace_args(&entry[1], &tk->tp, regs, NULL, sizeof(*entry), dsize);
perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
head, NULL);
return 0;
@@ -1593,7 +1629,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
if (hlist_empty(head))
return;
- dsize = __get_data_size(&tk->tp, regs);
+ dsize = __get_data_size(&tk->tp, regs, ri->data);
__size = sizeof(*entry) + tk->tp.size + dsize;
size = ALIGN(__size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
@@ -1604,7 +1640,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
entry->func = (unsigned long)tk->rp.kp.addr;
entry->ret_ip = get_kretprobe_retaddr(ri);
- store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize);
+ store_trace_args(&entry[1], &tk->tp, regs, ri->data, sizeof(*entry), dsize);
perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
head, NULL);
}
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 34289f9c6707..c3f2937b434a 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -594,6 +594,8 @@ static int parse_btf_field(char *fieldname, const struct btf_type *type,
return 0;
}
+static int __store_entry_arg(struct trace_probe *tp, int argnum);
+
static int parse_btf_arg(char *varname,
struct fetch_insn **pcode, struct fetch_insn *end,
struct traceprobe_parse_context *ctx)
@@ -618,11 +620,7 @@ static int parse_btf_arg(char *varname,
return -EOPNOTSUPP;
}
- if (ctx->flags & TPARG_FL_RETURN) {
- if (strcmp(varname, "$retval") != 0) {
- trace_probe_log_err(ctx->offset, NO_BTFARG);
- return -ENOENT;
- }
+ if (ctx->flags & TPARG_FL_RETURN && !strcmp(varname, "$retval")) {
code->op = FETCH_OP_RETVAL;
/* Check whether the function return type is not void */
if (query_btf_context(ctx) == 0) {
@@ -654,11 +652,21 @@ static int parse_btf_arg(char *varname,
const char *name = btf_name_by_offset(ctx->btf, params[i].name_off);
if (name && !strcmp(name, varname)) {
- code->op = FETCH_OP_ARG;
- if (ctx->flags & TPARG_FL_TPOINT)
- code->param = i + 1;
- else
- code->param = i;
+ if (tparg_is_function_entry(ctx->flags)) {
+ code->op = FETCH_OP_ARG;
+ if (ctx->flags & TPARG_FL_TPOINT)
+ code->param = i + 1;
+ else
+ code->param = i;
+ } else if (tparg_is_function_return(ctx->flags)) {
+ code->op = FETCH_OP_EDATA;
+ ret = __store_entry_arg(ctx->tp, i);
+ if (ret < 0) {
+ /* internal error */
+ return ret;
+ }
+ code->offset = ret;
+ }
tid = params[i].type;
goto found;
}
@@ -755,6 +763,110 @@ static int check_prepare_btf_string_fetch(char *typename,
#endif
+#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
+
+static int __store_entry_arg(struct trace_probe *tp, int argnum)
+{
+ struct probe_entry_arg *earg = tp->entry_arg;
+ bool match = false;
+ int i, offset;
+
+ if (!earg) {
+ earg = kzalloc(sizeof(*tp->entry_arg), GFP_KERNEL);
+ if (!earg)
+ return -ENOMEM;
+ earg->size = 2 * tp->nr_args + 1;
+ earg->code = kcalloc(earg->size, sizeof(struct fetch_insn),
+ GFP_KERNEL);
+ if (!earg->code) {
+ kfree(earg);
+ return -ENOMEM;
+ }
+ /* Fill the code buffer with 'end' to simplify it */
+ for (i = 0; i < earg->size; i++)
+ earg->code[i].op = FETCH_OP_END;
+ tp->entry_arg = earg;
+ }
+
+ offset = 0;
+ for (i = 0; i < earg->size - 1; i++) {
+ switch (earg->code[i].op) {
+ case FETCH_OP_END:
+ earg->code[i].op = FETCH_OP_ARG;
+ earg->code[i].param = argnum;
+ earg->code[i + 1].op = FETCH_OP_ST_EDATA;
+ earg->code[i + 1].offset = offset;
+ return offset;
+ case FETCH_OP_ARG:
+ match = (earg->code[i].param == argnum);
+ break;
+ case FETCH_OP_ST_EDATA:
+ offset = earg->code[i].offset;
+ if (match)
+ return offset;
+ offset += sizeof(unsigned long);
+ break;
+ default:
+ break;
+ }
+ }
+ return -ENOSPC;
+}
+
+int traceprobe_get_entry_data_size(struct trace_probe *tp)
+{
+ struct probe_entry_arg *earg = tp->entry_arg;
+ int i, size = 0;
+
+ if (!earg)
+ return 0;
+
+ for (i = 0; i < earg->size; i++) {
+ switch (earg->code[i].op) {
+ case FETCH_OP_END:
+ goto out;
+ case FETCH_OP_ST_EDATA:
+ size = earg->code[i].offset + sizeof(unsigned long);
+ break;
+ default:
+ break;
+ }
+ }
+out:
+ return size;
+}
+
+void store_trace_entry_data(void *edata, struct trace_probe *tp, struct pt_regs *regs)
+{
+ struct probe_entry_arg *earg = tp->entry_arg;
+ unsigned long val = 0;
+ int i;
+
+ if (!earg)
+ return;
+
+ for (i = 0; i < earg->size; i++) {
+ struct fetch_insn *code = &earg->code[i];
+
+ switch (code->op) {
+ case FETCH_OP_ARG:
+ val = regs_get_kernel_argument(regs, code->param);
+ break;
+ case FETCH_OP_ST_EDATA:
+ *(unsigned long *)((unsigned long)edata + code->offset) = val;
+ break;
+ case FETCH_OP_END:
+ goto end;
+ default:
+ break;
+ }
+ }
+end:
+ return;
+}
+NOKPROBE_SYMBOL(store_trace_entry_data)
+#endif
+
#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
/* Parse $vars. @orig_arg points '$', which syncs to @ctx->offset */
@@ -830,7 +942,7 @@ static int parse_probe_vars(char *orig_arg, const struct fetch_type *t,
#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
len = str_has_prefix(arg, "arg");
- if (len && tparg_is_function_entry(ctx->flags)) {
+ if (len) {
ret = kstrtoul(arg + len, 10, &param);
if (ret)
goto inval;
@@ -839,15 +951,29 @@ static int parse_probe_vars(char *orig_arg, const struct fetch_type *t,
err = TP_ERR_BAD_ARG_NUM;
goto inval;
}
+ param--; /* argN starts from 1, but internal arg[N] starts from 0 */
- code->op = FETCH_OP_ARG;
- code->param = (unsigned int)param - 1;
- /*
- * The tracepoint probe will probe a stub function, and the
- * first parameter of the stub is a dummy and should be ignored.
- */
- if (ctx->flags & TPARG_FL_TPOINT)
- code->param++;
+ if (tparg_is_function_entry(ctx->flags)) {
+ code->op = FETCH_OP_ARG;
+ code->param = (unsigned int)param;
+ /*
+ * The tracepoint probe will probe a stub function, and the
+ * first parameter of the stub is a dummy and should be ignored.
+ */
+ if (ctx->flags & TPARG_FL_TPOINT)
+ code->param++;
+ } else if (tparg_is_function_return(ctx->flags)) {
+ /* function entry argument access from return probe */
+ ret = __store_entry_arg(ctx->tp, param);
+ if (ret < 0) /* This error should be an internal error */
+ return ret;
+
+ code->op = FETCH_OP_EDATA;
+ code->offset = ret;
+ } else {
+ err = TP_ERR_NOFENTRY_ARGS;
+ goto inval;
+ }
return 0;
}
#endif
@@ -1037,7 +1163,8 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
break;
default:
if (isalpha(arg[0]) || arg[0] == '_') { /* BTF variable */
- if (!tparg_is_function_entry(ctx->flags)) {
+ if (!tparg_is_function_entry(ctx->flags) &&
+ !tparg_is_function_return(ctx->flags)) {
trace_probe_log_err(ctx->offset, NOSUP_BTFARG);
return -EINVAL;
}
@@ -1053,8 +1180,6 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
return ret;
}
-#define BYTES_TO_BITS(nb) ((BITS_PER_LONG * (nb)) / sizeof(long))
-
/* Bitfield type needs to be parsed into a fetch function */
static int __parse_bitfield_probe_arg(const char *bf,
const struct fetch_type *t,
@@ -1090,67 +1215,45 @@ static int __parse_bitfield_probe_arg(const char *bf,
return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0;
}
-/* String length checking wrapper */
-static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
- struct probe_arg *parg,
- struct traceprobe_parse_context *ctx)
+/* Split type part from @arg and return it. */
+static char *parse_probe_arg_type(char *arg, struct probe_arg *parg,
+ struct traceprobe_parse_context *ctx)
{
- struct fetch_insn *code, *scode, *tmp = NULL;
- char *t, *t2, *t3;
- int ret, len;
- char *arg;
+ char *t = NULL, *t2, *t3;
+ int offs;
- arg = kstrdup(argv, GFP_KERNEL);
- if (!arg)
- return -ENOMEM;
-
- ret = -EINVAL;
- len = strlen(arg);
- if (len > MAX_ARGSTR_LEN) {
- trace_probe_log_err(ctx->offset, ARG_TOO_LONG);
- goto out;
- } else if (len == 0) {
- trace_probe_log_err(ctx->offset, NO_ARG_BODY);
- goto out;
- }
-
- ret = -ENOMEM;
- parg->comm = kstrdup(arg, GFP_KERNEL);
- if (!parg->comm)
- goto out;
-
- ret = -EINVAL;
t = strchr(arg, ':');
if (t) {
- *t = '\0';
- t2 = strchr(++t, '[');
+ *t++ = '\0';
+ t2 = strchr(t, '[');
if (t2) {
*t2++ = '\0';
t3 = strchr(t2, ']');
if (!t3) {
- int offs = t2 + strlen(t2) - arg;
+ offs = t2 + strlen(t2) - arg;
trace_probe_log_err(ctx->offset + offs,
ARRAY_NO_CLOSE);
- goto out;
+ return ERR_PTR(-EINVAL);
} else if (t3[1] != '\0') {
trace_probe_log_err(ctx->offset + t3 + 1 - arg,
BAD_ARRAY_SUFFIX);
- goto out;
+ return ERR_PTR(-EINVAL);
}
*t3 = '\0';
if (kstrtouint(t2, 0, &parg->count) || !parg->count) {
trace_probe_log_err(ctx->offset + t2 - arg,
BAD_ARRAY_NUM);
- goto out;
+ return ERR_PTR(-EINVAL);
}
if (parg->count > MAX_ARRAY_LEN) {
trace_probe_log_err(ctx->offset + t2 - arg,
ARRAY_TOO_BIG);
- goto out;
+ return ERR_PTR(-EINVAL);
}
}
}
+ offs = t ? t - arg : 0;
/*
* Since $comm and immediate string can not be dereferenced,
@@ -1161,74 +1264,52 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
strncmp(arg, "\\\"", 2) == 0)) {
/* The type of $comm must be "string", and not an array type. */
if (parg->count || (t && strcmp(t, "string"))) {
- trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0),
- NEED_STRING_TYPE);
- goto out;
+ trace_probe_log_err(ctx->offset + offs, NEED_STRING_TYPE);
+ return ERR_PTR(-EINVAL);
}
parg->type = find_fetch_type("string", ctx->flags);
} else
parg->type = find_fetch_type(t, ctx->flags);
+
if (!parg->type) {
- trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0), BAD_TYPE);
- goto out;
+ trace_probe_log_err(ctx->offset + offs, BAD_TYPE);
+ return ERR_PTR(-EINVAL);
}
- code = tmp = kcalloc(FETCH_INSN_MAX, sizeof(*code), GFP_KERNEL);
- if (!code)
- goto out;
- code[FETCH_INSN_MAX - 1].op = FETCH_OP_END;
-
- ctx->last_type = NULL;
- ret = parse_probe_arg(arg, parg->type, &code, &code[FETCH_INSN_MAX - 1],
- ctx);
- if (ret)
- goto fail;
-
- /* Update storing type if BTF is available */
- if (IS_ENABLED(CONFIG_PROBE_EVENTS_BTF_ARGS) &&
- ctx->last_type) {
- if (!t) {
- parg->type = find_fetch_type_from_btf_type(ctx);
- } else if (strstr(t, "string")) {
- ret = check_prepare_btf_string_fetch(t, &code, ctx);
- if (ret)
- goto fail;
- }
- }
- parg->offset = *size;
- *size += parg->type->size * (parg->count ?: 1);
+ return t;
+}
- if (parg->count) {
- len = strlen(parg->type->fmttype) + 6;
- parg->fmt = kmalloc(len, GFP_KERNEL);
- if (!parg->fmt) {
- ret = -ENOMEM;
- goto out;
- }
- snprintf(parg->fmt, len, "%s[%d]", parg->type->fmttype,
- parg->count);
- }
+/* After parsing, adjust the fetch_insn according to the probe_arg */
+static int finalize_fetch_insn(struct fetch_insn *code,
+ struct probe_arg *parg,
+ char *type,
+ int type_offset,
+ struct traceprobe_parse_context *ctx)
+{
+ struct fetch_insn *scode;
+ int ret;
- ret = -EINVAL;
/* Store operation */
if (parg->type->is_string) {
+ /* Check bad combination of the type and the last fetch_insn. */
if (!strcmp(parg->type->name, "symstr")) {
if (code->op != FETCH_OP_REG && code->op != FETCH_OP_STACK &&
code->op != FETCH_OP_RETVAL && code->op != FETCH_OP_ARG &&
code->op != FETCH_OP_DEREF && code->op != FETCH_OP_TP_ARG) {
- trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0),
+ trace_probe_log_err(ctx->offset + type_offset,
BAD_SYMSTRING);
- goto fail;
+ return -EINVAL;
}
} else {
if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_UDEREF &&
code->op != FETCH_OP_IMM && code->op != FETCH_OP_COMM &&
code->op != FETCH_OP_DATA && code->op != FETCH_OP_TP_ARG) {
- trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0),
+ trace_probe_log_err(ctx->offset + type_offset,
BAD_STRING);
- goto fail;
+ return -EINVAL;
}
}
+
if (!strcmp(parg->type->name, "symstr") ||
(code->op == FETCH_OP_IMM || code->op == FETCH_OP_COMM ||
code->op == FETCH_OP_DATA) || code->op == FETCH_OP_TP_ARG ||
@@ -1244,9 +1325,10 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
code++;
if (code->op != FETCH_OP_NOP) {
trace_probe_log_err(ctx->offset, TOO_MANY_OPS);
- goto fail;
+ return -EINVAL;
}
}
+
/* If op == DEREF, replace it with STRING */
if (!strcmp(parg->type->name, "ustring") ||
code->op == FETCH_OP_UDEREF)
@@ -1267,47 +1349,134 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
code++;
if (code->op != FETCH_OP_NOP) {
trace_probe_log_err(ctx->offset, TOO_MANY_OPS);
- goto fail;
+ return -E2BIG;
}
code->op = FETCH_OP_ST_RAW;
code->size = parg->type->size;
}
+
+ /* Save storing fetch_insn. */
scode = code;
+
/* Modify operation */
- if (t != NULL) {
- ret = __parse_bitfield_probe_arg(t, parg->type, &code);
+ if (type != NULL) {
+ /* Bitfield needs a special fetch_insn. */
+ ret = __parse_bitfield_probe_arg(type, parg->type, &code);
if (ret) {
- trace_probe_log_err(ctx->offset + t - arg, BAD_BITFIELD);
- goto fail;
+ trace_probe_log_err(ctx->offset + type_offset, BAD_BITFIELD);
+ return ret;
}
} else if (IS_ENABLED(CONFIG_PROBE_EVENTS_BTF_ARGS) &&
ctx->last_type) {
+ /* If user not specified the type, try parsing BTF bitfield. */
ret = parse_btf_bitfield(&code, ctx);
if (ret)
- goto fail;
+ return ret;
}
- ret = -EINVAL;
+
/* Loop(Array) operation */
if (parg->count) {
if (scode->op != FETCH_OP_ST_MEM &&
scode->op != FETCH_OP_ST_STRING &&
scode->op != FETCH_OP_ST_USTRING) {
- trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0),
- BAD_STRING);
- goto fail;
+ trace_probe_log_err(ctx->offset + type_offset, BAD_STRING);
+ return -EINVAL;
}
code++;
if (code->op != FETCH_OP_NOP) {
trace_probe_log_err(ctx->offset, TOO_MANY_OPS);
- goto fail;
+ return -E2BIG;
}
code->op = FETCH_OP_LP_ARRAY;
code->param = parg->count;
}
+
+ /* Finalize the fetch_insn array. */
code++;
code->op = FETCH_OP_END;
- ret = 0;
+ return 0;
+}
+
+/* String length checking wrapper */
+static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
+ struct probe_arg *parg,
+ struct traceprobe_parse_context *ctx)
+{
+ struct fetch_insn *code, *tmp = NULL;
+ char *type, *arg;
+ int ret, len;
+
+ len = strlen(argv);
+ if (len > MAX_ARGSTR_LEN) {
+ trace_probe_log_err(ctx->offset, ARG_TOO_LONG);
+ return -E2BIG;
+ } else if (len == 0) {
+ trace_probe_log_err(ctx->offset, NO_ARG_BODY);
+ return -EINVAL;
+ }
+
+ arg = kstrdup(argv, GFP_KERNEL);
+ if (!arg)
+ return -ENOMEM;
+
+ parg->comm = kstrdup(arg, GFP_KERNEL);
+ if (!parg->comm) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ type = parse_probe_arg_type(arg, parg, ctx);
+ if (IS_ERR(type)) {
+ ret = PTR_ERR(type);
+ goto out;
+ }
+
+ code = tmp = kcalloc(FETCH_INSN_MAX, sizeof(*code), GFP_KERNEL);
+ if (!code) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ code[FETCH_INSN_MAX - 1].op = FETCH_OP_END;
+
+ ctx->last_type = NULL;
+ ret = parse_probe_arg(arg, parg->type, &code, &code[FETCH_INSN_MAX - 1],
+ ctx);
+ if (ret < 0)
+ goto fail;
+
+ /* Update storing type if BTF is available */
+ if (IS_ENABLED(CONFIG_PROBE_EVENTS_BTF_ARGS) &&
+ ctx->last_type) {
+ if (!type) {
+ parg->type = find_fetch_type_from_btf_type(ctx);
+ } else if (strstr(type, "string")) {
+ ret = check_prepare_btf_string_fetch(type, &code, ctx);
+ if (ret)
+ goto fail;
+ }
+ }
+ parg->offset = *size;
+ *size += parg->type->size * (parg->count ?: 1);
+
+ if (parg->count) {
+ len = strlen(parg->type->fmttype) + 6;
+ parg->fmt = kmalloc(len, GFP_KERNEL);
+ if (!parg->fmt) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+ snprintf(parg->fmt, len, "%s[%d]", parg->type->fmttype,
+ parg->count);
+ }
+
+ ret = finalize_fetch_insn(code, parg, type, type ? type - arg : 0, ctx);
+ if (ret < 0)
+ goto fail;
+
+ for (; code < tmp + FETCH_INSN_MAX; code++)
+ if (code->op == FETCH_OP_END)
+ break;
/* Shrink down the code buffer */
parg->code = kcalloc(code - tmp + 1, sizeof(*code), GFP_KERNEL);
if (!parg->code)
@@ -1316,7 +1485,7 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
memcpy(parg->code, tmp, sizeof(*code) * (code - tmp + 1));
fail:
- if (ret) {
+ if (ret < 0) {
for (code = tmp; code < tmp + FETCH_INSN_MAX; code++)
if (code->op == FETCH_NOP_SYMBOL ||
code->op == FETCH_OP_DATA)
@@ -1379,9 +1548,7 @@ int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, const char *arg,
struct probe_arg *parg = &tp->args[i];
const char *body;
- /* Increment count for freeing args in error case */
- tp->nr_args++;
-
+ ctx->tp = tp;
body = strchr(arg, '=');
if (body) {
if (body - arg > MAX_ARG_NAME_LEN) {
@@ -1438,7 +1605,8 @@ static int argv_has_var_arg(int argc, const char *argv[], int *args_idx,
if (str_has_prefix(argv[i], "$arg")) {
trace_probe_log_set_index(i + 2);
- if (!tparg_is_function_entry(ctx->flags)) {
+ if (!tparg_is_function_entry(ctx->flags) &&
+ !tparg_is_function_return(ctx->flags)) {
trace_probe_log_err(0, NOFENTRY_ARGS);
return -EINVAL;
}
@@ -1761,12 +1929,18 @@ void trace_probe_cleanup(struct trace_probe *tp)
for (i = 0; i < tp->nr_args; i++)
traceprobe_free_probe_arg(&tp->args[i]);
+ if (tp->entry_arg) {
+ kfree(tp->entry_arg->code);
+ kfree(tp->entry_arg);
+ tp->entry_arg = NULL;
+ }
+
if (tp->event)
trace_probe_unlink(tp);
}
int trace_probe_init(struct trace_probe *tp, const char *event,
- const char *group, bool alloc_filter)
+ const char *group, bool alloc_filter, int nargs)
{
struct trace_event_call *call;
size_t size = sizeof(struct trace_probe_event);
@@ -1802,6 +1976,11 @@ int trace_probe_init(struct trace_probe *tp, const char *event,
goto error;
}
+ tp->nr_args = nargs;
+ /* Make sure pointers in args[] are NULL */
+ if (nargs)
+ memset(tp->args, 0, sizeof(tp->args[0]) * nargs);
+
return 0;
error:
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index c1877d018269..cef3a50628a3 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -92,6 +92,7 @@ enum fetch_op {
FETCH_OP_ARG, /* Function argument : .param */
FETCH_OP_FOFFS, /* File offset: .immediate */
FETCH_OP_DATA, /* Allocated data: .data */
+ FETCH_OP_EDATA, /* Entry data: .offset */
// Stage 2 (dereference) op
FETCH_OP_DEREF, /* Dereference: .offset */
FETCH_OP_UDEREF, /* User-space Dereference: .offset */
@@ -102,6 +103,7 @@ enum fetch_op {
FETCH_OP_ST_STRING, /* String: .offset, .size */
FETCH_OP_ST_USTRING, /* User String: .offset, .size */
FETCH_OP_ST_SYMSTR, /* Kernel Symbol String: .offset, .size */
+ FETCH_OP_ST_EDATA, /* Store Entry Data: .offset */
// Stage 4 (modify) op
FETCH_OP_MOD_BF, /* Bitfield: .basesize, .lshift, .rshift */
// Stage 5 (loop) op
@@ -232,6 +234,11 @@ struct probe_arg {
const struct fetch_type *type; /* Type of this argument */
};
+struct probe_entry_arg {
+ struct fetch_insn *code;
+ unsigned int size; /* The entry data size */
+};
+
struct trace_uprobe_filter {
rwlock_t rwlock;
int nr_systemwide;
@@ -253,6 +260,7 @@ struct trace_probe {
struct trace_probe_event *event;
ssize_t size; /* trace entry size */
unsigned int nr_args;
+ struct probe_entry_arg *entry_arg; /* This is only for return probe */
struct probe_arg args[];
};
@@ -338,7 +346,7 @@ static inline bool trace_probe_has_single_file(struct trace_probe *tp)
}
int trace_probe_init(struct trace_probe *tp, const char *event,
- const char *group, bool alloc_filter);
+ const char *group, bool alloc_filter, int nargs);
void trace_probe_cleanup(struct trace_probe *tp);
int trace_probe_append(struct trace_probe *tp, struct trace_probe *to);
void trace_probe_unlink(struct trace_probe *tp);
@@ -355,6 +363,18 @@ int trace_probe_create(const char *raw_command, int (*createfn)(int, const char
int trace_probe_print_args(struct trace_seq *s, struct probe_arg *args, int nr_args,
u8 *data, void *field);
+#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
+int traceprobe_get_entry_data_size(struct trace_probe *tp);
+/* This is a runtime function to store entry data */
+void store_trace_entry_data(void *edata, struct trace_probe *tp, struct pt_regs *regs);
+#else /* !CONFIG_HAVE_FUNCTION_ARG_ACCESS_API */
+static inline int traceprobe_get_entry_data_size(struct trace_probe *tp)
+{
+ return 0;
+}
+#define store_trace_entry_data(edata, tp, regs) do { } while (0)
+#endif
+
#define trace_probe_for_each_link(pos, tp) \
list_for_each_entry(pos, &(tp)->event->files, list)
#define trace_probe_for_each_link_rcu(pos, tp) \
@@ -381,6 +401,11 @@ static inline bool tparg_is_function_entry(unsigned int flags)
return (flags & TPARG_FL_LOC_MASK) == (TPARG_FL_KERNEL | TPARG_FL_FENTRY);
}
+static inline bool tparg_is_function_return(unsigned int flags)
+{
+ return (flags & TPARG_FL_LOC_MASK) == (TPARG_FL_KERNEL | TPARG_FL_RETURN);
+}
+
struct traceprobe_parse_context {
struct trace_event_call *event;
/* BTF related parameters */
@@ -392,6 +417,7 @@ struct traceprobe_parse_context {
const struct btf_type *last_type; /* Saved type */
u32 last_bitoffs; /* Saved bitoffs */
u32 last_bitsize; /* Saved bitsize */
+ struct trace_probe *tp;
unsigned int flags;
int offset;
};
@@ -506,7 +532,7 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,
C(NO_BTFARG, "This variable is not found at this probe point"),\
C(NO_BTF_ENTRY, "No BTF entry for this probe point"), \
C(BAD_VAR_ARGS, "$arg* must be an independent parameter without name etc."),\
- C(NOFENTRY_ARGS, "$arg* can be used only on function entry"), \
+ C(NOFENTRY_ARGS, "$arg* can be used only on function entry or exit"), \
C(DOUBLE_ARGS, "$arg* can be used only once in the parameters"), \
C(ARGS_2LONG, "$arg* failed because the argument list is too long"), \
C(ARGIDX_2BIG, "$argN index is too big"), \
diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h
index 3935b347f874..2caf0d2afb32 100644
--- a/kernel/trace/trace_probe_tmpl.h
+++ b/kernel/trace/trace_probe_tmpl.h
@@ -54,7 +54,7 @@ fetch_apply_bitfield(struct fetch_insn *code, void *buf)
* If dest is NULL, don't store result and return required dynamic data size.
*/
static int
-process_fetch_insn(struct fetch_insn *code, void *rec,
+process_fetch_insn(struct fetch_insn *code, void *rec, void *edata,
void *dest, void *base);
static nokprobe_inline int fetch_store_strlen(unsigned long addr);
static nokprobe_inline int
@@ -232,7 +232,7 @@ array:
/* Sum up total data length for dynamic arrays (strings) */
static nokprobe_inline int
-__get_data_size(struct trace_probe *tp, struct pt_regs *regs)
+__get_data_size(struct trace_probe *tp, struct pt_regs *regs, void *edata)
{
struct probe_arg *arg;
int i, len, ret = 0;
@@ -240,7 +240,7 @@ __get_data_size(struct trace_probe *tp, struct pt_regs *regs)
for (i = 0; i < tp->nr_args; i++) {
arg = tp->args + i;
if (unlikely(arg->dynamic)) {
- len = process_fetch_insn(arg->code, regs, NULL, NULL);
+ len = process_fetch_insn(arg->code, regs, edata, NULL, NULL);
if (len > 0)
ret += len;
}
@@ -251,7 +251,7 @@ __get_data_size(struct trace_probe *tp, struct pt_regs *regs)
/* Store the value of each argument */
static nokprobe_inline void
-store_trace_args(void *data, struct trace_probe *tp, void *rec,
+store_trace_args(void *data, struct trace_probe *tp, void *rec, void *edata,
int header_size, int maxlen)
{
struct probe_arg *arg;
@@ -266,7 +266,7 @@ store_trace_args(void *data, struct trace_probe *tp, void *rec,
/* Point the dynamic data area if needed */
if (unlikely(arg->dynamic))
*dl = make_data_loc(maxlen, dyndata - base);
- ret = process_fetch_insn(arg->code, rec, dl, base);
+ ret = process_fetch_insn(arg->code, rec, edata, dl, base);
if (arg->dynamic && likely(ret > 0)) {
dyndata += ret;
maxlen -= ret;
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index c9ffdcfe622e..8a407adb0e1c 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -8,6 +8,7 @@
#include <linux/module.h>
#include <linux/kallsyms.h>
#include <linux/uaccess.h>
+#include <linux/kmemleak.h>
#include <linux/ftrace.h>
#include <trace/events/sched.h>
@@ -148,3 +149,517 @@ void tracing_stop_tgid_record(void)
{
tracing_stop_sched_switch(RECORD_TGID);
}
+
+/*
+ * The tgid_map array maps from pid to tgid; i.e. the value stored at index i
+ * is the tgid last observed corresponding to pid=i.
+ */
+static int *tgid_map;
+
+/* The maximum valid index into tgid_map. */
+static size_t tgid_map_max;
+
+#define SAVED_CMDLINES_DEFAULT 128
+#define NO_CMDLINE_MAP UINT_MAX
+/*
+ * Preemption must be disabled before acquiring trace_cmdline_lock.
+ * The various trace_arrays' max_lock must be acquired in a context
+ * where interrupt is disabled.
+ */
+static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED;
+struct saved_cmdlines_buffer {
+ unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
+ unsigned *map_cmdline_to_pid;
+ unsigned cmdline_num;
+ int cmdline_idx;
+ char saved_cmdlines[];
+};
+static struct saved_cmdlines_buffer *savedcmd;
+
+/* Holds the size of a cmdline and pid element */
+#define SAVED_CMDLINE_MAP_ELEMENT_SIZE(s) \
+ (TASK_COMM_LEN + sizeof((s)->map_cmdline_to_pid[0]))
+
+static inline char *get_saved_cmdlines(int idx)
+{
+ return &savedcmd->saved_cmdlines[idx * TASK_COMM_LEN];
+}
+
+static inline void set_cmdline(int idx, const char *cmdline)
+{
+ strncpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN);
+}
+
+static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s)
+{
+ int order = get_order(sizeof(*s) + s->cmdline_num * TASK_COMM_LEN);
+
+ kmemleak_free(s);
+ free_pages((unsigned long)s, order);
+}
+
+static struct saved_cmdlines_buffer *allocate_cmdlines_buffer(unsigned int val)
+{
+ struct saved_cmdlines_buffer *s;
+ struct page *page;
+ int orig_size, size;
+ int order;
+
+ /* Figure out how much is needed to hold the given number of cmdlines */
+ orig_size = sizeof(*s) + val * SAVED_CMDLINE_MAP_ELEMENT_SIZE(s);
+ order = get_order(orig_size);
+ size = 1 << (order + PAGE_SHIFT);
+ page = alloc_pages(GFP_KERNEL, order);
+ if (!page)
+ return NULL;
+
+ s = page_address(page);
+ kmemleak_alloc(s, size, 1, GFP_KERNEL);
+ memset(s, 0, sizeof(*s));
+
+ /* Round up to actual allocation */
+ val = (size - sizeof(*s)) / SAVED_CMDLINE_MAP_ELEMENT_SIZE(s);
+ s->cmdline_num = val;
+
+ /* Place map_cmdline_to_pid array right after saved_cmdlines */
+ s->map_cmdline_to_pid = (unsigned *)&s->saved_cmdlines[val * TASK_COMM_LEN];
+
+ s->cmdline_idx = 0;
+ memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP,
+ sizeof(s->map_pid_to_cmdline));
+ memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP,
+ val * sizeof(*s->map_cmdline_to_pid));
+
+ return s;
+}
+
+int trace_create_savedcmd(void)
+{
+ savedcmd = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT);
+
+ return savedcmd ? 0 : -ENOMEM;
+}
+
+int trace_save_cmdline(struct task_struct *tsk)
+{
+ unsigned tpid, idx;
+
+ /* treat recording of idle task as a success */
+ if (!tsk->pid)
+ return 1;
+
+ tpid = tsk->pid & (PID_MAX_DEFAULT - 1);
+
+ /*
+ * It's not the end of the world if we don't get
+ * the lock, but we also don't want to spin
+ * nor do we want to disable interrupts,
+ * so if we miss here, then better luck next time.
+ *
+ * This is called within the scheduler and wake up, so interrupts
+ * had better been disabled and run queue lock been held.
+ */
+ lockdep_assert_preemption_disabled();
+ if (!arch_spin_trylock(&trace_cmdline_lock))
+ return 0;
+
+ idx = savedcmd->map_pid_to_cmdline[tpid];
+ if (idx == NO_CMDLINE_MAP) {
+ idx = (savedcmd->cmdline_idx + 1) % savedcmd->cmdline_num;
+
+ savedcmd->map_pid_to_cmdline[tpid] = idx;
+ savedcmd->cmdline_idx = idx;
+ }
+
+ savedcmd->map_cmdline_to_pid[idx] = tsk->pid;
+ set_cmdline(idx, tsk->comm);
+
+ arch_spin_unlock(&trace_cmdline_lock);
+
+ return 1;
+}
+
+static void __trace_find_cmdline(int pid, char comm[])
+{
+ unsigned map;
+ int tpid;
+
+ if (!pid) {
+ strcpy(comm, "<idle>");
+ return;
+ }
+
+ if (WARN_ON_ONCE(pid < 0)) {
+ strcpy(comm, "<XXX>");
+ return;
+ }
+
+ tpid = pid & (PID_MAX_DEFAULT - 1);
+ map = savedcmd->map_pid_to_cmdline[tpid];
+ if (map != NO_CMDLINE_MAP) {
+ tpid = savedcmd->map_cmdline_to_pid[map];
+ if (tpid == pid) {
+ strscpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN);
+ return;
+ }
+ }
+ strcpy(comm, "<...>");
+}
+
+void trace_find_cmdline(int pid, char comm[])
+{
+ preempt_disable();
+ arch_spin_lock(&trace_cmdline_lock);
+
+ __trace_find_cmdline(pid, comm);
+
+ arch_spin_unlock(&trace_cmdline_lock);
+ preempt_enable();
+}
+
+static int *trace_find_tgid_ptr(int pid)
+{
+ /*
+ * Pairs with the smp_store_release in set_tracer_flag() to ensure that
+ * if we observe a non-NULL tgid_map then we also observe the correct
+ * tgid_map_max.
+ */
+ int *map = smp_load_acquire(&tgid_map);
+
+ if (unlikely(!map || pid > tgid_map_max))
+ return NULL;
+
+ return &map[pid];
+}
+
+int trace_find_tgid(int pid)
+{
+ int *ptr = trace_find_tgid_ptr(pid);
+
+ return ptr ? *ptr : 0;
+}
+
+static int trace_save_tgid(struct task_struct *tsk)
+{
+ int *ptr;
+
+ /* treat recording of idle task as a success */
+ if (!tsk->pid)
+ return 1;
+
+ ptr = trace_find_tgid_ptr(tsk->pid);
+ if (!ptr)
+ return 0;
+
+ *ptr = tsk->tgid;
+ return 1;
+}
+
+static bool tracing_record_taskinfo_skip(int flags)
+{
+ if (unlikely(!(flags & (TRACE_RECORD_CMDLINE | TRACE_RECORD_TGID))))
+ return true;
+ if (!__this_cpu_read(trace_taskinfo_save))
+ return true;
+ return false;
+}
+
+/**
+ * tracing_record_taskinfo - record the task info of a task
+ *
+ * @task: task to record
+ * @flags: TRACE_RECORD_CMDLINE for recording comm
+ * TRACE_RECORD_TGID for recording tgid
+ */
+void tracing_record_taskinfo(struct task_struct *task, int flags)
+{
+ bool done;
+
+ if (tracing_record_taskinfo_skip(flags))
+ return;
+
+ /*
+ * Record as much task information as possible. If some fail, continue
+ * to try to record the others.
+ */
+ done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(task);
+ done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(task);
+
+ /* If recording any information failed, retry again soon. */
+ if (!done)
+ return;
+
+ __this_cpu_write(trace_taskinfo_save, false);
+}
+
+/**
+ * tracing_record_taskinfo_sched_switch - record task info for sched_switch
+ *
+ * @prev: previous task during sched_switch
+ * @next: next task during sched_switch
+ * @flags: TRACE_RECORD_CMDLINE for recording comm
+ * TRACE_RECORD_TGID for recording tgid
+ */
+void tracing_record_taskinfo_sched_switch(struct task_struct *prev,
+ struct task_struct *next, int flags)
+{
+ bool done;
+
+ if (tracing_record_taskinfo_skip(flags))
+ return;
+
+ /*
+ * Record as much task information as possible. If some fail, continue
+ * to try to record the others.
+ */
+ done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(prev);
+ done &= !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(next);
+ done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(prev);
+ done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(next);
+
+ /* If recording any information failed, retry again soon. */
+ if (!done)
+ return;
+
+ __this_cpu_write(trace_taskinfo_save, false);
+}
+
+/* Helpers to record a specific task information */
+void tracing_record_cmdline(struct task_struct *task)
+{
+ tracing_record_taskinfo(task, TRACE_RECORD_CMDLINE);
+}
+
+void tracing_record_tgid(struct task_struct *task)
+{
+ tracing_record_taskinfo(task, TRACE_RECORD_TGID);
+}
+
+int trace_alloc_tgid_map(void)
+{
+ int *map;
+
+ if (tgid_map)
+ return 0;
+
+ tgid_map_max = pid_max;
+ map = kvcalloc(tgid_map_max + 1, sizeof(*tgid_map),
+ GFP_KERNEL);
+ if (!map)
+ return -ENOMEM;
+
+ /*
+ * Pairs with smp_load_acquire() in
+ * trace_find_tgid_ptr() to ensure that if it observes
+ * the tgid_map we just allocated then it also observes
+ * the corresponding tgid_map_max value.
+ */
+ smp_store_release(&tgid_map, map);
+ return 0;
+}
+
+static void *saved_tgids_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ int pid = ++(*pos);
+
+ return trace_find_tgid_ptr(pid);
+}
+
+static void *saved_tgids_start(struct seq_file *m, loff_t *pos)
+{
+ int pid = *pos;
+
+ return trace_find_tgid_ptr(pid);
+}
+
+static void saved_tgids_stop(struct seq_file *m, void *v)
+{
+}
+
+static int saved_tgids_show(struct seq_file *m, void *v)
+{
+ int *entry = (int *)v;
+ int pid = entry - tgid_map;
+ int tgid = *entry;
+
+ if (tgid == 0)
+ return SEQ_SKIP;
+
+ seq_printf(m, "%d %d\n", pid, tgid);
+ return 0;
+}
+
+static const struct seq_operations tracing_saved_tgids_seq_ops = {
+ .start = saved_tgids_start,
+ .stop = saved_tgids_stop,
+ .next = saved_tgids_next,
+ .show = saved_tgids_show,
+};
+
+static int tracing_saved_tgids_open(struct inode *inode, struct file *filp)
+{
+ int ret;
+
+ ret = tracing_check_open_get_tr(NULL);
+ if (ret)
+ return ret;
+
+ return seq_open(filp, &tracing_saved_tgids_seq_ops);
+}
+
+
+const struct file_operations tracing_saved_tgids_fops = {
+ .open = tracing_saved_tgids_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ unsigned int *ptr = v;
+
+ if (*pos || m->count)
+ ptr++;
+
+ (*pos)++;
+
+ for (; ptr < &savedcmd->map_cmdline_to_pid[savedcmd->cmdline_num];
+ ptr++) {
+ if (*ptr == -1 || *ptr == NO_CMDLINE_MAP)
+ continue;
+
+ return ptr;
+ }
+
+ return NULL;
+}
+
+static void *saved_cmdlines_start(struct seq_file *m, loff_t *pos)
+{
+ void *v;
+ loff_t l = 0;
+
+ preempt_disable();
+ arch_spin_lock(&trace_cmdline_lock);
+
+ v = &savedcmd->map_cmdline_to_pid[0];
+ while (l <= *pos) {
+ v = saved_cmdlines_next(m, v, &l);
+ if (!v)
+ return NULL;
+ }
+
+ return v;
+}
+
+static void saved_cmdlines_stop(struct seq_file *m, void *v)
+{
+ arch_spin_unlock(&trace_cmdline_lock);
+ preempt_enable();
+}
+
+static int saved_cmdlines_show(struct seq_file *m, void *v)
+{
+ char buf[TASK_COMM_LEN];
+ unsigned int *pid = v;
+
+ __trace_find_cmdline(*pid, buf);
+ seq_printf(m, "%d %s\n", *pid, buf);
+ return 0;
+}
+
+static const struct seq_operations tracing_saved_cmdlines_seq_ops = {
+ .start = saved_cmdlines_start,
+ .next = saved_cmdlines_next,
+ .stop = saved_cmdlines_stop,
+ .show = saved_cmdlines_show,
+};
+
+static int tracing_saved_cmdlines_open(struct inode *inode, struct file *filp)
+{
+ int ret;
+
+ ret = tracing_check_open_get_tr(NULL);
+ if (ret)
+ return ret;
+
+ return seq_open(filp, &tracing_saved_cmdlines_seq_ops);
+}
+
+const struct file_operations tracing_saved_cmdlines_fops = {
+ .open = tracing_saved_cmdlines_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static ssize_t
+tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char buf[64];
+ int r;
+
+ preempt_disable();
+ arch_spin_lock(&trace_cmdline_lock);
+ r = scnprintf(buf, sizeof(buf), "%u\n", savedcmd->cmdline_num);
+ arch_spin_unlock(&trace_cmdline_lock);
+ preempt_enable();
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+void trace_free_saved_cmdlines_buffer(void)
+{
+ free_saved_cmdlines_buffer(savedcmd);
+}
+
+static int tracing_resize_saved_cmdlines(unsigned int val)
+{
+ struct saved_cmdlines_buffer *s, *savedcmd_temp;
+
+ s = allocate_cmdlines_buffer(val);
+ if (!s)
+ return -ENOMEM;
+
+ preempt_disable();
+ arch_spin_lock(&trace_cmdline_lock);
+ savedcmd_temp = savedcmd;
+ savedcmd = s;
+ arch_spin_unlock(&trace_cmdline_lock);
+ preempt_enable();
+ free_saved_cmdlines_buffer(savedcmd_temp);
+
+ return 0;
+}
+
+static ssize_t
+tracing_saved_cmdlines_size_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ unsigned long val;
+ int ret;
+
+ ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+ if (ret)
+ return ret;
+
+ /* must have at least 1 entry or less than PID_MAX_DEFAULT */
+ if (!val || val > PID_MAX_DEFAULT)
+ return -EINVAL;
+
+ ret = tracing_resize_saved_cmdlines((unsigned int)val);
+ if (ret < 0)
+ return ret;
+
+ *ppos += cnt;
+
+ return cnt;
+}
+
+const struct file_operations tracing_saved_cmdlines_size_fops = {
+ .open = tracing_open_generic,
+ .read = tracing_saved_cmdlines_size_read,
+ .write = tracing_saved_cmdlines_size_write,
+};
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 529590499b1f..e9c5058a8efd 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -768,7 +768,7 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace)
if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) {
ftrace_graph_stop();
printk(KERN_WARNING "BUG: Function graph tracer hang!\n");
- if (ftrace_dump_on_oops) {
+ if (ftrace_dump_on_oops_enabled()) {
ftrace_dump(DUMP_ALL);
/* ftrace_dump() disables tracing */
tracing_on();
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index a84b85d8aac1..9e461362450a 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -211,8 +211,8 @@ static unsigned long translate_user_vaddr(unsigned long file_offset)
/* Note that we don't verify it, since the code does not come from user space */
static int
-process_fetch_insn(struct fetch_insn *code, void *rec, void *dest,
- void *base)
+process_fetch_insn(struct fetch_insn *code, void *rec, void *edata,
+ void *dest, void *base)
{
struct pt_regs *regs = rec;
unsigned long val;
@@ -337,7 +337,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)
if (!tu)
return ERR_PTR(-ENOMEM);
- ret = trace_probe_init(&tu->tp, event, group, true);
+ ret = trace_probe_init(&tu->tp, event, group, true, nargs);
if (ret < 0)
goto error;
@@ -1490,11 +1490,11 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
if (WARN_ON_ONCE(!uprobe_cpu_buffer))
return 0;
- dsize = __get_data_size(&tu->tp, regs);
+ dsize = __get_data_size(&tu->tp, regs, NULL);
esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
ucb = uprobe_buffer_get();
- store_trace_args(ucb->buf, &tu->tp, regs, esize, dsize);
+ store_trace_args(ucb->buf, &tu->tp, regs, NULL, esize, dsize);
if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE))
ret |= uprobe_trace_func(tu, regs, ucb, dsize);
@@ -1525,11 +1525,11 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con,
if (WARN_ON_ONCE(!uprobe_cpu_buffer))
return 0;
- dsize = __get_data_size(&tu->tp, regs);
+ dsize = __get_data_size(&tu->tp, regs, NULL);
esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
ucb = uprobe_buffer_get();
- store_trace_args(ucb->buf, &tu->tp, regs, esize, dsize);
+ store_trace_args(ucb->buf, &tu->tp, regs, NULL, esize, dsize);
if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE))
uretprobe_trace_func(tu, func, regs, ucb, dsize);
diff --git a/kernel/ucount.c b/kernel/ucount.c
index 4aa6166cb856..d9e283600f5c 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -119,7 +119,7 @@ bool setup_userns_sysctls(struct user_namespace *ns)
void retire_userns_sysctls(struct user_namespace *ns)
{
#ifdef CONFIG_SYSCTL
- struct ctl_table *tbl;
+ const struct ctl_table *tbl;
tbl = ns->sysctls->ctl_table_arg;
unregister_sysctl_table(ns->sysctls);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index ce4d99df5f0e..0b0b95418b16 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -931,7 +931,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
struct uid_gid_map new_map;
unsigned idx;
struct uid_gid_extent extent;
- char *kbuf = NULL, *pos, *next_line;
+ char *kbuf, *pos, *next_line;
ssize_t ret;
/* Only allow < page size writes at the beginning of the file */
diff --git a/kernel/vmcore_info.c b/kernel/vmcore_info.c
new file mode 100644
index 000000000000..23c125c2e243
--- /dev/null
+++ b/kernel/vmcore_info.c
@@ -0,0 +1,232 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * crash.c - kernel crash support code.
+ * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
+ */
+
+#include <linux/buildid.h>
+#include <linux/init.h>
+#include <linux/utsname.h>
+#include <linux/vmalloc.h>
+#include <linux/sizes.h>
+#include <linux/kexec.h>
+#include <linux/memory.h>
+#include <linux/cpuhotplug.h>
+#include <linux/memblock.h>
+#include <linux/kmemleak.h>
+
+#include <asm/page.h>
+#include <asm/sections.h>
+
+#include <crypto/sha1.h>
+
+#include "kallsyms_internal.h"
+#include "kexec_internal.h"
+
+/* vmcoreinfo stuff */
+unsigned char *vmcoreinfo_data;
+size_t vmcoreinfo_size;
+u32 *vmcoreinfo_note;
+
+/* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */
+static unsigned char *vmcoreinfo_data_safecopy;
+
+Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
+ void *data, size_t data_len)
+{
+ struct elf_note *note = (struct elf_note *)buf;
+
+ note->n_namesz = strlen(name) + 1;
+ note->n_descsz = data_len;
+ note->n_type = type;
+ buf += DIV_ROUND_UP(sizeof(*note), sizeof(Elf_Word));
+ memcpy(buf, name, note->n_namesz);
+ buf += DIV_ROUND_UP(note->n_namesz, sizeof(Elf_Word));
+ memcpy(buf, data, data_len);
+ buf += DIV_ROUND_UP(data_len, sizeof(Elf_Word));
+
+ return buf;
+}
+
+void final_note(Elf_Word *buf)
+{
+ memset(buf, 0, sizeof(struct elf_note));
+}
+
+static void update_vmcoreinfo_note(void)
+{
+ u32 *buf = vmcoreinfo_note;
+
+ if (!vmcoreinfo_size)
+ return;
+ buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
+ vmcoreinfo_size);
+ final_note(buf);
+}
+
+void crash_update_vmcoreinfo_safecopy(void *ptr)
+{
+ if (ptr)
+ memcpy(ptr, vmcoreinfo_data, vmcoreinfo_size);
+
+ vmcoreinfo_data_safecopy = ptr;
+}
+
+void crash_save_vmcoreinfo(void)
+{
+ if (!vmcoreinfo_note)
+ return;
+
+ /* Use the safe copy to generate vmcoreinfo note if have */
+ if (vmcoreinfo_data_safecopy)
+ vmcoreinfo_data = vmcoreinfo_data_safecopy;
+
+ vmcoreinfo_append_str("CRASHTIME=%lld\n", ktime_get_real_seconds());
+ update_vmcoreinfo_note();
+}
+
+void vmcoreinfo_append_str(const char *fmt, ...)
+{
+ va_list args;
+ char buf[0x50];
+ size_t r;
+
+ va_start(args, fmt);
+ r = vscnprintf(buf, sizeof(buf), fmt, args);
+ va_end(args);
+
+ r = min(r, (size_t)VMCOREINFO_BYTES - vmcoreinfo_size);
+
+ memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
+
+ vmcoreinfo_size += r;
+
+ WARN_ONCE(vmcoreinfo_size == VMCOREINFO_BYTES,
+ "vmcoreinfo data exceeds allocated size, truncating");
+}
+
+/*
+ * provide an empty default implementation here -- architecture
+ * code may override this
+ */
+void __weak arch_crash_save_vmcoreinfo(void)
+{}
+
+phys_addr_t __weak paddr_vmcoreinfo_note(void)
+{
+ return __pa(vmcoreinfo_note);
+}
+EXPORT_SYMBOL(paddr_vmcoreinfo_note);
+
+static int __init crash_save_vmcoreinfo_init(void)
+{
+ vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL);
+ if (!vmcoreinfo_data) {
+ pr_warn("Memory allocation for vmcoreinfo_data failed\n");
+ return -ENOMEM;
+ }
+
+ vmcoreinfo_note = alloc_pages_exact(VMCOREINFO_NOTE_SIZE,
+ GFP_KERNEL | __GFP_ZERO);
+ if (!vmcoreinfo_note) {
+ free_page((unsigned long)vmcoreinfo_data);
+ vmcoreinfo_data = NULL;
+ pr_warn("Memory allocation for vmcoreinfo_note failed\n");
+ return -ENOMEM;
+ }
+
+ VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
+ VMCOREINFO_BUILD_ID();
+ VMCOREINFO_PAGESIZE(PAGE_SIZE);
+
+ VMCOREINFO_SYMBOL(init_uts_ns);
+ VMCOREINFO_OFFSET(uts_namespace, name);
+ VMCOREINFO_SYMBOL(node_online_map);
+#ifdef CONFIG_MMU
+ VMCOREINFO_SYMBOL_ARRAY(swapper_pg_dir);
+#endif
+ VMCOREINFO_SYMBOL(_stext);
+ vmcoreinfo_append_str("NUMBER(VMALLOC_START)=0x%lx\n", (unsigned long) VMALLOC_START);
+
+#ifndef CONFIG_NUMA
+ VMCOREINFO_SYMBOL(mem_map);
+ VMCOREINFO_SYMBOL(contig_page_data);
+#endif
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+ VMCOREINFO_SYMBOL_ARRAY(vmemmap);
+#endif
+#ifdef CONFIG_SPARSEMEM
+ VMCOREINFO_SYMBOL_ARRAY(mem_section);
+ VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
+ VMCOREINFO_STRUCT_SIZE(mem_section);
+ VMCOREINFO_OFFSET(mem_section, section_mem_map);
+ VMCOREINFO_NUMBER(SECTION_SIZE_BITS);
+ VMCOREINFO_NUMBER(MAX_PHYSMEM_BITS);
+#endif
+ VMCOREINFO_STRUCT_SIZE(page);
+ VMCOREINFO_STRUCT_SIZE(pglist_data);
+ VMCOREINFO_STRUCT_SIZE(zone);
+ VMCOREINFO_STRUCT_SIZE(free_area);
+ VMCOREINFO_STRUCT_SIZE(list_head);
+ VMCOREINFO_SIZE(nodemask_t);
+ VMCOREINFO_OFFSET(page, flags);
+ VMCOREINFO_OFFSET(page, _refcount);
+ VMCOREINFO_OFFSET(page, mapping);
+ VMCOREINFO_OFFSET(page, lru);
+ VMCOREINFO_OFFSET(page, _mapcount);
+ VMCOREINFO_OFFSET(page, private);
+ VMCOREINFO_OFFSET(page, compound_head);
+ VMCOREINFO_OFFSET(pglist_data, node_zones);
+ VMCOREINFO_OFFSET(pglist_data, nr_zones);
+#ifdef CONFIG_FLATMEM
+ VMCOREINFO_OFFSET(pglist_data, node_mem_map);
+#endif
+ VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
+ VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
+ VMCOREINFO_OFFSET(pglist_data, node_id);
+ VMCOREINFO_OFFSET(zone, free_area);
+ VMCOREINFO_OFFSET(zone, vm_stat);
+ VMCOREINFO_OFFSET(zone, spanned_pages);
+ VMCOREINFO_OFFSET(free_area, free_list);
+ VMCOREINFO_OFFSET(list_head, next);
+ VMCOREINFO_OFFSET(list_head, prev);
+ VMCOREINFO_LENGTH(zone.free_area, NR_PAGE_ORDERS);
+ log_buf_vmcoreinfo_setup();
+ VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
+ VMCOREINFO_NUMBER(NR_FREE_PAGES);
+ VMCOREINFO_NUMBER(PG_lru);
+ VMCOREINFO_NUMBER(PG_private);
+ VMCOREINFO_NUMBER(PG_swapcache);
+ VMCOREINFO_NUMBER(PG_swapbacked);
+ VMCOREINFO_NUMBER(PG_slab);
+#ifdef CONFIG_MEMORY_FAILURE
+ VMCOREINFO_NUMBER(PG_hwpoison);
+#endif
+ VMCOREINFO_NUMBER(PG_head_mask);
+#define PAGE_BUDDY_MAPCOUNT_VALUE (~PG_buddy)
+ VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
+#define PAGE_HUGETLB_MAPCOUNT_VALUE (~PG_hugetlb)
+ VMCOREINFO_NUMBER(PAGE_HUGETLB_MAPCOUNT_VALUE);
+#define PAGE_OFFLINE_MAPCOUNT_VALUE (~PG_offline)
+ VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE);
+
+#ifdef CONFIG_KALLSYMS
+ VMCOREINFO_SYMBOL(kallsyms_names);
+ VMCOREINFO_SYMBOL(kallsyms_num_syms);
+ VMCOREINFO_SYMBOL(kallsyms_token_table);
+ VMCOREINFO_SYMBOL(kallsyms_token_index);
+#ifdef CONFIG_KALLSYMS_BASE_RELATIVE
+ VMCOREINFO_SYMBOL(kallsyms_offsets);
+ VMCOREINFO_SYMBOL(kallsyms_relative_base);
+#else
+ VMCOREINFO_SYMBOL(kallsyms_addresses);
+#endif /* CONFIG_KALLSYMS_BASE_RELATIVE */
+#endif /* CONFIG_KALLSYMS */
+
+ arch_crash_save_vmcoreinfo();
+ update_vmcoreinfo_note();
+
+ return 0;
+}
+
+subsys_initcall(crash_save_vmcoreinfo_init);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 81a8862295d6..d12ff74889ed 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -12,20 +12,25 @@
#define pr_fmt(fmt) "watchdog: " fmt
-#include <linux/mm.h>
#include <linux/cpu.h>
-#include <linux/nmi.h>
#include <linux/init.h>
+#include <linux/irq.h>
+#include <linux/irqdesc.h>
+#include <linux/kernel_stat.h>
+#include <linux/kvm_para.h>
+#include <linux/math64.h>
+#include <linux/mm.h>
#include <linux/module.h>
+#include <linux/nmi.h>
+#include <linux/stop_machine.h>
#include <linux/sysctl.h>
#include <linux/tick.h>
+
#include <linux/sched/clock.h>
#include <linux/sched/debug.h>
#include <linux/sched/isolation.h>
-#include <linux/stop_machine.h>
#include <asm/irq_regs.h>
-#include <linux/kvm_para.h>
static DEFINE_MUTEX(watchdog_mutex);
@@ -35,6 +40,8 @@ static DEFINE_MUTEX(watchdog_mutex);
# define WATCHDOG_HARDLOCKUP_DEFAULT 0
#endif
+#define NUM_SAMPLE_PERIODS 5
+
unsigned long __read_mostly watchdog_enabled;
int __read_mostly watchdog_user_enabled = 1;
static int __read_mostly watchdog_hardlockup_user_enabled = WATCHDOG_HARDLOCKUP_DEFAULT;
@@ -333,6 +340,188 @@ __setup("watchdog_thresh=", watchdog_thresh_setup);
static void __lockup_detector_cleanup(void);
+#ifdef CONFIG_SOFTLOCKUP_DETECTOR_INTR_STORM
+enum stats_per_group {
+ STATS_SYSTEM,
+ STATS_SOFTIRQ,
+ STATS_HARDIRQ,
+ STATS_IDLE,
+ NUM_STATS_PER_GROUP,
+};
+
+static const enum cpu_usage_stat tracked_stats[NUM_STATS_PER_GROUP] = {
+ CPUTIME_SYSTEM,
+ CPUTIME_SOFTIRQ,
+ CPUTIME_IRQ,
+ CPUTIME_IDLE,
+};
+
+static DEFINE_PER_CPU(u16, cpustat_old[NUM_STATS_PER_GROUP]);
+static DEFINE_PER_CPU(u8, cpustat_util[NUM_SAMPLE_PERIODS][NUM_STATS_PER_GROUP]);
+static DEFINE_PER_CPU(u8, cpustat_tail);
+
+/*
+ * We don't need nanosecond resolution. A granularity of 16ms is
+ * sufficient for our precision, allowing us to use u16 to store
+ * cpustats, which will roll over roughly every ~1000 seconds.
+ * 2^24 ~= 16 * 10^6
+ */
+static u16 get_16bit_precision(u64 data_ns)
+{
+ return data_ns >> 24LL; /* 2^24ns ~= 16.8ms */
+}
+
+static void update_cpustat(void)
+{
+ int i;
+ u8 util;
+ u16 old_stat, new_stat;
+ struct kernel_cpustat kcpustat;
+ u64 *cpustat = kcpustat.cpustat;
+ u8 tail = __this_cpu_read(cpustat_tail);
+ u16 sample_period_16 = get_16bit_precision(sample_period);
+
+ kcpustat_cpu_fetch(&kcpustat, smp_processor_id());
+
+ for (i = 0; i < NUM_STATS_PER_GROUP; i++) {
+ old_stat = __this_cpu_read(cpustat_old[i]);
+ new_stat = get_16bit_precision(cpustat[tracked_stats[i]]);
+ util = DIV_ROUND_UP(100 * (new_stat - old_stat), sample_period_16);
+ __this_cpu_write(cpustat_util[tail][i], util);
+ __this_cpu_write(cpustat_old[i], new_stat);
+ }
+
+ __this_cpu_write(cpustat_tail, (tail + 1) % NUM_SAMPLE_PERIODS);
+}
+
+static void print_cpustat(void)
+{
+ int i, group;
+ u8 tail = __this_cpu_read(cpustat_tail);
+ u64 sample_period_second = sample_period;
+
+ do_div(sample_period_second, NSEC_PER_SEC);
+
+ /*
+ * Outputting the "watchdog" prefix on every line is redundant and not
+ * concise, and the original alarm information is sufficient for
+ * positioning in logs, hence here printk() is used instead of pr_crit().
+ */
+ printk(KERN_CRIT "CPU#%d Utilization every %llus during lockup:\n",
+ smp_processor_id(), sample_period_second);
+
+ for (i = 0; i < NUM_SAMPLE_PERIODS; i++) {
+ group = (tail + i) % NUM_SAMPLE_PERIODS;
+ printk(KERN_CRIT "\t#%d: %3u%% system,\t%3u%% softirq,\t"
+ "%3u%% hardirq,\t%3u%% idle\n", i + 1,
+ __this_cpu_read(cpustat_util[group][STATS_SYSTEM]),
+ __this_cpu_read(cpustat_util[group][STATS_SOFTIRQ]),
+ __this_cpu_read(cpustat_util[group][STATS_HARDIRQ]),
+ __this_cpu_read(cpustat_util[group][STATS_IDLE]));
+ }
+}
+
+#define HARDIRQ_PERCENT_THRESH 50
+#define NUM_HARDIRQ_REPORT 5
+struct irq_counts {
+ int irq;
+ u32 counts;
+};
+
+static DEFINE_PER_CPU(bool, snapshot_taken);
+
+/* Tabulate the most frequent interrupts. */
+static void tabulate_irq_count(struct irq_counts *irq_counts, int irq, u32 counts, int rank)
+{
+ int i;
+ struct irq_counts new_count = {irq, counts};
+
+ for (i = 0; i < rank; i++) {
+ if (counts > irq_counts[i].counts)
+ swap(new_count, irq_counts[i]);
+ }
+}
+
+/*
+ * If the hardirq time exceeds HARDIRQ_PERCENT_THRESH% of the sample_period,
+ * then the cause of softlockup might be interrupt storm. In this case, it
+ * would be useful to start interrupt counting.
+ */
+static bool need_counting_irqs(void)
+{
+ u8 util;
+ int tail = __this_cpu_read(cpustat_tail);
+
+ tail = (tail + NUM_HARDIRQ_REPORT - 1) % NUM_HARDIRQ_REPORT;
+ util = __this_cpu_read(cpustat_util[tail][STATS_HARDIRQ]);
+ return util > HARDIRQ_PERCENT_THRESH;
+}
+
+static void start_counting_irqs(void)
+{
+ if (!__this_cpu_read(snapshot_taken)) {
+ kstat_snapshot_irqs();
+ __this_cpu_write(snapshot_taken, true);
+ }
+}
+
+static void stop_counting_irqs(void)
+{
+ __this_cpu_write(snapshot_taken, false);
+}
+
+static void print_irq_counts(void)
+{
+ unsigned int i, count;
+ struct irq_counts irq_counts_sorted[NUM_HARDIRQ_REPORT] = {
+ {-1, 0}, {-1, 0}, {-1, 0}, {-1, 0}, {-1, 0}
+ };
+
+ if (__this_cpu_read(snapshot_taken)) {
+ for_each_active_irq(i) {
+ count = kstat_get_irq_since_snapshot(i);
+ tabulate_irq_count(irq_counts_sorted, i, count, NUM_HARDIRQ_REPORT);
+ }
+
+ /*
+ * Outputting the "watchdog" prefix on every line is redundant and not
+ * concise, and the original alarm information is sufficient for
+ * positioning in logs, hence here printk() is used instead of pr_crit().
+ */
+ printk(KERN_CRIT "CPU#%d Detect HardIRQ Time exceeds %d%%. Most frequent HardIRQs:\n",
+ smp_processor_id(), HARDIRQ_PERCENT_THRESH);
+
+ for (i = 0; i < NUM_HARDIRQ_REPORT; i++) {
+ if (irq_counts_sorted[i].irq == -1)
+ break;
+
+ printk(KERN_CRIT "\t#%u: %-10u\tirq#%d\n",
+ i + 1, irq_counts_sorted[i].counts,
+ irq_counts_sorted[i].irq);
+ }
+
+ /*
+ * If the hardirq time is less than HARDIRQ_PERCENT_THRESH% in the last
+ * sample_period, then we suspect the interrupt storm might be subsiding.
+ */
+ if (!need_counting_irqs())
+ stop_counting_irqs();
+ }
+}
+
+static void report_cpu_status(void)
+{
+ print_cpustat();
+ print_irq_counts();
+}
+#else
+static inline void update_cpustat(void) { }
+static inline void report_cpu_status(void) { }
+static inline bool need_counting_irqs(void) { return false; }
+static inline void start_counting_irqs(void) { }
+static inline void stop_counting_irqs(void) { }
+#endif
+
/*
* Hard-lockup warnings should be triggered after just a few seconds. Soft-
* lockups can have false positives under extreme conditions. So we generally
@@ -364,7 +553,7 @@ static void set_sample_period(void)
* and hard thresholds) to increment before the
* hardlockup detector generates a warning
*/
- sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5);
+ sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / NUM_SAMPLE_PERIODS);
watchdog_update_hrtimer_threshold(sample_period);
}
@@ -434,6 +623,18 @@ static int is_softlockup(unsigned long touch_ts,
unsigned long now)
{
if ((watchdog_enabled & WATCHDOG_SOFTOCKUP_ENABLED) && watchdog_thresh) {
+ /*
+ * If period_ts has not been updated during a sample_period, then
+ * in the subsequent few sample_periods, period_ts might also not
+ * be updated, which could indicate a potential softlockup. In
+ * this case, if we suspect the cause of the potential softlockup
+ * might be interrupt storm, then we need to count the interrupts
+ * to find which interrupt is storming.
+ */
+ if (time_after_eq(now, period_ts + get_softlockup_thresh() / NUM_SAMPLE_PERIODS) &&
+ need_counting_irqs())
+ start_counting_irqs();
+
/* Warn about unreasonable delays. */
if (time_after(now, period_ts + get_softlockup_thresh()))
return now - touch_ts;
@@ -456,6 +657,7 @@ static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work);
static int softlockup_fn(void *data)
{
update_touch_ts();
+ stop_counting_irqs();
complete(this_cpu_ptr(&softlockup_completion));
return 0;
@@ -504,6 +706,8 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
*/
period_ts = READ_ONCE(*this_cpu_ptr(&watchdog_report_ts));
+ update_cpustat();
+
/* Reset the interval when touched by known problematic code. */
if (period_ts == SOFTLOCKUP_DELAY_REPORT) {
if (unlikely(__this_cpu_read(softlockup_touch_sync))) {
@@ -539,6 +743,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
smp_processor_id(), duration,
current->comm, task_pid_nr(current));
+ report_cpu_status();
print_modules();
print_irqtrace_events(current);
if (regs)
@@ -796,8 +1001,8 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write,
/*
* /proc/sys/kernel/watchdog
*/
-int proc_watchdog(struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
+static int proc_watchdog(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
return proc_watchdog_common(WATCHDOG_HARDLOCKUP_ENABLED |
WATCHDOG_SOFTOCKUP_ENABLED,
@@ -807,8 +1012,8 @@ int proc_watchdog(struct ctl_table *table, int write,
/*
* /proc/sys/kernel/nmi_watchdog
*/
-int proc_nmi_watchdog(struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
+static int proc_nmi_watchdog(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
if (!watchdog_hardlockup_available && write)
return -ENOTSUPP;
@@ -816,21 +1021,23 @@ int proc_nmi_watchdog(struct ctl_table *table, int write,
table, write, buffer, lenp, ppos);
}
+#ifdef CONFIG_SOFTLOCKUP_DETECTOR
/*
* /proc/sys/kernel/soft_watchdog
*/
-int proc_soft_watchdog(struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
+static int proc_soft_watchdog(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
return proc_watchdog_common(WATCHDOG_SOFTOCKUP_ENABLED,
table, write, buffer, lenp, ppos);
}
+#endif
/*
* /proc/sys/kernel/watchdog_thresh
*/
-int proc_watchdog_thresh(struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
+static int proc_watchdog_thresh(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int err, old;
@@ -852,8 +1059,8 @@ int proc_watchdog_thresh(struct ctl_table *table, int write,
* user to specify a mask that will include cpus that have not yet
* been brought online, if desired.
*/
-int proc_watchdog_cpumask(struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
+static int proc_watchdog_cpumask(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int err;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index bf2bdac46843..80882ae43261 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1277,8 +1277,12 @@ static bool kick_pool(struct worker_pool *pool)
!cpumask_test_cpu(p->wake_cpu, pool->attrs->__pod_cpumask)) {
struct work_struct *work = list_first_entry(&pool->worklist,
struct work_struct, entry);
- p->wake_cpu = cpumask_any_distribute(pool->attrs->__pod_cpumask);
- get_work_pwq(work)->stats[PWQ_STAT_REPATRIATED]++;
+ int wake_cpu = cpumask_any_and_distribute(pool->attrs->__pod_cpumask,
+ cpu_online_mask);
+ if (wake_cpu < nr_cpu_ids) {
+ p->wake_cpu = wake_cpu;
+ get_work_pwq(work)->stats[PWQ_STAT_REPATRIATED]++;
+ }
}
#endif
wake_up_process(p);
@@ -1464,7 +1468,7 @@ void wq_worker_sleeping(struct task_struct *task)
* wq_worker_tick - a scheduler tick occurred while a kworker is running
* @task: task currently running
*
- * Called from scheduler_tick(). We're in the IRQ context and the current
+ * Called from sched_tick(). We're in the IRQ context and the current
* worker's fields which follow the 'K' locking rule can be accessed safely.
*/
void wq_worker_tick(struct task_struct *task)
@@ -1594,6 +1598,15 @@ static void wq_update_node_max_active(struct workqueue_struct *wq, int off_cpu)
if (off_cpu >= 0)
total_cpus--;
+ /* If all CPUs of the wq get offline, use the default values */
+ if (unlikely(!total_cpus)) {
+ for_each_node(node)
+ wq_node_nr_active(wq, node)->max = min_active;
+
+ wq_node_nr_active(wq, NUMA_NO_NODE)->max = max_active;
+ return;
+ }
+
for_each_node(node) {
int node_cpus;
@@ -1606,7 +1619,7 @@ static void wq_update_node_max_active(struct workqueue_struct *wq, int off_cpu)
min_active, max_active);
}
- wq_node_nr_active(wq, NUMA_NO_NODE)->max = min_active;
+ wq_node_nr_active(wq, NUMA_NO_NODE)->max = max_active;
}
/**
@@ -7080,7 +7093,7 @@ static struct device_attribute wq_sysfs_unbound_attrs[] = {
__ATTR_NULL,
};
-static struct bus_type wq_subsys = {
+static const struct bus_type wq_subsys = {
.name = "workqueue",
.dev_groups = wq_sysfs_groups,
};