diff options
Diffstat (limited to 'kernel')
231 files changed, 18622 insertions, 7122 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 318789c728d3..10ef068f598d 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -38,11 +38,9 @@ KCOV_INSTRUMENT_kcov.o := n KASAN_SANITIZE_kcov.o := n KCSAN_SANITIZE_kcov.o := n UBSAN_SANITIZE_kcov.o := n +KMSAN_SANITIZE_kcov.o := n CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack) -fno-stack-protector -# Don't instrument error handlers -CFLAGS_REMOVE_cfi.o := $(CC_FLAGS_CFI) - obj-y += sched/ obj-y += locking/ obj-y += power/ @@ -68,6 +66,7 @@ endif obj-$(CONFIG_UID16) += uid16.o obj-$(CONFIG_MODULE_SIG_FORMAT) += module_signature.o obj-$(CONFIG_KALLSYMS) += kallsyms.o +obj-$(CONFIG_KALLSYMS_SELFTEST) += kallsyms_selftest.o obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o obj-$(CONFIG_CRASH_CORE) += crash_core.o obj-$(CONFIG_KEXEC_CORE) += kexec_core.o diff --git a/kernel/acct.c b/kernel/acct.c index 13706356ec54..010667ce6080 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -320,7 +320,7 @@ void acct_exit_ns(struct pid_namespace *ns) } /* - * encode an unsigned long into a comp_t + * encode an u64 into a comp_t * * This routine has been adopted from the encode_comp_t() function in * the kern_acct.c file of the FreeBSD operating system. The encoding @@ -331,7 +331,7 @@ void acct_exit_ns(struct pid_namespace *ns) #define EXPSIZE 3 /* Base 8 (3 bit) exponent. */ #define MAXFRACT ((1 << MANTSIZE) - 1) /* Maximum fractional value. */ -static comp_t encode_comp_t(unsigned long value) +static comp_t encode_comp_t(u64 value) { int exp, rnd; @@ -350,6 +350,8 @@ static comp_t encode_comp_t(unsigned long value) exp++; } + if (exp > (((comp_t) ~0U) >> MANTSIZE)) + return (comp_t) ~0U; /* * Clean it up and polish it off. */ @@ -555,15 +557,14 @@ void acct_collect(long exitcode, int group_dead) unsigned long vsize = 0; if (group_dead && current->mm) { + struct mm_struct *mm = current->mm; + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; - mmap_read_lock(current->mm); - vma = current->mm->mmap; - while (vma) { + mmap_read_lock(mm); + for_each_vma(vmi, vma) vsize += vma->vm_end - vma->vm_start; - vma = vma->vm_next; - } - mmap_read_unlock(current->mm); + mmap_read_unlock(mm); } spin_lock_irq(¤t->sighand->siglock); diff --git a/kernel/audit.c b/kernel/audit.c index a75978ae38ad..9bc0b0301198 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -321,7 +321,6 @@ static inline int audit_rate_check(void) static DEFINE_SPINLOCK(lock); unsigned long flags; unsigned long now; - unsigned long elapsed; int retval = 0; if (!audit_rate_limit) return 1; @@ -330,9 +329,8 @@ static inline int audit_rate_check(void) if (++messages < audit_rate_limit) { retval = 1; } else { - now = jiffies; - elapsed = now - last_check; - if (elapsed > HZ) { + now = jiffies; + if (time_after(now, last_check + HZ)) { last_check = now; messages = 0; retval = 1; @@ -366,7 +364,7 @@ void audit_log_lost(const char *message) if (!print) { spin_lock_irqsave(&lock, flags); now = jiffies; - if (now - last_msg > HZ) { + if (time_after(now, last_msg + HZ)) { print = 1; last_msg = now; } diff --git a/kernel/audit.h b/kernel/audit.h index 58b66543b4d5..c57b008b9914 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -133,7 +133,7 @@ struct audit_context { struct sockaddr_storage *sockaddr; size_t sockaddr_len; /* Save things to print about task_struct */ - pid_t pid, ppid; + pid_t ppid; kuid_t uid, euid, suid, fsuid; kgid_t gid, egid, sgid, fsgid; unsigned long personality; @@ -245,8 +245,6 @@ struct audit_netlink_list { int audit_send_list_thread(void *_dest); -extern int selinux_audit_rule_update(void); - extern struct mutex audit_filter_mutex; extern int audit_del_rule(struct audit_entry *entry); extern void audit_free_rule_rcu(struct rcu_head *head); diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 4b0957aa2cd4..65075f1e4ac8 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -133,7 +133,7 @@ int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev) } /* Initialize a parent watch entry. */ -static struct audit_parent *audit_init_parent(struct path *path) +static struct audit_parent *audit_init_parent(const struct path *path) { struct inode *inode = d_backing_inode(path->dentry); struct audit_parent *parent; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 79a5da1bc5bb..547c88be8a28 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -806,30 +806,53 @@ static int audit_in_mask(const struct audit_krule *rule, unsigned long val) } /** - * audit_filter_uring - apply filters to an io_uring operation + * __audit_filter_op - common filter helper for operations (syscall/uring/etc) * @tsk: associated task * @ctx: audit context + * @list: audit filter list + * @name: audit_name (can be NULL) + * @op: current syscall/uring_op + * + * Run the udit filters specified in @list against @tsk using @ctx, + * @name, and @op, as necessary; the caller is responsible for ensuring + * that the call is made while the RCU read lock is held. The @name + * parameter can be NULL, but all others must be specified. + * Returns 1/true if the filter finds a match, 0/false if none are found. */ -static void audit_filter_uring(struct task_struct *tsk, - struct audit_context *ctx) +static int __audit_filter_op(struct task_struct *tsk, + struct audit_context *ctx, + struct list_head *list, + struct audit_names *name, + unsigned long op) { struct audit_entry *e; enum audit_state state; + list_for_each_entry_rcu(e, list, list) { + if (audit_in_mask(&e->rule, op) && + audit_filter_rules(tsk, &e->rule, ctx, name, + &state, false)) { + ctx->current_state = state; + return 1; + } + } + return 0; +} + +/** + * audit_filter_uring - apply filters to an io_uring operation + * @tsk: associated task + * @ctx: audit context + */ +static void audit_filter_uring(struct task_struct *tsk, + struct audit_context *ctx) +{ if (auditd_test_task(tsk)) return; rcu_read_lock(); - list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_URING_EXIT], - list) { - if (audit_in_mask(&e->rule, ctx->uring_op) && - audit_filter_rules(tsk, &e->rule, ctx, NULL, &state, - false)) { - rcu_read_unlock(); - ctx->current_state = state; - return; - } - } + __audit_filter_op(tsk, ctx, &audit_filter_list[AUDIT_FILTER_URING_EXIT], + NULL, ctx->uring_op); rcu_read_unlock(); } @@ -841,24 +864,13 @@ static void audit_filter_uring(struct task_struct *tsk, static void audit_filter_syscall(struct task_struct *tsk, struct audit_context *ctx) { - struct audit_entry *e; - enum audit_state state; - if (auditd_test_task(tsk)) return; rcu_read_lock(); - list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_EXIT], list) { - if (audit_in_mask(&e->rule, ctx->major) && - audit_filter_rules(tsk, &e->rule, ctx, NULL, - &state, false)) { - rcu_read_unlock(); - ctx->current_state = state; - return; - } - } + __audit_filter_op(tsk, ctx, &audit_filter_list[AUDIT_FILTER_EXIT], + NULL, ctx->major); rcu_read_unlock(); - return; } /* @@ -870,17 +882,8 @@ static int audit_filter_inode_name(struct task_struct *tsk, struct audit_context *ctx) { int h = audit_hash_ino((u32)n->ino); struct list_head *list = &audit_inode_hash[h]; - struct audit_entry *e; - enum audit_state state; - list_for_each_entry_rcu(e, list, list) { - if (audit_in_mask(&e->rule, ctx->major) && - audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) { - ctx->current_state = state; - return 1; - } - } - return 0; + return __audit_filter_op(tsk, ctx, list, n, ctx->major); } /* At syscall exit time, this filter is called if any audit_names have been @@ -965,7 +968,7 @@ static void audit_reset_context(struct audit_context *ctx) if (!ctx) return; - /* if ctx is non-null, reset the "ctx->state" regardless */ + /* if ctx is non-null, reset the "ctx->context" regardless */ ctx->context = AUDIT_CTX_UNUSED; if (ctx->dummy) return; @@ -1002,7 +1005,7 @@ static void audit_reset_context(struct audit_context *ctx) kfree(ctx->sockaddr); ctx->sockaddr = NULL; ctx->sockaddr_len = 0; - ctx->pid = ctx->ppid = 0; + ctx->ppid = 0; ctx->uid = ctx->euid = ctx->suid = ctx->fsuid = KUIDT_INIT(0); ctx->gid = ctx->egid = ctx->sgid = ctx->fsgid = KGIDT_INIT(0); ctx->personality = 0; @@ -1016,7 +1019,6 @@ static void audit_reset_context(struct audit_context *ctx) WARN_ON(!list_empty(&ctx->killed_trees)); audit_free_module(ctx); ctx->fds[0] = -1; - audit_proctitle_free(ctx); ctx->type = 0; /* reset last for audit_free_*() */ } @@ -1077,6 +1079,7 @@ static inline void audit_free_context(struct audit_context *context) { /* resetting is extra work, but it is likely just noise */ audit_reset_context(context); + audit_proctitle_free(context); free_tree_refs(context); kfree(context->filterkey); kfree(context); @@ -1833,7 +1836,7 @@ void __audit_free(struct task_struct *tsk) /* We are called either by do_exit() or the fork() error handling code; * in the former case tsk == current and in the latter tsk is a - * random task_struct that doesn't doesn't have any meaningful data we + * random task_struct that doesn't have any meaningful data we * need to log via audit_log_exit(). */ if (tsk == current && !context->dummy) { @@ -2069,7 +2072,7 @@ void __audit_syscall_exit(int success, long return_code) /* run through both filters to ensure we set the filterkey properly */ audit_filter_syscall(current, context); audit_filter_inodes(current, context); - if (context->current_state < AUDIT_STATE_RECORD) + if (context->current_state != AUDIT_STATE_RECORD) goto out; audit_log_exit(); diff --git a/kernel/bounds.c b/kernel/bounds.c index 9795d75b09b2..b529182e8b04 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c @@ -22,6 +22,13 @@ int main(void) DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS)); #endif DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t)); +#ifdef CONFIG_LRU_GEN + DEFINE(LRU_GEN_WIDTH, order_base_2(MAX_NR_GENS + 1)); + DEFINE(__LRU_REFS_WIDTH, MAX_NR_TIERS - 2); +#else + DEFINE(LRU_GEN_WIDTH, 0); + DEFINE(__LRU_REFS_WIDTH, 0); +#endif /* End of constants */ return 0; diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 057ba8e01e70..3a12e6b400a2 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -13,7 +13,7 @@ obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o obj-$(CONFIG_BPF_JIT) += trampoline.o -obj-$(CONFIG_BPF_SYSCALL) += btf.o +obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o obj-$(CONFIG_BPF_JIT) += dispatcher.o ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o @@ -24,6 +24,9 @@ endif ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o endif +ifeq ($(CONFIG_CGROUPS),y) +obj-$(CONFIG_BPF_SYSCALL) += cgroup_iter.o bpf_cgrp_storage.o +endif obj-$(CONFIG_CGROUP_BPF) += cgroup.o ifeq ($(CONFIG_INET),y) obj-$(CONFIG_BPF_SYSCALL) += reuseport_array.o diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 624527401d4d..484706959556 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -279,7 +279,8 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) rcu_read_lock(); pptr = array->pptrs[index & array->index_mask]; for_each_possible_cpu(cpu) { - bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size); + copy_map_value_long(map, value + off, per_cpu_ptr(pptr, cpu)); + check_and_init_map_value(map, value + off); off += size; } rcu_read_unlock(); @@ -305,14 +306,6 @@ static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key return 0; } -static void check_and_free_fields(struct bpf_array *arr, void *val) -{ - if (map_value_has_timer(&arr->map)) - bpf_timer_cancel_and_free(val + arr->map.timer_off); - if (map_value_has_kptrs(&arr->map)) - bpf_map_free_kptrs(&arr->map, val); -} - /* Called from syscall or from eBPF program */ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) @@ -334,12 +327,13 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, return -EEXIST; if (unlikely((map_flags & BPF_F_LOCK) && - !map_value_has_spin_lock(map))) + !btf_record_has_field(map->record, BPF_SPIN_LOCK))) return -EINVAL; if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { - memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]), - value, map->value_size); + val = this_cpu_ptr(array->pptrs[index & array->index_mask]); + copy_map_value(map, val, value); + bpf_obj_free_fields(array->map.record, val); } else { val = array->value + (u64)array->elem_size * (index & array->index_mask); @@ -347,7 +341,7 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, copy_map_value_locked(map, val, value, false); else copy_map_value(map, val, value); - check_and_free_fields(array, val); + bpf_obj_free_fields(array->map.record, val); } return 0; } @@ -383,7 +377,8 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, rcu_read_lock(); pptr = array->pptrs[index & array->index_mask]; for_each_possible_cpu(cpu) { - bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size); + copy_map_value_long(map, per_cpu_ptr(pptr, cpu), value + off); + bpf_obj_free_fields(array->map.record, per_cpu_ptr(pptr, cpu)); off += size; } rcu_read_unlock(); @@ -406,12 +401,12 @@ static void array_map_free_timers(struct bpf_map *map) struct bpf_array *array = container_of(map, struct bpf_array, map); int i; - /* We don't reset or free kptr on uref dropping to zero. */ - if (!map_value_has_timer(map)) + /* We don't reset or free fields other than timer on uref dropping to zero. */ + if (!btf_record_has_field(map->record, BPF_TIMER)) return; for (i = 0; i < array->map.max_entries; i++) - bpf_timer_cancel_and_free(array_map_elem_ptr(array, i) + map->timer_off); + bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i)); } /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ @@ -420,10 +415,21 @@ static void array_map_free(struct bpf_map *map) struct bpf_array *array = container_of(map, struct bpf_array, map); int i; - if (map_value_has_kptrs(map)) { - for (i = 0; i < array->map.max_entries; i++) - bpf_map_free_kptrs(map, array_map_elem_ptr(array, i)); - bpf_map_free_kptr_off_tab(map); + if (!IS_ERR_OR_NULL(map->record)) { + if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { + for (i = 0; i < array->map.max_entries; i++) { + void __percpu *pptr = array->pptrs[i & array->index_mask]; + int cpu; + + for_each_possible_cpu(cpu) { + bpf_obj_free_fields(map->record, per_cpu_ptr(pptr, cpu)); + cond_resched(); + } + } + } else { + for (i = 0; i < array->map.max_entries; i++) + bpf_obj_free_fields(map->record, array_map_elem_ptr(array, i)); + } } if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) @@ -608,9 +614,9 @@ static int __bpf_array_map_seq_show(struct seq_file *seq, void *v) pptr = v; size = array->elem_size; for_each_possible_cpu(cpu) { - bpf_long_memcpy(info->percpu_value_buf + off, - per_cpu_ptr(pptr, cpu), - size); + copy_map_value_long(map, info->percpu_value_buf + off, + per_cpu_ptr(pptr, cpu)); + check_and_init_map_value(map, info->percpu_value_buf + off); off += size; } ctx.value = info->percpu_value_buf; diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c index b9ea539a5561..48ee750849f2 100644 --- a/kernel/bpf/bloom_filter.c +++ b/kernel/bpf/bloom_filter.c @@ -158,7 +158,7 @@ static struct bpf_map *bloom_map_alloc(union bpf_attr *attr) attr->value_size / sizeof(u32); if (!(attr->map_flags & BPF_F_ZERO_SEED)) - bloom->hash_seed = get_random_int(); + bloom->hash_seed = get_random_u32(); return &bloom->map; } diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c new file mode 100644 index 000000000000..6cdf6d9ed91d --- /dev/null +++ b/kernel/bpf/bpf_cgrp_storage.c @@ -0,0 +1,246 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + */ + +#include <linux/types.h> +#include <linux/bpf.h> +#include <linux/bpf_local_storage.h> +#include <uapi/linux/btf.h> +#include <linux/btf_ids.h> + +DEFINE_BPF_STORAGE_CACHE(cgroup_cache); + +static DEFINE_PER_CPU(int, bpf_cgrp_storage_busy); + +static void bpf_cgrp_storage_lock(void) +{ + migrate_disable(); + this_cpu_inc(bpf_cgrp_storage_busy); +} + +static void bpf_cgrp_storage_unlock(void) +{ + this_cpu_dec(bpf_cgrp_storage_busy); + migrate_enable(); +} + +static bool bpf_cgrp_storage_trylock(void) +{ + migrate_disable(); + if (unlikely(this_cpu_inc_return(bpf_cgrp_storage_busy) != 1)) { + this_cpu_dec(bpf_cgrp_storage_busy); + migrate_enable(); + return false; + } + return true; +} + +static struct bpf_local_storage __rcu **cgroup_storage_ptr(void *owner) +{ + struct cgroup *cg = owner; + + return &cg->bpf_cgrp_storage; +} + +void bpf_cgrp_storage_free(struct cgroup *cgroup) +{ + struct bpf_local_storage *local_storage; + bool free_cgroup_storage = false; + unsigned long flags; + + rcu_read_lock(); + local_storage = rcu_dereference(cgroup->bpf_cgrp_storage); + if (!local_storage) { + rcu_read_unlock(); + return; + } + + bpf_cgrp_storage_lock(); + raw_spin_lock_irqsave(&local_storage->lock, flags); + free_cgroup_storage = bpf_local_storage_unlink_nolock(local_storage); + raw_spin_unlock_irqrestore(&local_storage->lock, flags); + bpf_cgrp_storage_unlock(); + rcu_read_unlock(); + + if (free_cgroup_storage) + kfree_rcu(local_storage, rcu); +} + +static struct bpf_local_storage_data * +cgroup_storage_lookup(struct cgroup *cgroup, struct bpf_map *map, bool cacheit_lockit) +{ + struct bpf_local_storage *cgroup_storage; + struct bpf_local_storage_map *smap; + + cgroup_storage = rcu_dereference_check(cgroup->bpf_cgrp_storage, + bpf_rcu_lock_held()); + if (!cgroup_storage) + return NULL; + + smap = (struct bpf_local_storage_map *)map; + return bpf_local_storage_lookup(cgroup_storage, smap, cacheit_lockit); +} + +static void *bpf_cgrp_storage_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_local_storage_data *sdata; + struct cgroup *cgroup; + int fd; + + fd = *(int *)key; + cgroup = cgroup_get_from_fd(fd); + if (IS_ERR(cgroup)) + return ERR_CAST(cgroup); + + bpf_cgrp_storage_lock(); + sdata = cgroup_storage_lookup(cgroup, map, true); + bpf_cgrp_storage_unlock(); + cgroup_put(cgroup); + return sdata ? sdata->data : NULL; +} + +static int bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) +{ + struct bpf_local_storage_data *sdata; + struct cgroup *cgroup; + int fd; + + fd = *(int *)key; + cgroup = cgroup_get_from_fd(fd); + if (IS_ERR(cgroup)) + return PTR_ERR(cgroup); + + bpf_cgrp_storage_lock(); + sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map, + value, map_flags, GFP_ATOMIC); + bpf_cgrp_storage_unlock(); + cgroup_put(cgroup); + return PTR_ERR_OR_ZERO(sdata); +} + +static int cgroup_storage_delete(struct cgroup *cgroup, struct bpf_map *map) +{ + struct bpf_local_storage_data *sdata; + + sdata = cgroup_storage_lookup(cgroup, map, false); + if (!sdata) + return -ENOENT; + + bpf_selem_unlink(SELEM(sdata), true); + return 0; +} + +static int bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key) +{ + struct cgroup *cgroup; + int err, fd; + + fd = *(int *)key; + cgroup = cgroup_get_from_fd(fd); + if (IS_ERR(cgroup)) + return PTR_ERR(cgroup); + + bpf_cgrp_storage_lock(); + err = cgroup_storage_delete(cgroup, map); + bpf_cgrp_storage_unlock(); + cgroup_put(cgroup); + return err; +} + +static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + return -ENOTSUPP; +} + +static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) +{ + return bpf_local_storage_map_alloc(attr, &cgroup_cache); +} + +static void cgroup_storage_map_free(struct bpf_map *map) +{ + bpf_local_storage_map_free(map, &cgroup_cache, NULL); +} + +/* *gfp_flags* is a hidden argument provided by the verifier */ +BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup, + void *, value, u64, flags, gfp_t, gfp_flags) +{ + struct bpf_local_storage_data *sdata; + + WARN_ON_ONCE(!bpf_rcu_lock_held()); + if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE)) + return (unsigned long)NULL; + + if (!cgroup) + return (unsigned long)NULL; + + if (!bpf_cgrp_storage_trylock()) + return (unsigned long)NULL; + + sdata = cgroup_storage_lookup(cgroup, map, true); + if (sdata) + goto unlock; + + /* only allocate new storage, when the cgroup is refcounted */ + if (!percpu_ref_is_dying(&cgroup->self.refcnt) && + (flags & BPF_LOCAL_STORAGE_GET_F_CREATE)) + sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map, + value, BPF_NOEXIST, gfp_flags); + +unlock: + bpf_cgrp_storage_unlock(); + return IS_ERR_OR_NULL(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data; +} + +BPF_CALL_2(bpf_cgrp_storage_delete, struct bpf_map *, map, struct cgroup *, cgroup) +{ + int ret; + + WARN_ON_ONCE(!bpf_rcu_lock_held()); + if (!cgroup) + return -EINVAL; + + if (!bpf_cgrp_storage_trylock()) + return -EBUSY; + + ret = cgroup_storage_delete(cgroup, map); + bpf_cgrp_storage_unlock(); + return ret; +} + +const struct bpf_map_ops cgrp_storage_map_ops = { + .map_meta_equal = bpf_map_meta_equal, + .map_alloc_check = bpf_local_storage_map_alloc_check, + .map_alloc = cgroup_storage_map_alloc, + .map_free = cgroup_storage_map_free, + .map_get_next_key = notsupp_get_next_key, + .map_lookup_elem = bpf_cgrp_storage_lookup_elem, + .map_update_elem = bpf_cgrp_storage_update_elem, + .map_delete_elem = bpf_cgrp_storage_delete_elem, + .map_check_btf = bpf_local_storage_map_check_btf, + .map_btf_id = &bpf_local_storage_map_btf_id[0], + .map_owner_storage_ptr = cgroup_storage_ptr, +}; + +const struct bpf_func_proto bpf_cgrp_storage_get_proto = { + .func = bpf_cgrp_storage_get, + .gpl_only = false, + .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_btf_id = &bpf_cgroup_btf_id[0], + .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, + .arg4_type = ARG_ANYTHING, +}; + +const struct bpf_func_proto bpf_cgrp_storage_delete_proto = { + .func = bpf_cgrp_storage_delete, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_btf_id = &bpf_cgroup_btf_id[0], +}; diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index 5f7683b19199..05f4c66c9089 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -56,11 +56,9 @@ static struct bpf_local_storage_data *inode_storage_lookup(struct inode *inode, void bpf_inode_storage_free(struct inode *inode) { - struct bpf_local_storage_elem *selem; struct bpf_local_storage *local_storage; bool free_inode_storage = false; struct bpf_storage_blob *bsb; - struct hlist_node *n; bsb = bpf_inode(inode); if (!bsb) @@ -74,30 +72,11 @@ void bpf_inode_storage_free(struct inode *inode) return; } - /* Neither the bpf_prog nor the bpf-map's syscall - * could be modifying the local_storage->list now. - * Thus, no elem can be added-to or deleted-from the - * local_storage->list by the bpf_prog or by the bpf-map's syscall. - * - * It is racing with bpf_local_storage_map_free() alone - * when unlinking elem from the local_storage->list and - * the map's bucket->list. - */ raw_spin_lock_bh(&local_storage->lock); - hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) { - /* Always unlink from map before unlinking from - * local_storage. - */ - bpf_selem_unlink_map(selem); - free_inode_storage = bpf_selem_unlink_storage_nolock( - local_storage, selem, false, false); - } + free_inode_storage = bpf_local_storage_unlink_nolock(local_storage); raw_spin_unlock_bh(&local_storage->lock); rcu_read_unlock(); - /* free_inoode_storage should always be true as long as - * local_storage->list was non-empty. - */ if (free_inode_storage) kfree_rcu(local_storage, rcu); } @@ -226,27 +205,14 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key, static struct bpf_map *inode_storage_map_alloc(union bpf_attr *attr) { - struct bpf_local_storage_map *smap; - - smap = bpf_local_storage_map_alloc(attr); - if (IS_ERR(smap)) - return ERR_CAST(smap); - - smap->cache_idx = bpf_local_storage_cache_idx_get(&inode_cache); - return &smap->map; + return bpf_local_storage_map_alloc(attr, &inode_cache); } static void inode_storage_map_free(struct bpf_map *map) { - struct bpf_local_storage_map *smap; - - smap = (struct bpf_local_storage_map *)map; - bpf_local_storage_cache_idx_free(&inode_cache, smap->cache_idx); - bpf_local_storage_map_free(smap, NULL); + bpf_local_storage_map_free(map, &inode_cache, NULL); } -BTF_ID_LIST_SINGLE(inode_storage_map_btf_ids, struct, - bpf_local_storage_map) const struct bpf_map_ops inode_storage_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = bpf_local_storage_map_alloc_check, @@ -257,7 +223,7 @@ const struct bpf_map_ops inode_storage_map_ops = { .map_update_elem = bpf_fd_inode_storage_update_elem, .map_delete_elem = bpf_fd_inode_storage_delete_elem, .map_check_btf = bpf_local_storage_map_check_btf, - .map_btf_id = &inode_storage_map_btf_ids[0], + .map_btf_id = &bpf_local_storage_map_btf_id[0], .map_owner_storage_ptr = inode_storage_ptr, }; diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 24b755eca0b3..5dc307bdeaeb 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -202,6 +202,11 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size, } stop: offs = seq->count; + if (IS_ERR(p)) { + seq->op->stop(seq, NULL); + err = PTR_ERR(p); + goto done; + } /* bpf program called if !p */ seq->op->stop(seq, p); if (!p) { @@ -689,19 +694,24 @@ struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop) int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx) { + struct bpf_run_ctx run_ctx, *old_run_ctx; int ret; if (prog->aux->sleepable) { rcu_read_lock_trace(); migrate_disable(); might_fault(); + old_run_ctx = bpf_set_run_ctx(&run_ctx); ret = bpf_prog_run(prog, ctx); + bpf_reset_run_ctx(old_run_ctx); migrate_enable(); rcu_read_unlock_trace(); } else { rcu_read_lock(); migrate_disable(); + old_run_ctx = bpf_set_run_ctx(&run_ctx); ret = bpf_prog_run(prog, ctx); + bpf_reset_run_ctx(old_run_ctx); migrate_enable(); rcu_read_unlock(); } diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 8ce40fd869f6..b39a46e8fb08 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -74,7 +74,7 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, gfp_flags | __GFP_NOWARN); if (selem) { if (value) - memcpy(SDATA(selem)->data, value, smap->map.value_size); + copy_map_value(&smap->map, SDATA(selem)->data, value); return selem; } @@ -88,8 +88,14 @@ void bpf_local_storage_free_rcu(struct rcu_head *rcu) { struct bpf_local_storage *local_storage; + /* If RCU Tasks Trace grace period implies RCU grace period, do + * kfree(), else do kfree_rcu(). + */ local_storage = container_of(rcu, struct bpf_local_storage, rcu); - kfree_rcu(local_storage, rcu); + if (rcu_trace_implies_rcu_gp()) + kfree(local_storage); + else + kfree_rcu(local_storage, rcu); } static void bpf_selem_free_rcu(struct rcu_head *rcu) @@ -97,16 +103,19 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu) struct bpf_local_storage_elem *selem; selem = container_of(rcu, struct bpf_local_storage_elem, rcu); - kfree_rcu(selem, rcu); + if (rcu_trace_implies_rcu_gp()) + kfree(selem); + else + kfree_rcu(selem, rcu); } /* local_storage->lock must be held and selem->local_storage == local_storage. * The caller must ensure selem->smap is still valid to be * dereferenced for its smap->elem_size and smap->cache_idx. */ -bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, - struct bpf_local_storage_elem *selem, - bool uncharge_mem, bool use_trace_rcu) +static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, + struct bpf_local_storage_elem *selem, + bool uncharge_mem, bool use_trace_rcu) { struct bpf_local_storage_map *smap; bool free_local_storage; @@ -233,6 +242,7 @@ void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool use_trace_rcu) __bpf_selem_unlink_storage(selem, use_trace_rcu); } +/* If cacheit_lockit is false, this lookup function is lockless */ struct bpf_local_storage_data * bpf_local_storage_lookup(struct bpf_local_storage *local_storage, struct bpf_local_storage_map *smap, @@ -372,7 +382,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) || /* BPF_F_LOCK can only be used in a value with spin_lock */ unlikely((map_flags & BPF_F_LOCK) && - !map_value_has_spin_lock(&smap->map))) + !btf_record_has_field(smap->map.record, BPF_SPIN_LOCK))) return ERR_PTR(-EINVAL); if (gfp_flags == GFP_KERNEL && (map_flags & ~BPF_F_LOCK) != BPF_NOEXIST) @@ -491,7 +501,7 @@ unlock_err: return ERR_PTR(err); } -u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache) +static u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache) { u64 min_usage = U64_MAX; u16 i, res = 0; @@ -515,76 +525,14 @@ u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache) return res; } -void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache, - u16 idx) +static void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache, + u16 idx) { spin_lock(&cache->idx_lock); cache->idx_usage_counts[idx]--; spin_unlock(&cache->idx_lock); } -void bpf_local_storage_map_free(struct bpf_local_storage_map *smap, - int __percpu *busy_counter) -{ - struct bpf_local_storage_elem *selem; - struct bpf_local_storage_map_bucket *b; - unsigned int i; - - /* Note that this map might be concurrently cloned from - * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone - * RCU read section to finish before proceeding. New RCU - * read sections should be prevented via bpf_map_inc_not_zero. - */ - synchronize_rcu(); - - /* bpf prog and the userspace can no longer access this map - * now. No new selem (of this map) can be added - * to the owner->storage or to the map bucket's list. - * - * The elem of this map can be cleaned up here - * or when the storage is freed e.g. - * by bpf_sk_storage_free() during __sk_destruct(). - */ - for (i = 0; i < (1U << smap->bucket_log); i++) { - b = &smap->buckets[i]; - - rcu_read_lock(); - /* No one is adding to b->list now */ - while ((selem = hlist_entry_safe( - rcu_dereference_raw(hlist_first_rcu(&b->list)), - struct bpf_local_storage_elem, map_node))) { - if (busy_counter) { - migrate_disable(); - __this_cpu_inc(*busy_counter); - } - bpf_selem_unlink(selem, false); - if (busy_counter) { - __this_cpu_dec(*busy_counter); - migrate_enable(); - } - cond_resched_rcu(); - } - rcu_read_unlock(); - } - - /* While freeing the storage we may still need to access the map. - * - * e.g. when bpf_sk_storage_free() has unlinked selem from the map - * which then made the above while((selem = ...)) loop - * exit immediately. - * - * However, while freeing the storage one still needs to access the - * smap->elem_size to do the uncharging in - * bpf_selem_unlink_storage_nolock(). - * - * Hence, wait another rcu grace period for the storage to be freed. - */ - synchronize_rcu(); - - kvfree(smap->buckets); - kfree(smap); -} - int bpf_local_storage_map_alloc_check(union bpf_attr *attr) { if (attr->map_flags & ~BPF_LOCAL_STORAGE_CREATE_FLAG_MASK || @@ -604,13 +552,13 @@ int bpf_local_storage_map_alloc_check(union bpf_attr *attr) return 0; } -struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr) +static struct bpf_local_storage_map *__bpf_local_storage_map_alloc(union bpf_attr *attr) { struct bpf_local_storage_map *smap; unsigned int i; u32 nbuckets; - smap = kzalloc(sizeof(*smap), GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT); + smap = bpf_map_area_alloc(sizeof(*smap), NUMA_NO_NODE); if (!smap) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&smap->map, attr); @@ -623,7 +571,7 @@ struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr) smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets, GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT); if (!smap->buckets) { - kfree(smap); + bpf_map_area_free(smap); return ERR_PTR(-ENOMEM); } @@ -654,3 +602,117 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map, return 0; } + +bool bpf_local_storage_unlink_nolock(struct bpf_local_storage *local_storage) +{ + struct bpf_local_storage_elem *selem; + bool free_storage = false; + struct hlist_node *n; + + /* Neither the bpf_prog nor the bpf_map's syscall + * could be modifying the local_storage->list now. + * Thus, no elem can be added to or deleted from the + * local_storage->list by the bpf_prog or by the bpf_map's syscall. + * + * It is racing with bpf_local_storage_map_free() alone + * when unlinking elem from the local_storage->list and + * the map's bucket->list. + */ + hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) { + /* Always unlink from map before unlinking from + * local_storage. + */ + bpf_selem_unlink_map(selem); + /* If local_storage list has only one element, the + * bpf_selem_unlink_storage_nolock() will return true. + * Otherwise, it will return false. The current loop iteration + * intends to remove all local storage. So the last iteration + * of the loop will set the free_cgroup_storage to true. + */ + free_storage = bpf_selem_unlink_storage_nolock( + local_storage, selem, false, false); + } + + return free_storage; +} + +struct bpf_map * +bpf_local_storage_map_alloc(union bpf_attr *attr, + struct bpf_local_storage_cache *cache) +{ + struct bpf_local_storage_map *smap; + + smap = __bpf_local_storage_map_alloc(attr); + if (IS_ERR(smap)) + return ERR_CAST(smap); + + smap->cache_idx = bpf_local_storage_cache_idx_get(cache); + return &smap->map; +} + +void bpf_local_storage_map_free(struct bpf_map *map, + struct bpf_local_storage_cache *cache, + int __percpu *busy_counter) +{ + struct bpf_local_storage_map_bucket *b; + struct bpf_local_storage_elem *selem; + struct bpf_local_storage_map *smap; + unsigned int i; + + smap = (struct bpf_local_storage_map *)map; + bpf_local_storage_cache_idx_free(cache, smap->cache_idx); + + /* Note that this map might be concurrently cloned from + * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone + * RCU read section to finish before proceeding. New RCU + * read sections should be prevented via bpf_map_inc_not_zero. + */ + synchronize_rcu(); + + /* bpf prog and the userspace can no longer access this map + * now. No new selem (of this map) can be added + * to the owner->storage or to the map bucket's list. + * + * The elem of this map can be cleaned up here + * or when the storage is freed e.g. + * by bpf_sk_storage_free() during __sk_destruct(). + */ + for (i = 0; i < (1U << smap->bucket_log); i++) { + b = &smap->buckets[i]; + + rcu_read_lock(); + /* No one is adding to b->list now */ + while ((selem = hlist_entry_safe( + rcu_dereference_raw(hlist_first_rcu(&b->list)), + struct bpf_local_storage_elem, map_node))) { + if (busy_counter) { + migrate_disable(); + this_cpu_inc(*busy_counter); + } + bpf_selem_unlink(selem, false); + if (busy_counter) { + this_cpu_dec(*busy_counter); + migrate_enable(); + } + cond_resched_rcu(); + } + rcu_read_unlock(); + } + + /* While freeing the storage we may still need to access the map. + * + * e.g. when bpf_sk_storage_free() has unlinked selem from the map + * which then made the above while((selem = ...)) loop + * exit immediately. + * + * However, while freeing the storage one still needs to access the + * smap->elem_size to do the uncharging in + * bpf_selem_unlink_storage_nolock(). + * + * Hence, wait another rcu grace period for the storage to be freed. + */ + synchronize_rcu(); + + kvfree(smap->buckets); + bpf_map_area_free(smap); +} diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index fa71d58b7ded..9ea42a45da47 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -41,17 +41,21 @@ BTF_SET_END(bpf_lsm_hooks) */ BTF_SET_START(bpf_lsm_current_hooks) /* operate on freshly allocated sk without any cgroup association */ +#ifdef CONFIG_SECURITY_NETWORK BTF_ID(func, bpf_lsm_sk_alloc_security) BTF_ID(func, bpf_lsm_sk_free_security) +#endif BTF_SET_END(bpf_lsm_current_hooks) /* List of LSM hooks that trigger while the socket is properly locked. */ BTF_SET_START(bpf_lsm_locked_sockopt_hooks) +#ifdef CONFIG_SECURITY_NETWORK BTF_ID(func, bpf_lsm_socket_sock_rcv_skb) BTF_ID(func, bpf_lsm_sock_graft) BTF_ID(func, bpf_lsm_inet_csk_clone) BTF_ID(func, bpf_lsm_inet_conn_established) +#endif BTF_SET_END(bpf_lsm_locked_sockopt_hooks) /* List of LSM hooks that trigger while the socket is _not_ locked, @@ -59,8 +63,10 @@ BTF_SET_END(bpf_lsm_locked_sockopt_hooks) * in the early init phase. */ BTF_SET_START(bpf_lsm_unlocked_sockopt_hooks) +#ifdef CONFIG_SECURITY_NETWORK BTF_ID(func, bpf_lsm_socket_post_create) BTF_ID(func, bpf_lsm_socket_socketpair) +#endif BTF_SET_END(bpf_lsm_unlocked_sockopt_hooks) #ifdef CONFIG_CGROUP_BPF @@ -145,6 +151,7 @@ BTF_ID_LIST_SINGLE(bpf_ima_inode_hash_btf_ids, struct, inode) static const struct bpf_func_proto bpf_ima_inode_hash_proto = { .func = bpf_ima_inode_hash, .gpl_only = false, + .might_sleep = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &bpf_ima_inode_hash_btf_ids[0], @@ -163,6 +170,7 @@ BTF_ID_LIST_SINGLE(bpf_ima_file_hash_btf_ids, struct, file) static const struct bpf_func_proto bpf_ima_file_hash_proto = { .func = bpf_ima_file_hash, .gpl_only = false, + .might_sleep = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &bpf_ima_file_hash_btf_ids[0], @@ -189,6 +197,14 @@ static const struct bpf_func_proto bpf_get_attach_cookie_proto = { static const struct bpf_func_proto * bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { + const struct bpf_func_proto *func_proto; + + if (prog->expected_attach_type == BPF_LSM_CGROUP) { + func_proto = cgroup_common_func_proto(func_id, prog); + if (func_proto) + return func_proto; + } + switch (func_id) { case BPF_FUNC_inode_storage_get: return &bpf_inode_storage_get_proto; @@ -207,20 +223,11 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_bprm_opts_set: return &bpf_bprm_opts_set_proto; case BPF_FUNC_ima_inode_hash: - return prog->aux->sleepable ? &bpf_ima_inode_hash_proto : NULL; + return &bpf_ima_inode_hash_proto; case BPF_FUNC_ima_file_hash: - return prog->aux->sleepable ? &bpf_ima_file_hash_proto : NULL; + return &bpf_ima_file_hash_proto; case BPF_FUNC_get_attach_cookie: return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto : NULL; - case BPF_FUNC_get_local_storage: - return prog->expected_attach_type == BPF_LSM_CGROUP ? - &bpf_get_local_storage_proto : NULL; - case BPF_FUNC_set_retval: - return prog->expected_attach_type == BPF_LSM_CGROUP ? - &bpf_set_retval_proto : NULL; - case BPF_FUNC_get_retval: - return prog->expected_attach_type == BPF_LSM_CGROUP ? - &bpf_get_retval_proto : NULL; #ifdef CONFIG_NET case BPF_FUNC_setsockopt: if (prog->expected_attach_type != BPF_LSM_CGROUP) @@ -335,13 +342,30 @@ BTF_ID(func, bpf_lsm_task_getsecid_obj) BTF_ID(func, bpf_lsm_task_prctl) BTF_ID(func, bpf_lsm_task_setscheduler) BTF_ID(func, bpf_lsm_task_to_inode) +BTF_ID(func, bpf_lsm_userns_create) BTF_SET_END(sleepable_lsm_hooks) +BTF_SET_START(untrusted_lsm_hooks) +BTF_ID(func, bpf_lsm_bpf_map_free_security) +BTF_ID(func, bpf_lsm_bpf_prog_alloc_security) +BTF_ID(func, bpf_lsm_bpf_prog_free_security) +BTF_ID(func, bpf_lsm_file_alloc_security) +BTF_ID(func, bpf_lsm_file_free_security) +BTF_ID(func, bpf_lsm_sk_alloc_security) +BTF_ID(func, bpf_lsm_sk_free_security) +BTF_ID(func, bpf_lsm_task_free) +BTF_SET_END(untrusted_lsm_hooks) + bool bpf_lsm_is_sleepable_hook(u32 btf_id) { return btf_id_set_contains(&sleepable_lsm_hooks, btf_id); } +bool bpf_lsm_is_trusted(const struct bpf_prog *prog) +{ + return !btf_id_set_contains(&untrusted_lsm_hooks, prog->aux->attach_btf_id); +} + const struct bpf_prog_ops lsm_prog_ops = { }; diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 84b2d9dba79a..ece9870cab68 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -494,8 +494,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, refcount_set(&kvalue->refcnt, 1); bpf_map_inc(map); - set_memory_ro((long)st_map->image, 1); - set_memory_x((long)st_map->image, 1); + set_memory_rox((long)st_map->image, 1); err = st_ops->reg(kdata); if (likely(!err)) { /* Pair with smp_load_acquire() during lookup_elem(). diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index e9014dc62682..1e486055a523 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -26,20 +26,20 @@ static DEFINE_PER_CPU(int, bpf_task_storage_busy); static void bpf_task_storage_lock(void) { migrate_disable(); - __this_cpu_inc(bpf_task_storage_busy); + this_cpu_inc(bpf_task_storage_busy); } static void bpf_task_storage_unlock(void) { - __this_cpu_dec(bpf_task_storage_busy); + this_cpu_dec(bpf_task_storage_busy); migrate_enable(); } static bool bpf_task_storage_trylock(void) { migrate_disable(); - if (unlikely(__this_cpu_inc_return(bpf_task_storage_busy) != 1)) { - __this_cpu_dec(bpf_task_storage_busy); + if (unlikely(this_cpu_inc_return(bpf_task_storage_busy) != 1)) { + this_cpu_dec(bpf_task_storage_busy); migrate_enable(); return false; } @@ -71,10 +71,8 @@ task_storage_lookup(struct task_struct *task, struct bpf_map *map, void bpf_task_storage_free(struct task_struct *task) { - struct bpf_local_storage_elem *selem; struct bpf_local_storage *local_storage; bool free_task_storage = false; - struct hlist_node *n; unsigned long flags; rcu_read_lock(); @@ -85,32 +83,13 @@ void bpf_task_storage_free(struct task_struct *task) return; } - /* Neither the bpf_prog nor the bpf-map's syscall - * could be modifying the local_storage->list now. - * Thus, no elem can be added-to or deleted-from the - * local_storage->list by the bpf_prog or by the bpf-map's syscall. - * - * It is racing with bpf_local_storage_map_free() alone - * when unlinking elem from the local_storage->list and - * the map's bucket->list. - */ bpf_task_storage_lock(); raw_spin_lock_irqsave(&local_storage->lock, flags); - hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) { - /* Always unlink from map before unlinking from - * local_storage. - */ - bpf_selem_unlink_map(selem); - free_task_storage = bpf_selem_unlink_storage_nolock( - local_storage, selem, false, false); - } + free_task_storage = bpf_local_storage_unlink_nolock(local_storage); raw_spin_unlock_irqrestore(&local_storage->lock, flags); bpf_task_storage_unlock(); rcu_read_unlock(); - /* free_task_storage should always be true as long as - * local_storage->list was non-empty. - */ if (free_task_storage) kfree_rcu(local_storage, rcu); } @@ -184,7 +163,8 @@ out: return err; } -static int task_storage_delete(struct task_struct *task, struct bpf_map *map) +static int task_storage_delete(struct task_struct *task, struct bpf_map *map, + bool nobusy) { struct bpf_local_storage_data *sdata; @@ -192,6 +172,9 @@ static int task_storage_delete(struct task_struct *task, struct bpf_map *map) if (!sdata) return -ENOENT; + if (!nobusy) + return -EBUSY; + bpf_selem_unlink(SELEM(sdata), true); return 0; @@ -220,63 +203,108 @@ static int bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key) } bpf_task_storage_lock(); - err = task_storage_delete(task, map); + err = task_storage_delete(task, map, true); bpf_task_storage_unlock(); out: put_pid(pid); return err; } -/* *gfp_flags* is a hidden argument provided by the verifier */ -BPF_CALL_5(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *, - task, void *, value, u64, flags, gfp_t, gfp_flags) +/* Called by bpf_task_storage_get*() helpers */ +static void *__bpf_task_storage_get(struct bpf_map *map, + struct task_struct *task, void *value, + u64 flags, gfp_t gfp_flags, bool nobusy) { struct bpf_local_storage_data *sdata; - WARN_ON_ONCE(!bpf_rcu_lock_held()); - if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE)) - return (unsigned long)NULL; - - if (!task) - return (unsigned long)NULL; - - if (!bpf_task_storage_trylock()) - return (unsigned long)NULL; - - sdata = task_storage_lookup(task, map, true); + sdata = task_storage_lookup(task, map, nobusy); if (sdata) - goto unlock; + return sdata->data; /* only allocate new storage, when the task is refcounted */ if (refcount_read(&task->usage) && - (flags & BPF_LOCAL_STORAGE_GET_F_CREATE)) + (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) && nobusy) { sdata = bpf_local_storage_update( task, (struct bpf_local_storage_map *)map, value, BPF_NOEXIST, gfp_flags); + return IS_ERR(sdata) ? NULL : sdata->data; + } + + return NULL; +} -unlock: +/* *gfp_flags* is a hidden argument provided by the verifier */ +BPF_CALL_5(bpf_task_storage_get_recur, struct bpf_map *, map, struct task_struct *, + task, void *, value, u64, flags, gfp_t, gfp_flags) +{ + bool nobusy; + void *data; + + WARN_ON_ONCE(!bpf_rcu_lock_held()); + if (flags & ~BPF_LOCAL_STORAGE_GET_F_CREATE || !task) + return (unsigned long)NULL; + + nobusy = bpf_task_storage_trylock(); + data = __bpf_task_storage_get(map, task, value, flags, + gfp_flags, nobusy); + if (nobusy) + bpf_task_storage_unlock(); + return (unsigned long)data; +} + +/* *gfp_flags* is a hidden argument provided by the verifier */ +BPF_CALL_5(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *, + task, void *, value, u64, flags, gfp_t, gfp_flags) +{ + void *data; + + WARN_ON_ONCE(!bpf_rcu_lock_held()); + if (flags & ~BPF_LOCAL_STORAGE_GET_F_CREATE || !task) + return (unsigned long)NULL; + + bpf_task_storage_lock(); + data = __bpf_task_storage_get(map, task, value, flags, + gfp_flags, true); bpf_task_storage_unlock(); - return IS_ERR_OR_NULL(sdata) ? (unsigned long)NULL : - (unsigned long)sdata->data; + return (unsigned long)data; } -BPF_CALL_2(bpf_task_storage_delete, struct bpf_map *, map, struct task_struct *, +BPF_CALL_2(bpf_task_storage_delete_recur, struct bpf_map *, map, struct task_struct *, task) { + bool nobusy; int ret; WARN_ON_ONCE(!bpf_rcu_lock_held()); if (!task) return -EINVAL; - if (!bpf_task_storage_trylock()) - return -EBUSY; + nobusy = bpf_task_storage_trylock(); + /* This helper must only be called from places where the lifetime of the task + * is guaranteed. Either by being refcounted or by being protected + * by an RCU read-side critical section. + */ + ret = task_storage_delete(task, map, nobusy); + if (nobusy) + bpf_task_storage_unlock(); + return ret; +} + +BPF_CALL_2(bpf_task_storage_delete, struct bpf_map *, map, struct task_struct *, + task) +{ + int ret; + + WARN_ON_ONCE(!bpf_rcu_lock_held()); + if (!task) + return -EINVAL; + bpf_task_storage_lock(); /* This helper must only be called from places where the lifetime of the task * is guaranteed. Either by being refcounted or by being protected * by an RCU read-side critical section. */ - ret = task_storage_delete(task, map); + ret = task_storage_delete(task, map, true); bpf_task_storage_unlock(); return ret; } @@ -288,26 +316,15 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key) static struct bpf_map *task_storage_map_alloc(union bpf_attr *attr) { - struct bpf_local_storage_map *smap; - - smap = bpf_local_storage_map_alloc(attr); - if (IS_ERR(smap)) - return ERR_CAST(smap); - - smap->cache_idx = bpf_local_storage_cache_idx_get(&task_cache); - return &smap->map; + return bpf_local_storage_map_alloc(attr, &task_cache); } static void task_storage_map_free(struct bpf_map *map) { - struct bpf_local_storage_map *smap; - - smap = (struct bpf_local_storage_map *)map; - bpf_local_storage_cache_idx_free(&task_cache, smap->cache_idx); - bpf_local_storage_map_free(smap, &bpf_task_storage_busy); + bpf_local_storage_map_free(map, &task_cache, &bpf_task_storage_busy); } -BTF_ID_LIST_SINGLE(task_storage_map_btf_ids, struct, bpf_local_storage_map) +BTF_ID_LIST_GLOBAL_SINGLE(bpf_local_storage_map_btf_id, struct, bpf_local_storage_map) const struct bpf_map_ops task_storage_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = bpf_local_storage_map_alloc_check, @@ -318,10 +335,21 @@ const struct bpf_map_ops task_storage_map_ops = { .map_update_elem = bpf_pid_task_storage_update_elem, .map_delete_elem = bpf_pid_task_storage_delete_elem, .map_check_btf = bpf_local_storage_map_check_btf, - .map_btf_id = &task_storage_map_btf_ids[0], + .map_btf_id = &bpf_local_storage_map_btf_id[0], .map_owner_storage_ptr = task_storage_ptr, }; +const struct bpf_func_proto bpf_task_storage_get_recur_proto = { + .func = bpf_task_storage_get_recur, + .gpl_only = false, + .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], + .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, + .arg4_type = ARG_ANYTHING, +}; + const struct bpf_func_proto bpf_task_storage_get_proto = { .func = bpf_task_storage_get, .gpl_only = false, @@ -333,6 +361,15 @@ const struct bpf_func_proto bpf_task_storage_get_proto = { .arg4_type = ARG_ANYTHING, }; +const struct bpf_func_proto bpf_task_storage_delete_recur_proto = { + .func = bpf_task_storage_delete_recur, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], +}; + const struct bpf_func_proto bpf_task_storage_delete_proto = { .func = bpf_task_storage_delete, .gpl_only = false, diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 7e64447659f3..f7dd8af06413 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -19,6 +19,7 @@ #include <linux/bpf_verifier.h> #include <linux/btf.h> #include <linux/btf_ids.h> +#include <linux/bpf_lsm.h> #include <linux/skmsg.h> #include <linux/perf_event.h> #include <linux/bsearch.h> @@ -199,16 +200,18 @@ DEFINE_IDR(btf_idr); DEFINE_SPINLOCK(btf_idr_lock); enum btf_kfunc_hook { + BTF_KFUNC_HOOK_COMMON, BTF_KFUNC_HOOK_XDP, BTF_KFUNC_HOOK_TC, BTF_KFUNC_HOOK_STRUCT_OPS, BTF_KFUNC_HOOK_TRACING, BTF_KFUNC_HOOK_SYSCALL, + BTF_KFUNC_HOOK_FMODRET, BTF_KFUNC_HOOK_MAX, }; enum { - BTF_KFUNC_SET_MAX_CNT = 32, + BTF_KFUNC_SET_MAX_CNT = 256, BTF_DTOR_KFUNC_MAX_CNT = 256, }; @@ -237,6 +240,7 @@ struct btf { struct rcu_head rcu; struct btf_kfunc_set_tab *kfunc_set_tab; struct btf_id_dtor_kfunc_tab *dtor_kfunc_tab; + struct btf_struct_metas *struct_meta_tab; /* split BTF support */ struct btf *base_btf; @@ -477,16 +481,6 @@ static bool btf_type_nosize_or_null(const struct btf_type *t) return !t || btf_type_nosize(t); } -static bool __btf_type_is_struct(const struct btf_type *t) -{ - return BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT; -} - -static bool btf_type_is_array(const struct btf_type *t) -{ - return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY; -} - static bool btf_type_is_datasec(const struct btf_type *t) { return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC; @@ -818,6 +812,7 @@ const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) return NULL; return btf->types[type_id]; } +EXPORT_SYMBOL_GPL(btf_type_by_id); /* * Regular int is not a bit field and it must be either @@ -1396,7 +1391,6 @@ __printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env, const char *fmt, ...) { struct bpf_verifier_log *log = &env->log; - u8 kind = BTF_INFO_KIND(t->info); struct btf *btf = env->btf; va_list args; @@ -1412,7 +1406,7 @@ __printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env, __btf_verifier_log(log, "[%u] %s %s%s", env->log_type_id, - btf_kind_str[kind], + btf_type_str(t), __btf_name_by_offset(btf, t->name_off), log_details ? " " : ""); @@ -1642,8 +1636,30 @@ static void btf_free_dtor_kfunc_tab(struct btf *btf) btf->dtor_kfunc_tab = NULL; } +static void btf_struct_metas_free(struct btf_struct_metas *tab) +{ + int i; + + if (!tab) + return; + for (i = 0; i < tab->cnt; i++) { + btf_record_free(tab->types[i].record); + kfree(tab->types[i].field_offs); + } + kfree(tab); +} + +static void btf_free_struct_meta_tab(struct btf *btf) +{ + struct btf_struct_metas *tab = btf->struct_meta_tab; + + btf_struct_metas_free(tab); + btf->struct_meta_tab = NULL; +} + static void btf_free(struct btf *btf) { + btf_free_struct_meta_tab(btf); btf_free_dtor_kfunc_tab(btf); btf_free_kfunc_set_tab(btf); kvfree(btf->types); @@ -3128,7 +3144,7 @@ static int btf_struct_resolve(struct btf_verifier_env *env, if (v->next_member) { const struct btf_type *last_member_type; const struct btf_member *last_member; - u16 last_member_type_id; + u32 last_member_type_id; last_member = btf_type_member(v->t) + v->next_member - 1; last_member_type_id = last_member->type; @@ -3191,7 +3207,7 @@ static void btf_struct_log(struct btf_verifier_env *env, btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t)); } -enum btf_field_type { +enum btf_field_info_type { BTF_FIELD_SPIN_LOCK, BTF_FIELD_TIMER, BTF_FIELD_KPTR, @@ -3203,18 +3219,28 @@ enum { }; struct btf_field_info { - u32 type_id; + enum btf_field_type type; u32 off; - enum bpf_kptr_type type; + union { + struct { + u32 type_id; + } kptr; + struct { + const char *node_name; + u32 value_btf_id; + } list_head; + }; }; static int btf_find_struct(const struct btf *btf, const struct btf_type *t, - u32 off, int sz, struct btf_field_info *info) + u32 off, int sz, enum btf_field_type field_type, + struct btf_field_info *info) { if (!__btf_type_is_struct(t)) return BTF_FIELD_IGNORE; if (t->size != sz) return BTF_FIELD_IGNORE; + info->type = field_type; info->off = off; return BTF_FIELD_FOUND; } @@ -3222,9 +3248,12 @@ static int btf_find_struct(const struct btf *btf, const struct btf_type *t, static int btf_find_kptr(const struct btf *btf, const struct btf_type *t, u32 off, int sz, struct btf_field_info *info) { - enum bpf_kptr_type type; + enum btf_field_type type; u32 res_id; + /* Permit modifiers on the pointer itself */ + if (btf_type_is_volatile(t)) + t = btf_type_by_id(btf, t->type); /* For PTR, sz is always == 8 */ if (!btf_type_is_ptr(t)) return BTF_FIELD_IGNORE; @@ -3248,28 +3277,135 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t, if (!__btf_type_is_struct(t)) return -EINVAL; - info->type_id = res_id; - info->off = off; info->type = type; + info->off = off; + info->kptr.type_id = res_id; + return BTF_FIELD_FOUND; +} + +static const char *btf_find_decl_tag_value(const struct btf *btf, + const struct btf_type *pt, + int comp_idx, const char *tag_key) +{ + int i; + + for (i = 1; i < btf_nr_types(btf); i++) { + const struct btf_type *t = btf_type_by_id(btf, i); + int len = strlen(tag_key); + + if (!btf_type_is_decl_tag(t)) + continue; + if (pt != btf_type_by_id(btf, t->type) || + btf_type_decl_tag(t)->component_idx != comp_idx) + continue; + if (strncmp(__btf_name_by_offset(btf, t->name_off), tag_key, len)) + continue; + return __btf_name_by_offset(btf, t->name_off) + len; + } + return NULL; +} + +static int btf_find_list_head(const struct btf *btf, const struct btf_type *pt, + const struct btf_type *t, int comp_idx, + u32 off, int sz, struct btf_field_info *info) +{ + const char *value_type; + const char *list_node; + s32 id; + + if (!__btf_type_is_struct(t)) + return BTF_FIELD_IGNORE; + if (t->size != sz) + return BTF_FIELD_IGNORE; + value_type = btf_find_decl_tag_value(btf, pt, comp_idx, "contains:"); + if (!value_type) + return -EINVAL; + list_node = strstr(value_type, ":"); + if (!list_node) + return -EINVAL; + value_type = kstrndup(value_type, list_node - value_type, GFP_KERNEL | __GFP_NOWARN); + if (!value_type) + return -ENOMEM; + id = btf_find_by_name_kind(btf, value_type, BTF_KIND_STRUCT); + kfree(value_type); + if (id < 0) + return id; + list_node++; + if (str_is_empty(list_node)) + return -EINVAL; + info->type = BPF_LIST_HEAD; + info->off = off; + info->list_head.value_btf_id = id; + info->list_head.node_name = list_node; return BTF_FIELD_FOUND; } -static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t, - const char *name, int sz, int align, - enum btf_field_type field_type, +static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask, + int *align, int *sz) +{ + int type = 0; + + if (field_mask & BPF_SPIN_LOCK) { + if (!strcmp(name, "bpf_spin_lock")) { + if (*seen_mask & BPF_SPIN_LOCK) + return -E2BIG; + *seen_mask |= BPF_SPIN_LOCK; + type = BPF_SPIN_LOCK; + goto end; + } + } + if (field_mask & BPF_TIMER) { + if (!strcmp(name, "bpf_timer")) { + if (*seen_mask & BPF_TIMER) + return -E2BIG; + *seen_mask |= BPF_TIMER; + type = BPF_TIMER; + goto end; + } + } + if (field_mask & BPF_LIST_HEAD) { + if (!strcmp(name, "bpf_list_head")) { + type = BPF_LIST_HEAD; + goto end; + } + } + if (field_mask & BPF_LIST_NODE) { + if (!strcmp(name, "bpf_list_node")) { + type = BPF_LIST_NODE; + goto end; + } + } + /* Only return BPF_KPTR when all other types with matchable names fail */ + if (field_mask & BPF_KPTR) { + type = BPF_KPTR_REF; + goto end; + } + return 0; +end: + *sz = btf_field_type_size(type); + *align = btf_field_type_align(type); + return type; +} + +static int btf_find_struct_field(const struct btf *btf, + const struct btf_type *t, u32 field_mask, struct btf_field_info *info, int info_cnt) { + int ret, idx = 0, align, sz, field_type; const struct btf_member *member; struct btf_field_info tmp; - int ret, idx = 0; - u32 i, off; + u32 i, off, seen_mask = 0; for_each_member(i, t, member) { const struct btf_type *member_type = btf_type_by_id(btf, member->type); - if (name && strcmp(__btf_name_by_offset(btf, member_type->name_off), name)) + field_type = btf_get_field_type(__btf_name_by_offset(btf, member_type->name_off), + field_mask, &seen_mask, &align, &sz); + if (field_type == 0) continue; + if (field_type < 0) + return field_type; off = __btf_member_bit_offset(t, member); if (off % 8) @@ -3277,22 +3413,30 @@ static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t return -EINVAL; off /= 8; if (off % align) - return -EINVAL; + continue; switch (field_type) { - case BTF_FIELD_SPIN_LOCK: - case BTF_FIELD_TIMER: - ret = btf_find_struct(btf, member_type, off, sz, + case BPF_SPIN_LOCK: + case BPF_TIMER: + case BPF_LIST_NODE: + ret = btf_find_struct(btf, member_type, off, sz, field_type, idx < info_cnt ? &info[idx] : &tmp); if (ret < 0) return ret; break; - case BTF_FIELD_KPTR: + case BPF_KPTR_UNREF: + case BPF_KPTR_REF: ret = btf_find_kptr(btf, member_type, off, sz, idx < info_cnt ? &info[idx] : &tmp); if (ret < 0) return ret; break; + case BPF_LIST_HEAD: + ret = btf_find_list_head(btf, t, member_type, i, off, sz, + idx < info_cnt ? &info[idx] : &tmp); + if (ret < 0) + return ret; + break; default: return -EFAULT; } @@ -3307,42 +3451,53 @@ static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t } static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t, - const char *name, int sz, int align, - enum btf_field_type field_type, - struct btf_field_info *info, int info_cnt) + u32 field_mask, struct btf_field_info *info, + int info_cnt) { + int ret, idx = 0, align, sz, field_type; const struct btf_var_secinfo *vsi; struct btf_field_info tmp; - int ret, idx = 0; - u32 i, off; + u32 i, off, seen_mask = 0; for_each_vsi(i, t, vsi) { const struct btf_type *var = btf_type_by_id(btf, vsi->type); const struct btf_type *var_type = btf_type_by_id(btf, var->type); - off = vsi->offset; - - if (name && strcmp(__btf_name_by_offset(btf, var_type->name_off), name)) + field_type = btf_get_field_type(__btf_name_by_offset(btf, var_type->name_off), + field_mask, &seen_mask, &align, &sz); + if (field_type == 0) continue; + if (field_type < 0) + return field_type; + + off = vsi->offset; if (vsi->size != sz) continue; if (off % align) - return -EINVAL; + continue; switch (field_type) { - case BTF_FIELD_SPIN_LOCK: - case BTF_FIELD_TIMER: - ret = btf_find_struct(btf, var_type, off, sz, + case BPF_SPIN_LOCK: + case BPF_TIMER: + case BPF_LIST_NODE: + ret = btf_find_struct(btf, var_type, off, sz, field_type, idx < info_cnt ? &info[idx] : &tmp); if (ret < 0) return ret; break; - case BTF_FIELD_KPTR: + case BPF_KPTR_UNREF: + case BPF_KPTR_REF: ret = btf_find_kptr(btf, var_type, off, sz, idx < info_cnt ? &info[idx] : &tmp); if (ret < 0) return ret; break; + case BPF_LIST_HEAD: + ret = btf_find_list_head(btf, var, var_type, -1, off, sz, + idx < info_cnt ? &info[idx] : &tmp); + if (ret < 0) + return ret; + break; default: return -EFAULT; } @@ -3357,169 +3512,327 @@ static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t, } static int btf_find_field(const struct btf *btf, const struct btf_type *t, - enum btf_field_type field_type, - struct btf_field_info *info, int info_cnt) + u32 field_mask, struct btf_field_info *info, + int info_cnt) { - const char *name; - int sz, align; - - switch (field_type) { - case BTF_FIELD_SPIN_LOCK: - name = "bpf_spin_lock"; - sz = sizeof(struct bpf_spin_lock); - align = __alignof__(struct bpf_spin_lock); - break; - case BTF_FIELD_TIMER: - name = "bpf_timer"; - sz = sizeof(struct bpf_timer); - align = __alignof__(struct bpf_timer); - break; - case BTF_FIELD_KPTR: - name = NULL; - sz = sizeof(u64); - align = 8; - break; - default: - return -EFAULT; - } - if (__btf_type_is_struct(t)) - return btf_find_struct_field(btf, t, name, sz, align, field_type, info, info_cnt); + return btf_find_struct_field(btf, t, field_mask, info, info_cnt); else if (btf_type_is_datasec(t)) - return btf_find_datasec_var(btf, t, name, sz, align, field_type, info, info_cnt); + return btf_find_datasec_var(btf, t, field_mask, info, info_cnt); return -EINVAL; } -/* find 'struct bpf_spin_lock' in map value. - * return >= 0 offset if found - * and < 0 in case of error - */ -int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t) +static int btf_parse_kptr(const struct btf *btf, struct btf_field *field, + struct btf_field_info *info) { - struct btf_field_info info; + struct module *mod = NULL; + const struct btf_type *t; + struct btf *kernel_btf; int ret; + s32 id; - ret = btf_find_field(btf, t, BTF_FIELD_SPIN_LOCK, &info, 1); - if (ret < 0) - return ret; - if (!ret) - return -ENOENT; - return info.off; + /* Find type in map BTF, and use it to look up the matching type + * in vmlinux or module BTFs, by name and kind. + */ + t = btf_type_by_id(btf, info->kptr.type_id); + id = bpf_find_btf_id(__btf_name_by_offset(btf, t->name_off), BTF_INFO_KIND(t->info), + &kernel_btf); + if (id < 0) + return id; + + /* Find and stash the function pointer for the destruction function that + * needs to be eventually invoked from the map free path. + */ + if (info->type == BPF_KPTR_REF) { + const struct btf_type *dtor_func; + const char *dtor_func_name; + unsigned long addr; + s32 dtor_btf_id; + + /* This call also serves as a whitelist of allowed objects that + * can be used as a referenced pointer and be stored in a map at + * the same time. + */ + dtor_btf_id = btf_find_dtor_kfunc(kernel_btf, id); + if (dtor_btf_id < 0) { + ret = dtor_btf_id; + goto end_btf; + } + + dtor_func = btf_type_by_id(kernel_btf, dtor_btf_id); + if (!dtor_func) { + ret = -ENOENT; + goto end_btf; + } + + if (btf_is_module(kernel_btf)) { + mod = btf_try_get_module(kernel_btf); + if (!mod) { + ret = -ENXIO; + goto end_btf; + } + } + + /* We already verified dtor_func to be btf_type_is_func + * in register_btf_id_dtor_kfuncs. + */ + dtor_func_name = __btf_name_by_offset(kernel_btf, dtor_func->name_off); + addr = kallsyms_lookup_name(dtor_func_name); + if (!addr) { + ret = -EINVAL; + goto end_mod; + } + field->kptr.dtor = (void *)addr; + } + + field->kptr.btf_id = id; + field->kptr.btf = kernel_btf; + field->kptr.module = mod; + return 0; +end_mod: + module_put(mod); +end_btf: + btf_put(kernel_btf); + return ret; } -int btf_find_timer(const struct btf *btf, const struct btf_type *t) +static int btf_parse_list_head(const struct btf *btf, struct btf_field *field, + struct btf_field_info *info) { - struct btf_field_info info; - int ret; + const struct btf_type *t, *n = NULL; + const struct btf_member *member; + u32 offset; + int i; - ret = btf_find_field(btf, t, BTF_FIELD_TIMER, &info, 1); - if (ret < 0) - return ret; - if (!ret) + t = btf_type_by_id(btf, info->list_head.value_btf_id); + /* We've already checked that value_btf_id is a struct type. We + * just need to figure out the offset of the list_node, and + * verify its type. + */ + for_each_member(i, t, member) { + if (strcmp(info->list_head.node_name, __btf_name_by_offset(btf, member->name_off))) + continue; + /* Invalid BTF, two members with same name */ + if (n) + return -EINVAL; + n = btf_type_by_id(btf, member->type); + if (!__btf_type_is_struct(n)) + return -EINVAL; + if (strcmp("bpf_list_node", __btf_name_by_offset(btf, n->name_off))) + return -EINVAL; + offset = __btf_member_bit_offset(n, member); + if (offset % 8) + return -EINVAL; + offset /= 8; + if (offset % __alignof__(struct bpf_list_node)) + return -EINVAL; + + field->list_head.btf = (struct btf *)btf; + field->list_head.value_btf_id = info->list_head.value_btf_id; + field->list_head.node_offset = offset; + } + if (!n) return -ENOENT; - return info.off; + return 0; } -struct bpf_map_value_off *btf_parse_kptrs(const struct btf *btf, - const struct btf_type *t) +struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t, + u32 field_mask, u32 value_size) { - struct btf_field_info info_arr[BPF_MAP_VALUE_OFF_MAX]; - struct bpf_map_value_off *tab; - struct btf *kernel_btf = NULL; - struct module *mod = NULL; - int ret, i, nr_off; + struct btf_field_info info_arr[BTF_FIELDS_MAX]; + struct btf_record *rec; + u32 next_off = 0; + int ret, i, cnt; - ret = btf_find_field(btf, t, BTF_FIELD_KPTR, info_arr, ARRAY_SIZE(info_arr)); + ret = btf_find_field(btf, t, field_mask, info_arr, ARRAY_SIZE(info_arr)); if (ret < 0) return ERR_PTR(ret); if (!ret) return NULL; - nr_off = ret; - tab = kzalloc(offsetof(struct bpf_map_value_off, off[nr_off]), GFP_KERNEL | __GFP_NOWARN); - if (!tab) + cnt = ret; + /* This needs to be kzalloc to zero out padding and unused fields, see + * comment in btf_record_equal. + */ + rec = kzalloc(offsetof(struct btf_record, fields[cnt]), GFP_KERNEL | __GFP_NOWARN); + if (!rec) return ERR_PTR(-ENOMEM); - for (i = 0; i < nr_off; i++) { - const struct btf_type *t; - s32 id; + rec->spin_lock_off = -EINVAL; + rec->timer_off = -EINVAL; + for (i = 0; i < cnt; i++) { + if (info_arr[i].off + btf_field_type_size(info_arr[i].type) > value_size) { + WARN_ONCE(1, "verifier bug off %d size %d", info_arr[i].off, value_size); + ret = -EFAULT; + goto end; + } + if (info_arr[i].off < next_off) { + ret = -EEXIST; + goto end; + } + next_off = info_arr[i].off + btf_field_type_size(info_arr[i].type); - /* Find type in map BTF, and use it to look up the matching type - * in vmlinux or module BTFs, by name and kind. - */ - t = btf_type_by_id(btf, info_arr[i].type_id); - id = bpf_find_btf_id(__btf_name_by_offset(btf, t->name_off), BTF_INFO_KIND(t->info), - &kernel_btf); - if (id < 0) { - ret = id; + rec->field_mask |= info_arr[i].type; + rec->fields[i].offset = info_arr[i].off; + rec->fields[i].type = info_arr[i].type; + + switch (info_arr[i].type) { + case BPF_SPIN_LOCK: + WARN_ON_ONCE(rec->spin_lock_off >= 0); + /* Cache offset for faster lookup at runtime */ + rec->spin_lock_off = rec->fields[i].offset; + break; + case BPF_TIMER: + WARN_ON_ONCE(rec->timer_off >= 0); + /* Cache offset for faster lookup at runtime */ + rec->timer_off = rec->fields[i].offset; + break; + case BPF_KPTR_UNREF: + case BPF_KPTR_REF: + ret = btf_parse_kptr(btf, &rec->fields[i], &info_arr[i]); + if (ret < 0) + goto end; + break; + case BPF_LIST_HEAD: + ret = btf_parse_list_head(btf, &rec->fields[i], &info_arr[i]); + if (ret < 0) + goto end; + break; + case BPF_LIST_NODE: + break; + default: + ret = -EFAULT; goto end; } + rec->cnt++; + } - /* Find and stash the function pointer for the destruction function that - * needs to be eventually invoked from the map free path. - */ - if (info_arr[i].type == BPF_KPTR_REF) { - const struct btf_type *dtor_func; - const char *dtor_func_name; - unsigned long addr; - s32 dtor_btf_id; - - /* This call also serves as a whitelist of allowed objects that - * can be used as a referenced pointer and be stored in a map at - * the same time. - */ - dtor_btf_id = btf_find_dtor_kfunc(kernel_btf, id); - if (dtor_btf_id < 0) { - ret = dtor_btf_id; - goto end_btf; - } + /* bpf_list_head requires bpf_spin_lock */ + if (btf_record_has_field(rec, BPF_LIST_HEAD) && rec->spin_lock_off < 0) { + ret = -EINVAL; + goto end; + } - dtor_func = btf_type_by_id(kernel_btf, dtor_btf_id); - if (!dtor_func) { - ret = -ENOENT; - goto end_btf; - } + return rec; +end: + btf_record_free(rec); + return ERR_PTR(ret); +} - if (btf_is_module(kernel_btf)) { - mod = btf_try_get_module(kernel_btf); - if (!mod) { - ret = -ENXIO; - goto end_btf; - } - } +int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec) +{ + int i; - /* We already verified dtor_func to be btf_type_is_func - * in register_btf_id_dtor_kfuncs. - */ - dtor_func_name = __btf_name_by_offset(kernel_btf, dtor_func->name_off); - addr = kallsyms_lookup_name(dtor_func_name); - if (!addr) { - ret = -EINVAL; - goto end_mod; - } - tab->off[i].kptr.dtor = (void *)addr; - } + /* There are two owning types, kptr_ref and bpf_list_head. The former + * only supports storing kernel types, which can never store references + * to program allocated local types, atleast not yet. Hence we only need + * to ensure that bpf_list_head ownership does not form cycles. + */ + if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & BPF_LIST_HEAD)) + return 0; + for (i = 0; i < rec->cnt; i++) { + struct btf_struct_meta *meta; + u32 btf_id; + + if (!(rec->fields[i].type & BPF_LIST_HEAD)) + continue; + btf_id = rec->fields[i].list_head.value_btf_id; + meta = btf_find_struct_meta(btf, btf_id); + if (!meta) + return -EFAULT; + rec->fields[i].list_head.value_rec = meta->record; - tab->off[i].offset = info_arr[i].off; - tab->off[i].type = info_arr[i].type; - tab->off[i].kptr.btf_id = id; - tab->off[i].kptr.btf = kernel_btf; - tab->off[i].kptr.module = mod; + if (!(rec->field_mask & BPF_LIST_NODE)) + continue; + + /* We need to ensure ownership acyclicity among all types. The + * proper way to do it would be to topologically sort all BTF + * IDs based on the ownership edges, since there can be multiple + * bpf_list_head in a type. Instead, we use the following + * reasoning: + * + * - A type can only be owned by another type in user BTF if it + * has a bpf_list_node. + * - A type can only _own_ another type in user BTF if it has a + * bpf_list_head. + * + * We ensure that if a type has both bpf_list_head and + * bpf_list_node, its element types cannot be owning types. + * + * To ensure acyclicity: + * + * When A only has bpf_list_head, ownership chain can be: + * A -> B -> C + * Where: + * - B has both bpf_list_head and bpf_list_node. + * - C only has bpf_list_node. + * + * When A has both bpf_list_head and bpf_list_node, some other + * type already owns it in the BTF domain, hence it can not own + * another owning type through any of the bpf_list_head edges. + * A -> B + * Where: + * - B only has bpf_list_node. + */ + if (meta->record->field_mask & BPF_LIST_HEAD) + return -ELOOP; } - tab->nr_off = nr_off; - return tab; -end_mod: - module_put(mod); -end_btf: - btf_put(kernel_btf); -end: - while (i--) { - btf_put(tab->off[i].kptr.btf); - if (tab->off[i].kptr.module) - module_put(tab->off[i].kptr.module); + return 0; +} + +static int btf_field_offs_cmp(const void *_a, const void *_b, const void *priv) +{ + const u32 a = *(const u32 *)_a; + const u32 b = *(const u32 *)_b; + + if (a < b) + return -1; + else if (a > b) + return 1; + return 0; +} + +static void btf_field_offs_swap(void *_a, void *_b, int size, const void *priv) +{ + struct btf_field_offs *foffs = (void *)priv; + u32 *off_base = foffs->field_off; + u32 *a = _a, *b = _b; + u8 *sz_a, *sz_b; + + sz_a = foffs->field_sz + (a - off_base); + sz_b = foffs->field_sz + (b - off_base); + + swap(*a, *b); + swap(*sz_a, *sz_b); +} + +struct btf_field_offs *btf_parse_field_offs(struct btf_record *rec) +{ + struct btf_field_offs *foffs; + u32 i, *off; + u8 *sz; + + BUILD_BUG_ON(ARRAY_SIZE(foffs->field_off) != ARRAY_SIZE(foffs->field_sz)); + if (IS_ERR_OR_NULL(rec)) + return NULL; + + foffs = kzalloc(sizeof(*foffs), GFP_KERNEL | __GFP_NOWARN); + if (!foffs) + return ERR_PTR(-ENOMEM); + + off = foffs->field_off; + sz = foffs->field_sz; + for (i = 0; i < rec->cnt; i++) { + off[i] = rec->fields[i].offset; + sz[i] = btf_field_type_size(rec->fields[i].type); } - kfree(tab); - return ERR_PTR(ret); + foffs->cnt = rec->cnt; + + if (foffs->cnt == 1) + return foffs; + sort_r(foffs->field_off, foffs->cnt, sizeof(foffs->field_off[0]), + btf_field_offs_cmp, btf_field_offs_swap, foffs); + return foffs; } static void __btf_struct_show(const struct btf *btf, const struct btf_type *t, @@ -4436,6 +4749,11 @@ static int btf_func_proto_check(struct btf_verifier_env *env, return -EINVAL; } + if (btf_type_is_resolve_source_only(ret_type)) { + btf_verifier_log_type(env, t, "Invalid return type"); + return -EINVAL; + } + if (btf_type_needs_resolve(ret_type) && !env_type_is_resolved(env, ret_type_id)) { err = btf_resolve(env, ret_type, ret_type_id); @@ -4463,7 +4781,6 @@ static int btf_func_proto_check(struct btf_verifier_env *env, nr_args--; } - err = 0; for (i = 0; i < nr_args; i++) { const struct btf_type *arg_type; u32 arg_type_id; @@ -4472,8 +4789,12 @@ static int btf_func_proto_check(struct btf_verifier_env *env, arg_type = btf_type_by_id(btf, arg_type_id); if (!arg_type) { btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1); - err = -EINVAL; - break; + return -EINVAL; + } + + if (btf_type_is_resolve_source_only(arg_type)) { + btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1); + return -EINVAL; } if (args[i].name_off && @@ -4481,25 +4802,23 @@ static int btf_func_proto_check(struct btf_verifier_env *env, !btf_name_valid_identifier(btf, args[i].name_off))) { btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1); - err = -EINVAL; - break; + return -EINVAL; } if (btf_type_needs_resolve(arg_type) && !env_type_is_resolved(env, arg_type_id)) { err = btf_resolve(env, arg_type, arg_type_id); if (err) - break; + return err; } if (!btf_type_id_size(btf, &arg_type_id, NULL)) { btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1); - err = -EINVAL; - break; + return -EINVAL; } } - return err; + return 0; } static int btf_func_check(struct btf_verifier_env *env, @@ -4854,7 +5173,6 @@ static int btf_parse_hdr(struct btf_verifier_env *env) u32 hdr_len, hdr_copy, btf_data_size; const struct btf_header *hdr; struct btf *btf; - int err; btf = env->btf; btf_data_size = btf->data_size; @@ -4911,11 +5229,120 @@ static int btf_parse_hdr(struct btf_verifier_env *env) return -EINVAL; } - err = btf_check_sec_info(env, btf_data_size); - if (err) - return err; + return btf_check_sec_info(env, btf_data_size); +} - return 0; +static const char *alloc_obj_fields[] = { + "bpf_spin_lock", + "bpf_list_head", + "bpf_list_node", +}; + +static struct btf_struct_metas * +btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf) +{ + union { + struct btf_id_set set; + struct { + u32 _cnt; + u32 _ids[ARRAY_SIZE(alloc_obj_fields)]; + } _arr; + } aof; + struct btf_struct_metas *tab = NULL; + int i, n, id, ret; + + BUILD_BUG_ON(offsetof(struct btf_id_set, cnt) != 0); + BUILD_BUG_ON(sizeof(struct btf_id_set) != sizeof(u32)); + + memset(&aof, 0, sizeof(aof)); + for (i = 0; i < ARRAY_SIZE(alloc_obj_fields); i++) { + /* Try to find whether this special type exists in user BTF, and + * if so remember its ID so we can easily find it among members + * of structs that we iterate in the next loop. + */ + id = btf_find_by_name_kind(btf, alloc_obj_fields[i], BTF_KIND_STRUCT); + if (id < 0) + continue; + aof.set.ids[aof.set.cnt++] = id; + } + + if (!aof.set.cnt) + return NULL; + sort(&aof.set.ids, aof.set.cnt, sizeof(aof.set.ids[0]), btf_id_cmp_func, NULL); + + n = btf_nr_types(btf); + for (i = 1; i < n; i++) { + struct btf_struct_metas *new_tab; + const struct btf_member *member; + struct btf_field_offs *foffs; + struct btf_struct_meta *type; + struct btf_record *record; + const struct btf_type *t; + int j, tab_cnt; + + t = btf_type_by_id(btf, i); + if (!t) { + ret = -EINVAL; + goto free; + } + if (!__btf_type_is_struct(t)) + continue; + + cond_resched(); + + for_each_member(j, t, member) { + if (btf_id_set_contains(&aof.set, member->type)) + goto parse; + } + continue; + parse: + tab_cnt = tab ? tab->cnt : 0; + new_tab = krealloc(tab, offsetof(struct btf_struct_metas, types[tab_cnt + 1]), + GFP_KERNEL | __GFP_NOWARN); + if (!new_tab) { + ret = -ENOMEM; + goto free; + } + if (!tab) + new_tab->cnt = 0; + tab = new_tab; + + type = &tab->types[tab->cnt]; + type->btf_id = i; + record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE, t->size); + /* The record cannot be unset, treat it as an error if so */ + if (IS_ERR_OR_NULL(record)) { + ret = PTR_ERR_OR_ZERO(record) ?: -EFAULT; + goto free; + } + foffs = btf_parse_field_offs(record); + /* We need the field_offs to be valid for a valid record, + * either both should be set or both should be unset. + */ + if (IS_ERR_OR_NULL(foffs)) { + btf_record_free(record); + ret = -EFAULT; + goto free; + } + type->record = record; + type->field_offs = foffs; + tab->cnt++; + } + return tab; +free: + btf_struct_metas_free(tab); + return ERR_PTR(ret); +} + +struct btf_struct_meta *btf_find_struct_meta(const struct btf *btf, u32 btf_id) +{ + struct btf_struct_metas *tab; + + BUILD_BUG_ON(offsetof(struct btf_struct_meta, btf_id) != 0); + tab = btf->struct_meta_tab; + if (!tab) + return NULL; + return bsearch(&btf_id, tab->types, tab->cnt, sizeof(tab->types[0]), btf_id_cmp_func); } static int btf_check_type_tags(struct btf_verifier_env *env, @@ -4968,6 +5395,7 @@ static int btf_check_type_tags(struct btf_verifier_env *env, static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size, u32 log_level, char __user *log_ubuf, u32 log_size) { + struct btf_struct_metas *struct_meta_tab; struct btf_verifier_env *env = NULL; struct bpf_verifier_log *log; struct btf *btf = NULL; @@ -5036,15 +5464,34 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size, if (err) goto errout; + struct_meta_tab = btf_parse_struct_metas(log, btf); + if (IS_ERR(struct_meta_tab)) { + err = PTR_ERR(struct_meta_tab); + goto errout; + } + btf->struct_meta_tab = struct_meta_tab; + + if (struct_meta_tab) { + int i; + + for (i = 0; i < struct_meta_tab->cnt; i++) { + err = btf_check_and_fixup_fields(btf, struct_meta_tab->types[i].record); + if (err < 0) + goto errout_meta; + } + } + if (log->level && bpf_verifier_log_full(log)) { err = -ENOSPC; - goto errout; + goto errout_meta; } btf_verifier_env_free(env); refcount_set(&btf->refcnt, 1); return btf; +errout_meta: + btf_free_struct_meta_tab(btf); errout: btf_verifier_env_free(env); if (btf) @@ -5086,7 +5533,7 @@ static u8 bpf_ctx_convert_map[] = { #undef BPF_MAP_TYPE #undef BPF_LINK_TYPE -static const struct btf_member * +const struct btf_member * btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, const struct btf_type *t, enum bpf_prog_type prog_type, int arg) @@ -5159,6 +5606,26 @@ static int btf_translate_to_vmlinux(struct bpf_verifier_log *log, return kern_ctx_type->type; } +int get_kern_ctx_btf_id(struct bpf_verifier_log *log, enum bpf_prog_type prog_type) +{ + const struct btf_member *kctx_member; + const struct btf_type *conv_struct; + const struct btf_type *kctx_type; + u32 kctx_type_id; + + conv_struct = bpf_ctx_convert.t; + /* get member for kernel ctx type */ + kctx_member = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2 + 1; + kctx_type_id = kctx_member->type; + kctx_type = btf_type_by_id(btf_vmlinux, kctx_type_id); + if (!btf_type_is_struct(kctx_type)) { + bpf_log(log, "kern ctx type id %u is not a struct\n", kctx_type_id); + return -EINVAL; + } + + return kctx_type_id; +} + BTF_ID_LIST(bpf_ctx_convert_btf_id) BTF_ID(struct, bpf_ctx_convert) @@ -5328,6 +5795,50 @@ static bool is_int_ptr(struct btf *btf, const struct btf_type *t) return btf_type_is_int(t); } +static u32 get_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto, + int off) +{ + const struct btf_param *args; + const struct btf_type *t; + u32 offset = 0, nr_args; + int i; + + if (!func_proto) + return off / 8; + + nr_args = btf_type_vlen(func_proto); + args = (const struct btf_param *)(func_proto + 1); + for (i = 0; i < nr_args; i++) { + t = btf_type_skip_modifiers(btf, args[i].type, NULL); + offset += btf_type_is_ptr(t) ? 8 : roundup(t->size, 8); + if (off < offset) + return i; + } + + t = btf_type_skip_modifiers(btf, func_proto->type, NULL); + offset += btf_type_is_ptr(t) ? 8 : roundup(t->size, 8); + if (off < offset) + return nr_args; + + return nr_args + 1; +} + +static bool prog_args_trusted(const struct bpf_prog *prog) +{ + enum bpf_attach_type atype = prog->expected_attach_type; + + switch (prog->type) { + case BPF_PROG_TYPE_TRACING: + return atype == BPF_TRACE_RAW_TP || atype == BPF_TRACE_ITER; + case BPF_PROG_TYPE_LSM: + return bpf_lsm_is_trusted(prog); + case BPF_PROG_TYPE_STRUCT_OPS: + return true; + default: + return false; + } +} + bool btf_ctx_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) @@ -5347,7 +5858,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, tname, off); return false; } - arg = off / 8; + arg = get_ctx_arg_idx(btf, t, off); args = (const struct btf_param *)(t + 1); /* if (t == NULL) Fall back to default BPF prog with * MAX_BPF_FUNC_REG_ARGS u64 arguments. @@ -5398,7 +5909,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, if (!btf_type_is_small_int(t)) { bpf_log(log, "ret type %s not allowed for fmod_ret\n", - btf_kind_str[BTF_INFO_KIND(t->info)]); + btf_type_str(t)); return false; } break; @@ -5417,7 +5928,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, /* skip modifiers */ while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); - if (btf_type_is_small_int(t) || btf_is_any_enum(t)) + if (btf_type_is_small_int(t) || btf_is_any_enum(t) || __btf_type_is_struct(t)) /* accessing a scalar */ return true; if (!btf_type_is_ptr(t)) { @@ -5425,7 +5936,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, "func '%s' arg%d '%s' has type %s. Only pointer access is allowed\n", tname, arg, __btf_name_by_offset(btf, t->name_off), - btf_kind_str[BTF_INFO_KIND(t->info)]); + btf_type_str(t)); return false; } @@ -5471,6 +5982,9 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, } info->reg_type = PTR_TO_BTF_ID; + if (prog_args_trusted(prog)) + info->reg_type |= PTR_TRUSTED; + if (tgt_prog) { enum bpf_prog_type tgt_type; @@ -5509,11 +6023,11 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, if (!btf_type_is_struct(t)) { bpf_log(log, "func '%s' arg%d type %s is not a struct\n", - tname, arg, btf_kind_str[BTF_INFO_KIND(t->info)]); + tname, arg, btf_type_str(t)); return false; } bpf_log(log, "func '%s' arg%d has btf_id %d type %s '%s'\n", - tname, arg, info->btf_id, btf_kind_str[BTF_INFO_KIND(t->info)], + tname, arg, info->btf_id, btf_type_str(t), __btf_name_by_offset(btf, t->name_off)); return true; } @@ -5737,6 +6251,9 @@ error: /* check __percpu tag */ if (strcmp(tag_value, "percpu") == 0) tmp_flag = MEM_PERCPU; + /* check __rcu tag */ + if (strcmp(tag_value, "rcu") == 0) + tmp_flag = MEM_RCU; } stype = btf_type_skip_modifiers(btf, mtype->type, &id); @@ -5766,20 +6283,50 @@ error: return -EINVAL; } -int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf, - const struct btf_type *t, int off, int size, - enum bpf_access_type atype __maybe_unused, +int btf_struct_access(struct bpf_verifier_log *log, + const struct bpf_reg_state *reg, + int off, int size, enum bpf_access_type atype __maybe_unused, u32 *next_btf_id, enum bpf_type_flag *flag) { + const struct btf *btf = reg->btf; enum bpf_type_flag tmp_flag = 0; + const struct btf_type *t; + u32 id = reg->btf_id; int err; - u32 id; + while (type_is_alloc(reg->type)) { + struct btf_struct_meta *meta; + struct btf_record *rec; + int i; + + meta = btf_find_struct_meta(btf, id); + if (!meta) + break; + rec = meta->record; + for (i = 0; i < rec->cnt; i++) { + struct btf_field *field = &rec->fields[i]; + u32 offset = field->offset; + if (off < offset + btf_field_type_size(field->type) && offset < off + size) { + bpf_log(log, + "direct access to %s is disallowed\n", + btf_field_type_name(field->type)); + return -EACCES; + } + } + break; + } + + t = btf_type_by_id(btf, id); do { err = btf_struct_walk(log, btf, t, off, size, &id, &tmp_flag); switch (err) { case WALK_PTR: + /* For local types, the destination register cannot + * become a pointer again. + */ + if (type_is_alloc(reg->type)) + return SCALAR_VALUE; /* If we found the pointer or scalar on t+off, * we're done. */ @@ -5814,8 +6361,8 @@ int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf, * end up with two different module BTFs, but IDs point to the common type in * vmlinux BTF. */ -static bool btf_types_are_same(const struct btf *btf1, u32 id1, - const struct btf *btf2, u32 id2) +bool btf_types_are_same(const struct btf *btf1, u32 id1, + const struct btf *btf2, u32 id2) { if (id1 != id2) return false; @@ -5864,26 +6411,25 @@ again: } static int __get_type_size(struct btf *btf, u32 btf_id, - const struct btf_type **bad_type) + const struct btf_type **ret_type) { const struct btf_type *t; + *ret_type = btf_type_by_id(btf, 0); if (!btf_id) /* void */ return 0; t = btf_type_by_id(btf, btf_id); while (t && btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); - if (!t) { - *bad_type = btf_type_by_id(btf, 0); + if (!t) return -EINVAL; - } + *ret_type = t; if (btf_type_is_ptr(t)) /* kernel size of pointer. Not BPF's size of pointer*/ return sizeof(void *); - if (btf_type_is_int(t) || btf_is_any_enum(t)) + if (btf_type_is_int(t) || btf_is_any_enum(t) || __btf_type_is_struct(t)) return t->size; - *bad_type = t; return -EINVAL; } @@ -5902,8 +6448,10 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, /* BTF function prototype doesn't match the verifier types. * Fall back to MAX_BPF_FUNC_REG_ARGS u64 args. */ - for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) + for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) { m->arg_size[i] = 8; + m->arg_flags[i] = 0; + } m->ret_size = 8; m->nr_args = MAX_BPF_FUNC_REG_ARGS; return 0; @@ -5917,10 +6465,10 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, return -EINVAL; } ret = __get_type_size(btf, func->type, &t); - if (ret < 0) { + if (ret < 0 || __btf_type_is_struct(t)) { bpf_log(log, "The function %s return type %s is unsupported.\n", - tname, btf_kind_str[BTF_INFO_KIND(t->info)]); + tname, btf_type_str(t)); return -EINVAL; } m->ret_size = ret; @@ -5933,10 +6481,12 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, return -EINVAL; } ret = __get_type_size(btf, args[i].type, &t); - if (ret < 0) { + + /* No support of struct argument size greater than 16 bytes */ + if (ret < 0 || ret > 16) { bpf_log(log, "The function %s arg%d type %s is unsupported.\n", - tname, i, btf_kind_str[BTF_INFO_KIND(t->info)]); + tname, i, btf_type_str(t)); return -EINVAL; } if (ret == 0) { @@ -5946,6 +6496,7 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, return -EINVAL; } m->arg_size[i] = ret; + m->arg_flags[i] = __btf_type_is_struct(t) ? BTF_FMODEL_STRUCT_ARG : 0; } m->nr_args = nargs; return 0; @@ -6093,95 +6644,19 @@ int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *pr return btf_check_func_type_match(log, btf1, t1, btf2, t2); } -static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = { -#ifdef CONFIG_NET - [PTR_TO_SOCKET] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK], - [PTR_TO_SOCK_COMMON] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], - [PTR_TO_TCP_SOCK] = &btf_sock_ids[BTF_SOCK_TYPE_TCP], -#endif -}; - -/* Returns true if struct is composed of scalars, 4 levels of nesting allowed */ -static bool __btf_type_is_scalar_struct(struct bpf_verifier_log *log, - const struct btf *btf, - const struct btf_type *t, int rec) -{ - const struct btf_type *member_type; - const struct btf_member *member; - u32 i; - - if (!btf_type_is_struct(t)) - return false; - - for_each_member(i, t, member) { - const struct btf_array *array; - - member_type = btf_type_skip_modifiers(btf, member->type, NULL); - if (btf_type_is_struct(member_type)) { - if (rec >= 3) { - bpf_log(log, "max struct nesting depth exceeded\n"); - return false; - } - if (!__btf_type_is_scalar_struct(log, btf, member_type, rec + 1)) - return false; - continue; - } - if (btf_type_is_array(member_type)) { - array = btf_type_array(member_type); - if (!array->nelems) - return false; - member_type = btf_type_skip_modifiers(btf, array->type, NULL); - if (!btf_type_is_scalar(member_type)) - return false; - continue; - } - if (!btf_type_is_scalar(member_type)) - return false; - } - return true; -} - -static bool is_kfunc_arg_mem_size(const struct btf *btf, - const struct btf_param *arg, - const struct bpf_reg_state *reg) -{ - int len, sfx_len = sizeof("__sz") - 1; - const struct btf_type *t; - const char *param_name; - - t = btf_type_skip_modifiers(btf, arg->type, NULL); - if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE) - return false; - - /* In the future, this can be ported to use BTF tagging */ - param_name = btf_name_by_offset(btf, arg->name_off); - if (str_is_empty(param_name)) - return false; - len = strlen(param_name); - if (len < sfx_len) - return false; - param_name += len - sfx_len; - if (strncmp(param_name, "__sz", sfx_len)) - return false; - - return true; -} - static int btf_check_func_arg_match(struct bpf_verifier_env *env, const struct btf *btf, u32 func_id, struct bpf_reg_state *regs, bool ptr_to_mem_ok, - u32 kfunc_flags) + bool processing_call) { enum bpf_prog_type prog_type = resolve_prog_type(env->prog); - bool rel = false, kptr_get = false, trusted_arg = false; struct bpf_verifier_log *log = &env->log; - u32 i, nargs, ref_id, ref_obj_id = 0; - bool is_kfunc = btf_is_kernel(btf); const char *func_name, *ref_tname; const struct btf_type *t, *ref_t; const struct btf_param *args; - int ref_regno = 0, ret; + u32 i, nargs, ref_id; + int ret; t = btf_type_by_id(btf, func_id); if (!t || !btf_type_is_func(t)) { @@ -6207,13 +6682,6 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, return -EINVAL; } - if (is_kfunc) { - /* Only kfunc can be release func */ - rel = kfunc_flags & KF_RELEASE; - kptr_get = kfunc_flags & KF_KPTR_GET; - trusted_arg = kfunc_flags & KF_TRUSTED_ARGS; - } - /* check that BTF function arguments match actual types that the * verifier sees. */ @@ -6236,70 +6704,14 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, return -EINVAL; } - /* Check if argument must be a referenced pointer, args + i has - * been verified to be a pointer (after skipping modifiers). - */ - if (is_kfunc && trusted_arg && !reg->ref_obj_id) { - bpf_log(log, "R%d must be referenced\n", regno); - return -EINVAL; - } - ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id); ref_tname = btf_name_by_offset(btf, ref_t->name_off); - /* Trusted args have the same offset checks as release arguments */ - if (trusted_arg || (rel && reg->ref_obj_id)) - arg_type |= OBJ_RELEASE; ret = check_func_arg_reg_off(env, reg, regno, arg_type); if (ret < 0) return ret; - /* kptr_get is only true for kfunc */ - if (i == 0 && kptr_get) { - struct bpf_map_value_off_desc *off_desc; - - if (reg->type != PTR_TO_MAP_VALUE) { - bpf_log(log, "arg#0 expected pointer to map value\n"); - return -EINVAL; - } - - /* check_func_arg_reg_off allows var_off for - * PTR_TO_MAP_VALUE, but we need fixed offset to find - * off_desc. - */ - if (!tnum_is_const(reg->var_off)) { - bpf_log(log, "arg#0 must have constant offset\n"); - return -EINVAL; - } - - off_desc = bpf_map_kptr_off_contains(reg->map_ptr, reg->off + reg->var_off.value); - if (!off_desc || off_desc->type != BPF_KPTR_REF) { - bpf_log(log, "arg#0 no referenced kptr at map value offset=%llu\n", - reg->off + reg->var_off.value); - return -EINVAL; - } - - if (!btf_type_is_ptr(ref_t)) { - bpf_log(log, "arg#0 BTF type must be a double pointer\n"); - return -EINVAL; - } - - ref_t = btf_type_skip_modifiers(btf, ref_t->type, &ref_id); - ref_tname = btf_name_by_offset(btf, ref_t->name_off); - - if (!btf_type_is_struct(ref_t)) { - bpf_log(log, "kernel function %s args#%d pointer type %s %s is not supported\n", - func_name, i, btf_type_str(ref_t), ref_tname); - return -EINVAL; - } - if (!btf_struct_ids_match(log, btf, ref_id, 0, off_desc->kptr.btf, - off_desc->kptr.btf_id, true)) { - bpf_log(log, "kernel function %s args#%d expected pointer to %s %s\n", - func_name, i, btf_type_str(ref_t), ref_tname); - return -EINVAL; - } - /* rest of the arguments can be anything, like normal kfunc */ - } else if (btf_get_prog_ctx_type(log, btf, t, prog_type, i)) { + if (btf_get_prog_ctx_type(log, btf, t, prog_type, i)) { /* If function expects ctx type in BTF check that caller * is passing PTR_TO_CTX. */ @@ -6309,86 +6721,10 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, i, btf_type_str(t)); return -EINVAL; } - } else if (is_kfunc && (reg->type == PTR_TO_BTF_ID || - (reg2btf_ids[base_type(reg->type)] && !type_flag(reg->type)))) { - const struct btf_type *reg_ref_t; - const struct btf *reg_btf; - const char *reg_ref_tname; - u32 reg_ref_id; - - if (!btf_type_is_struct(ref_t)) { - bpf_log(log, "kernel function %s args#%d pointer type %s %s is not supported\n", - func_name, i, btf_type_str(ref_t), - ref_tname); - return -EINVAL; - } - - if (reg->type == PTR_TO_BTF_ID) { - reg_btf = reg->btf; - reg_ref_id = reg->btf_id; - /* Ensure only one argument is referenced PTR_TO_BTF_ID */ - if (reg->ref_obj_id) { - if (ref_obj_id) { - bpf_log(log, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", - regno, reg->ref_obj_id, ref_obj_id); - return -EFAULT; - } - ref_regno = regno; - ref_obj_id = reg->ref_obj_id; - } - } else { - reg_btf = btf_vmlinux; - reg_ref_id = *reg2btf_ids[base_type(reg->type)]; - } - - reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id, - ®_ref_id); - reg_ref_tname = btf_name_by_offset(reg_btf, - reg_ref_t->name_off); - if (!btf_struct_ids_match(log, reg_btf, reg_ref_id, - reg->off, btf, ref_id, - trusted_arg || (rel && reg->ref_obj_id))) { - bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n", - func_name, i, - btf_type_str(ref_t), ref_tname, - regno, btf_type_str(reg_ref_t), - reg_ref_tname); - return -EINVAL; - } - } else if (ptr_to_mem_ok) { + } else if (ptr_to_mem_ok && processing_call) { const struct btf_type *resolve_ret; u32 type_size; - if (is_kfunc) { - bool arg_mem_size = i + 1 < nargs && is_kfunc_arg_mem_size(btf, &args[i + 1], ®s[regno + 1]); - - /* Permit pointer to mem, but only when argument - * type is pointer to scalar, or struct composed - * (recursively) of scalars. - * When arg_mem_size is true, the pointer can be - * void *. - */ - if (!btf_type_is_scalar(ref_t) && - !__btf_type_is_scalar_struct(log, btf, ref_t, 0) && - (arg_mem_size ? !btf_type_is_void(ref_t) : 1)) { - bpf_log(log, - "arg#%d pointer type %s %s must point to %sscalar, or struct with scalar\n", - i, btf_type_str(ref_t), ref_tname, arg_mem_size ? "void, " : ""); - return -EINVAL; - } - - /* Check for mem, len pair */ - if (arg_mem_size) { - if (check_kfunc_mem_size_reg(env, ®s[regno + 1], regno + 1)) { - bpf_log(log, "arg#%d arg#%d memory, len pair leads to invalid memory access\n", - i, i + 1); - return -EINVAL; - } - i++; - continue; - } - } - resolve_ret = btf_resolve_size(btf, ref_t, &type_size); if (IS_ERR(resolve_ret)) { bpf_log(log, @@ -6401,29 +6737,16 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, if (check_mem_reg(env, reg, regno, type_size)) return -EINVAL; } else { - bpf_log(log, "reg type unsupported for arg#%d %sfunction %s#%d\n", i, - is_kfunc ? "kernel " : "", func_name, func_id); + bpf_log(log, "reg type unsupported for arg#%d function %s#%d\n", i, + func_name, func_id); return -EINVAL; } } - /* Either both are set, or neither */ - WARN_ON_ONCE((ref_obj_id && !ref_regno) || (!ref_obj_id && ref_regno)); - /* We already made sure ref_obj_id is set only for one argument. We do - * allow (!rel && ref_obj_id), so that passing such referenced - * PTR_TO_BTF_ID to other kfuncs works. Note that rel is only true when - * is_kfunc is true. - */ - if (rel && !ref_obj_id) { - bpf_log(log, "release kernel function %s expects refcounted PTR_TO_BTF_ID\n", - func_name); - return -EINVAL; - } - /* returns argument register number > 0 in case of reference release kfunc */ - return rel ? ref_regno : 0; + return 0; } -/* Compare BTF of a function with given bpf_reg_state. +/* Compare BTF of a function declaration with given bpf_reg_state. * Returns: * EFAULT - there is a verifier bug. Abort verification. * EINVAL - there is a type mismatch or BTF is not available. @@ -6450,7 +6773,7 @@ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog, return -EINVAL; is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL; - err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, 0); + err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, false); /* Compiler optimizations can remove arguments from static functions * or mismatched type can be passed into a global function. @@ -6461,12 +6784,47 @@ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog, return err; } -int btf_check_kfunc_arg_match(struct bpf_verifier_env *env, - const struct btf *btf, u32 func_id, - struct bpf_reg_state *regs, - u32 kfunc_flags) +/* Compare BTF of a function call with given bpf_reg_state. + * Returns: + * EFAULT - there is a verifier bug. Abort verification. + * EINVAL - there is a type mismatch or BTF is not available. + * 0 - BTF matches with what bpf_reg_state expects. + * Only PTR_TO_CTX and SCALAR_VALUE states are recognized. + * + * NOTE: the code is duplicated from btf_check_subprog_arg_match() + * because btf_check_func_arg_match() is still doing both. Once that + * function is split in 2, we can call from here btf_check_subprog_arg_match() + * first, and then treat the calling part in a new code path. + */ +int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog, + struct bpf_reg_state *regs) { - return btf_check_func_arg_match(env, btf, func_id, regs, true, kfunc_flags); + struct bpf_prog *prog = env->prog; + struct btf *btf = prog->aux->btf; + bool is_global; + u32 btf_id; + int err; + + if (!prog->aux->func_info) + return -EINVAL; + + btf_id = prog->aux->func_info[subprog].type_id; + if (!btf_id) + return -EFAULT; + + if (prog->aux->func_info_aux[subprog].unreliable) + return -EINVAL; + + is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL; + err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, true); + + /* Compiler optimizations can remove arguments from static functions + * or mismatched type can be passed into a global function. + * In such cases mark the function as unreliable from BTF point of view. + */ + if (err) + prog->aux->func_info_aux[subprog].unreliable = true; + return err; } /* Convert BTF of a function into bpf_reg_state if possible @@ -6580,7 +6938,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, continue; } bpf_log(log, "Arg#%d type %s in %s() is not supported yet.\n", - i, btf_kind_str[BTF_INFO_KIND(t->info)], tname); + i, btf_type_str(t), tname); return -EINVAL; } return 0; @@ -6851,23 +7209,6 @@ bool btf_is_module(const struct btf *btf) return btf->kernel_btf && strcmp(btf->name, "vmlinux") != 0; } -static int btf_id_cmp_func(const void *a, const void *b) -{ - const int *pa = a, *pb = b; - - return *pa - *pb; -} - -bool btf_id_set_contains(const struct btf_id_set *set, u32 id) -{ - return bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func) != NULL; -} - -static void *btf_id_set8_contains(const struct btf_id_set8 *set, u32 id) -{ - return bsearch(&id, set->pairs, set->cnt, sizeof(set->pairs[0]), btf_id_cmp_func); -} - enum { BTF_MODULE_F_LIVE = (1 << 0), }; @@ -7228,6 +7569,8 @@ static u32 *__btf_kfunc_id_set_contains(const struct btf *btf, static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type) { switch (prog_type) { + case BPF_PROG_TYPE_UNSPEC: + return BTF_KFUNC_HOOK_COMMON; case BPF_PROG_TYPE_XDP: return BTF_KFUNC_HOOK_XDP; case BPF_PROG_TYPE_SCHED_CLS: @@ -7235,6 +7578,7 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type) case BPF_PROG_TYPE_STRUCT_OPS: return BTF_KFUNC_HOOK_STRUCT_OPS; case BPF_PROG_TYPE_TRACING: + case BPF_PROG_TYPE_LSM: return BTF_KFUNC_HOOK_TRACING; case BPF_PROG_TYPE_SYSCALL: return BTF_KFUNC_HOOK_SYSCALL; @@ -7255,16 +7599,24 @@ u32 *btf_kfunc_id_set_contains(const struct btf *btf, u32 kfunc_btf_id) { enum btf_kfunc_hook hook; + u32 *kfunc_flags; + + kfunc_flags = __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_COMMON, kfunc_btf_id); + if (kfunc_flags) + return kfunc_flags; hook = bpf_prog_type_to_kfunc_hook(prog_type); return __btf_kfunc_id_set_contains(btf, hook, kfunc_btf_id); } -/* This function must be invoked only from initcalls/module init functions */ -int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, - const struct btf_kfunc_id_set *kset) +u32 *btf_kfunc_is_modify_return(const struct btf *btf, u32 kfunc_btf_id) +{ + return __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_FMODRET, kfunc_btf_id); +} + +static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook, + const struct btf_kfunc_id_set *kset) { - enum btf_kfunc_hook hook; struct btf *btf; int ret; @@ -7283,13 +7635,29 @@ int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, if (IS_ERR(btf)) return PTR_ERR(btf); - hook = bpf_prog_type_to_kfunc_hook(prog_type); ret = btf_populate_kfunc_set(btf, hook, kset->set); btf_put(btf); return ret; } + +/* This function must be invoked only from initcalls/module init functions */ +int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, + const struct btf_kfunc_id_set *kset) +{ + enum btf_kfunc_hook hook; + + hook = bpf_prog_type_to_kfunc_hook(prog_type); + return __register_btf_kfunc_id_set(hook, kset); +} EXPORT_SYMBOL_GPL(register_btf_kfunc_id_set); +/* This function must be invoked only from initcalls/module init functions */ +int register_btf_fmodret_id_set(const struct btf_kfunc_id_set *kset) +{ + return __register_btf_kfunc_id_set(BTF_KFUNC_HOOK_FMODRET, kset); +} +EXPORT_SYMBOL_GPL(register_btf_fmodret_id_set); + s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id) { struct btf_id_dtor_kfunc_tab *tab = btf->dtor_kfunc_tab; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 4a400cd63731..bf2fdb33fb31 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1020,6 +1020,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, union bpf_attr __user *uattr) { __u32 __user *prog_attach_flags = u64_to_user_ptr(attr->query.prog_attach_flags); + bool effective_query = attr->query.query_flags & BPF_F_QUERY_EFFECTIVE; __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); enum bpf_attach_type type = attr->query.attach_type; enum cgroup_bpf_attach_type from_atype, to_atype; @@ -1029,8 +1030,12 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, int total_cnt = 0; u32 flags; + if (effective_query && prog_attach_flags) + return -EINVAL; + if (type == BPF_LSM_CGROUP) { - if (attr->query.prog_cnt && prog_ids && !prog_attach_flags) + if (!effective_query && attr->query.prog_cnt && + prog_ids && !prog_attach_flags) return -EINVAL; from_atype = CGROUP_LSM_START; @@ -1045,7 +1050,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, } for (atype = from_atype; atype <= to_atype; atype++) { - if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) { + if (effective_query) { effective = rcu_dereference_protected(cgrp->bpf.effective[atype], lockdep_is_held(&cgroup_mutex)); total_cnt += bpf_prog_array_length(effective); @@ -1054,6 +1059,8 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, } } + /* always output uattr->query.attach_flags as 0 during effective query */ + flags = effective_query ? 0 : flags; if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) return -EFAULT; if (copy_to_user(&uattr->query.prog_cnt, &total_cnt, sizeof(total_cnt))) @@ -1068,7 +1075,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, } for (atype = from_atype; atype <= to_atype && total_cnt; atype++) { - if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) { + if (effective_query) { effective = rcu_dereference_protected(cgrp->bpf.effective[atype], lockdep_is_held(&cgroup_mutex)); cnt = min_t(int, bpf_prog_array_length(effective), total_cnt); @@ -1090,15 +1097,16 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, if (++i == cnt) break; } - } - if (prog_attach_flags) { - flags = cgrp->bpf.flags[atype]; + if (prog_attach_flags) { + flags = cgrp->bpf.flags[atype]; - for (i = 0; i < cnt; i++) - if (copy_to_user(prog_attach_flags + i, &flags, sizeof(flags))) - return -EFAULT; - prog_attach_flags += cnt; + for (i = 0; i < cnt; i++) + if (copy_to_user(prog_attach_flags + i, + &flags, sizeof(flags))) + return -EFAULT; + prog_attach_flags += cnt; + } } prog_ids += cnt; @@ -1529,6 +1537,37 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, return ret; } +BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags) +{ + /* flags argument is not used now, + * but provides an ability to extend the API. + * verifier checks that its value is correct. + */ + enum bpf_cgroup_storage_type stype = cgroup_storage_type(map); + struct bpf_cgroup_storage *storage; + struct bpf_cg_run_ctx *ctx; + void *ptr; + + /* get current cgroup storage from BPF run context */ + ctx = container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx); + storage = ctx->prog_item->cgroup_storage[stype]; + + if (stype == BPF_CGROUP_STORAGE_SHARED) + ptr = &READ_ONCE(storage->buf)->data[0]; + else + ptr = this_cpu_ptr(storage->percpu_buf); + + return (unsigned long)ptr; +} + +const struct bpf_func_proto bpf_get_local_storage_proto = { + .func = bpf_get_local_storage, + .gpl_only = false, + .ret_type = RET_PTR_TO_MAP_VALUE, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_ANYTHING, +}; + BPF_CALL_0(bpf_get_retval) { struct bpf_cg_run_ctx *ctx = @@ -1560,32 +1599,26 @@ const struct bpf_func_proto bpf_set_retval_proto = { }; static const struct bpf_func_proto * -cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { + const struct bpf_func_proto *func_proto; + + func_proto = cgroup_common_func_proto(func_id, prog); + if (func_proto) + return func_proto; + + func_proto = cgroup_current_func_proto(func_id, prog); + if (func_proto) + return func_proto; + switch (func_id) { - case BPF_FUNC_get_current_uid_gid: - return &bpf_get_current_uid_gid_proto; - case BPF_FUNC_get_local_storage: - return &bpf_get_local_storage_proto; - case BPF_FUNC_get_current_cgroup_id: - return &bpf_get_current_cgroup_id_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; - case BPF_FUNC_get_retval: - return &bpf_get_retval_proto; - case BPF_FUNC_set_retval: - return &bpf_set_retval_proto; default: return bpf_base_func_proto(func_id); } } -static const struct bpf_func_proto * -cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) -{ - return cgroup_base_func_proto(func_id, prog); -} - static bool cgroup_dev_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, @@ -2098,11 +2131,17 @@ static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = { static const struct bpf_func_proto * sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { + const struct bpf_func_proto *func_proto; + + func_proto = cgroup_common_func_proto(func_id, prog); + if (func_proto) + return func_proto; + + func_proto = cgroup_current_func_proto(func_id, prog); + if (func_proto) + return func_proto; + switch (func_id) { - case BPF_FUNC_strtol: - return &bpf_strtol_proto; - case BPF_FUNC_strtoul: - return &bpf_strtoul_proto; case BPF_FUNC_sysctl_get_name: return &bpf_sysctl_get_name_proto; case BPF_FUNC_sysctl_get_current_value: @@ -2113,8 +2152,10 @@ sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sysctl_set_new_value_proto; case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; + case BPF_FUNC_perf_event_output: + return &bpf_event_output_data_proto; default: - return cgroup_base_func_proto(func_id, prog); + return bpf_base_func_proto(func_id); } } @@ -2235,6 +2276,16 @@ static const struct bpf_func_proto bpf_get_netns_cookie_sockopt_proto = { static const struct bpf_func_proto * cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { + const struct bpf_func_proto *func_proto; + + func_proto = cgroup_common_func_proto(func_id, prog); + if (func_proto) + return func_proto; + + func_proto = cgroup_current_func_proto(func_id, prog); + if (func_proto) + return func_proto; + switch (func_id) { #ifdef CONFIG_NET case BPF_FUNC_get_netns_cookie: @@ -2256,8 +2307,10 @@ cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_tcp_sock: return &bpf_tcp_sock_proto; #endif + case BPF_FUNC_perf_event_output: + return &bpf_event_output_data_proto; default: - return cgroup_base_func_proto(func_id, prog); + return bpf_base_func_proto(func_id); } } @@ -2422,3 +2475,69 @@ const struct bpf_verifier_ops cg_sockopt_verifier_ops = { const struct bpf_prog_ops cg_sockopt_prog_ops = { }; + +/* Common helpers for cgroup hooks. */ +const struct bpf_func_proto * +cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_get_local_storage: + return &bpf_get_local_storage_proto; + case BPF_FUNC_get_retval: + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET_INGRESS: + case BPF_CGROUP_INET_EGRESS: + case BPF_CGROUP_SOCK_OPS: + case BPF_CGROUP_UDP4_RECVMSG: + case BPF_CGROUP_UDP6_RECVMSG: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: + return NULL; + default: + return &bpf_get_retval_proto; + } + case BPF_FUNC_set_retval: + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET_INGRESS: + case BPF_CGROUP_INET_EGRESS: + case BPF_CGROUP_SOCK_OPS: + case BPF_CGROUP_UDP4_RECVMSG: + case BPF_CGROUP_UDP6_RECVMSG: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: + return NULL; + default: + return &bpf_set_retval_proto; + } + default: + return NULL; + } +} + +/* Common helpers for cgroup hooks with valid process context. */ +const struct bpf_func_proto * +cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_get_current_uid_gid: + return &bpf_get_current_uid_gid_proto; + case BPF_FUNC_get_current_pid_tgid: + return &bpf_get_current_pid_tgid_proto; + case BPF_FUNC_get_current_comm: + return &bpf_get_current_comm_proto; + case BPF_FUNC_get_current_cgroup_id: + return &bpf_get_current_cgroup_id_proto; + case BPF_FUNC_get_current_ancestor_cgroup_id: + return &bpf_get_current_ancestor_cgroup_id_proto; +#ifdef CONFIG_CGROUP_NET_CLASSID + case BPF_FUNC_get_cgroup_classid: + return &bpf_get_cgroup_classid_curr_proto; +#endif + default: + return NULL; + } +} diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c new file mode 100644 index 000000000000..06989d278846 --- /dev/null +++ b/kernel/bpf/cgroup_iter.c @@ -0,0 +1,296 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2022 Google */ +#include <linux/bpf.h> +#include <linux/btf_ids.h> +#include <linux/cgroup.h> +#include <linux/kernel.h> +#include <linux/seq_file.h> + +#include "../cgroup/cgroup-internal.h" /* cgroup_mutex and cgroup_is_dead */ + +/* cgroup_iter provides four modes of traversal to the cgroup hierarchy. + * + * 1. Walk the descendants of a cgroup in pre-order. + * 2. Walk the descendants of a cgroup in post-order. + * 3. Walk the ancestors of a cgroup. + * 4. Show the given cgroup only. + * + * For walking descendants, cgroup_iter can walk in either pre-order or + * post-order. For walking ancestors, the iter walks up from a cgroup to + * the root. + * + * The iter program can terminate the walk early by returning 1. Walk + * continues if prog returns 0. + * + * The prog can check (seq->num == 0) to determine whether this is + * the first element. The prog may also be passed a NULL cgroup, + * which means the walk has completed and the prog has a chance to + * do post-processing, such as outputting an epilogue. + * + * Note: the iter_prog is called with cgroup_mutex held. + * + * Currently only one session is supported, which means, depending on the + * volume of data bpf program intends to send to user space, the number + * of cgroups that can be walked is limited. For example, given the current + * buffer size is 8 * PAGE_SIZE, if the program sends 64B data for each + * cgroup, assuming PAGE_SIZE is 4kb, the total number of cgroups that can + * be walked is 512. This is a limitation of cgroup_iter. If the output data + * is larger than the kernel buffer size, after all data in the kernel buffer + * is consumed by user space, the subsequent read() syscall will signal + * EOPNOTSUPP. In order to work around, the user may have to update their + * program to reduce the volume of data sent to output. For example, skip + * some uninteresting cgroups. + */ + +struct bpf_iter__cgroup { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct cgroup *, cgroup); +}; + +struct cgroup_iter_priv { + struct cgroup_subsys_state *start_css; + bool visited_all; + bool terminate; + int order; +}; + +static void *cgroup_iter_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct cgroup_iter_priv *p = seq->private; + + mutex_lock(&cgroup_mutex); + + /* cgroup_iter doesn't support read across multiple sessions. */ + if (*pos > 0) { + if (p->visited_all) + return NULL; + + /* Haven't visited all, but because cgroup_mutex has dropped, + * return -EOPNOTSUPP to indicate incomplete iteration. + */ + return ERR_PTR(-EOPNOTSUPP); + } + + ++*pos; + p->terminate = false; + p->visited_all = false; + if (p->order == BPF_CGROUP_ITER_DESCENDANTS_PRE) + return css_next_descendant_pre(NULL, p->start_css); + else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST) + return css_next_descendant_post(NULL, p->start_css); + else /* BPF_CGROUP_ITER_SELF_ONLY and BPF_CGROUP_ITER_ANCESTORS_UP */ + return p->start_css; +} + +static int __cgroup_iter_seq_show(struct seq_file *seq, + struct cgroup_subsys_state *css, int in_stop); + +static void cgroup_iter_seq_stop(struct seq_file *seq, void *v) +{ + struct cgroup_iter_priv *p = seq->private; + + mutex_unlock(&cgroup_mutex); + + /* pass NULL to the prog for post-processing */ + if (!v) { + __cgroup_iter_seq_show(seq, NULL, true); + p->visited_all = true; + } +} + +static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct cgroup_subsys_state *curr = (struct cgroup_subsys_state *)v; + struct cgroup_iter_priv *p = seq->private; + + ++*pos; + if (p->terminate) + return NULL; + + if (p->order == BPF_CGROUP_ITER_DESCENDANTS_PRE) + return css_next_descendant_pre(curr, p->start_css); + else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST) + return css_next_descendant_post(curr, p->start_css); + else if (p->order == BPF_CGROUP_ITER_ANCESTORS_UP) + return curr->parent; + else /* BPF_CGROUP_ITER_SELF_ONLY */ + return NULL; +} + +static int __cgroup_iter_seq_show(struct seq_file *seq, + struct cgroup_subsys_state *css, int in_stop) +{ + struct cgroup_iter_priv *p = seq->private; + struct bpf_iter__cgroup ctx; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + int ret = 0; + + /* cgroup is dead, skip this element */ + if (css && cgroup_is_dead(css->cgroup)) + return 0; + + ctx.meta = &meta; + ctx.cgroup = css ? css->cgroup : NULL; + meta.seq = seq; + prog = bpf_iter_get_info(&meta, in_stop); + if (prog) + ret = bpf_iter_run_prog(prog, &ctx); + + /* if prog returns > 0, terminate after this element. */ + if (ret != 0) + p->terminate = true; + + return 0; +} + +static int cgroup_iter_seq_show(struct seq_file *seq, void *v) +{ + return __cgroup_iter_seq_show(seq, (struct cgroup_subsys_state *)v, + false); +} + +static const struct seq_operations cgroup_iter_seq_ops = { + .start = cgroup_iter_seq_start, + .next = cgroup_iter_seq_next, + .stop = cgroup_iter_seq_stop, + .show = cgroup_iter_seq_show, +}; + +BTF_ID_LIST_GLOBAL_SINGLE(bpf_cgroup_btf_id, struct, cgroup) + +static int cgroup_iter_seq_init(void *priv, struct bpf_iter_aux_info *aux) +{ + struct cgroup_iter_priv *p = (struct cgroup_iter_priv *)priv; + struct cgroup *cgrp = aux->cgroup.start; + + /* bpf_iter_attach_cgroup() has already acquired an extra reference + * for the start cgroup, but the reference may be released after + * cgroup_iter_seq_init(), so acquire another reference for the + * start cgroup. + */ + p->start_css = &cgrp->self; + css_get(p->start_css); + p->terminate = false; + p->visited_all = false; + p->order = aux->cgroup.order; + return 0; +} + +static void cgroup_iter_seq_fini(void *priv) +{ + struct cgroup_iter_priv *p = (struct cgroup_iter_priv *)priv; + + css_put(p->start_css); +} + +static const struct bpf_iter_seq_info cgroup_iter_seq_info = { + .seq_ops = &cgroup_iter_seq_ops, + .init_seq_private = cgroup_iter_seq_init, + .fini_seq_private = cgroup_iter_seq_fini, + .seq_priv_size = sizeof(struct cgroup_iter_priv), +}; + +static int bpf_iter_attach_cgroup(struct bpf_prog *prog, + union bpf_iter_link_info *linfo, + struct bpf_iter_aux_info *aux) +{ + int fd = linfo->cgroup.cgroup_fd; + u64 id = linfo->cgroup.cgroup_id; + int order = linfo->cgroup.order; + struct cgroup *cgrp; + + if (order != BPF_CGROUP_ITER_DESCENDANTS_PRE && + order != BPF_CGROUP_ITER_DESCENDANTS_POST && + order != BPF_CGROUP_ITER_ANCESTORS_UP && + order != BPF_CGROUP_ITER_SELF_ONLY) + return -EINVAL; + + if (fd && id) + return -EINVAL; + + if (fd) + cgrp = cgroup_v1v2_get_from_fd(fd); + else if (id) + cgrp = cgroup_get_from_id(id); + else /* walk the entire hierarchy by default. */ + cgrp = cgroup_get_from_path("/"); + + if (IS_ERR(cgrp)) + return PTR_ERR(cgrp); + + aux->cgroup.start = cgrp; + aux->cgroup.order = order; + return 0; +} + +static void bpf_iter_detach_cgroup(struct bpf_iter_aux_info *aux) +{ + cgroup_put(aux->cgroup.start); +} + +static void bpf_iter_cgroup_show_fdinfo(const struct bpf_iter_aux_info *aux, + struct seq_file *seq) +{ + char *buf; + + buf = kzalloc(PATH_MAX, GFP_KERNEL); + if (!buf) { + seq_puts(seq, "cgroup_path:\t<unknown>\n"); + goto show_order; + } + + /* If cgroup_path_ns() fails, buf will be an empty string, cgroup_path + * will print nothing. + * + * Path is in the calling process's cgroup namespace. + */ + cgroup_path_ns(aux->cgroup.start, buf, PATH_MAX, + current->nsproxy->cgroup_ns); + seq_printf(seq, "cgroup_path:\t%s\n", buf); + kfree(buf); + +show_order: + if (aux->cgroup.order == BPF_CGROUP_ITER_DESCENDANTS_PRE) + seq_puts(seq, "order: descendants_pre\n"); + else if (aux->cgroup.order == BPF_CGROUP_ITER_DESCENDANTS_POST) + seq_puts(seq, "order: descendants_post\n"); + else if (aux->cgroup.order == BPF_CGROUP_ITER_ANCESTORS_UP) + seq_puts(seq, "order: ancestors_up\n"); + else /* BPF_CGROUP_ITER_SELF_ONLY */ + seq_puts(seq, "order: self_only\n"); +} + +static int bpf_iter_cgroup_fill_link_info(const struct bpf_iter_aux_info *aux, + struct bpf_link_info *info) +{ + info->iter.cgroup.order = aux->cgroup.order; + info->iter.cgroup.cgroup_id = cgroup_id(aux->cgroup.start); + return 0; +} + +DEFINE_BPF_ITER_FUNC(cgroup, struct bpf_iter_meta *meta, + struct cgroup *cgroup) + +static struct bpf_iter_reg bpf_cgroup_reg_info = { + .target = "cgroup", + .feature = BPF_ITER_RESCHED, + .attach_target = bpf_iter_attach_cgroup, + .detach_target = bpf_iter_detach_cgroup, + .show_fdinfo = bpf_iter_cgroup_show_fdinfo, + .fill_link_info = bpf_iter_cgroup_fill_link_info, + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__cgroup, cgroup), + PTR_TO_BTF_ID_OR_NULL }, + }, + .seq_info = &cgroup_iter_seq_info, +}; + +static int __init bpf_cgroup_iter_init(void) +{ + bpf_cgroup_reg_info.ctx_arg_info[0].btf_id = bpf_cgroup_btf_id[0]; + return bpf_iter_reg_target(&bpf_cgroup_reg_info); +} + +late_initcall(bpf_cgroup_iter_init); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 3d9eb3ae334c..ba3fff17e2f9 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -34,6 +34,7 @@ #include <linux/log2.h> #include <linux/bpf_verifier.h> #include <linux/nodemask.h> +#include <linux/bpf_mem_alloc.h> #include <asm/barrier.h> #include <asm/unaligned.h> @@ -60,6 +61,9 @@ #define CTX regs[BPF_REG_CTX] #define IMM insn->imm +struct bpf_mem_alloc bpf_global_ma; +bool bpf_global_ma_set; + /* No hurry in this branch * * Exported for the bpf jit load helper. @@ -825,6 +829,11 @@ struct bpf_prog_pack { unsigned long bitmap[]; }; +void bpf_jit_fill_hole_with_zero(void *area, unsigned int size) +{ + memset(area, 0, size); +} + #define BPF_PROG_SIZE_TO_NBITS(size) (round_up(size, BPF_PROG_CHUNK_SIZE) / BPF_PROG_CHUNK_SIZE) static DEFINE_MUTEX(pack_mutex); @@ -859,12 +868,11 @@ static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_ins list_add_tail(&pack->list, &pack_list); set_vm_flush_reset_perms(pack->ptr); - set_memory_ro((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE); - set_memory_x((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE); + set_memory_rox((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE); return pack; } -static void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns) +void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns) { unsigned int nbits = BPF_PROG_SIZE_TO_NBITS(size); struct bpf_prog_pack *pack; @@ -878,8 +886,7 @@ static void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insn if (ptr) { bpf_fill_ill_insns(ptr, size); set_vm_flush_reset_perms(ptr); - set_memory_ro((unsigned long)ptr, size / PAGE_SIZE); - set_memory_x((unsigned long)ptr, size / PAGE_SIZE); + set_memory_rox((unsigned long)ptr, size / PAGE_SIZE); } goto out; } @@ -905,7 +912,7 @@ out: return ptr; } -static void bpf_prog_pack_free(struct bpf_binary_header *hdr) +void bpf_prog_pack_free(struct bpf_binary_header *hdr) { struct bpf_prog_pack *pack = NULL, *tmp; unsigned int nbits; @@ -1027,7 +1034,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, hdr->size = size; hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)), PAGE_SIZE - sizeof(*hdr)); - start = (get_random_int() % hole) & ~(alignment - 1); + start = get_random_u32_below(hole) & ~(alignment - 1); /* Leave a random number of instructions before BPF code. */ *image_ptr = &hdr->image[start]; @@ -1089,7 +1096,7 @@ bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr, hole = min_t(unsigned int, size - (proglen + sizeof(*ro_header)), BPF_PROG_CHUNK_SIZE - sizeof(*ro_header)); - start = (get_random_int() % hole) & ~(alignment - 1); + start = get_random_u32_below(hole) & ~(alignment - 1); *image_ptr = &ro_header->image[start]; *rw_image = &(*rw_header)->image[start]; @@ -1211,7 +1218,7 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from, bool emit_zext) { struct bpf_insn *to = to_buff; - u32 imm_rnd = get_random_int(); + u32 imm_rnd = get_random_u32(); s16 off; BUILD_BUG_ON(BPF_REG_AX + 1 != MAX_BPF_JIT_REG); @@ -2002,7 +2009,7 @@ out: static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn *insn) \ { \ u64 stack[stack_size / sizeof(u64)]; \ - u64 regs[MAX_BPF_EXT_REG]; \ + u64 regs[MAX_BPF_EXT_REG] = {}; \ \ FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \ ARG1 = (u64) (unsigned long) ctx; \ @@ -2083,6 +2090,7 @@ static unsigned int __bpf_prog_ret0_warn(const void *ctx, bool bpf_prog_map_compatible(struct bpf_map *map, const struct bpf_prog *fp) { + enum bpf_prog_type prog_type = resolve_prog_type(fp); bool ret; if (fp->kprobe_override) @@ -2093,12 +2101,12 @@ bool bpf_prog_map_compatible(struct bpf_map *map, /* There's no owner yet where we could check for * compatibility. */ - map->owner.type = fp->type; + map->owner.type = prog_type; map->owner.jited = fp->jited; map->owner.xdp_has_frags = fp->aux->xdp_has_frags; ret = true; } else { - ret = map->owner.type == fp->type && + ret = map->owner.type == prog_type && map->owner.jited == fp->jited && map->owner.xdp_has_frags == fp->aux->xdp_has_frags; } @@ -2246,8 +2254,14 @@ static void __bpf_prog_array_free_sleepable_cb(struct rcu_head *rcu) { struct bpf_prog_array *progs; + /* If RCU Tasks Trace grace period implies RCU grace period, there is + * no need to call kfree_rcu(), just call kfree() directly. + */ progs = container_of(rcu, struct bpf_prog_array, rcu); - kfree_rcu(progs, rcu); + if (rcu_trace_implies_rcu_gp()) + kfree(progs); + else + kfree_rcu(progs, rcu); } void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs) @@ -2623,6 +2637,7 @@ const struct bpf_func_proto bpf_get_numa_node_id_proto __weak; const struct bpf_func_proto bpf_ktime_get_ns_proto __weak; const struct bpf_func_proto bpf_ktime_get_boot_ns_proto __weak; const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto __weak; +const struct bpf_func_proto bpf_ktime_get_tai_ns_proto __weak; const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak; const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak; @@ -2734,6 +2749,18 @@ int __weak bpf_arch_text_invalidate(void *dst, size_t len) return -ENOTSUPP; } +#ifdef CONFIG_BPF_SYSCALL +static int __init bpf_global_ma_init(void) +{ + int ret; + + ret = bpf_mem_alloc_init(&bpf_global_ma, 0, false); + bpf_global_ma_set = !ret; + return ret; +} +late_initcall(bpf_global_ma_init); +#endif + DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key); EXPORT_SYMBOL(bpf_stats_enabled_key); diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index f4860ac756cd..e0b2d016f0bf 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -4,13 +4,16 @@ * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. */ -/* The 'cpumap' is primarily used as a backend map for XDP BPF helper +/** + * DOC: cpu map + * The 'cpumap' is primarily used as a backend map for XDP BPF helper * call bpf_redirect_map() and XDP_REDIRECT action, like 'devmap'. * - * Unlike devmap which redirects XDP frames out another NIC device, + * Unlike devmap which redirects XDP frames out to another NIC device, * this map type redirects raw XDP frames to another CPU. The remote * CPU will do SKB-allocation and call the normal network stack. - * + */ +/* * This is a scalability and isolation mechanism, that allow * separating the early driver network XDP layer, from the rest of the * netstack, and assigning dedicated CPUs for this stage. This @@ -85,7 +88,6 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) { u32 value_size = attr->value_size; struct bpf_cpu_map *cmap; - int err = -ENOMEM; if (!bpf_capable()) return ERR_PTR(-EPERM); @@ -97,29 +99,26 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) attr->map_flags & ~BPF_F_NUMA_NODE) return ERR_PTR(-EINVAL); - cmap = kzalloc(sizeof(*cmap), GFP_USER | __GFP_ACCOUNT); + /* Pre-limit array size based on NR_CPUS, not final CPU check */ + if (attr->max_entries > NR_CPUS) + return ERR_PTR(-E2BIG); + + cmap = bpf_map_area_alloc(sizeof(*cmap), NUMA_NO_NODE); if (!cmap) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&cmap->map, attr); - /* Pre-limit array size based on NR_CPUS, not final CPU check */ - if (cmap->map.max_entries > NR_CPUS) { - err = -E2BIG; - goto free_cmap; - } - /* Alloc array for possible remote "destination" CPUs */ cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *), cmap->map.numa_node); - if (!cmap->cpu_map) - goto free_cmap; + if (!cmap->cpu_map) { + bpf_map_area_free(cmap); + return ERR_PTR(-ENOMEM); + } return &cmap->map; -free_cmap: - kfree(cmap); - return ERR_PTR(err); } static void get_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) @@ -623,7 +622,7 @@ static void cpu_map_free(struct bpf_map *map) __cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */ } bpf_map_area_free(cmap->cpu_map); - kfree(cmap); + bpf_map_area_free(cmap); } /* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or @@ -668,9 +667,9 @@ static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } -static int cpu_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) +static int cpu_map_redirect(struct bpf_map *map, u64 index, u64 flags) { - return __bpf_xdp_redirect_map(map, ifindex, flags, 0, + return __bpf_xdp_redirect_map(map, index, flags, 0, __cpu_map_lookup_elem); } diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index a0e02b009487..d01e4c55b376 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -163,13 +163,13 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) if (!capable(CAP_NET_ADMIN)) return ERR_PTR(-EPERM); - dtab = kzalloc(sizeof(*dtab), GFP_USER | __GFP_ACCOUNT); + dtab = bpf_map_area_alloc(sizeof(*dtab), NUMA_NO_NODE); if (!dtab) return ERR_PTR(-ENOMEM); err = dev_map_init_map(dtab, attr); if (err) { - kfree(dtab); + bpf_map_area_free(dtab); return ERR_PTR(err); } @@ -240,7 +240,7 @@ static void dev_map_free(struct bpf_map *map) bpf_map_area_free(dtab->netdev_map); } - kfree(dtab); + bpf_map_area_free(dtab); } static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) @@ -992,14 +992,14 @@ static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value, map, key, value, map_flags); } -static int dev_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) +static int dev_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags) { return __bpf_xdp_redirect_map(map, ifindex, flags, BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS, __dev_map_lookup_elem); } -static int dev_hash_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) +static int dev_hash_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags) { return __bpf_xdp_redirect_map(map, ifindex, flags, BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS, diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c index 2444bd15cc2d..fa3e9225aedc 100644 --- a/kernel/bpf/dispatcher.c +++ b/kernel/bpf/dispatcher.c @@ -4,6 +4,7 @@ #include <linux/hash.h> #include <linux/bpf.h> #include <linux/filter.h> +#include <linux/static_call.h> /* The BPF dispatcher is a multiway branch code generator. The * dispatcher is a mechanism to avoid the performance penalty of an @@ -85,12 +86,12 @@ static bool bpf_dispatcher_remove_prog(struct bpf_dispatcher *d, return false; } -int __weak arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs) +int __weak arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_funcs) { return -ENOTSUPP; } -static int bpf_dispatcher_prepare(struct bpf_dispatcher *d, void *image) +static int bpf_dispatcher_prepare(struct bpf_dispatcher *d, void *image, void *buf) { s64 ips[BPF_DISPATCHER_MAX] = {}, *ipsp = &ips[0]; int i; @@ -99,34 +100,38 @@ static int bpf_dispatcher_prepare(struct bpf_dispatcher *d, void *image) if (d->progs[i].prog) *ipsp++ = (s64)(uintptr_t)d->progs[i].prog->bpf_func; } - return arch_prepare_bpf_dispatcher(image, &ips[0], d->num_progs); + return arch_prepare_bpf_dispatcher(image, buf, &ips[0], d->num_progs); } static void bpf_dispatcher_update(struct bpf_dispatcher *d, int prev_num_progs) { - void *old, *new; - u32 noff; - int err; - - if (!prev_num_progs) { - old = NULL; - noff = 0; - } else { - old = d->image + d->image_off; + void *new, *tmp; + u32 noff = 0; + + if (prev_num_progs) noff = d->image_off ^ (PAGE_SIZE / 2); - } new = d->num_progs ? d->image + noff : NULL; + tmp = d->num_progs ? d->rw_image + noff : NULL; if (new) { - if (bpf_dispatcher_prepare(d, new)) + /* Prepare the dispatcher in d->rw_image. Then use + * bpf_arch_text_copy to update d->image, which is RO+X. + */ + if (bpf_dispatcher_prepare(d, new, tmp)) + return; + if (IS_ERR(bpf_arch_text_copy(new, tmp, PAGE_SIZE / 2))) return; } - err = bpf_arch_text_poke(d->func, BPF_MOD_JUMP, old, new); - if (err || !new) - return; + __BPF_DISPATCHER_UPDATE(d, new ?: (void *)&bpf_dispatcher_nop_func); - d->image_off = noff; + /* Make sure all the callers executing the previous/old half of the + * image leave it, so following update call can modify it safely. + */ + synchronize_rcu(); + + if (new) + d->image_off = noff; } void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from, @@ -140,9 +145,18 @@ void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from, mutex_lock(&d->mutex); if (!d->image) { - d->image = bpf_jit_alloc_exec_page(); + d->image = bpf_prog_pack_alloc(PAGE_SIZE, bpf_jit_fill_hole_with_zero); if (!d->image) goto out; + d->rw_image = bpf_jit_alloc_exec(PAGE_SIZE); + if (!d->rw_image) { + u32 size = PAGE_SIZE; + + bpf_arch_text_copy(d->image, &size, sizeof(size)); + bpf_prog_pack_free((struct bpf_binary_header *)d->image); + d->image = NULL; + goto out; + } bpf_image_ksym_add(d->image, &d->ksym); } diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 6c530a5e560a..5aa2b5525f79 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -14,6 +14,7 @@ #include "percpu_freelist.h" #include "bpf_lru_list.h" #include "map_in_map.h" +#include <linux/bpf_mem_alloc.h> #define HTAB_CREATE_FLAG_MASK \ (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \ @@ -67,24 +68,16 @@ * In theory the BPF locks could be converted to regular spinlocks as well, * but the bucket locks and percpu_freelist locks can be taken from * arbitrary contexts (perf, kprobes, tracepoints) which are required to be - * atomic contexts even on RT. These mechanisms require preallocated maps, - * so there is no need to invoke memory allocations within the lock held - * sections. - * - * BPF maps which need dynamic allocation are only used from (forced) - * thread context on RT and can therefore use regular spinlocks which in - * turn allows to invoke memory allocations from the lock held section. - * - * On a non RT kernel this distinction is neither possible nor required. - * spinlock maps to raw_spinlock and the extra code is optimized out by the - * compiler. + * atomic contexts even on RT. Before the introduction of bpf_mem_alloc, + * it is only safe to use raw spinlock for preallocated hash map on a RT kernel, + * because there is no memory allocation within the lock held sections. However + * after hash map was fully converted to use bpf_mem_alloc, there will be + * non-synchronous memory allocation for non-preallocated hash map, so it is + * safe to always use raw spinlock for bucket lock. */ struct bucket { struct hlist_nulls_head head; - union { - raw_spinlock_t raw_lock; - spinlock_t lock; - }; + raw_spinlock_t raw_lock; }; #define HASHTAB_MAP_LOCK_COUNT 8 @@ -92,6 +85,8 @@ struct bucket { struct bpf_htab { struct bpf_map map; + struct bpf_mem_alloc ma; + struct bpf_mem_alloc pcpu_ma; struct bucket *buckets; void *elems; union { @@ -99,7 +94,12 @@ struct bpf_htab { struct bpf_lru lru; }; struct htab_elem *__percpu *extra_elems; - atomic_t count; /* number of elements in this hashtable */ + /* number of elements in non-preallocated hashtable are kept + * in either pcount or count + */ + struct percpu_counter pcount; + atomic_t count; + bool use_percpu_counter; u32 n_buckets; /* number of hash buckets */ u32 elem_size; /* size of each element in bytes */ u32 hashrnd; @@ -114,14 +114,14 @@ struct htab_elem { struct { void *padding; union { - struct bpf_htab *htab; struct pcpu_freelist_node fnode; struct htab_elem *batch_flink; }; }; }; union { - struct rcu_head rcu; + /* pointer to per-cpu pointer */ + void *ptr_to_pptr; struct bpf_lru_node lru_node; }; u32 hash; @@ -133,26 +133,15 @@ static inline bool htab_is_prealloc(const struct bpf_htab *htab) return !(htab->map.map_flags & BPF_F_NO_PREALLOC); } -static inline bool htab_use_raw_lock(const struct bpf_htab *htab) -{ - return (!IS_ENABLED(CONFIG_PREEMPT_RT) || htab_is_prealloc(htab)); -} - static void htab_init_buckets(struct bpf_htab *htab) { unsigned int i; for (i = 0; i < htab->n_buckets; i++) { INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i); - if (htab_use_raw_lock(htab)) { - raw_spin_lock_init(&htab->buckets[i].raw_lock); - lockdep_set_class(&htab->buckets[i].raw_lock, + raw_spin_lock_init(&htab->buckets[i].raw_lock); + lockdep_set_class(&htab->buckets[i].raw_lock, &htab->lockdep_key); - } else { - spin_lock_init(&htab->buckets[i].lock); - lockdep_set_class(&htab->buckets[i].lock, - &htab->lockdep_key); - } cond_resched(); } } @@ -165,17 +154,14 @@ static inline int htab_lock_bucket(const struct bpf_htab *htab, hash = hash & HASHTAB_MAP_LOCK_MASK; - migrate_disable(); + preempt_disable(); if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) { __this_cpu_dec(*(htab->map_locked[hash])); - migrate_enable(); + preempt_enable(); return -EBUSY; } - if (htab_use_raw_lock(htab)) - raw_spin_lock_irqsave(&b->raw_lock, flags); - else - spin_lock_irqsave(&b->lock, flags); + raw_spin_lock_irqsave(&b->raw_lock, flags); *pflags = flags; return 0; @@ -186,12 +172,9 @@ static inline void htab_unlock_bucket(const struct bpf_htab *htab, unsigned long flags) { hash = hash & HASHTAB_MAP_LOCK_MASK; - if (htab_use_raw_lock(htab)) - raw_spin_unlock_irqrestore(&b->raw_lock, flags); - else - spin_unlock_irqrestore(&b->lock, flags); + raw_spin_unlock_irqrestore(&b->raw_lock, flags); __this_cpu_dec(*(htab->map_locked[hash])); - migrate_enable(); + preempt_enable(); } static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node); @@ -239,7 +222,7 @@ static void htab_free_prealloced_timers(struct bpf_htab *htab) u32 num_entries = htab->map.max_entries; int i; - if (!map_value_has_timer(&htab->map)) + if (!btf_record_has_field(htab->map.record, BPF_TIMER)) return; if (htab_has_extra_elems(htab)) num_entries += num_possible_cpus(); @@ -248,28 +231,25 @@ static void htab_free_prealloced_timers(struct bpf_htab *htab) struct htab_elem *elem; elem = get_htab_elem(htab, i); - bpf_timer_cancel_and_free(elem->key + - round_up(htab->map.key_size, 8) + - htab->map.timer_off); + bpf_obj_free_timer(htab->map.record, elem->key + round_up(htab->map.key_size, 8)); cond_resched(); } } -static void htab_free_prealloced_kptrs(struct bpf_htab *htab) +static void htab_free_prealloced_fields(struct bpf_htab *htab) { u32 num_entries = htab->map.max_entries; int i; - if (!map_value_has_kptrs(&htab->map)) + if (IS_ERR_OR_NULL(htab->map.record)) return; if (htab_has_extra_elems(htab)) num_entries += num_possible_cpus(); - for (i = 0; i < num_entries; i++) { struct htab_elem *elem; elem = get_htab_elem(htab, i); - bpf_map_free_kptrs(&htab->map, elem->key + round_up(htab->map.key_size, 8)); + bpf_obj_free_fields(htab->map.record, elem->key + round_up(htab->map.key_size, 8)); cond_resched(); } } @@ -428,8 +408,6 @@ static int htab_map_alloc_check(union bpf_attr *attr) bool zero_seed = (attr->map_flags & BPF_F_ZERO_SEED); int numa_node = bpf_map_attr_numa_node(attr); - BUILD_BUG_ON(offsetof(struct htab_elem, htab) != - offsetof(struct htab_elem, hash_node.pprev)); BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) != offsetof(struct htab_elem, hash_node.pprev)); @@ -491,7 +469,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) struct bpf_htab *htab; int err, i; - htab = kzalloc(sizeof(*htab), GFP_USER | __GFP_ACCOUNT); + htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE); if (!htab) return ERR_PTR(-ENOMEM); @@ -546,10 +524,33 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) if (htab->map.map_flags & BPF_F_ZERO_SEED) htab->hashrnd = 0; else - htab->hashrnd = get_random_int(); + htab->hashrnd = get_random_u32(); htab_init_buckets(htab); +/* compute_batch_value() computes batch value as num_online_cpus() * 2 + * and __percpu_counter_compare() needs + * htab->max_entries - cur_number_of_elems to be more than batch * num_online_cpus() + * for percpu_counter to be faster than atomic_t. In practice the average bpf + * hash map size is 10k, which means that a system with 64 cpus will fill + * hashmap to 20% of 10k before percpu_counter becomes ineffective. Therefore + * define our own batch count as 32 then 10k hash map can be filled up to 80%: + * 10k - 8k > 32 _batch_ * 64 _cpus_ + * and __percpu_counter_compare() will still be fast. At that point hash map + * collisions will dominate its performance anyway. Assume that hash map filled + * to 50+% isn't going to be O(1) and use the following formula to choose + * between percpu_counter and atomic_t. + */ +#define PERCPU_COUNTER_BATCH 32 + if (attr->max_entries / 2 > num_online_cpus() * PERCPU_COUNTER_BATCH) + htab->use_percpu_counter = true; + + if (htab->use_percpu_counter) { + err = percpu_counter_init(&htab->pcount, 0, GFP_KERNEL); + if (err) + goto free_map_locked; + } + if (prealloc) { err = prealloc_init(htab); if (err) @@ -563,6 +564,16 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) if (err) goto free_prealloc; } + } else { + err = bpf_mem_alloc_init(&htab->ma, htab->elem_size, false); + if (err) + goto free_map_locked; + if (percpu) { + err = bpf_mem_alloc_init(&htab->pcpu_ma, + round_up(htab->map.value_size, 8), true); + if (err) + goto free_map_locked; + } } return &htab->map; @@ -570,12 +581,16 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) free_prealloc: prealloc_destroy(htab); free_map_locked: + if (htab->use_percpu_counter) + percpu_counter_destroy(&htab->pcount); for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) free_percpu(htab->map_locked[i]); bpf_map_area_free(htab->buckets); + bpf_mem_alloc_destroy(&htab->pcpu_ma); + bpf_mem_alloc_destroy(&htab->ma); free_htab: lockdep_unregister_key(&htab->lockdep_key); - kfree(htab); + bpf_map_area_free(htab); return ERR_PTR(err); } @@ -746,10 +761,7 @@ static void check_and_free_fields(struct bpf_htab *htab, { void *map_value = elem->key + round_up(htab->map.key_size, 8); - if (map_value_has_timer(&htab->map)) - bpf_timer_cancel_and_free(map_value + htab->map.timer_off); - if (map_value_has_kptrs(&htab->map)) - bpf_map_free_kptrs(&htab->map, map_value); + bpf_obj_free_fields(htab->map.record, map_value); } /* It is called from the bpf_lru_list when the LRU needs to delete @@ -847,17 +859,9 @@ find_first_elem: static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l) { if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) - free_percpu(htab_elem_get_ptr(l, htab->map.key_size)); + bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr); check_and_free_fields(htab, l); - kfree(l); -} - -static void htab_elem_free_rcu(struct rcu_head *head) -{ - struct htab_elem *l = container_of(head, struct htab_elem, rcu); - struct bpf_htab *htab = l->htab; - - htab_elem_free(htab, l); + bpf_mem_cache_free(&htab->ma, l); } static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l) @@ -871,6 +875,31 @@ static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l) } } +static bool is_map_full(struct bpf_htab *htab) +{ + if (htab->use_percpu_counter) + return __percpu_counter_compare(&htab->pcount, htab->map.max_entries, + PERCPU_COUNTER_BATCH) >= 0; + return atomic_read(&htab->count) >= htab->map.max_entries; +} + +static void inc_elem_count(struct bpf_htab *htab) +{ + if (htab->use_percpu_counter) + percpu_counter_add_batch(&htab->pcount, 1, PERCPU_COUNTER_BATCH); + else + atomic_inc(&htab->count); +} + +static void dec_elem_count(struct bpf_htab *htab) +{ + if (htab->use_percpu_counter) + percpu_counter_add_batch(&htab->pcount, -1, PERCPU_COUNTER_BATCH); + else + atomic_dec(&htab->count); +} + + static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) { htab_put_fd_value(htab, l); @@ -879,9 +908,8 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) check_and_free_fields(htab, l); __pcpu_freelist_push(&htab->freelist, &l->fnode); } else { - atomic_dec(&htab->count); - l->htab = htab; - call_rcu(&l->rcu, htab_elem_free_rcu); + dec_elem_count(htab); + htab_elem_free(htab, l); } } @@ -906,13 +934,12 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr, void *value, bool onallcpus) { - /* When using prealloc and not setting the initial value on all cpus, - * zero-fill element values for other cpus (just as what happens when - * not using prealloc). Otherwise, bpf program has no way to ensure + /* When not setting the initial value on all cpus, zero-fill element + * values for other cpus. Otherwise, bpf program has no way to ensure * known initial values for cpus other than current one * (onallcpus=false always when coming from bpf prog). */ - if (htab_is_prealloc(htab) && !onallcpus) { + if (!onallcpus) { u32 size = round_up(htab->map.value_size, 8); int current_cpu = raw_smp_processor_id(); int cpu; @@ -963,19 +990,16 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, l_new = container_of(l, struct htab_elem, fnode); } } else { - if (atomic_inc_return(&htab->count) > htab->map.max_entries) - if (!old_elem) { + if (is_map_full(htab)) + if (!old_elem) /* when map is full and update() is replacing * old element, it's ok to allocate, since * old element will be freed immediately. * Otherwise return an error */ - l_new = ERR_PTR(-E2BIG); - goto dec_count; - } - l_new = bpf_map_kmalloc_node(&htab->map, htab->elem_size, - GFP_NOWAIT | __GFP_NOWARN, - htab->map.numa_node); + return ERR_PTR(-E2BIG); + inc_elem_count(htab); + l_new = bpf_mem_cache_alloc(&htab->ma); if (!l_new) { l_new = ERR_PTR(-ENOMEM); goto dec_count; @@ -986,18 +1010,18 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, memcpy(l_new->key, key, key_size); if (percpu) { - size = round_up(size, 8); if (prealloc) { pptr = htab_elem_get_ptr(l_new, key_size); } else { /* alloc_percpu zero-fills */ - pptr = bpf_map_alloc_percpu(&htab->map, size, 8, - GFP_NOWAIT | __GFP_NOWARN); + pptr = bpf_mem_cache_alloc(&htab->pcpu_ma); if (!pptr) { - kfree(l_new); + bpf_mem_cache_free(&htab->ma, l_new); l_new = ERR_PTR(-ENOMEM); goto dec_count; } + l_new->ptr_to_pptr = pptr; + pptr = *(void **)pptr; } pcpu_init_value(htab, pptr, value, onallcpus); @@ -1016,7 +1040,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, l_new->hash = hash; return l_new; dec_count: - atomic_dec(&htab->count); + dec_elem_count(htab); return l_new; } @@ -1061,7 +1085,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, head = &b->head; if (unlikely(map_flags & BPF_F_LOCK)) { - if (unlikely(!map_value_has_spin_lock(map))) + if (unlikely(!btf_record_has_field(map->record, BPF_SPIN_LOCK))) return -EINVAL; /* find an element without taking the bucket lock */ l_old = lookup_nulls_elem_raw(head, hash, key, key_size, @@ -1416,6 +1440,10 @@ static void delete_all_elements(struct bpf_htab *htab) { int i; + /* It's called from a worker thread, so disable migration here, + * since bpf_mem_cache_free() relies on that. + */ + migrate_disable(); for (i = 0; i < htab->n_buckets; i++) { struct hlist_nulls_head *head = select_bucket(htab, i); struct hlist_nulls_node *n; @@ -1426,6 +1454,7 @@ static void delete_all_elements(struct bpf_htab *htab) htab_elem_free(htab, l); } } + migrate_enable(); } static void htab_free_malloced_timers(struct bpf_htab *htab) @@ -1439,12 +1468,8 @@ static void htab_free_malloced_timers(struct bpf_htab *htab) struct htab_elem *l; hlist_nulls_for_each_entry(l, n, head, hash_node) { - /* We don't reset or free kptr on uref dropping to zero, - * hence just free timer. - */ - bpf_timer_cancel_and_free(l->key + - round_up(htab->map.key_size, 8) + - htab->map.timer_off); + /* We only free timer on uref dropping to zero */ + bpf_obj_free_timer(htab->map.record, l->key + round_up(htab->map.key_size, 8)); } cond_resched_rcu(); } @@ -1455,8 +1480,8 @@ static void htab_map_free_timers(struct bpf_map *map) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - /* We don't reset or free kptr on uref dropping to zero. */ - if (!map_value_has_timer(&htab->map)) + /* We only free timer on uref dropping to zero */ + if (!btf_record_has_field(htab->map.record, BPF_TIMER)) return; if (!htab_is_prealloc(htab)) htab_free_malloced_timers(htab); @@ -1475,24 +1500,27 @@ static void htab_map_free(struct bpf_map *map) * There is no need to synchronize_rcu() here to protect map elements. */ - /* some of free_htab_elem() callbacks for elements of this map may - * not have executed. Wait for them. + /* htab no longer uses call_rcu() directly. bpf_mem_alloc does it + * underneath and is reponsible for waiting for callbacks to finish + * during bpf_mem_alloc_destroy(). */ - rcu_barrier(); if (!htab_is_prealloc(htab)) { delete_all_elements(htab); } else { - htab_free_prealloced_kptrs(htab); + htab_free_prealloced_fields(htab); prealloc_destroy(htab); } - bpf_map_free_kptr_off_tab(map); free_percpu(htab->extra_elems); bpf_map_area_free(htab->buckets); + bpf_mem_alloc_destroy(&htab->pcpu_ma); + bpf_mem_alloc_destroy(&htab->ma); + if (htab->use_percpu_counter) + percpu_counter_destroy(&htab->pcount); for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) free_percpu(htab->map_locked[i]); lockdep_unregister_key(&htab->lockdep_key); - kfree(htab); + bpf_map_area_free(htab); } static void htab_map_seq_show_elem(struct bpf_map *map, void *key, @@ -1636,7 +1664,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, elem_map_flags = attr->batch.elem_flags; if ((elem_map_flags & ~BPF_F_LOCK) || - ((elem_map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map))) + ((elem_map_flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK))) return -EINVAL; map_flags = attr->batch.flags; @@ -1691,8 +1719,11 @@ again_nocopy: /* do not grab the lock unless need it (bucket_cnt > 0). */ if (locked) { ret = htab_lock_bucket(htab, b, batch, &flags); - if (ret) - goto next_batch; + if (ret) { + rcu_read_unlock(); + bpf_enable_instrumentation(); + goto after_loop; + } } bucket_cnt = 0; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 1f961f9982d2..af30c6cbd65d 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -4,6 +4,7 @@ #include <linux/bpf.h> #include <linux/btf.h> #include <linux/bpf-cgroup.h> +#include <linux/cgroup.h> #include <linux/rcupdate.h> #include <linux/random.h> #include <linux/smp.h> @@ -15,9 +16,11 @@ #include <linux/ctype.h> #include <linux/jiffies.h> #include <linux/pid_namespace.h> +#include <linux/poison.h> #include <linux/proc_ns.h> #include <linux/security.h> #include <linux/btf_ids.h> +#include <linux/bpf_mem_alloc.h> #include "../../lib/kstrtox.h" @@ -198,6 +201,18 @@ const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto = { .ret_type = RET_INTEGER, }; +BPF_CALL_0(bpf_ktime_get_tai_ns) +{ + /* NMI safe access to clock tai */ + return ktime_get_tai_fast_ns(); +} + +const struct bpf_func_proto bpf_ktime_get_tai_ns_proto = { + .func = bpf_ktime_get_tai_ns, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; + BPF_CALL_0(bpf_get_current_pid_tgid) { struct task_struct *task = current; @@ -323,6 +338,7 @@ const struct bpf_func_proto bpf_spin_lock_proto = { .gpl_only = false, .ret_type = RET_VOID, .arg1_type = ARG_PTR_TO_SPIN_LOCK, + .arg1_btf_id = BPF_PTR_POISON, }; static inline void __bpf_spin_unlock_irqrestore(struct bpf_spin_lock *lock) @@ -345,6 +361,7 @@ const struct bpf_func_proto bpf_spin_unlock_proto = { .gpl_only = false, .ret_type = RET_VOID, .arg1_type = ARG_PTR_TO_SPIN_LOCK, + .arg1_btf_id = BPF_PTR_POISON, }; void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, @@ -353,9 +370,9 @@ void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, struct bpf_spin_lock *lock; if (lock_src) - lock = src + map->spin_lock_off; + lock = src + map->record->spin_lock_off; else - lock = dst + map->spin_lock_off; + lock = dst + map->record->spin_lock_off; preempt_disable(); __bpf_spin_lock_irqsave(lock); copy_map_value(map, dst, src); @@ -415,40 +432,7 @@ const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, }; - -#ifdef CONFIG_CGROUP_BPF - -BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags) -{ - /* flags argument is not used now, - * but provides an ability to extend the API. - * verifier checks that its value is correct. - */ - enum bpf_cgroup_storage_type stype = cgroup_storage_type(map); - struct bpf_cgroup_storage *storage; - struct bpf_cg_run_ctx *ctx; - void *ptr; - - /* get current cgroup storage from BPF run context */ - ctx = container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx); - storage = ctx->prog_item->cgroup_storage[stype]; - - if (stype == BPF_CGROUP_STORAGE_SHARED) - ptr = &READ_ONCE(storage->buf)->data[0]; - else - ptr = this_cpu_ptr(storage->percpu_buf); - - return (unsigned long)ptr; -} - -const struct bpf_func_proto bpf_get_local_storage_proto = { - .func = bpf_get_local_storage, - .gpl_only = false, - .ret_type = RET_PTR_TO_MAP_VALUE, - .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_ANYTHING, -}; -#endif +#endif /* CONFIG_CGROUPS */ #define BPF_STRTOX_BASE_MASK 0x1F @@ -577,7 +561,6 @@ const struct bpf_func_proto bpf_strtoul_proto = { .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_LONG, }; -#endif BPF_CALL_3(bpf_strncmp, const char *, s1, u32, s1_sz, const char *, s2) { @@ -678,6 +661,7 @@ BPF_CALL_3(bpf_copy_from_user, void *, dst, u32, size, const struct bpf_func_proto bpf_copy_from_user_proto = { .func = bpf_copy_from_user, .gpl_only = false, + .might_sleep = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, @@ -708,6 +692,7 @@ BPF_CALL_5(bpf_copy_from_user_task, void *, dst, u32, size, const struct bpf_func_proto bpf_copy_from_user_task_proto = { .func = bpf_copy_from_user_task, .gpl_only = true, + .might_sleep = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, @@ -1190,7 +1175,7 @@ BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map ret = -ENOMEM; goto out; } - t->value = (void *)timer - map->timer_off; + t->value = (void *)timer - map->record->timer_off; t->map = map; t->prog = NULL; rcu_assign_pointer(t->callback_fn, NULL); @@ -1398,10 +1383,9 @@ BPF_CALL_2(bpf_kptr_xchg, void *, map_value, void *, ptr) } /* Unlike other PTR_TO_BTF_ID helpers the btf_id in bpf_kptr_xchg() - * helper is determined dynamically by the verifier. + * helper is determined dynamically by the verifier. Use BPF_PTR_POISON to + * denote type that verifier will determine. */ -#define BPF_PTR_POISON ((void *)((0xeB9FUL << 2) + POISON_POINTER_DELTA)) - static const struct bpf_func_proto bpf_kptr_xchg_proto = { .func = bpf_kptr_xchg, .gpl_only = false, @@ -1420,7 +1404,7 @@ static const struct bpf_func_proto bpf_kptr_xchg_proto = { #define DYNPTR_SIZE_MASK 0xFFFFFF #define DYNPTR_RDONLY_BIT BIT(31) -static bool bpf_dynptr_is_rdonly(struct bpf_dynptr_kern *ptr) +static bool bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr) { return ptr->size & DYNPTR_RDONLY_BIT; } @@ -1430,7 +1414,7 @@ static void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum bpf_dynptr_typ ptr->size |= type << DYNPTR_TYPE_SHIFT; } -static u32 bpf_dynptr_get_size(struct bpf_dynptr_kern *ptr) +u32 bpf_dynptr_get_size(const struct bpf_dynptr_kern *ptr) { return ptr->size & DYNPTR_SIZE_MASK; } @@ -1454,7 +1438,7 @@ void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr) memset(ptr, 0, sizeof(*ptr)); } -static int bpf_dynptr_check_off_len(struct bpf_dynptr_kern *ptr, u32 offset, u32 len) +static int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u32 offset, u32 len) { u32 size = bpf_dynptr_get_size(ptr); @@ -1468,6 +1452,8 @@ BPF_CALL_4(bpf_dynptr_from_mem, void *, data, u32, size, u64, flags, struct bpf_ { int err; + BTF_TYPE_EMIT(struct bpf_dynptr); + err = bpf_dynptr_check_size(size); if (err) goto error; @@ -1497,7 +1483,7 @@ static const struct bpf_func_proto bpf_dynptr_from_mem_proto = { .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT, }; -BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, struct bpf_dynptr_kern *, src, +BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src, u32, offset, u64, flags) { int err; @@ -1509,7 +1495,11 @@ BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, struct bpf_dynptr_kern *, src if (err) return err; - memcpy(dst, src->data + src->offset + offset, len); + /* Source and destination may possibly overlap, hence use memmove to + * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr + * pointing to overlapping PTR_TO_MAP_VALUE regions. + */ + memmove(dst, src->data + src->offset + offset, len); return 0; } @@ -1520,12 +1510,12 @@ static const struct bpf_func_proto bpf_dynptr_read_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, - .arg3_type = ARG_PTR_TO_DYNPTR, + .arg3_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; -BPF_CALL_5(bpf_dynptr_write, struct bpf_dynptr_kern *, dst, u32, offset, void *, src, +BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, void *, src, u32, len, u64, flags) { int err; @@ -1537,7 +1527,11 @@ BPF_CALL_5(bpf_dynptr_write, struct bpf_dynptr_kern *, dst, u32, offset, void *, if (err) return err; - memcpy(dst->data + dst->offset + offset, src, len); + /* Source and destination may possibly overlap, hence use memmove to + * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr + * pointing to overlapping PTR_TO_MAP_VALUE regions. + */ + memmove(dst->data + dst->offset + offset, src, len); return 0; } @@ -1546,14 +1540,14 @@ static const struct bpf_func_proto bpf_dynptr_write_proto = { .func = bpf_dynptr_write, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_DYNPTR, + .arg1_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE_OR_ZERO, .arg5_type = ARG_ANYTHING, }; -BPF_CALL_3(bpf_dynptr_data, struct bpf_dynptr_kern *, ptr, u32, offset, u32, len) +BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u32, len) { int err; @@ -1574,7 +1568,7 @@ static const struct bpf_func_proto bpf_dynptr_data_proto = { .func = bpf_dynptr_data, .gpl_only = false, .ret_type = RET_PTR_TO_DYNPTR_MEM_OR_NULL, - .arg1_type = ARG_PTR_TO_DYNPTR, + .arg1_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_CONST_ALLOC_SIZE_OR_ZERO, }; @@ -1617,6 +1611,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_ktime_get_ns_proto; case BPF_FUNC_ktime_get_boot_ns: return &bpf_ktime_get_boot_ns_proto; + case BPF_FUNC_ktime_get_tai_ns: + return &bpf_ktime_get_tai_ns_proto; case BPF_FUNC_ringbuf_output: return &bpf_ringbuf_output_proto; case BPF_FUNC_ringbuf_reserve: @@ -1627,26 +1623,12 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_ringbuf_discard_proto; case BPF_FUNC_ringbuf_query: return &bpf_ringbuf_query_proto; - case BPF_FUNC_ringbuf_reserve_dynptr: - return &bpf_ringbuf_reserve_dynptr_proto; - case BPF_FUNC_ringbuf_submit_dynptr: - return &bpf_ringbuf_submit_dynptr_proto; - case BPF_FUNC_ringbuf_discard_dynptr: - return &bpf_ringbuf_discard_dynptr_proto; - case BPF_FUNC_for_each_map_elem: - return &bpf_for_each_map_elem_proto; - case BPF_FUNC_loop: - return &bpf_loop_proto; case BPF_FUNC_strncmp: return &bpf_strncmp_proto; - case BPF_FUNC_dynptr_from_mem: - return &bpf_dynptr_from_mem_proto; - case BPF_FUNC_dynptr_read: - return &bpf_dynptr_read_proto; - case BPF_FUNC_dynptr_write: - return &bpf_dynptr_write_proto; - case BPF_FUNC_dynptr_data: - return &bpf_dynptr_data_proto; + case BPF_FUNC_strtol: + return &bpf_strtol_proto; + case BPF_FUNC_strtoul: + return &bpf_strtoul_proto; default: break; } @@ -1675,6 +1657,32 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_timer_cancel_proto; case BPF_FUNC_kptr_xchg: return &bpf_kptr_xchg_proto; + case BPF_FUNC_for_each_map_elem: + return &bpf_for_each_map_elem_proto; + case BPF_FUNC_loop: + return &bpf_loop_proto; + case BPF_FUNC_user_ringbuf_drain: + return &bpf_user_ringbuf_drain_proto; + case BPF_FUNC_ringbuf_reserve_dynptr: + return &bpf_ringbuf_reserve_dynptr_proto; + case BPF_FUNC_ringbuf_submit_dynptr: + return &bpf_ringbuf_submit_dynptr_proto; + case BPF_FUNC_ringbuf_discard_dynptr: + return &bpf_ringbuf_discard_dynptr_proto; + case BPF_FUNC_dynptr_from_mem: + return &bpf_dynptr_from_mem_proto; + case BPF_FUNC_dynptr_read: + return &bpf_dynptr_read_proto; + case BPF_FUNC_dynptr_write: + return &bpf_dynptr_write_proto; + case BPF_FUNC_dynptr_data: + return &bpf_dynptr_data_proto; +#ifdef CONFIG_CGROUPS + case BPF_FUNC_cgrp_storage_get: + return &bpf_cgrp_storage_get_proto; + case BPF_FUNC_cgrp_storage_delete: + return &bpf_cgrp_storage_delete_proto; +#endif default: break; } @@ -1711,3 +1719,402 @@ bpf_base_func_proto(enum bpf_func_id func_id) return NULL; } } + +void bpf_list_head_free(const struct btf_field *field, void *list_head, + struct bpf_spin_lock *spin_lock) +{ + struct list_head *head = list_head, *orig_head = list_head; + + BUILD_BUG_ON(sizeof(struct list_head) > sizeof(struct bpf_list_head)); + BUILD_BUG_ON(__alignof__(struct list_head) > __alignof__(struct bpf_list_head)); + + /* Do the actual list draining outside the lock to not hold the lock for + * too long, and also prevent deadlocks if tracing programs end up + * executing on entry/exit of functions called inside the critical + * section, and end up doing map ops that call bpf_list_head_free for + * the same map value again. + */ + __bpf_spin_lock_irqsave(spin_lock); + if (!head->next || list_empty(head)) + goto unlock; + head = head->next; +unlock: + INIT_LIST_HEAD(orig_head); + __bpf_spin_unlock_irqrestore(spin_lock); + + while (head != orig_head) { + void *obj = head; + + obj -= field->list_head.node_offset; + head = head->next; + /* The contained type can also have resources, including a + * bpf_list_head which needs to be freed. + */ + bpf_obj_free_fields(field->list_head.value_rec, obj); + /* bpf_mem_free requires migrate_disable(), since we can be + * called from map free path as well apart from BPF program (as + * part of map ops doing bpf_obj_free_fields). + */ + migrate_disable(); + bpf_mem_free(&bpf_global_ma, obj); + migrate_enable(); + } +} + +__diag_push(); +__diag_ignore_all("-Wmissing-prototypes", + "Global functions as their definitions will be in vmlinux BTF"); + +void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign) +{ + struct btf_struct_meta *meta = meta__ign; + u64 size = local_type_id__k; + void *p; + + p = bpf_mem_alloc(&bpf_global_ma, size); + if (!p) + return NULL; + if (meta) + bpf_obj_init(meta->field_offs, p); + return p; +} + +void bpf_obj_drop_impl(void *p__alloc, void *meta__ign) +{ + struct btf_struct_meta *meta = meta__ign; + void *p = p__alloc; + + if (meta) + bpf_obj_free_fields(meta->record, p); + bpf_mem_free(&bpf_global_ma, p); +} + +static void __bpf_list_add(struct bpf_list_node *node, struct bpf_list_head *head, bool tail) +{ + struct list_head *n = (void *)node, *h = (void *)head; + + if (unlikely(!h->next)) + INIT_LIST_HEAD(h); + if (unlikely(!n->next)) + INIT_LIST_HEAD(n); + tail ? list_add_tail(n, h) : list_add(n, h); +} + +void bpf_list_push_front(struct bpf_list_head *head, struct bpf_list_node *node) +{ + return __bpf_list_add(node, head, false); +} + +void bpf_list_push_back(struct bpf_list_head *head, struct bpf_list_node *node) +{ + return __bpf_list_add(node, head, true); +} + +static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, bool tail) +{ + struct list_head *n, *h = (void *)head; + + if (unlikely(!h->next)) + INIT_LIST_HEAD(h); + if (list_empty(h)) + return NULL; + n = tail ? h->prev : h->next; + list_del_init(n); + return (struct bpf_list_node *)n; +} + +struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head) +{ + return __bpf_list_del(head, false); +} + +struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head) +{ + return __bpf_list_del(head, true); +} + +/** + * bpf_task_acquire - Acquire a reference to a task. A task acquired by this + * kfunc which is not stored in a map as a kptr, must be released by calling + * bpf_task_release(). + * @p: The task on which a reference is being acquired. + */ +struct task_struct *bpf_task_acquire(struct task_struct *p) +{ + return get_task_struct(p); +} + +/** + * bpf_task_acquire_not_zero - Acquire a reference to a rcu task object. A task + * acquired by this kfunc which is not stored in a map as a kptr, must be + * released by calling bpf_task_release(). + * @p: The task on which a reference is being acquired. + */ +struct task_struct *bpf_task_acquire_not_zero(struct task_struct *p) +{ + /* For the time being this function returns NULL, as it's not currently + * possible to safely acquire a reference to a task with RCU protection + * using get_task_struct() and put_task_struct(). This is due to the + * slightly odd mechanics of p->rcu_users, and how task RCU protection + * works. + * + * A struct task_struct is refcounted by two different refcount_t + * fields: + * + * 1. p->usage: The "true" refcount field which tracks a task's + * lifetime. The task is freed as soon as this + * refcount drops to 0. + * + * 2. p->rcu_users: An "RCU users" refcount field which is statically + * initialized to 2, and is co-located in a union with + * a struct rcu_head field (p->rcu). p->rcu_users + * essentially encapsulates a single p->usage + * refcount, and when p->rcu_users goes to 0, an RCU + * callback is scheduled on the struct rcu_head which + * decrements the p->usage refcount. + * + * There are two important implications to this task refcounting logic + * described above. The first is that + * refcount_inc_not_zero(&p->rcu_users) cannot be used anywhere, as + * after the refcount goes to 0, the RCU callback being scheduled will + * cause the memory backing the refcount to again be nonzero due to the + * fields sharing a union. The other is that we can't rely on RCU to + * guarantee that a task is valid in a BPF program. This is because a + * task could have already transitioned to being in the TASK_DEAD + * state, had its rcu_users refcount go to 0, and its rcu callback + * invoked in which it drops its single p->usage reference. At this + * point the task will be freed as soon as the last p->usage reference + * goes to 0, without waiting for another RCU gp to elapse. The only + * way that a BPF program can guarantee that a task is valid is in this + * scenario is to hold a p->usage refcount itself. + * + * Until we're able to resolve this issue, either by pulling + * p->rcu_users and p->rcu out of the union, or by getting rid of + * p->usage and just using p->rcu_users for refcounting, we'll just + * return NULL here. + */ + return NULL; +} + +/** + * bpf_task_kptr_get - Acquire a reference on a struct task_struct kptr. A task + * kptr acquired by this kfunc which is not subsequently stored in a map, must + * be released by calling bpf_task_release(). + * @pp: A pointer to a task kptr on which a reference is being acquired. + */ +struct task_struct *bpf_task_kptr_get(struct task_struct **pp) +{ + /* We must return NULL here until we have clarity on how to properly + * leverage RCU for ensuring a task's lifetime. See the comment above + * in bpf_task_acquire_not_zero() for more details. + */ + return NULL; +} + +/** + * bpf_task_release - Release the reference acquired on a task. + * @p: The task on which a reference is being released. + */ +void bpf_task_release(struct task_struct *p) +{ + if (!p) + return; + + put_task_struct(p); +} + +#ifdef CONFIG_CGROUPS +/** + * bpf_cgroup_acquire - Acquire a reference to a cgroup. A cgroup acquired by + * this kfunc which is not stored in a map as a kptr, must be released by + * calling bpf_cgroup_release(). + * @cgrp: The cgroup on which a reference is being acquired. + */ +struct cgroup *bpf_cgroup_acquire(struct cgroup *cgrp) +{ + cgroup_get(cgrp); + return cgrp; +} + +/** + * bpf_cgroup_kptr_get - Acquire a reference on a struct cgroup kptr. A cgroup + * kptr acquired by this kfunc which is not subsequently stored in a map, must + * be released by calling bpf_cgroup_release(). + * @cgrpp: A pointer to a cgroup kptr on which a reference is being acquired. + */ +struct cgroup *bpf_cgroup_kptr_get(struct cgroup **cgrpp) +{ + struct cgroup *cgrp; + + rcu_read_lock(); + /* Another context could remove the cgroup from the map and release it + * at any time, including after we've done the lookup above. This is + * safe because we're in an RCU read region, so the cgroup is + * guaranteed to remain valid until at least the rcu_read_unlock() + * below. + */ + cgrp = READ_ONCE(*cgrpp); + + if (cgrp && !cgroup_tryget(cgrp)) + /* If the cgroup had been removed from the map and freed as + * described above, cgroup_tryget() will return false. The + * cgroup will be freed at some point after the current RCU gp + * has ended, so just return NULL to the user. + */ + cgrp = NULL; + rcu_read_unlock(); + + return cgrp; +} + +/** + * bpf_cgroup_release - Release the reference acquired on a cgroup. + * If this kfunc is invoked in an RCU read region, the cgroup is guaranteed to + * not be freed until the current grace period has ended, even if its refcount + * drops to 0. + * @cgrp: The cgroup on which a reference is being released. + */ +void bpf_cgroup_release(struct cgroup *cgrp) +{ + if (!cgrp) + return; + + cgroup_put(cgrp); +} + +/** + * bpf_cgroup_ancestor - Perform a lookup on an entry in a cgroup's ancestor + * array. A cgroup returned by this kfunc which is not subsequently stored in a + * map, must be released by calling bpf_cgroup_release(). + * @cgrp: The cgroup for which we're performing a lookup. + * @level: The level of ancestor to look up. + */ +struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level) +{ + struct cgroup *ancestor; + + if (level > cgrp->level || level < 0) + return NULL; + + ancestor = cgrp->ancestors[level]; + cgroup_get(ancestor); + return ancestor; +} +#endif /* CONFIG_CGROUPS */ + +/** + * bpf_task_from_pid - Find a struct task_struct from its pid by looking it up + * in the root pid namespace idr. If a task is returned, it must either be + * stored in a map, or released with bpf_task_release(). + * @pid: The pid of the task being looked up. + */ +struct task_struct *bpf_task_from_pid(s32 pid) +{ + struct task_struct *p; + + rcu_read_lock(); + p = find_task_by_pid_ns(pid, &init_pid_ns); + if (p) + bpf_task_acquire(p); + rcu_read_unlock(); + + return p; +} + +void *bpf_cast_to_kern_ctx(void *obj) +{ + return obj; +} + +void *bpf_rdonly_cast(void *obj__ign, u32 btf_id__k) +{ + return obj__ign; +} + +void bpf_rcu_read_lock(void) +{ + rcu_read_lock(); +} + +void bpf_rcu_read_unlock(void) +{ + rcu_read_unlock(); +} + +__diag_pop(); + +BTF_SET8_START(generic_btf_ids) +#ifdef CONFIG_KEXEC_CORE +BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE) +#endif +BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_obj_drop_impl, KF_RELEASE) +BTF_ID_FLAGS(func, bpf_list_push_front) +BTF_ID_FLAGS(func, bpf_list_push_back) +BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_task_acquire_not_zero, KF_ACQUIRE | KF_RCU | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_task_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE) +#ifdef CONFIG_CGROUPS +BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cgroup_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_cgroup_release, KF_RELEASE) +BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_TRUSTED_ARGS | KF_RET_NULL) +#endif +BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL) +BTF_SET8_END(generic_btf_ids) + +static const struct btf_kfunc_id_set generic_kfunc_set = { + .owner = THIS_MODULE, + .set = &generic_btf_ids, +}; + + +BTF_ID_LIST(generic_dtor_ids) +BTF_ID(struct, task_struct) +BTF_ID(func, bpf_task_release) +#ifdef CONFIG_CGROUPS +BTF_ID(struct, cgroup) +BTF_ID(func, bpf_cgroup_release) +#endif + +BTF_SET8_START(common_btf_ids) +BTF_ID_FLAGS(func, bpf_cast_to_kern_ctx) +BTF_ID_FLAGS(func, bpf_rdonly_cast) +BTF_ID_FLAGS(func, bpf_rcu_read_lock) +BTF_ID_FLAGS(func, bpf_rcu_read_unlock) +BTF_SET8_END(common_btf_ids) + +static const struct btf_kfunc_id_set common_kfunc_set = { + .owner = THIS_MODULE, + .set = &common_btf_ids, +}; + +static int __init kfunc_init(void) +{ + int ret; + const struct btf_id_dtor_kfunc generic_dtors[] = { + { + .btf_id = generic_dtor_ids[0], + .kfunc_btf_id = generic_dtor_ids[1] + }, +#ifdef CONFIG_CGROUPS + { + .btf_id = generic_dtor_ids[2], + .kfunc_btf_id = generic_dtor_ids[3] + }, +#endif + }; + + ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &generic_kfunc_set); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &generic_kfunc_set); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &generic_kfunc_set); + ret = ret ?: register_btf_id_dtor_kfuncs(generic_dtors, + ARRAY_SIZE(generic_dtors), + THIS_MODULE); + return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &common_kfunc_set); +} + +late_initcall(kfunc_init); diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 49ef0ce040c7..e90d9f63edc5 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -151,7 +151,7 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *key, return -EINVAL; if (unlikely((flags & BPF_F_LOCK) && - !map_value_has_spin_lock(map))) + !btf_record_has_field(map->record, BPF_SPIN_LOCK))) return -EINVAL; storage = cgroup_storage_lookup((struct bpf_cgroup_storage_map *)map, @@ -313,8 +313,7 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) /* max_entries is not used and enforced to be 0 */ return ERR_PTR(-EINVAL); - map = kmalloc_node(sizeof(struct bpf_cgroup_storage_map), - __GFP_ZERO | GFP_USER | __GFP_ACCOUNT, numa_node); + map = bpf_map_area_alloc(sizeof(struct bpf_cgroup_storage_map), numa_node); if (!map) return ERR_PTR(-ENOMEM); @@ -346,7 +345,7 @@ static void cgroup_storage_map_free(struct bpf_map *_map) WARN_ON(!RB_EMPTY_ROOT(&map->root)); WARN_ON(!list_empty(&map->list)); - kfree(map); + bpf_map_area_free(map); } static int cgroup_storage_delete_elem(struct bpf_map *map, void *key) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index d789e3b831ad..d833496e9e42 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -558,7 +558,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) attr->value_size > LPM_VAL_SIZE_MAX) return ERR_PTR(-EINVAL); - trie = kzalloc(sizeof(*trie), GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT); + trie = bpf_map_area_alloc(sizeof(*trie), NUMA_NO_NODE); if (!trie) return ERR_PTR(-ENOMEM); @@ -609,7 +609,7 @@ static void trie_free(struct bpf_map *map) } out: - kfree(trie); + bpf_map_area_free(trie); } static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 135205d0d560..38136ec4e095 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -12,6 +12,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) struct bpf_map *inner_map, *inner_map_meta; u32 inner_map_meta_size; struct fd f; + int ret; f = fdget(inner_map_ufd); inner_map = __bpf_map_get(f); @@ -20,18 +21,13 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) /* Does not support >1 level map-in-map */ if (inner_map->inner_map_meta) { - fdput(f); - return ERR_PTR(-EINVAL); + ret = -EINVAL; + goto put; } if (!inner_map->ops->map_meta_equal) { - fdput(f); - return ERR_PTR(-ENOTSUPP); - } - - if (map_value_has_spin_lock(inner_map)) { - fdput(f); - return ERR_PTR(-ENOTSUPP); + ret = -ENOTSUPP; + goto put; } inner_map_meta_size = sizeof(*inner_map_meta); @@ -41,8 +37,8 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) inner_map_meta = kzalloc(inner_map_meta_size, GFP_USER); if (!inner_map_meta) { - fdput(f); - return ERR_PTR(-ENOMEM); + ret = -ENOMEM; + goto put; } inner_map_meta->map_type = inner_map->map_type; @@ -50,9 +46,33 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) inner_map_meta->value_size = inner_map->value_size; inner_map_meta->map_flags = inner_map->map_flags; inner_map_meta->max_entries = inner_map->max_entries; - inner_map_meta->spin_lock_off = inner_map->spin_lock_off; - inner_map_meta->timer_off = inner_map->timer_off; - inner_map_meta->kptr_off_tab = bpf_map_copy_kptr_off_tab(inner_map); + + inner_map_meta->record = btf_record_dup(inner_map->record); + if (IS_ERR(inner_map_meta->record)) { + /* btf_record_dup returns NULL or valid pointer in case of + * invalid/empty/valid, but ERR_PTR in case of errors. During + * equality NULL or IS_ERR is equivalent. + */ + ret = PTR_ERR(inner_map_meta->record); + goto free; + } + if (inner_map_meta->record) { + struct btf_field_offs *field_offs; + /* If btf_record is !IS_ERR_OR_NULL, then field_offs is always + * valid. + */ + field_offs = kmemdup(inner_map->field_offs, sizeof(*inner_map->field_offs), GFP_KERNEL | __GFP_NOWARN); + if (!field_offs) { + ret = -ENOMEM; + goto free_rec; + } + inner_map_meta->field_offs = field_offs; + } + /* Note: We must use the same BTF, as we also used btf_record_dup above + * which relies on BTF being same for both maps, as some members like + * record->fields.list_head have pointers like value_rec pointing into + * inner_map->btf. + */ if (inner_map->btf) { btf_get(inner_map->btf); inner_map_meta->btf = inner_map->btf; @@ -68,11 +88,19 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) fdput(f); return inner_map_meta; +free_rec: + btf_record_free(inner_map_meta->record); +free: + kfree(inner_map_meta); +put: + fdput(f); + return ERR_PTR(ret); } void bpf_map_meta_free(struct bpf_map *map_meta) { - bpf_map_free_kptr_off_tab(map_meta); + kfree(map_meta->field_offs); + bpf_map_free_record(map_meta); btf_put(map_meta->btf); kfree(map_meta); } @@ -84,9 +112,8 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0, return meta0->map_type == meta1->map_type && meta0->key_size == meta1->key_size && meta0->value_size == meta1->value_size && - meta0->timer_off == meta1->timer_off && meta0->map_flags == meta1->map_flags && - bpf_map_equal_kptr_off_tab(meta0, meta1); + btf_record_equal(meta0->record, meta1->record); } void *bpf_map_fd_get_ptr(struct bpf_map *map, diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c new file mode 100644 index 000000000000..ebcc3dd0fa19 --- /dev/null +++ b/kernel/bpf/memalloc.c @@ -0,0 +1,677 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ +#include <linux/mm.h> +#include <linux/llist.h> +#include <linux/bpf.h> +#include <linux/irq_work.h> +#include <linux/bpf_mem_alloc.h> +#include <linux/memcontrol.h> +#include <asm/local.h> + +/* Any context (including NMI) BPF specific memory allocator. + * + * Tracing BPF programs can attach to kprobe and fentry. Hence they + * run in unknown context where calling plain kmalloc() might not be safe. + * + * Front-end kmalloc() with per-cpu per-bucket cache of free elements. + * Refill this cache asynchronously from irq_work. + * + * CPU_0 buckets + * 16 32 64 96 128 196 256 512 1024 2048 4096 + * ... + * CPU_N buckets + * 16 32 64 96 128 196 256 512 1024 2048 4096 + * + * The buckets are prefilled at the start. + * BPF programs always run with migration disabled. + * It's safe to allocate from cache of the current cpu with irqs disabled. + * Free-ing is always done into bucket of the current cpu as well. + * irq_work trims extra free elements from buckets with kfree + * and refills them with kmalloc, so global kmalloc logic takes care + * of freeing objects allocated by one cpu and freed on another. + * + * Every allocated objected is padded with extra 8 bytes that contains + * struct llist_node. + */ +#define LLIST_NODE_SZ sizeof(struct llist_node) + +/* similar to kmalloc, but sizeof == 8 bucket is gone */ +static u8 size_index[24] __ro_after_init = { + 3, /* 8 */ + 3, /* 16 */ + 4, /* 24 */ + 4, /* 32 */ + 5, /* 40 */ + 5, /* 48 */ + 5, /* 56 */ + 5, /* 64 */ + 1, /* 72 */ + 1, /* 80 */ + 1, /* 88 */ + 1, /* 96 */ + 6, /* 104 */ + 6, /* 112 */ + 6, /* 120 */ + 6, /* 128 */ + 2, /* 136 */ + 2, /* 144 */ + 2, /* 152 */ + 2, /* 160 */ + 2, /* 168 */ + 2, /* 176 */ + 2, /* 184 */ + 2 /* 192 */ +}; + +static int bpf_mem_cache_idx(size_t size) +{ + if (!size || size > 4096) + return -1; + + if (size <= 192) + return size_index[(size - 1) / 8] - 1; + + return fls(size - 1) - 1; +} + +#define NUM_CACHES 11 + +struct bpf_mem_cache { + /* per-cpu list of free objects of size 'unit_size'. + * All accesses are done with interrupts disabled and 'active' counter + * protection with __llist_add() and __llist_del_first(). + */ + struct llist_head free_llist; + local_t active; + + /* Operations on the free_list from unit_alloc/unit_free/bpf_mem_refill + * are sequenced by per-cpu 'active' counter. But unit_free() cannot + * fail. When 'active' is busy the unit_free() will add an object to + * free_llist_extra. + */ + struct llist_head free_llist_extra; + + struct irq_work refill_work; + struct obj_cgroup *objcg; + int unit_size; + /* count of objects in free_llist */ + int free_cnt; + int low_watermark, high_watermark, batch; + int percpu_size; + + struct rcu_head rcu; + struct llist_head free_by_rcu; + struct llist_head waiting_for_gp; + atomic_t call_rcu_in_progress; +}; + +struct bpf_mem_caches { + struct bpf_mem_cache cache[NUM_CACHES]; +}; + +static struct llist_node notrace *__llist_del_first(struct llist_head *head) +{ + struct llist_node *entry, *next; + + entry = head->first; + if (!entry) + return NULL; + next = entry->next; + head->first = next; + return entry; +} + +static void *__alloc(struct bpf_mem_cache *c, int node) +{ + /* Allocate, but don't deplete atomic reserves that typical + * GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc + * will allocate from the current numa node which is what we + * want here. + */ + gfp_t flags = GFP_NOWAIT | __GFP_NOWARN | __GFP_ACCOUNT; + + if (c->percpu_size) { + void **obj = kmalloc_node(c->percpu_size, flags, node); + void *pptr = __alloc_percpu_gfp(c->unit_size, 8, flags); + + if (!obj || !pptr) { + free_percpu(pptr); + kfree(obj); + return NULL; + } + obj[1] = pptr; + return obj; + } + + return kmalloc_node(c->unit_size, flags, node); +} + +static struct mem_cgroup *get_memcg(const struct bpf_mem_cache *c) +{ +#ifdef CONFIG_MEMCG_KMEM + if (c->objcg) + return get_mem_cgroup_from_objcg(c->objcg); +#endif + +#ifdef CONFIG_MEMCG + return root_mem_cgroup; +#else + return NULL; +#endif +} + +/* Mostly runs from irq_work except __init phase. */ +static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node) +{ + struct mem_cgroup *memcg = NULL, *old_memcg; + unsigned long flags; + void *obj; + int i; + + memcg = get_memcg(c); + old_memcg = set_active_memcg(memcg); + for (i = 0; i < cnt; i++) { + /* + * free_by_rcu is only manipulated by irq work refill_work(). + * IRQ works on the same CPU are called sequentially, so it is + * safe to use __llist_del_first() here. If alloc_bulk() is + * invoked by the initial prefill, there will be no running + * refill_work(), so __llist_del_first() is fine as well. + * + * In most cases, objects on free_by_rcu are from the same CPU. + * If some objects come from other CPUs, it doesn't incur any + * harm because NUMA_NO_NODE means the preference for current + * numa node and it is not a guarantee. + */ + obj = __llist_del_first(&c->free_by_rcu); + if (!obj) { + obj = __alloc(c, node); + if (!obj) + break; + } + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + /* In RT irq_work runs in per-cpu kthread, so disable + * interrupts to avoid preemption and interrupts and + * reduce the chance of bpf prog executing on this cpu + * when active counter is busy. + */ + local_irq_save(flags); + /* alloc_bulk runs from irq_work which will not preempt a bpf + * program that does unit_alloc/unit_free since IRQs are + * disabled there. There is no race to increment 'active' + * counter. It protects free_llist from corruption in case NMI + * bpf prog preempted this loop. + */ + WARN_ON_ONCE(local_inc_return(&c->active) != 1); + __llist_add(obj, &c->free_llist); + c->free_cnt++; + local_dec(&c->active); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_restore(flags); + } + set_active_memcg(old_memcg); + mem_cgroup_put(memcg); +} + +static void free_one(struct bpf_mem_cache *c, void *obj) +{ + if (c->percpu_size) { + free_percpu(((void **)obj)[1]); + kfree(obj); + return; + } + + kfree(obj); +} + +static void __free_rcu(struct rcu_head *head) +{ + struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu); + struct llist_node *llnode = llist_del_all(&c->waiting_for_gp); + struct llist_node *pos, *t; + + llist_for_each_safe(pos, t, llnode) + free_one(c, pos); + atomic_set(&c->call_rcu_in_progress, 0); +} + +static void __free_rcu_tasks_trace(struct rcu_head *head) +{ + /* If RCU Tasks Trace grace period implies RCU grace period, + * there is no need to invoke call_rcu(). + */ + if (rcu_trace_implies_rcu_gp()) + __free_rcu(head); + else + call_rcu(head, __free_rcu); +} + +static void enque_to_free(struct bpf_mem_cache *c, void *obj) +{ + struct llist_node *llnode = obj; + + /* bpf_mem_cache is a per-cpu object. Freeing happens in irq_work. + * Nothing races to add to free_by_rcu list. + */ + __llist_add(llnode, &c->free_by_rcu); +} + +static void do_call_rcu(struct bpf_mem_cache *c) +{ + struct llist_node *llnode, *t; + + if (atomic_xchg(&c->call_rcu_in_progress, 1)) + return; + + WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp)); + llist_for_each_safe(llnode, t, __llist_del_all(&c->free_by_rcu)) + /* There is no concurrent __llist_add(waiting_for_gp) access. + * It doesn't race with llist_del_all either. + * But there could be two concurrent llist_del_all(waiting_for_gp): + * from __free_rcu() and from drain_mem_cache(). + */ + __llist_add(llnode, &c->waiting_for_gp); + /* Use call_rcu_tasks_trace() to wait for sleepable progs to finish. + * If RCU Tasks Trace grace period implies RCU grace period, free + * these elements directly, else use call_rcu() to wait for normal + * progs to finish and finally do free_one() on each element. + */ + call_rcu_tasks_trace(&c->rcu, __free_rcu_tasks_trace); +} + +static void free_bulk(struct bpf_mem_cache *c) +{ + struct llist_node *llnode, *t; + unsigned long flags; + int cnt; + + do { + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_save(flags); + WARN_ON_ONCE(local_inc_return(&c->active) != 1); + llnode = __llist_del_first(&c->free_llist); + if (llnode) + cnt = --c->free_cnt; + else + cnt = 0; + local_dec(&c->active); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_restore(flags); + if (llnode) + enque_to_free(c, llnode); + } while (cnt > (c->high_watermark + c->low_watermark) / 2); + + /* and drain free_llist_extra */ + llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra)) + enque_to_free(c, llnode); + do_call_rcu(c); +} + +static void bpf_mem_refill(struct irq_work *work) +{ + struct bpf_mem_cache *c = container_of(work, struct bpf_mem_cache, refill_work); + int cnt; + + /* Racy access to free_cnt. It doesn't need to be 100% accurate */ + cnt = c->free_cnt; + if (cnt < c->low_watermark) + /* irq_work runs on this cpu and kmalloc will allocate + * from the current numa node which is what we want here. + */ + alloc_bulk(c, c->batch, NUMA_NO_NODE); + else if (cnt > c->high_watermark) + free_bulk(c); +} + +static void notrace irq_work_raise(struct bpf_mem_cache *c) +{ + irq_work_queue(&c->refill_work); +} + +/* For typical bpf map case that uses bpf_mem_cache_alloc and single bucket + * the freelist cache will be elem_size * 64 (or less) on each cpu. + * + * For bpf programs that don't have statically known allocation sizes and + * assuming (low_mark + high_mark) / 2 as an average number of elements per + * bucket and all buckets are used the total amount of memory in freelists + * on each cpu will be: + * 64*16 + 64*32 + 64*64 + 64*96 + 64*128 + 64*196 + 64*256 + 32*512 + 16*1024 + 8*2048 + 4*4096 + * == ~ 116 Kbyte using below heuristic. + * Initialized, but unused bpf allocator (not bpf map specific one) will + * consume ~ 11 Kbyte per cpu. + * Typical case will be between 11K and 116K closer to 11K. + * bpf progs can and should share bpf_mem_cache when possible. + */ + +static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu) +{ + init_irq_work(&c->refill_work, bpf_mem_refill); + if (c->unit_size <= 256) { + c->low_watermark = 32; + c->high_watermark = 96; + } else { + /* When page_size == 4k, order-0 cache will have low_mark == 2 + * and high_mark == 6 with batch alloc of 3 individual pages at + * a time. + * 8k allocs and above low == 1, high == 3, batch == 1. + */ + c->low_watermark = max(32 * 256 / c->unit_size, 1); + c->high_watermark = max(96 * 256 / c->unit_size, 3); + } + c->batch = max((c->high_watermark - c->low_watermark) / 4 * 3, 1); + + /* To avoid consuming memory assume that 1st run of bpf + * prog won't be doing more than 4 map_update_elem from + * irq disabled region + */ + alloc_bulk(c, c->unit_size <= 256 ? 4 : 1, cpu_to_node(cpu)); +} + +/* When size != 0 bpf_mem_cache for each cpu. + * This is typical bpf hash map use case when all elements have equal size. + * + * When size == 0 allocate 11 bpf_mem_cache-s for each cpu, then rely on + * kmalloc/kfree. Max allocation size is 4096 in this case. + * This is bpf_dynptr and bpf_kptr use case. + */ +int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) +{ + static u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096}; + struct bpf_mem_caches *cc, __percpu *pcc; + struct bpf_mem_cache *c, __percpu *pc; + struct obj_cgroup *objcg = NULL; + int cpu, i, unit_size, percpu_size = 0; + + if (size) { + pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL); + if (!pc) + return -ENOMEM; + + if (percpu) + /* room for llist_node and per-cpu pointer */ + percpu_size = LLIST_NODE_SZ + sizeof(void *); + else + size += LLIST_NODE_SZ; /* room for llist_node */ + unit_size = size; + +#ifdef CONFIG_MEMCG_KMEM + objcg = get_obj_cgroup_from_current(); +#endif + for_each_possible_cpu(cpu) { + c = per_cpu_ptr(pc, cpu); + c->unit_size = unit_size; + c->objcg = objcg; + c->percpu_size = percpu_size; + prefill_mem_cache(c, cpu); + } + ma->cache = pc; + return 0; + } + + /* size == 0 && percpu is an invalid combination */ + if (WARN_ON_ONCE(percpu)) + return -EINVAL; + + pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL); + if (!pcc) + return -ENOMEM; +#ifdef CONFIG_MEMCG_KMEM + objcg = get_obj_cgroup_from_current(); +#endif + for_each_possible_cpu(cpu) { + cc = per_cpu_ptr(pcc, cpu); + for (i = 0; i < NUM_CACHES; i++) { + c = &cc->cache[i]; + c->unit_size = sizes[i]; + c->objcg = objcg; + prefill_mem_cache(c, cpu); + } + } + ma->caches = pcc; + return 0; +} + +static void drain_mem_cache(struct bpf_mem_cache *c) +{ + struct llist_node *llnode, *t; + + /* No progs are using this bpf_mem_cache, but htab_map_free() called + * bpf_mem_cache_free() for all remaining elements and they can be in + * free_by_rcu or in waiting_for_gp lists, so drain those lists now. + * + * Except for waiting_for_gp list, there are no concurrent operations + * on these lists, so it is safe to use __llist_del_all(). + */ + llist_for_each_safe(llnode, t, __llist_del_all(&c->free_by_rcu)) + free_one(c, llnode); + llist_for_each_safe(llnode, t, llist_del_all(&c->waiting_for_gp)) + free_one(c, llnode); + llist_for_each_safe(llnode, t, __llist_del_all(&c->free_llist)) + free_one(c, llnode); + llist_for_each_safe(llnode, t, __llist_del_all(&c->free_llist_extra)) + free_one(c, llnode); +} + +static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma) +{ + free_percpu(ma->cache); + free_percpu(ma->caches); + ma->cache = NULL; + ma->caches = NULL; +} + +static void free_mem_alloc(struct bpf_mem_alloc *ma) +{ + /* waiting_for_gp lists was drained, but __free_rcu might + * still execute. Wait for it now before we freeing percpu caches. + * + * rcu_barrier_tasks_trace() doesn't imply synchronize_rcu_tasks_trace(), + * but rcu_barrier_tasks_trace() and rcu_barrier() below are only used + * to wait for the pending __free_rcu_tasks_trace() and __free_rcu(), + * so if call_rcu(head, __free_rcu) is skipped due to + * rcu_trace_implies_rcu_gp(), it will be OK to skip rcu_barrier() by + * using rcu_trace_implies_rcu_gp() as well. + */ + rcu_barrier_tasks_trace(); + if (!rcu_trace_implies_rcu_gp()) + rcu_barrier(); + free_mem_alloc_no_barrier(ma); +} + +static void free_mem_alloc_deferred(struct work_struct *work) +{ + struct bpf_mem_alloc *ma = container_of(work, struct bpf_mem_alloc, work); + + free_mem_alloc(ma); + kfree(ma); +} + +static void destroy_mem_alloc(struct bpf_mem_alloc *ma, int rcu_in_progress) +{ + struct bpf_mem_alloc *copy; + + if (!rcu_in_progress) { + /* Fast path. No callbacks are pending, hence no need to do + * rcu_barrier-s. + */ + free_mem_alloc_no_barrier(ma); + return; + } + + copy = kmalloc(sizeof(*ma), GFP_KERNEL); + if (!copy) { + /* Slow path with inline barrier-s */ + free_mem_alloc(ma); + return; + } + + /* Defer barriers into worker to let the rest of map memory to be freed */ + copy->cache = ma->cache; + ma->cache = NULL; + copy->caches = ma->caches; + ma->caches = NULL; + INIT_WORK(©->work, free_mem_alloc_deferred); + queue_work(system_unbound_wq, ©->work); +} + +void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) +{ + struct bpf_mem_caches *cc; + struct bpf_mem_cache *c; + int cpu, i, rcu_in_progress; + + if (ma->cache) { + rcu_in_progress = 0; + for_each_possible_cpu(cpu) { + c = per_cpu_ptr(ma->cache, cpu); + /* + * refill_work may be unfinished for PREEMPT_RT kernel + * in which irq work is invoked in a per-CPU RT thread. + * It is also possible for kernel with + * arch_irq_work_has_interrupt() being false and irq + * work is invoked in timer interrupt. So waiting for + * the completion of irq work to ease the handling of + * concurrency. + */ + irq_work_sync(&c->refill_work); + drain_mem_cache(c); + rcu_in_progress += atomic_read(&c->call_rcu_in_progress); + } + /* objcg is the same across cpus */ + if (c->objcg) + obj_cgroup_put(c->objcg); + destroy_mem_alloc(ma, rcu_in_progress); + } + if (ma->caches) { + rcu_in_progress = 0; + for_each_possible_cpu(cpu) { + cc = per_cpu_ptr(ma->caches, cpu); + for (i = 0; i < NUM_CACHES; i++) { + c = &cc->cache[i]; + irq_work_sync(&c->refill_work); + drain_mem_cache(c); + rcu_in_progress += atomic_read(&c->call_rcu_in_progress); + } + } + if (c->objcg) + obj_cgroup_put(c->objcg); + destroy_mem_alloc(ma, rcu_in_progress); + } +} + +/* notrace is necessary here and in other functions to make sure + * bpf programs cannot attach to them and cause llist corruptions. + */ +static void notrace *unit_alloc(struct bpf_mem_cache *c) +{ + struct llist_node *llnode = NULL; + unsigned long flags; + int cnt = 0; + + /* Disable irqs to prevent the following race for majority of prog types: + * prog_A + * bpf_mem_alloc + * preemption or irq -> prog_B + * bpf_mem_alloc + * + * but prog_B could be a perf_event NMI prog. + * Use per-cpu 'active' counter to order free_list access between + * unit_alloc/unit_free/bpf_mem_refill. + */ + local_irq_save(flags); + if (local_inc_return(&c->active) == 1) { + llnode = __llist_del_first(&c->free_llist); + if (llnode) + cnt = --c->free_cnt; + } + local_dec(&c->active); + local_irq_restore(flags); + + WARN_ON(cnt < 0); + + if (cnt < c->low_watermark) + irq_work_raise(c); + return llnode; +} + +/* Though 'ptr' object could have been allocated on a different cpu + * add it to the free_llist of the current cpu. + * Let kfree() logic deal with it when it's later called from irq_work. + */ +static void notrace unit_free(struct bpf_mem_cache *c, void *ptr) +{ + struct llist_node *llnode = ptr - LLIST_NODE_SZ; + unsigned long flags; + int cnt = 0; + + BUILD_BUG_ON(LLIST_NODE_SZ > 8); + + local_irq_save(flags); + if (local_inc_return(&c->active) == 1) { + __llist_add(llnode, &c->free_llist); + cnt = ++c->free_cnt; + } else { + /* unit_free() cannot fail. Therefore add an object to atomic + * llist. free_bulk() will drain it. Though free_llist_extra is + * a per-cpu list we have to use atomic llist_add here, since + * it also can be interrupted by bpf nmi prog that does another + * unit_free() into the same free_llist_extra. + */ + llist_add(llnode, &c->free_llist_extra); + } + local_dec(&c->active); + local_irq_restore(flags); + + if (cnt > c->high_watermark) + /* free few objects from current cpu into global kmalloc pool */ + irq_work_raise(c); +} + +/* Called from BPF program or from sys_bpf syscall. + * In both cases migration is disabled. + */ +void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size) +{ + int idx; + void *ret; + + if (!size) + return ZERO_SIZE_PTR; + + idx = bpf_mem_cache_idx(size + LLIST_NODE_SZ); + if (idx < 0) + return NULL; + + ret = unit_alloc(this_cpu_ptr(ma->caches)->cache + idx); + return !ret ? NULL : ret + LLIST_NODE_SZ; +} + +void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr) +{ + int idx; + + if (!ptr) + return; + + idx = bpf_mem_cache_idx(ksize(ptr - LLIST_NODE_SZ)); + if (idx < 0) + return; + + unit_free(this_cpu_ptr(ma->caches)->cache + idx, ptr); +} + +void notrace *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma) +{ + void *ret; + + ret = unit_alloc(this_cpu_ptr(ma->cache)); + return !ret ? NULL : ret + LLIST_NODE_SZ; +} + +void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr) +{ + if (!ptr) + return; + + unit_free(this_cpu_ptr(ma->cache), ptr); +} diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index bd09290e3648..13e4efc971e6 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -372,7 +372,7 @@ struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) attr->map_type != BPF_MAP_TYPE_HASH) return ERR_PTR(-EINVAL); - offmap = kzalloc(sizeof(*offmap), GFP_USER); + offmap = bpf_map_area_alloc(sizeof(*offmap), NUMA_NO_NODE); if (!offmap) return ERR_PTR(-ENOMEM); @@ -404,7 +404,7 @@ struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) err_unlock: up_write(&bpf_devs_lock); rtnl_unlock(); - kfree(offmap); + bpf_map_area_free(offmap); return ERR_PTR(err); } @@ -428,7 +428,7 @@ void bpf_map_offload_map_free(struct bpf_map *map) up_write(&bpf_devs_lock); rtnl_unlock(); - kfree(offmap); + bpf_map_area_free(offmap); } int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value) diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c index 00b874c8e889..034cf87b54e9 100644 --- a/kernel/bpf/percpu_freelist.c +++ b/kernel/bpf/percpu_freelist.c @@ -58,23 +58,21 @@ static inline void ___pcpu_freelist_push_nmi(struct pcpu_freelist *s, { int cpu, orig_cpu; - orig_cpu = cpu = raw_smp_processor_id(); + orig_cpu = raw_smp_processor_id(); while (1) { - struct pcpu_freelist_head *head; + for_each_cpu_wrap(cpu, cpu_possible_mask, orig_cpu) { + struct pcpu_freelist_head *head; - head = per_cpu_ptr(s->freelist, cpu); - if (raw_spin_trylock(&head->lock)) { - pcpu_freelist_push_node(head, node); - raw_spin_unlock(&head->lock); - return; + head = per_cpu_ptr(s->freelist, cpu); + if (raw_spin_trylock(&head->lock)) { + pcpu_freelist_push_node(head, node); + raw_spin_unlock(&head->lock); + return; + } } - cpu = cpumask_next(cpu, cpu_possible_mask); - if (cpu >= nr_cpu_ids) - cpu = 0; /* cannot lock any per cpu lock, try extralist */ - if (cpu == orig_cpu && - pcpu_freelist_try_push_extra(s, node)) + if (pcpu_freelist_try_push_extra(s, node)) return; } } @@ -102,22 +100,21 @@ void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size, u32 nr_elems) { struct pcpu_freelist_head *head; - int i, cpu, pcpu_entries; + unsigned int cpu, cpu_idx, i, j, n, m; - pcpu_entries = nr_elems / num_possible_cpus() + 1; - i = 0; + n = nr_elems / num_possible_cpus(); + m = nr_elems % num_possible_cpus(); + cpu_idx = 0; for_each_possible_cpu(cpu) { -again: head = per_cpu_ptr(s->freelist, cpu); - /* No locking required as this is not visible yet. */ - pcpu_freelist_push_node(head, buf); - i++; - buf += elem_size; - if (i == nr_elems) - break; - if (i % pcpu_entries) - goto again; + j = n + (cpu_idx < m ? 1 : 0); + for (i = 0; i < j; i++) { + /* No locking required as this is not visible yet. */ + pcpu_freelist_push_node(head, buf); + buf += elem_size; + } + cpu_idx++; } } @@ -125,13 +122,12 @@ static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s) { struct pcpu_freelist_head *head; struct pcpu_freelist_node *node; - int orig_cpu, cpu; + int cpu; - orig_cpu = cpu = raw_smp_processor_id(); - while (1) { + for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) { head = per_cpu_ptr(s->freelist, cpu); if (!READ_ONCE(head->first)) - goto next_cpu; + continue; raw_spin_lock(&head->lock); node = head->first; if (node) { @@ -140,12 +136,6 @@ static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s) return node; } raw_spin_unlock(&head->lock); -next_cpu: - cpu = cpumask_next(cpu, cpu_possible_mask); - if (cpu >= nr_cpu_ids) - cpu = 0; - if (cpu == orig_cpu) - break; } /* per cpu lists are all empty, try extralist */ @@ -164,13 +154,12 @@ ___pcpu_freelist_pop_nmi(struct pcpu_freelist *s) { struct pcpu_freelist_head *head; struct pcpu_freelist_node *node; - int orig_cpu, cpu; + int cpu; - orig_cpu = cpu = raw_smp_processor_id(); - while (1) { + for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) { head = per_cpu_ptr(s->freelist, cpu); if (!READ_ONCE(head->first)) - goto next_cpu; + continue; if (raw_spin_trylock(&head->lock)) { node = head->first; if (node) { @@ -180,12 +169,6 @@ ___pcpu_freelist_pop_nmi(struct pcpu_freelist *s) } raw_spin_unlock(&head->lock); } -next_cpu: - cpu = cpumask_next(cpu, cpu_possible_mask); - if (cpu >= nr_cpu_ids) - cpu = 0; - if (cpu == orig_cpu) - break; } /* cannot pop from per cpu lists, try extralist */ diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index a1c0794ae49d..8a5e060de63b 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -78,8 +78,6 @@ static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr) if (!qs) return ERR_PTR(-ENOMEM); - memset(qs, 0, sizeof(*qs)); - bpf_map_init_from_attr(&qs->map, attr); qs->size = size; diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index ded4faeca192..80f4b4d88aaf 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -38,10 +38,43 @@ struct bpf_ringbuf { struct page **pages; int nr_pages; spinlock_t spinlock ____cacheline_aligned_in_smp; - /* Consumer and producer counters are put into separate pages to allow - * mapping consumer page as r/w, but restrict producer page to r/o. - * This protects producer position from being modified by user-space - * application and ruining in-kernel position tracking. + /* For user-space producer ring buffers, an atomic_t busy bit is used + * to synchronize access to the ring buffers in the kernel, rather than + * the spinlock that is used for kernel-producer ring buffers. This is + * done because the ring buffer must hold a lock across a BPF program's + * callback: + * + * __bpf_user_ringbuf_peek() // lock acquired + * -> program callback_fn() + * -> __bpf_user_ringbuf_sample_release() // lock released + * + * It is unsafe and incorrect to hold an IRQ spinlock across what could + * be a long execution window, so we instead simply disallow concurrent + * access to the ring buffer by kernel consumers, and return -EBUSY from + * __bpf_user_ringbuf_peek() if the busy bit is held by another task. + */ + atomic_t busy ____cacheline_aligned_in_smp; + /* Consumer and producer counters are put into separate pages to + * allow each position to be mapped with different permissions. + * This prevents a user-space application from modifying the + * position and ruining in-kernel tracking. The permissions of the + * pages depend on who is producing samples: user-space or the + * kernel. + * + * Kernel-producer + * --------------- + * The producer position and data pages are mapped as r/o in + * userspace. For this approach, bits in the header of samples are + * used to signal to user-space, and to other producers, whether a + * sample is currently being written. + * + * User-space producer + * ------------------- + * Only the page containing the consumer position is mapped r/o in + * user-space. User-space producers also use bits of the header to + * communicate to the kernel, but the kernel must carefully check and + * validate each sample to ensure that they're correctly formatted, and + * fully contained within the ring buffer. */ unsigned long consumer_pos __aligned(PAGE_SIZE); unsigned long producer_pos __aligned(PAGE_SIZE); @@ -116,7 +149,7 @@ static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node) err_free_pages: for (i = 0; i < nr_pages; i++) __free_page(pages[i]); - kvfree(pages); + bpf_map_area_free(pages); return NULL; } @@ -136,6 +169,7 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) return NULL; spin_lock_init(&rb->spinlock); + atomic_set(&rb->busy, 0); init_waitqueue_head(&rb->waitq); init_irq_work(&rb->work, bpf_ringbuf_notify); @@ -164,7 +198,7 @@ static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr) return ERR_PTR(-E2BIG); #endif - rb_map = kzalloc(sizeof(*rb_map), GFP_USER | __GFP_ACCOUNT); + rb_map = bpf_map_area_alloc(sizeof(*rb_map), NUMA_NO_NODE); if (!rb_map) return ERR_PTR(-ENOMEM); @@ -172,7 +206,7 @@ static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr) rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node); if (!rb_map->rb) { - kfree(rb_map); + bpf_map_area_free(rb_map); return ERR_PTR(-ENOMEM); } @@ -190,7 +224,7 @@ static void bpf_ringbuf_free(struct bpf_ringbuf *rb) vunmap(rb); for (i = 0; i < nr_pages; i++) __free_page(pages[i]); - kvfree(pages); + bpf_map_area_free(pages); } static void ringbuf_map_free(struct bpf_map *map) @@ -199,7 +233,7 @@ static void ringbuf_map_free(struct bpf_map *map) rb_map = container_of(map, struct bpf_ringbuf_map, map); bpf_ringbuf_free(rb_map->rb); - kfree(rb_map); + bpf_map_area_free(rb_map); } static void *ringbuf_map_lookup_elem(struct bpf_map *map, void *key) @@ -224,7 +258,7 @@ static int ringbuf_map_get_next_key(struct bpf_map *map, void *key, return -ENOTSUPP; } -static int ringbuf_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) +static int ringbuf_map_mmap_kern(struct bpf_map *map, struct vm_area_struct *vma) { struct bpf_ringbuf_map *rb_map; @@ -242,6 +276,26 @@ static int ringbuf_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) vma->vm_pgoff + RINGBUF_PGOFF); } +static int ringbuf_map_mmap_user(struct bpf_map *map, struct vm_area_struct *vma) +{ + struct bpf_ringbuf_map *rb_map; + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + + if (vma->vm_flags & VM_WRITE) { + if (vma->vm_pgoff == 0) + /* Disallow writable mappings to the consumer pointer, + * and allow writable mappings to both the producer + * position, and the ring buffer data itself. + */ + return -EPERM; + } else { + vma->vm_flags &= ~VM_MAYWRITE; + } + /* remap_vmalloc_range() checks size and offset constraints */ + return remap_vmalloc_range(vma, rb_map->rb, vma->vm_pgoff + RINGBUF_PGOFF); +} + static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb) { unsigned long cons_pos, prod_pos; @@ -251,8 +305,13 @@ static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb) return prod_pos - cons_pos; } -static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp, - struct poll_table_struct *pts) +static u32 ringbuf_total_data_sz(const struct bpf_ringbuf *rb) +{ + return rb->mask + 1; +} + +static __poll_t ringbuf_map_poll_kern(struct bpf_map *map, struct file *filp, + struct poll_table_struct *pts) { struct bpf_ringbuf_map *rb_map; @@ -264,13 +323,26 @@ static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp, return 0; } +static __poll_t ringbuf_map_poll_user(struct bpf_map *map, struct file *filp, + struct poll_table_struct *pts) +{ + struct bpf_ringbuf_map *rb_map; + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + poll_wait(filp, &rb_map->rb->waitq, pts); + + if (ringbuf_avail_data_sz(rb_map->rb) < ringbuf_total_data_sz(rb_map->rb)) + return EPOLLOUT | EPOLLWRNORM; + return 0; +} + BTF_ID_LIST_SINGLE(ringbuf_map_btf_ids, struct, bpf_ringbuf_map) const struct bpf_map_ops ringbuf_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = ringbuf_map_alloc, .map_free = ringbuf_map_free, - .map_mmap = ringbuf_map_mmap, - .map_poll = ringbuf_map_poll, + .map_mmap = ringbuf_map_mmap_kern, + .map_poll = ringbuf_map_poll_kern, .map_lookup_elem = ringbuf_map_lookup_elem, .map_update_elem = ringbuf_map_update_elem, .map_delete_elem = ringbuf_map_delete_elem, @@ -278,6 +350,20 @@ const struct bpf_map_ops ringbuf_map_ops = { .map_btf_id = &ringbuf_map_btf_ids[0], }; +BTF_ID_LIST_SINGLE(user_ringbuf_map_btf_ids, struct, bpf_ringbuf_map) +const struct bpf_map_ops user_ringbuf_map_ops = { + .map_meta_equal = bpf_map_meta_equal, + .map_alloc = ringbuf_map_alloc, + .map_free = ringbuf_map_free, + .map_mmap = ringbuf_map_mmap_user, + .map_poll = ringbuf_map_poll_user, + .map_lookup_elem = ringbuf_map_lookup_elem, + .map_update_elem = ringbuf_map_update_elem, + .map_delete_elem = ringbuf_map_delete_elem, + .map_get_next_key = ringbuf_map_get_next_key, + .map_btf_id = &user_ringbuf_map_btf_ids[0], +}; + /* Given pointer to ring buffer record metadata and struct bpf_ringbuf itself, * calculate offset from record metadata to ring buffer in pages, rounded * down. This page offset is stored as part of record metadata and allows to @@ -312,7 +398,7 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) return NULL; len = round_up(size + BPF_RINGBUF_HDR_SZ, 8); - if (len > rb->mask + 1) + if (len > ringbuf_total_data_sz(rb)) return NULL; cons_pos = smp_load_acquire(&rb->consumer_pos); @@ -361,7 +447,7 @@ BPF_CALL_3(bpf_ringbuf_reserve, struct bpf_map *, map, u64, size, u64, flags) const struct bpf_func_proto bpf_ringbuf_reserve_proto = { .func = bpf_ringbuf_reserve, - .ret_type = RET_PTR_TO_ALLOC_MEM_OR_NULL, + .ret_type = RET_PTR_TO_RINGBUF_MEM_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_CONST_ALLOC_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, @@ -404,7 +490,7 @@ BPF_CALL_2(bpf_ringbuf_submit, void *, sample, u64, flags) const struct bpf_func_proto bpf_ringbuf_submit_proto = { .func = bpf_ringbuf_submit, .ret_type = RET_VOID, - .arg1_type = ARG_PTR_TO_ALLOC_MEM | OBJ_RELEASE, + .arg1_type = ARG_PTR_TO_RINGBUF_MEM | OBJ_RELEASE, .arg2_type = ARG_ANYTHING, }; @@ -417,7 +503,7 @@ BPF_CALL_2(bpf_ringbuf_discard, void *, sample, u64, flags) const struct bpf_func_proto bpf_ringbuf_discard_proto = { .func = bpf_ringbuf_discard, .ret_type = RET_VOID, - .arg1_type = ARG_PTR_TO_ALLOC_MEM | OBJ_RELEASE, + .arg1_type = ARG_PTR_TO_RINGBUF_MEM | OBJ_RELEASE, .arg2_type = ARG_ANYTHING, }; @@ -459,7 +545,7 @@ BPF_CALL_2(bpf_ringbuf_query, struct bpf_map *, map, u64, flags) case BPF_RB_AVAIL_DATA: return ringbuf_avail_data_sz(rb); case BPF_RB_RING_SIZE: - return rb->mask + 1; + return ringbuf_total_data_sz(rb); case BPF_RB_CONS_POS: return smp_load_acquire(&rb->consumer_pos); case BPF_RB_PROD_POS: @@ -553,3 +639,138 @@ const struct bpf_func_proto bpf_ringbuf_discard_dynptr_proto = { .arg1_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | OBJ_RELEASE, .arg2_type = ARG_ANYTHING, }; + +static int __bpf_user_ringbuf_peek(struct bpf_ringbuf *rb, void **sample, u32 *size) +{ + int err; + u32 hdr_len, sample_len, total_len, flags, *hdr; + u64 cons_pos, prod_pos; + + /* Synchronizes with smp_store_release() in user-space producer. */ + prod_pos = smp_load_acquire(&rb->producer_pos); + if (prod_pos % 8) + return -EINVAL; + + /* Synchronizes with smp_store_release() in __bpf_user_ringbuf_sample_release() */ + cons_pos = smp_load_acquire(&rb->consumer_pos); + if (cons_pos >= prod_pos) + return -ENODATA; + + hdr = (u32 *)((uintptr_t)rb->data + (uintptr_t)(cons_pos & rb->mask)); + /* Synchronizes with smp_store_release() in user-space producer. */ + hdr_len = smp_load_acquire(hdr); + flags = hdr_len & (BPF_RINGBUF_BUSY_BIT | BPF_RINGBUF_DISCARD_BIT); + sample_len = hdr_len & ~flags; + total_len = round_up(sample_len + BPF_RINGBUF_HDR_SZ, 8); + + /* The sample must fit within the region advertised by the producer position. */ + if (total_len > prod_pos - cons_pos) + return -EINVAL; + + /* The sample must fit within the data region of the ring buffer. */ + if (total_len > ringbuf_total_data_sz(rb)) + return -E2BIG; + + /* The sample must fit into a struct bpf_dynptr. */ + err = bpf_dynptr_check_size(sample_len); + if (err) + return -E2BIG; + + if (flags & BPF_RINGBUF_DISCARD_BIT) { + /* If the discard bit is set, the sample should be skipped. + * + * Update the consumer pos, and return -EAGAIN so the caller + * knows to skip this sample and try to read the next one. + */ + smp_store_release(&rb->consumer_pos, cons_pos + total_len); + return -EAGAIN; + } + + if (flags & BPF_RINGBUF_BUSY_BIT) + return -ENODATA; + + *sample = (void *)((uintptr_t)rb->data + + (uintptr_t)((cons_pos + BPF_RINGBUF_HDR_SZ) & rb->mask)); + *size = sample_len; + return 0; +} + +static void __bpf_user_ringbuf_sample_release(struct bpf_ringbuf *rb, size_t size, u64 flags) +{ + u64 consumer_pos; + u32 rounded_size = round_up(size + BPF_RINGBUF_HDR_SZ, 8); + + /* Using smp_load_acquire() is unnecessary here, as the busy-bit + * prevents another task from writing to consumer_pos after it was read + * by this task with smp_load_acquire() in __bpf_user_ringbuf_peek(). + */ + consumer_pos = rb->consumer_pos; + /* Synchronizes with smp_load_acquire() in user-space producer. */ + smp_store_release(&rb->consumer_pos, consumer_pos + rounded_size); +} + +BPF_CALL_4(bpf_user_ringbuf_drain, struct bpf_map *, map, + void *, callback_fn, void *, callback_ctx, u64, flags) +{ + struct bpf_ringbuf *rb; + long samples, discarded_samples = 0, ret = 0; + bpf_callback_t callback = (bpf_callback_t)callback_fn; + u64 wakeup_flags = BPF_RB_NO_WAKEUP | BPF_RB_FORCE_WAKEUP; + int busy = 0; + + if (unlikely(flags & ~wakeup_flags)) + return -EINVAL; + + rb = container_of(map, struct bpf_ringbuf_map, map)->rb; + + /* If another consumer is already consuming a sample, wait for them to finish. */ + if (!atomic_try_cmpxchg(&rb->busy, &busy, 1)) + return -EBUSY; + + for (samples = 0; samples < BPF_MAX_USER_RINGBUF_SAMPLES && ret == 0; samples++) { + int err; + u32 size; + void *sample; + struct bpf_dynptr_kern dynptr; + + err = __bpf_user_ringbuf_peek(rb, &sample, &size); + if (err) { + if (err == -ENODATA) { + break; + } else if (err == -EAGAIN) { + discarded_samples++; + continue; + } else { + ret = err; + goto schedule_work_return; + } + } + + bpf_dynptr_init(&dynptr, sample, BPF_DYNPTR_TYPE_LOCAL, 0, size); + ret = callback((uintptr_t)&dynptr, (uintptr_t)callback_ctx, 0, 0, 0); + __bpf_user_ringbuf_sample_release(rb, size, flags); + } + ret = samples - discarded_samples; + +schedule_work_return: + /* Prevent the clearing of the busy-bit from being reordered before the + * storing of any rb consumer or producer positions. + */ + smp_mb__before_atomic(); + atomic_set(&rb->busy, 0); + + if (flags & BPF_RB_FORCE_WAKEUP) + irq_work_queue(&rb->work); + else if (!(flags & BPF_RB_NO_WAKEUP) && samples > 0) + irq_work_queue(&rb->work); + return ret; +} + +const struct bpf_func_proto bpf_user_ringbuf_drain_proto = { + .func = bpf_user_ringbuf_drain, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_FUNC, + .arg3_type = ARG_PTR_TO_STACK_OR_NULL, + .arg4_type = ARG_ANYTHING, +}; diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 1adbe67cdb95..aecea7451b61 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -338,7 +338,7 @@ BPF_CALL_3(bpf_get_stackid_pe, struct bpf_perf_event_data_kern *, ctx, int ret; /* perf_sample_data doesn't have callchain, use bpf_get_stackid */ - if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY)) + if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)) return bpf_get_stackid((unsigned long)(ctx->regs), (unsigned long) map, flags, 0, 0); @@ -506,7 +506,7 @@ BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx, int err = -EINVAL; __u64 nr_kernel; - if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY)) + if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)) return __bpf_get_stack(regs, NULL, NULL, buf, size, flags); if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 27760627370d..64131f88c553 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -175,8 +175,8 @@ static void maybe_wait_bpf_programs(struct bpf_map *map) synchronize_rcu(); } -static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key, - void *value, __u64 flags) +static int bpf_map_update_value(struct bpf_map *map, struct file *map_file, + void *key, void *value, __u64 flags) { int err; @@ -190,7 +190,7 @@ static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key, map->map_type == BPF_MAP_TYPE_SOCKMAP) { return sock_map_update_elem_sys(map, key, value, flags); } else if (IS_FD_PROG_ARRAY(map)) { - return bpf_fd_array_map_update_elem(map, f.file, key, value, + return bpf_fd_array_map_update_elem(map, map_file, key, value, flags); } @@ -205,12 +205,12 @@ static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key, flags); } else if (IS_FD_ARRAY(map)) { rcu_read_lock(); - err = bpf_fd_array_map_update_elem(map, f.file, key, value, + err = bpf_fd_array_map_update_elem(map, map_file, key, value, flags); rcu_read_unlock(); } else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { rcu_read_lock(); - err = bpf_fd_htab_map_update_elem(map, f.file, key, value, + err = bpf_fd_htab_map_update_elem(map, map_file, key, value, flags); rcu_read_unlock(); } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { @@ -495,114 +495,181 @@ static void bpf_map_release_memcg(struct bpf_map *map) } #endif -static int bpf_map_kptr_off_cmp(const void *a, const void *b) +static int btf_field_cmp(const void *a, const void *b) { - const struct bpf_map_value_off_desc *off_desc1 = a, *off_desc2 = b; + const struct btf_field *f1 = a, *f2 = b; - if (off_desc1->offset < off_desc2->offset) + if (f1->offset < f2->offset) return -1; - else if (off_desc1->offset > off_desc2->offset) + else if (f1->offset > f2->offset) return 1; return 0; } -struct bpf_map_value_off_desc *bpf_map_kptr_off_contains(struct bpf_map *map, u32 offset) +struct btf_field *btf_record_find(const struct btf_record *rec, u32 offset, + enum btf_field_type type) { - /* Since members are iterated in btf_find_field in increasing order, - * offsets appended to kptr_off_tab are in increasing order, so we can - * do bsearch to find exact match. - */ - struct bpf_map_value_off *tab; + struct btf_field *field; - if (!map_value_has_kptrs(map)) + if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & type)) + return NULL; + field = bsearch(&offset, rec->fields, rec->cnt, sizeof(rec->fields[0]), btf_field_cmp); + if (!field || !(field->type & type)) return NULL; - tab = map->kptr_off_tab; - return bsearch(&offset, tab->off, tab->nr_off, sizeof(tab->off[0]), bpf_map_kptr_off_cmp); + return field; } -void bpf_map_free_kptr_off_tab(struct bpf_map *map) +void btf_record_free(struct btf_record *rec) { - struct bpf_map_value_off *tab = map->kptr_off_tab; int i; - if (!map_value_has_kptrs(map)) + if (IS_ERR_OR_NULL(rec)) return; - for (i = 0; i < tab->nr_off; i++) { - if (tab->off[i].kptr.module) - module_put(tab->off[i].kptr.module); - btf_put(tab->off[i].kptr.btf); + for (i = 0; i < rec->cnt; i++) { + switch (rec->fields[i].type) { + case BPF_SPIN_LOCK: + case BPF_TIMER: + break; + case BPF_KPTR_UNREF: + case BPF_KPTR_REF: + if (rec->fields[i].kptr.module) + module_put(rec->fields[i].kptr.module); + btf_put(rec->fields[i].kptr.btf); + break; + case BPF_LIST_HEAD: + case BPF_LIST_NODE: + /* Nothing to release for bpf_list_head */ + break; + default: + WARN_ON_ONCE(1); + continue; + } } - kfree(tab); - map->kptr_off_tab = NULL; + kfree(rec); } -struct bpf_map_value_off *bpf_map_copy_kptr_off_tab(const struct bpf_map *map) +void bpf_map_free_record(struct bpf_map *map) { - struct bpf_map_value_off *tab = map->kptr_off_tab, *new_tab; - int size, i; + btf_record_free(map->record); + map->record = NULL; +} - if (!map_value_has_kptrs(map)) - return ERR_PTR(-ENOENT); - size = offsetof(struct bpf_map_value_off, off[tab->nr_off]); - new_tab = kmemdup(tab, size, GFP_KERNEL | __GFP_NOWARN); - if (!new_tab) +struct btf_record *btf_record_dup(const struct btf_record *rec) +{ + const struct btf_field *fields; + struct btf_record *new_rec; + int ret, size, i; + + if (IS_ERR_OR_NULL(rec)) + return NULL; + size = offsetof(struct btf_record, fields[rec->cnt]); + new_rec = kmemdup(rec, size, GFP_KERNEL | __GFP_NOWARN); + if (!new_rec) return ERR_PTR(-ENOMEM); - /* Do a deep copy of the kptr_off_tab */ - for (i = 0; i < tab->nr_off; i++) { - btf_get(tab->off[i].kptr.btf); - if (tab->off[i].kptr.module && !try_module_get(tab->off[i].kptr.module)) { - while (i--) { - if (tab->off[i].kptr.module) - module_put(tab->off[i].kptr.module); - btf_put(tab->off[i].kptr.btf); + /* Do a deep copy of the btf_record */ + fields = rec->fields; + new_rec->cnt = 0; + for (i = 0; i < rec->cnt; i++) { + switch (fields[i].type) { + case BPF_SPIN_LOCK: + case BPF_TIMER: + break; + case BPF_KPTR_UNREF: + case BPF_KPTR_REF: + btf_get(fields[i].kptr.btf); + if (fields[i].kptr.module && !try_module_get(fields[i].kptr.module)) { + ret = -ENXIO; + goto free; } - kfree(new_tab); - return ERR_PTR(-ENXIO); + break; + case BPF_LIST_HEAD: + case BPF_LIST_NODE: + /* Nothing to acquire for bpf_list_head */ + break; + default: + ret = -EFAULT; + WARN_ON_ONCE(1); + goto free; } + new_rec->cnt++; } - return new_tab; + return new_rec; +free: + btf_record_free(new_rec); + return ERR_PTR(ret); } -bool bpf_map_equal_kptr_off_tab(const struct bpf_map *map_a, const struct bpf_map *map_b) +bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b) { - struct bpf_map_value_off *tab_a = map_a->kptr_off_tab, *tab_b = map_b->kptr_off_tab; - bool a_has_kptr = map_value_has_kptrs(map_a), b_has_kptr = map_value_has_kptrs(map_b); + bool a_has_fields = !IS_ERR_OR_NULL(rec_a), b_has_fields = !IS_ERR_OR_NULL(rec_b); int size; - if (!a_has_kptr && !b_has_kptr) + if (!a_has_fields && !b_has_fields) return true; - if (a_has_kptr != b_has_kptr) + if (a_has_fields != b_has_fields) return false; - if (tab_a->nr_off != tab_b->nr_off) + if (rec_a->cnt != rec_b->cnt) return false; - size = offsetof(struct bpf_map_value_off, off[tab_a->nr_off]); - return !memcmp(tab_a, tab_b, size); + size = offsetof(struct btf_record, fields[rec_a->cnt]); + /* btf_parse_fields uses kzalloc to allocate a btf_record, so unused + * members are zeroed out. So memcmp is safe to do without worrying + * about padding/unused fields. + * + * While spin_lock, timer, and kptr have no relation to map BTF, + * list_head metadata is specific to map BTF, the btf and value_rec + * members in particular. btf is the map BTF, while value_rec points to + * btf_record in that map BTF. + * + * So while by default, we don't rely on the map BTF (which the records + * were parsed from) matching for both records, which is not backwards + * compatible, in case list_head is part of it, we implicitly rely on + * that by way of depending on memcmp succeeding for it. + */ + return !memcmp(rec_a, rec_b, size); } -/* Caller must ensure map_value_has_kptrs is true. Note that this function can - * be called on a map value while the map_value is visible to BPF programs, as - * it ensures the correct synchronization, and we already enforce the same using - * the bpf_kptr_xchg helper on the BPF program side for referenced kptrs. - */ -void bpf_map_free_kptrs(struct bpf_map *map, void *map_value) +void bpf_obj_free_timer(const struct btf_record *rec, void *obj) { - struct bpf_map_value_off *tab = map->kptr_off_tab; - unsigned long *btf_id_ptr; - int i; + if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_TIMER))) + return; + bpf_timer_cancel_and_free(obj + rec->timer_off); +} - for (i = 0; i < tab->nr_off; i++) { - struct bpf_map_value_off_desc *off_desc = &tab->off[i]; - unsigned long old_ptr; +void bpf_obj_free_fields(const struct btf_record *rec, void *obj) +{ + const struct btf_field *fields; + int i; - btf_id_ptr = map_value + off_desc->offset; - if (off_desc->type == BPF_KPTR_UNREF) { - u64 *p = (u64 *)btf_id_ptr; + if (IS_ERR_OR_NULL(rec)) + return; + fields = rec->fields; + for (i = 0; i < rec->cnt; i++) { + const struct btf_field *field = &fields[i]; + void *field_ptr = obj + field->offset; - WRITE_ONCE(p, 0); + switch (fields[i].type) { + case BPF_SPIN_LOCK: + break; + case BPF_TIMER: + bpf_timer_cancel_and_free(field_ptr); + break; + case BPF_KPTR_UNREF: + WRITE_ONCE(*(u64 *)field_ptr, 0); + break; + case BPF_KPTR_REF: + field->kptr.dtor((void *)xchg((unsigned long *)field_ptr, 0)); + break; + case BPF_LIST_HEAD: + if (WARN_ON_ONCE(rec->spin_lock_off < 0)) + continue; + bpf_list_head_free(field, field_ptr, obj + rec->spin_lock_off); + break; + case BPF_LIST_NODE: + break; + default: + WARN_ON_ONCE(1); continue; } - old_ptr = xchg(btf_id_ptr, 0); - off_desc->kptr.dtor((void *)old_ptr); } } @@ -610,14 +677,24 @@ void bpf_map_free_kptrs(struct bpf_map *map, void *map_value) static void bpf_map_free_deferred(struct work_struct *work) { struct bpf_map *map = container_of(work, struct bpf_map, work); + struct btf_field_offs *foffs = map->field_offs; + struct btf_record *rec = map->record; security_bpf_map_free(map); - kfree(map->off_arr); bpf_map_release_memcg(map); - /* implementation dependent freeing, map_free callback also does - * bpf_map_free_kptr_off_tab, if needed. - */ + /* implementation dependent freeing */ map->ops->map_free(map); + /* Delay freeing of field_offs and btf_record for maps, as map_free + * callback usually needs access to them. It is better to do it here + * than require each callback to do the free itself manually. + * + * Note that the btf_record stashed in map->inner_map_meta->record was + * already freed using the map_free callback for map in map case which + * eventually calls bpf_map_free_meta, since inner_map_meta is only a + * template bpf_map struct used during verification. + */ + kfree(foffs); + btf_record_free(rec); } static void bpf_map_put_uref(struct bpf_map *map) @@ -638,7 +715,10 @@ static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock) bpf_map_free_id(map, do_idr_lock); btf_put(map->btf); INIT_WORK(&map->work, bpf_map_free_deferred); - schedule_work(&map->work); + /* Avoid spawning kworkers, since they all might contend + * for the same mutex like slab_mutex. + */ + queue_work(system_unbound_wq, &map->work); } } @@ -775,8 +855,7 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) struct bpf_map *map = filp->private_data; int err; - if (!map->ops->map_mmap || map_value_has_spin_lock(map) || - map_value_has_timer(map) || map_value_has_kptrs(map)) + if (!map->ops->map_mmap || !IS_ERR_OR_NULL(map->record)) return -ENOTSUPP; if (!(vma->vm_flags & VM_SHARED)) @@ -903,84 +982,6 @@ int map_check_no_btf(const struct bpf_map *map, return -ENOTSUPP; } -static int map_off_arr_cmp(const void *_a, const void *_b, const void *priv) -{ - const u32 a = *(const u32 *)_a; - const u32 b = *(const u32 *)_b; - - if (a < b) - return -1; - else if (a > b) - return 1; - return 0; -} - -static void map_off_arr_swap(void *_a, void *_b, int size, const void *priv) -{ - struct bpf_map *map = (struct bpf_map *)priv; - u32 *off_base = map->off_arr->field_off; - u32 *a = _a, *b = _b; - u8 *sz_a, *sz_b; - - sz_a = map->off_arr->field_sz + (a - off_base); - sz_b = map->off_arr->field_sz + (b - off_base); - - swap(*a, *b); - swap(*sz_a, *sz_b); -} - -static int bpf_map_alloc_off_arr(struct bpf_map *map) -{ - bool has_spin_lock = map_value_has_spin_lock(map); - bool has_timer = map_value_has_timer(map); - bool has_kptrs = map_value_has_kptrs(map); - struct bpf_map_off_arr *off_arr; - u32 i; - - if (!has_spin_lock && !has_timer && !has_kptrs) { - map->off_arr = NULL; - return 0; - } - - off_arr = kmalloc(sizeof(*map->off_arr), GFP_KERNEL | __GFP_NOWARN); - if (!off_arr) - return -ENOMEM; - map->off_arr = off_arr; - - off_arr->cnt = 0; - if (has_spin_lock) { - i = off_arr->cnt; - - off_arr->field_off[i] = map->spin_lock_off; - off_arr->field_sz[i] = sizeof(struct bpf_spin_lock); - off_arr->cnt++; - } - if (has_timer) { - i = off_arr->cnt; - - off_arr->field_off[i] = map->timer_off; - off_arr->field_sz[i] = sizeof(struct bpf_timer); - off_arr->cnt++; - } - if (has_kptrs) { - struct bpf_map_value_off *tab = map->kptr_off_tab; - u32 *off = &off_arr->field_off[off_arr->cnt]; - u8 *sz = &off_arr->field_sz[off_arr->cnt]; - - for (i = 0; i < tab->nr_off; i++) { - *off++ = tab->off[i].offset; - *sz++ = sizeof(u64); - } - off_arr->cnt += tab->nr_off; - } - - if (off_arr->cnt == 1) - return 0; - sort_r(off_arr->field_off, off_arr->cnt, sizeof(off_arr->field_off[0]), - map_off_arr_cmp, map_off_arr_swap, map); - return 0; -} - static int map_check_btf(struct bpf_map *map, const struct btf *btf, u32 btf_key_id, u32 btf_value_id) { @@ -1003,39 +1004,12 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, if (!value_type || value_size != map->value_size) return -EINVAL; - map->spin_lock_off = btf_find_spin_lock(btf, value_type); - - if (map_value_has_spin_lock(map)) { - if (map->map_flags & BPF_F_RDONLY_PROG) - return -EACCES; - if (map->map_type != BPF_MAP_TYPE_HASH && - map->map_type != BPF_MAP_TYPE_ARRAY && - map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE && - map->map_type != BPF_MAP_TYPE_SK_STORAGE && - map->map_type != BPF_MAP_TYPE_INODE_STORAGE && - map->map_type != BPF_MAP_TYPE_TASK_STORAGE) - return -ENOTSUPP; - if (map->spin_lock_off + sizeof(struct bpf_spin_lock) > - map->value_size) { - WARN_ONCE(1, - "verifier bug spin_lock_off %d value_size %d\n", - map->spin_lock_off, map->value_size); - return -EFAULT; - } - } - - map->timer_off = btf_find_timer(btf, value_type); - if (map_value_has_timer(map)) { - if (map->map_flags & BPF_F_RDONLY_PROG) - return -EACCES; - if (map->map_type != BPF_MAP_TYPE_HASH && - map->map_type != BPF_MAP_TYPE_LRU_HASH && - map->map_type != BPF_MAP_TYPE_ARRAY) - return -EOPNOTSUPP; - } + map->record = btf_parse_fields(btf, value_type, + BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD, + map->value_size); + if (!IS_ERR_OR_NULL(map->record)) { + int i; - map->kptr_off_tab = btf_parse_kptrs(btf, value_type); - if (map_value_has_kptrs(map)) { if (!bpf_capable()) { ret = -EPERM; goto free_map_tab; @@ -1044,14 +1018,60 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, ret = -EACCES; goto free_map_tab; } - if (map->map_type != BPF_MAP_TYPE_HASH && - map->map_type != BPF_MAP_TYPE_LRU_HASH && - map->map_type != BPF_MAP_TYPE_ARRAY) { - ret = -EOPNOTSUPP; - goto free_map_tab; + for (i = 0; i < sizeof(map->record->field_mask) * 8; i++) { + switch (map->record->field_mask & (1 << i)) { + case 0: + continue; + case BPF_SPIN_LOCK: + if (map->map_type != BPF_MAP_TYPE_HASH && + map->map_type != BPF_MAP_TYPE_ARRAY && + map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE && + map->map_type != BPF_MAP_TYPE_SK_STORAGE && + map->map_type != BPF_MAP_TYPE_INODE_STORAGE && + map->map_type != BPF_MAP_TYPE_TASK_STORAGE && + map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) { + ret = -EOPNOTSUPP; + goto free_map_tab; + } + break; + case BPF_TIMER: + if (map->map_type != BPF_MAP_TYPE_HASH && + map->map_type != BPF_MAP_TYPE_LRU_HASH && + map->map_type != BPF_MAP_TYPE_ARRAY) { + ret = -EOPNOTSUPP; + goto free_map_tab; + } + break; + case BPF_KPTR_UNREF: + case BPF_KPTR_REF: + if (map->map_type != BPF_MAP_TYPE_HASH && + map->map_type != BPF_MAP_TYPE_LRU_HASH && + map->map_type != BPF_MAP_TYPE_ARRAY && + map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY) { + ret = -EOPNOTSUPP; + goto free_map_tab; + } + break; + case BPF_LIST_HEAD: + if (map->map_type != BPF_MAP_TYPE_HASH && + map->map_type != BPF_MAP_TYPE_LRU_HASH && + map->map_type != BPF_MAP_TYPE_ARRAY) { + ret = -EOPNOTSUPP; + goto free_map_tab; + } + break; + default: + /* Fail if map_type checks are missing for a field type */ + ret = -EOPNOTSUPP; + goto free_map_tab; + } } } + ret = btf_check_and_fixup_fields(btf, map->record); + if (ret < 0) + goto free_map_tab; + if (map->ops->map_check_btf) { ret = map->ops->map_check_btf(map, btf, key_type, value_type); if (ret < 0) @@ -1060,7 +1080,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, return ret; free_map_tab: - bpf_map_free_kptr_off_tab(map); + bpf_map_free_record(map); return ret; } @@ -1069,6 +1089,7 @@ free_map_tab: static int map_create(union bpf_attr *attr) { int numa_node = bpf_map_attr_numa_node(attr); + struct btf_field_offs *foffs; struct bpf_map *map; int f_flags; int err; @@ -1113,8 +1134,6 @@ static int map_create(union bpf_attr *attr) mutex_init(&map->freeze_mutex); spin_lock_init(&map->owner.lock); - map->spin_lock_off = -EINVAL; - map->timer_off = -EINVAL; if (attr->btf_key_type_id || attr->btf_value_type_id || /* Even the map's value is a kernel's struct, * the bpf_prog.o must have BTF to begin with @@ -1150,13 +1169,17 @@ static int map_create(union bpf_attr *attr) attr->btf_vmlinux_value_type_id; } - err = bpf_map_alloc_off_arr(map); - if (err) + + foffs = btf_parse_field_offs(map->record); + if (IS_ERR(foffs)) { + err = PTR_ERR(foffs); goto free_map; + } + map->field_offs = foffs; err = security_bpf_map_alloc(map); if (err) - goto free_map_off_arr; + goto free_map_field_offs; err = bpf_map_alloc_id(map); if (err) @@ -1180,8 +1203,8 @@ static int map_create(union bpf_attr *attr) free_map_sec: security_bpf_map_free(map); -free_map_off_arr: - kfree(map->off_arr); +free_map_field_offs: + kfree(map->field_offs); free_map: btf_put(map->btf); map->ops->map_free(map); @@ -1328,7 +1351,7 @@ static int map_lookup_elem(union bpf_attr *attr) } if ((attr->flags & BPF_F_LOCK) && - !map_value_has_spin_lock(map)) { + !btf_record_has_field(map->record, BPF_SPIN_LOCK)) { err = -EINVAL; goto err_put; } @@ -1401,7 +1424,7 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr) } if ((attr->flags & BPF_F_LOCK) && - !map_value_has_spin_lock(map)) { + !btf_record_has_field(map->record, BPF_SPIN_LOCK)) { err = -EINVAL; goto err_put; } @@ -1413,19 +1436,14 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr) } value_size = bpf_map_value_size(map); - - err = -ENOMEM; - value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); - if (!value) + value = kvmemdup_bpfptr(uvalue, value_size); + if (IS_ERR(value)) { + err = PTR_ERR(value); goto free_key; + } - err = -EFAULT; - if (copy_from_bpfptr(value, uvalue, value_size) != 0) - goto free_value; - - err = bpf_map_update_value(map, f, key, value, attr->flags); + err = bpf_map_update_value(map, f.file, key, value, attr->flags); -free_value: kvfree(value); free_key: kvfree(key); @@ -1437,9 +1455,9 @@ err_put: #define BPF_MAP_DELETE_ELEM_LAST_FIELD key -static int map_delete_elem(union bpf_attr *attr) +static int map_delete_elem(union bpf_attr *attr, bpfptr_t uattr) { - void __user *ukey = u64_to_user_ptr(attr->key); + bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel); int ufd = attr->map_fd; struct bpf_map *map; struct fd f; @@ -1459,7 +1477,7 @@ static int map_delete_elem(union bpf_attr *attr) goto err_put; } - key = __bpf_copy_key(ukey, map->key_size); + key = ___bpf_copy_key(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); goto err_put; @@ -1569,7 +1587,7 @@ int generic_map_delete_batch(struct bpf_map *map, return -EINVAL; if ((attr->batch.elem_flags & BPF_F_LOCK) && - !map_value_has_spin_lock(map)) { + !btf_record_has_field(map->record, BPF_SPIN_LOCK)) { return -EINVAL; } @@ -1610,23 +1628,21 @@ int generic_map_delete_batch(struct bpf_map *map, return err; } -int generic_map_update_batch(struct bpf_map *map, +int generic_map_update_batch(struct bpf_map *map, struct file *map_file, const union bpf_attr *attr, union bpf_attr __user *uattr) { void __user *values = u64_to_user_ptr(attr->batch.values); void __user *keys = u64_to_user_ptr(attr->batch.keys); u32 value_size, cp, max_count; - int ufd = attr->batch.map_fd; void *key, *value; - struct fd f; int err = 0; if (attr->batch.elem_flags & ~BPF_F_LOCK) return -EINVAL; if ((attr->batch.elem_flags & BPF_F_LOCK) && - !map_value_has_spin_lock(map)) { + !btf_record_has_field(map->record, BPF_SPIN_LOCK)) { return -EINVAL; } @@ -1646,7 +1662,6 @@ int generic_map_update_batch(struct bpf_map *map, return -ENOMEM; } - f = fdget(ufd); /* bpf_map_do_batch() guarantees ufd is valid */ for (cp = 0; cp < max_count; cp++) { err = -EFAULT; if (copy_from_user(key, keys + cp * map->key_size, @@ -1654,7 +1669,7 @@ int generic_map_update_batch(struct bpf_map *map, copy_from_user(value, values + cp * value_size, value_size)) break; - err = bpf_map_update_value(map, f, key, value, + err = bpf_map_update_value(map, map_file, key, value, attr->batch.elem_flags); if (err) @@ -1667,7 +1682,6 @@ int generic_map_update_batch(struct bpf_map *map, kvfree(value); kvfree(key); - fdput(f); return err; } @@ -1689,7 +1703,7 @@ int generic_map_lookup_batch(struct bpf_map *map, return -EINVAL; if ((attr->batch.elem_flags & BPF_F_LOCK) && - !map_value_has_spin_lock(map)) + !btf_record_has_field(map->record, BPF_SPIN_LOCK)) return -EINVAL; value_size = bpf_map_value_size(map); @@ -1811,7 +1825,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) } if ((attr->flags & BPF_F_LOCK) && - !map_value_has_spin_lock(map)) { + !btf_record_has_field(map->record, BPF_SPIN_LOCK)) { err = -EINVAL; goto err_put; } @@ -1882,8 +1896,7 @@ static int map_freeze(const union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); - if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS || - map_value_has_timer(map) || map_value_has_kptrs(map)) { + if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS || !IS_ERR_OR_NULL(map->record)) { fdput(f); return -ENOTSUPP; } @@ -2094,6 +2107,17 @@ struct bpf_prog_kstats { u64 misses; }; +void notrace bpf_prog_inc_misses_counter(struct bpf_prog *prog) +{ + struct bpf_prog_stats *stats; + unsigned int flags; + + stats = this_cpu_ptr(prog->stats); + flags = u64_stats_update_begin_irqsave(&stats->syncp); + u64_stats_inc(&stats->misses); + u64_stats_update_end_irqrestore(&stats->syncp, flags); +} + static void bpf_prog_get_stats(const struct bpf_prog *prog, struct bpf_prog_kstats *stats) { @@ -2107,11 +2131,11 @@ static void bpf_prog_get_stats(const struct bpf_prog *prog, st = per_cpu_ptr(prog->stats, cpu); do { - start = u64_stats_fetch_begin_irq(&st->syncp); + start = u64_stats_fetch_begin(&st->syncp); tnsecs = u64_stats_read(&st->nsecs); tcnt = u64_stats_read(&st->cnt); tmisses = u64_stats_read(&st->misses); - } while (u64_stats_fetch_retry_irq(&st->syncp, start)); + } while (u64_stats_fetch_retry(&st->syncp, start)); nsecs += tnsecs; cnt += tcnt; misses += tmisses; @@ -3494,9 +3518,9 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_PROG_TYPE_LSM: if (ptype == BPF_PROG_TYPE_LSM && prog->expected_attach_type != BPF_LSM_CGROUP) - return -EINVAL; - - ret = cgroup_bpf_prog_attach(attr, ptype, prog); + ret = -EINVAL; + else + ret = cgroup_bpf_prog_attach(attr, ptype, prog); break; default: ret = -EINVAL; @@ -4395,7 +4419,9 @@ static int bpf_task_fd_query(const union bpf_attr *attr, if (attr->task_fd_query.flags != 0) return -EINVAL; + rcu_read_lock(); task = get_pid_task(find_vpid(pid), PIDTYPE_PID); + rcu_read_unlock(); if (!task) return -ENOENT; @@ -4448,13 +4474,13 @@ put_file: #define BPF_MAP_BATCH_LAST_FIELD batch.flags -#define BPF_DO_BATCH(fn) \ +#define BPF_DO_BATCH(fn, ...) \ do { \ if (!fn) { \ err = -ENOTSUPP; \ goto err_put; \ } \ - err = fn(map, attr, uattr); \ + err = fn(__VA_ARGS__); \ } while (0) static int bpf_map_do_batch(const union bpf_attr *attr, @@ -4488,13 +4514,13 @@ static int bpf_map_do_batch(const union bpf_attr *attr, } if (cmd == BPF_MAP_LOOKUP_BATCH) - BPF_DO_BATCH(map->ops->map_lookup_batch); + BPF_DO_BATCH(map->ops->map_lookup_batch, map, attr, uattr); else if (cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH) - BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch); + BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch, map, attr, uattr); else if (cmd == BPF_MAP_UPDATE_BATCH) - BPF_DO_BATCH(map->ops->map_update_batch); + BPF_DO_BATCH(map->ops->map_update_batch, map, f.file, attr, uattr); else - BPF_DO_BATCH(map->ops->map_delete_batch); + BPF_DO_BATCH(map->ops->map_delete_batch, map, attr, uattr); err_put: if (has_write) bpf_map_write_active_dec(map); @@ -4941,7 +4967,7 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size) err = map_update_elem(&attr, uattr); break; case BPF_MAP_DELETE_ELEM: - err = map_delete_elem(&attr); + err = map_delete_elem(&attr, uattr); break; case BPF_MAP_GET_NEXT_KEY: err = map_get_next_key(&attr); @@ -5073,8 +5099,10 @@ BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size) { switch (cmd) { case BPF_MAP_CREATE: + case BPF_MAP_DELETE_ELEM: case BPF_MAP_UPDATE_ELEM: case BPF_MAP_FREEZE: + case BPF_MAP_GET_FD_BY_ID: case BPF_PROG_LOAD: case BPF_BTF_LOAD: case BPF_LINK_CREATE: @@ -5119,13 +5147,14 @@ int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size) run_ctx.bpf_cookie = 0; run_ctx.saved_run_ctx = NULL; - if (!__bpf_prog_enter_sleepable(prog, &run_ctx)) { + if (!__bpf_prog_enter_sleepable_recur(prog, &run_ctx)) { /* recursion detected */ bpf_prog_put(prog); return -EBUSY; } attr->test.retval = bpf_prog_run(prog, (void *) (long) attr->test.ctx_in); - __bpf_prog_exit_sleepable(prog, 0 /* bpf_prog_run does runtime stats */, &run_ctx); + __bpf_prog_exit_sleepable_recur(prog, 0 /* bpf_prog_run does runtime stats */, + &run_ctx); bpf_prog_put(prog); return 0; #endif diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 8c921799def4..c2a2182ce570 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -10,8 +10,17 @@ #include <linux/btf_ids.h> #include "mmap_unlock_work.h" +static const char * const iter_task_type_names[] = { + "ALL", + "TID", + "PID", +}; + struct bpf_iter_seq_task_common { struct pid_namespace *ns; + enum bpf_iter_task_type type; + u32 pid; + u32 pid_visiting; }; struct bpf_iter_seq_task_info { @@ -22,18 +31,115 @@ struct bpf_iter_seq_task_info { u32 tid; }; -static struct task_struct *task_seq_get_next(struct pid_namespace *ns, +static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_common *common, + u32 *tid, + bool skip_if_dup_files) +{ + struct task_struct *task, *next_task; + struct pid *pid; + u32 saved_tid; + + if (!*tid) { + /* The first time, the iterator calls this function. */ + pid = find_pid_ns(common->pid, common->ns); + if (!pid) + return NULL; + + task = get_pid_task(pid, PIDTYPE_TGID); + if (!task) + return NULL; + + *tid = common->pid; + common->pid_visiting = common->pid; + + return task; + } + + /* If the control returns to user space and comes back to the + * kernel again, *tid and common->pid_visiting should be the + * same for task_seq_start() to pick up the correct task. + */ + if (*tid == common->pid_visiting) { + pid = find_pid_ns(common->pid_visiting, common->ns); + task = get_pid_task(pid, PIDTYPE_PID); + + return task; + } + + pid = find_pid_ns(common->pid_visiting, common->ns); + if (!pid) + return NULL; + + task = get_pid_task(pid, PIDTYPE_PID); + if (!task) + return NULL; + +retry: + if (!pid_alive(task)) { + put_task_struct(task); + return NULL; + } + + next_task = next_thread(task); + put_task_struct(task); + if (!next_task) + return NULL; + + saved_tid = *tid; + *tid = __task_pid_nr_ns(next_task, PIDTYPE_PID, common->ns); + if (!*tid || *tid == common->pid) { + /* Run out of tasks of a process. The tasks of a + * thread_group are linked as circular linked list. + */ + *tid = saved_tid; + return NULL; + } + + get_task_struct(next_task); + common->pid_visiting = *tid; + + if (skip_if_dup_files && task->files == task->group_leader->files) { + task = next_task; + goto retry; + } + + return next_task; +} + +static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *common, u32 *tid, bool skip_if_dup_files) { struct task_struct *task = NULL; struct pid *pid; + if (common->type == BPF_TASK_ITER_TID) { + if (*tid && *tid != common->pid) + return NULL; + rcu_read_lock(); + pid = find_pid_ns(common->pid, common->ns); + if (pid) { + task = get_pid_task(pid, PIDTYPE_TGID); + *tid = common->pid; + } + rcu_read_unlock(); + + return task; + } + + if (common->type == BPF_TASK_ITER_TGID) { + rcu_read_lock(); + task = task_group_seq_get_next(common, tid, skip_if_dup_files); + rcu_read_unlock(); + + return task; + } + rcu_read_lock(); retry: - pid = find_ge_pid(*tid, ns); + pid = find_ge_pid(*tid, common->ns); if (pid) { - *tid = pid_nr_ns(pid, ns); + *tid = pid_nr_ns(pid, common->ns); task = get_pid_task(pid, PIDTYPE_PID); if (!task) { ++*tid; @@ -56,7 +162,7 @@ static void *task_seq_start(struct seq_file *seq, loff_t *pos) struct bpf_iter_seq_task_info *info = seq->private; struct task_struct *task; - task = task_seq_get_next(info->common.ns, &info->tid, false); + task = task_seq_get_next(&info->common, &info->tid, false); if (!task) return NULL; @@ -73,7 +179,7 @@ static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos) ++*pos; ++info->tid; put_task_struct((struct task_struct *)v); - task = task_seq_get_next(info->common.ns, &info->tid, false); + task = task_seq_get_next(&info->common, &info->tid, false); if (!task) return NULL; @@ -117,6 +223,41 @@ static void task_seq_stop(struct seq_file *seq, void *v) put_task_struct((struct task_struct *)v); } +static int bpf_iter_attach_task(struct bpf_prog *prog, + union bpf_iter_link_info *linfo, + struct bpf_iter_aux_info *aux) +{ + unsigned int flags; + struct pid *pid; + pid_t tgid; + + if ((!!linfo->task.tid + !!linfo->task.pid + !!linfo->task.pid_fd) > 1) + return -EINVAL; + + aux->task.type = BPF_TASK_ITER_ALL; + if (linfo->task.tid != 0) { + aux->task.type = BPF_TASK_ITER_TID; + aux->task.pid = linfo->task.tid; + } + if (linfo->task.pid != 0) { + aux->task.type = BPF_TASK_ITER_TGID; + aux->task.pid = linfo->task.pid; + } + if (linfo->task.pid_fd != 0) { + aux->task.type = BPF_TASK_ITER_TGID; + + pid = pidfd_get_pid(linfo->task.pid_fd, &flags); + if (IS_ERR(pid)) + return PTR_ERR(pid); + + tgid = pid_nr_ns(pid, task_active_pid_ns(current)); + aux->task.pid = tgid; + put_pid(pid); + } + + return 0; +} + static const struct seq_operations task_seq_ops = { .start = task_seq_start, .next = task_seq_next, @@ -137,8 +278,7 @@ struct bpf_iter_seq_task_file_info { static struct file * task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info) { - struct pid_namespace *ns = info->common.ns; - u32 curr_tid = info->tid; + u32 saved_tid = info->tid; struct task_struct *curr_task; unsigned int curr_fd = info->fd; @@ -151,21 +291,18 @@ again: curr_task = info->task; curr_fd = info->fd; } else { - curr_task = task_seq_get_next(ns, &curr_tid, true); + curr_task = task_seq_get_next(&info->common, &info->tid, true); if (!curr_task) { info->task = NULL; - info->tid = curr_tid; return NULL; } - /* set info->task and info->tid */ + /* set info->task */ info->task = curr_task; - if (curr_tid == info->tid) { + if (saved_tid == info->tid) curr_fd = info->fd; - } else { - info->tid = curr_tid; + else curr_fd = 0; - } } rcu_read_lock(); @@ -186,9 +323,15 @@ again: /* the current task is done, go to the next task */ rcu_read_unlock(); put_task_struct(curr_task); + + if (info->common.type == BPF_TASK_ITER_TID) { + info->task = NULL; + return NULL; + } + info->task = NULL; info->fd = 0; - curr_tid = ++(info->tid); + saved_tid = ++(info->tid); goto again; } @@ -269,6 +412,9 @@ static int init_seq_pidns(void *priv_data, struct bpf_iter_aux_info *aux) struct bpf_iter_seq_task_common *common = priv_data; common->ns = get_pid_ns(task_active_pid_ns(current)); + common->type = aux->task.type; + common->pid = aux->task.pid; + return 0; } @@ -299,19 +445,18 @@ struct bpf_iter_seq_task_vma_info { }; enum bpf_task_vma_iter_find_op { - task_vma_iter_first_vma, /* use mm->mmap */ - task_vma_iter_next_vma, /* use curr_vma->vm_next */ + task_vma_iter_first_vma, /* use find_vma() with addr 0 */ + task_vma_iter_next_vma, /* use vma_next() with curr_vma */ task_vma_iter_find_vma, /* use find_vma() to find next vma */ }; static struct vm_area_struct * task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info) { - struct pid_namespace *ns = info->common.ns; enum bpf_task_vma_iter_find_op op; struct vm_area_struct *curr_vma; struct task_struct *curr_task; - u32 curr_tid = info->tid; + u32 saved_tid = info->tid; /* If this function returns a non-NULL vma, it holds a reference to * the task_struct, and holds read lock on vma->mm->mmap_lock. @@ -371,14 +516,13 @@ task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info) } } else { again: - curr_task = task_seq_get_next(ns, &curr_tid, true); + curr_task = task_seq_get_next(&info->common, &info->tid, true); if (!curr_task) { - info->tid = curr_tid + 1; + info->tid++; goto finish; } - if (curr_tid != info->tid) { - info->tid = curr_tid; + if (saved_tid != info->tid) { /* new task, process the first vma */ op = task_vma_iter_first_vma; } else { @@ -400,10 +544,10 @@ again: switch (op) { case task_vma_iter_first_vma: - curr_vma = curr_task->mm->mmap; + curr_vma = find_vma(curr_task->mm, 0); break; case task_vma_iter_next_vma: - curr_vma = curr_vma->vm_next; + curr_vma = find_vma(curr_task->mm, curr_vma->vm_end); break; case task_vma_iter_find_vma: /* We dropped mmap_lock so it is necessary to use find_vma @@ -417,7 +561,7 @@ again: if (curr_vma && curr_vma->vm_start == info->prev_vm_start && curr_vma->vm_end == info->prev_vm_end) - curr_vma = curr_vma->vm_next; + curr_vma = find_vma(curr_task->mm, curr_vma->vm_end); break; } if (!curr_vma) { @@ -430,9 +574,12 @@ again: return curr_vma; next_task: + if (info->common.type == BPF_TASK_ITER_TID) + goto finish; + put_task_struct(curr_task); info->task = NULL; - curr_tid++; + info->tid++; goto again; finish: @@ -531,8 +678,33 @@ static const struct bpf_iter_seq_info task_seq_info = { .seq_priv_size = sizeof(struct bpf_iter_seq_task_info), }; +static int bpf_iter_fill_link_info(const struct bpf_iter_aux_info *aux, struct bpf_link_info *info) +{ + switch (aux->task.type) { + case BPF_TASK_ITER_TID: + info->iter.task.tid = aux->task.pid; + break; + case BPF_TASK_ITER_TGID: + info->iter.task.pid = aux->task.pid; + break; + default: + break; + } + return 0; +} + +static void bpf_iter_task_show_fdinfo(const struct bpf_iter_aux_info *aux, struct seq_file *seq) +{ + seq_printf(seq, "task_type:\t%s\n", iter_task_type_names[aux->task.type]); + if (aux->task.type == BPF_TASK_ITER_TID) + seq_printf(seq, "tid:\t%u\n", aux->task.pid); + else if (aux->task.type == BPF_TASK_ITER_TGID) + seq_printf(seq, "pid:\t%u\n", aux->task.pid); +} + static struct bpf_iter_reg task_reg_info = { .target = "task", + .attach_target = bpf_iter_attach_task, .feature = BPF_ITER_RESCHED, .ctx_arg_info_size = 1, .ctx_arg_info = { @@ -540,6 +712,8 @@ static struct bpf_iter_reg task_reg_info = { PTR_TO_BTF_ID_OR_NULL }, }, .seq_info = &task_seq_info, + .fill_link_info = bpf_iter_fill_link_info, + .show_fdinfo = bpf_iter_task_show_fdinfo, }; static const struct bpf_iter_seq_info task_file_seq_info = { @@ -551,6 +725,7 @@ static const struct bpf_iter_seq_info task_file_seq_info = { static struct bpf_iter_reg task_file_reg_info = { .target = "task_file", + .attach_target = bpf_iter_attach_task, .feature = BPF_ITER_RESCHED, .ctx_arg_info_size = 2, .ctx_arg_info = { @@ -560,6 +735,8 @@ static struct bpf_iter_reg task_file_reg_info = { PTR_TO_BTF_ID_OR_NULL }, }, .seq_info = &task_file_seq_info, + .fill_link_info = bpf_iter_fill_link_info, + .show_fdinfo = bpf_iter_task_show_fdinfo, }; static const struct bpf_iter_seq_info task_vma_seq_info = { @@ -571,6 +748,7 @@ static const struct bpf_iter_seq_info task_vma_seq_info = { static struct bpf_iter_reg task_vma_reg_info = { .target = "task_vma", + .attach_target = bpf_iter_attach_task, .feature = BPF_ITER_RESCHED, .ctx_arg_info_size = 2, .ctx_arg_info = { @@ -580,6 +758,8 @@ static struct bpf_iter_reg task_vma_reg_info = { PTR_TO_BTF_ID_OR_NULL }, }, .seq_info = &task_vma_seq_info, + .fill_link_info = bpf_iter_fill_link_info, + .show_fdinfo = bpf_iter_task_show_fdinfo, }; BPF_CALL_5(bpf_find_vma, struct task_struct *, task, u64, start, diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index ff87e38af8a7..11f5ec0b8016 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -116,22 +116,6 @@ bool bpf_prog_has_trampoline(const struct bpf_prog *prog) (ptype == BPF_PROG_TYPE_LSM && eatype == BPF_LSM_MAC); } -void *bpf_jit_alloc_exec_page(void) -{ - void *image; - - image = bpf_jit_alloc_exec(PAGE_SIZE); - if (!image) - return NULL; - - set_vm_flush_reset_perms(image); - /* Keep image as writeable. The alternative is to keep flipping ro/rw - * every time new program is attached or detached. - */ - set_memory_x((long)image, 1); - return image; -} - void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym) { ksym->start = (unsigned long) data; @@ -404,9 +388,10 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx) goto out_free_im; err = -ENOMEM; - im->image = image = bpf_jit_alloc_exec_page(); + im->image = image = bpf_jit_alloc_exec(PAGE_SIZE); if (!image) goto out_uncharge; + set_vm_flush_reset_perms(image); err = percpu_ref_init(&im->pcref, __bpf_tramp_image_release, 0, GFP_KERNEL); if (err) @@ -483,6 +468,8 @@ again: if (err < 0) goto out; + set_memory_rox((long)im->image, 1); + WARN_ON(tr->cur_image && tr->selector == 0); WARN_ON(!tr->cur_image && tr->selector); if (tr->cur_image) @@ -863,17 +850,6 @@ static __always_inline u64 notrace bpf_prog_start_time(void) return start; } -static void notrace inc_misses_counter(struct bpf_prog *prog) -{ - struct bpf_prog_stats *stats; - unsigned int flags; - - stats = this_cpu_ptr(prog->stats); - flags = u64_stats_update_begin_irqsave(&stats->syncp); - u64_stats_inc(&stats->misses); - u64_stats_update_end_irqrestore(&stats->syncp, flags); -} - /* The logic is similar to bpf_prog_run(), but with an explicit * rcu_read_lock() and migrate_disable() which are required * for the trampoline. The macro is split into @@ -887,7 +863,7 @@ static void notrace inc_misses_counter(struct bpf_prog *prog) * [2..MAX_U64] - execute bpf prog and record execution time. * This is start time. */ -u64 notrace __bpf_prog_enter(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx) +static u64 notrace __bpf_prog_enter_recur(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx) __acquires(RCU) { rcu_read_lock(); @@ -895,8 +871,8 @@ u64 notrace __bpf_prog_enter(struct bpf_prog *prog, struct bpf_tramp_run_ctx *ru run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); - if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) { - inc_misses_counter(prog); + if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { + bpf_prog_inc_misses_counter(prog); return 0; } return bpf_prog_start_time(); @@ -924,19 +900,20 @@ static void notrace update_prog_stats(struct bpf_prog *prog, } } -void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start, struct bpf_tramp_run_ctx *run_ctx) +static void notrace __bpf_prog_exit_recur(struct bpf_prog *prog, u64 start, + struct bpf_tramp_run_ctx *run_ctx) __releases(RCU) { bpf_reset_run_ctx(run_ctx->saved_run_ctx); update_prog_stats(prog, start); - __this_cpu_dec(*(prog->active)); + this_cpu_dec(*(prog->active)); migrate_enable(); rcu_read_unlock(); } -u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog, - struct bpf_tramp_run_ctx *run_ctx) +static u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog, + struct bpf_tramp_run_ctx *run_ctx) __acquires(RCU) { /* Runtime stats are exported via actual BPF_LSM_CGROUP @@ -950,8 +927,8 @@ u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog, return NO_START_TIME; } -void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start, - struct bpf_tramp_run_ctx *run_ctx) +static void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start, + struct bpf_tramp_run_ctx *run_ctx) __releases(RCU) { bpf_reset_run_ctx(run_ctx->saved_run_ctx); @@ -960,14 +937,15 @@ void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start, rcu_read_unlock(); } -u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx) +u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog, + struct bpf_tramp_run_ctx *run_ctx) { rcu_read_lock_trace(); migrate_disable(); might_fault(); - if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) { - inc_misses_counter(prog); + if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { + bpf_prog_inc_misses_counter(prog); return 0; } @@ -976,17 +954,62 @@ u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, struct bpf_tramp_r return bpf_prog_start_time(); } -void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start, - struct bpf_tramp_run_ctx *run_ctx) +void notrace __bpf_prog_exit_sleepable_recur(struct bpf_prog *prog, u64 start, + struct bpf_tramp_run_ctx *run_ctx) +{ + bpf_reset_run_ctx(run_ctx->saved_run_ctx); + + update_prog_stats(prog, start); + this_cpu_dec(*(prog->active)); + migrate_enable(); + rcu_read_unlock_trace(); +} + +static u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, + struct bpf_tramp_run_ctx *run_ctx) +{ + rcu_read_lock_trace(); + migrate_disable(); + might_fault(); + + run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); + + return bpf_prog_start_time(); +} + +static void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start, + struct bpf_tramp_run_ctx *run_ctx) { bpf_reset_run_ctx(run_ctx->saved_run_ctx); update_prog_stats(prog, start); - __this_cpu_dec(*(prog->active)); migrate_enable(); rcu_read_unlock_trace(); } +static u64 notrace __bpf_prog_enter(struct bpf_prog *prog, + struct bpf_tramp_run_ctx *run_ctx) + __acquires(RCU) +{ + rcu_read_lock(); + migrate_disable(); + + run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); + + return bpf_prog_start_time(); +} + +static void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start, + struct bpf_tramp_run_ctx *run_ctx) + __releases(RCU) +{ + bpf_reset_run_ctx(run_ctx->saved_run_ctx); + + update_prog_stats(prog, start); + migrate_enable(); + rcu_read_unlock(); +} + void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr) { percpu_ref_get(&tr->pcref); @@ -997,6 +1020,36 @@ void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr) percpu_ref_put(&tr->pcref); } +bpf_trampoline_enter_t bpf_trampoline_enter(const struct bpf_prog *prog) +{ + bool sleepable = prog->aux->sleepable; + + if (bpf_prog_check_recur(prog)) + return sleepable ? __bpf_prog_enter_sleepable_recur : + __bpf_prog_enter_recur; + + if (resolve_prog_type(prog) == BPF_PROG_TYPE_LSM && + prog->expected_attach_type == BPF_LSM_CGROUP) + return __bpf_prog_enter_lsm_cgroup; + + return sleepable ? __bpf_prog_enter_sleepable : __bpf_prog_enter; +} + +bpf_trampoline_exit_t bpf_trampoline_exit(const struct bpf_prog *prog) +{ + bool sleepable = prog->aux->sleepable; + + if (bpf_prog_check_recur(prog)) + return sleepable ? __bpf_prog_exit_sleepable_recur : + __bpf_prog_exit_recur; + + if (resolve_prog_type(prog) == BPF_PROG_TYPE_LSM && + prog->expected_attach_type == BPF_LSM_CGROUP) + return __bpf_prog_exit_lsm_cgroup; + + return sleepable ? __bpf_prog_exit_sleepable : __bpf_prog_exit; +} + int __weak arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end, const struct btf_func_model *m, u32 flags, diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3eadb14e090b..a5255a0dcbb6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -23,6 +23,7 @@ #include <linux/error-injection.h> #include <linux/bpf_lsm.h> #include <linux/btf_ids.h> +#include <linux/poison.h> #include "disasm.h" @@ -261,7 +262,7 @@ struct bpf_call_arg_meta { struct btf *ret_btf; u32 ret_btf_id; u32 subprogno; - struct bpf_map_value_off_desc *kptr_off_desc; + struct btf_field *kptr_field; u8 uninit_dynptr_regno; }; @@ -370,6 +371,7 @@ __printf(2, 3) void bpf_log(struct bpf_verifier_log *log, bpf_verifier_vlog(log, fmt, args); va_end(args); } +EXPORT_SYMBOL_GPL(bpf_log); static const char *ltrim(const char *s) { @@ -427,6 +429,7 @@ static void verbose_invalid_scalar(struct bpf_verifier_env *env, static bool type_is_pkt_pointer(enum bpf_reg_type type) { + type = base_type(type); return type == PTR_TO_PACKET || type == PTR_TO_PACKET_META; } @@ -448,28 +451,34 @@ static bool reg_type_not_null(enum bpf_reg_type type) type == PTR_TO_SOCK_COMMON; } -static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) +static bool type_is_ptr_alloc_obj(u32 type) { - return reg->type == PTR_TO_MAP_VALUE && - map_value_has_spin_lock(reg->map_ptr); + return base_type(type) == PTR_TO_BTF_ID && type_flag(type) & MEM_ALLOC; } -static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type) +static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg) { - return base_type(type) == PTR_TO_SOCKET || - base_type(type) == PTR_TO_TCP_SOCK || - base_type(type) == PTR_TO_MEM || - base_type(type) == PTR_TO_BTF_ID; + struct btf_record *rec = NULL; + struct btf_struct_meta *meta; + + if (reg->type == PTR_TO_MAP_VALUE) { + rec = reg->map_ptr->record; + } else if (type_is_ptr_alloc_obj(reg->type)) { + meta = btf_find_struct_meta(reg->btf, reg->btf_id); + if (meta) + rec = meta->record; + } + return rec; } -static bool type_is_rdonly_mem(u32 type) +static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) { - return type & MEM_RDONLY; + return btf_record_has_field(reg_btf_record(reg), BPF_SPIN_LOCK); } -static bool arg_type_may_be_refcounted(enum bpf_arg_type type) +static bool type_is_rdonly_mem(u32 type) { - return type == ARG_PTR_TO_SOCK_COMMON; + return type & MEM_RDONLY; } static bool type_may_be_null(u32 type) @@ -477,15 +486,6 @@ static bool type_may_be_null(u32 type) return type & PTR_MAYBE_NULL; } -static bool may_be_acquire_function(enum bpf_func_id func_id) -{ - return func_id == BPF_FUNC_sk_lookup_tcp || - func_id == BPF_FUNC_sk_lookup_udp || - func_id == BPF_FUNC_skc_lookup_tcp || - func_id == BPF_FUNC_map_lookup_elem || - func_id == BPF_FUNC_ringbuf_reserve; -} - static bool is_acquire_function(enum bpf_func_id func_id, const struct bpf_map *map) { @@ -518,6 +518,43 @@ static bool is_ptr_cast_function(enum bpf_func_id func_id) func_id == BPF_FUNC_skc_to_tcp_request_sock; } +static bool is_dynptr_ref_function(enum bpf_func_id func_id) +{ + return func_id == BPF_FUNC_dynptr_data; +} + +static bool is_callback_calling_function(enum bpf_func_id func_id) +{ + return func_id == BPF_FUNC_for_each_map_elem || + func_id == BPF_FUNC_timer_set_callback || + func_id == BPF_FUNC_find_vma || + func_id == BPF_FUNC_loop || + func_id == BPF_FUNC_user_ringbuf_drain; +} + +static bool is_storage_get_function(enum bpf_func_id func_id) +{ + return func_id == BPF_FUNC_sk_storage_get || + func_id == BPF_FUNC_inode_storage_get || + func_id == BPF_FUNC_task_storage_get || + func_id == BPF_FUNC_cgrp_storage_get; +} + +static bool helper_multiple_ref_obj_use(enum bpf_func_id func_id, + const struct bpf_map *map) +{ + int ref_obj_uses = 0; + + if (is_ptr_cast_function(func_id)) + ref_obj_uses++; + if (is_acquire_function(func_id, map)) + ref_obj_uses++; + if (is_dynptr_ref_function(func_id)) + ref_obj_uses++; + + return ref_obj_uses > 1; +} + static bool is_cmpxchg_insn(const struct bpf_insn *insn) { return BPF_CLASS(insn->code) == BPF_STX && @@ -533,7 +570,7 @@ static bool is_cmpxchg_insn(const struct bpf_insn *insn) static const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type) { - char postfix[16] = {0}, prefix[32] = {0}; + char postfix[16] = {0}, prefix[64] = {0}; static const char * const str[] = { [NOT_INIT] = "?", [SCALAR_VALUE] = "scalar", @@ -555,6 +592,7 @@ static const char *reg_type_str(struct bpf_verifier_env *env, [PTR_TO_BUF] = "buf", [PTR_TO_FUNC] = "func", [PTR_TO_MAP_KEY] = "map_key", + [CONST_PTR_TO_DYNPTR] = "dynptr_ptr", }; if (type & PTR_MAYBE_NULL) { @@ -564,16 +602,15 @@ static const char *reg_type_str(struct bpf_verifier_env *env, strncpy(postfix, "_or_null", 16); } - if (type & MEM_RDONLY) - strncpy(prefix, "rdonly_", 32); - if (type & MEM_ALLOC) - strncpy(prefix, "alloc_", 32); - if (type & MEM_USER) - strncpy(prefix, "user_", 32); - if (type & MEM_PERCPU) - strncpy(prefix, "percpu_", 32); - if (type & PTR_UNTRUSTED) - strncpy(prefix, "untrusted_", 32); + snprintf(prefix, sizeof(prefix), "%s%s%s%s%s%s%s", + type & MEM_RDONLY ? "rdonly_" : "", + type & MEM_RINGBUF ? "ringbuf_" : "", + type & MEM_USER ? "user_" : "", + type & MEM_PERCPU ? "percpu_" : "", + type & MEM_RCU ? "rcu_" : "", + type & PTR_UNTRUSTED ? "untrusted_" : "", + type & PTR_TRUSTED ? "trusted_" : "" + ); snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s%s", prefix, str[base_type(type)], postfix); @@ -688,6 +725,28 @@ static bool dynptr_type_refcounted(enum bpf_dynptr_type type) return type == BPF_DYNPTR_TYPE_RINGBUF; } +static void __mark_dynptr_reg(struct bpf_reg_state *reg, + enum bpf_dynptr_type type, + bool first_slot); + +static void __mark_reg_not_init(const struct bpf_verifier_env *env, + struct bpf_reg_state *reg); + +static void mark_dynptr_stack_regs(struct bpf_reg_state *sreg1, + struct bpf_reg_state *sreg2, + enum bpf_dynptr_type type) +{ + __mark_dynptr_reg(sreg1, type, true); + __mark_dynptr_reg(sreg2, type, false); +} + +static void mark_dynptr_cb_reg(struct bpf_reg_state *reg, + enum bpf_dynptr_type type) +{ + __mark_dynptr_reg(reg, type, true); +} + + static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg, enum bpf_arg_type arg_type, int insn_idx) { @@ -709,9 +768,8 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_ if (type == BPF_DYNPTR_TYPE_INVALID) return -EINVAL; - state->stack[spi].spilled_ptr.dynptr.first_slot = true; - state->stack[spi].spilled_ptr.dynptr.type = type; - state->stack[spi - 1].spilled_ptr.dynptr.type = type; + mark_dynptr_stack_regs(&state->stack[spi].spilled_ptr, + &state->stack[spi - 1].spilled_ptr, type); if (dynptr_type_refcounted(type)) { /* The id is used to track proper releasing */ @@ -719,8 +777,8 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_ if (id < 0) return id; - state->stack[spi].spilled_ptr.id = id; - state->stack[spi - 1].spilled_ptr.id = id; + state->stack[spi].spilled_ptr.ref_obj_id = id; + state->stack[spi - 1].spilled_ptr.ref_obj_id = id; } return 0; @@ -742,25 +800,23 @@ static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_re } /* Invalidate any slices associated with this dynptr */ - if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) { - release_reference(env, state->stack[spi].spilled_ptr.id); - state->stack[spi].spilled_ptr.id = 0; - state->stack[spi - 1].spilled_ptr.id = 0; - } - - state->stack[spi].spilled_ptr.dynptr.first_slot = false; - state->stack[spi].spilled_ptr.dynptr.type = 0; - state->stack[spi - 1].spilled_ptr.dynptr.type = 0; + if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) + WARN_ON_ONCE(release_reference(env, state->stack[spi].spilled_ptr.ref_obj_id)); + __mark_reg_not_init(env, &state->stack[spi].spilled_ptr); + __mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr); return 0; } static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { struct bpf_func_state *state = func(env, reg); - int spi = get_spi(reg->off); - int i; + int spi, i; + if (reg->type == CONST_PTR_TO_DYNPTR) + return false; + + spi = get_spi(reg->off); if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS)) return true; @@ -773,13 +829,17 @@ static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_ return true; } -static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg, - enum bpf_arg_type arg_type) +static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { struct bpf_func_state *state = func(env, reg); - int spi = get_spi(reg->off); + int spi; int i; + /* This already represents first slot of initialized bpf_dynptr */ + if (reg->type == CONST_PTR_TO_DYNPTR) + return true; + + spi = get_spi(reg->off); if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) || !state->stack[spi].spilled_ptr.dynptr.first_slot) return false; @@ -790,11 +850,27 @@ static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_re return false; } + return true; +} + +static bool is_dynptr_type_expected(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + enum bpf_arg_type arg_type) +{ + struct bpf_func_state *state = func(env, reg); + enum bpf_dynptr_type dynptr_type; + int spi; + /* ARG_PTR_TO_DYNPTR takes any type of dynptr */ if (arg_type == ARG_PTR_TO_DYNPTR) return true; - return state->stack[spi].spilled_ptr.dynptr.type == arg_to_dynptr_type(arg_type); + dynptr_type = arg_to_dynptr_type(arg_type); + if (reg->type == CONST_PTR_TO_DYNPTR) { + return reg->dynptr.type == dynptr_type; + } else { + spi = get_spi(reg->off); + return state->stack[spi].spilled_ptr.dynptr.type == dynptr_type; + } } /* The reg state of a pointer or a bounded scalar was saved when @@ -853,7 +929,7 @@ static void print_verifier_state(struct bpf_verifier_env *env, if (reg->id) verbose_a("id=%d", reg->id); - if (reg_type_may_be_refcounted_or_null(t) && reg->ref_obj_id) + if (reg->ref_obj_id) verbose_a("ref_obj_id=%d", reg->ref_obj_id); if (t != SCALAR_VALUE) verbose_a("off=%d", reg->off); @@ -986,9 +1062,9 @@ static void *copy_array(void *dst, const void *src, size_t n, size_t size, gfp_t if (unlikely(check_mul_overflow(n, size, &bytes))) return NULL; - if (ksize(dst) < bytes) { + if (ksize(dst) < ksize(src)) { kfree(dst); - dst = kmalloc_track_caller(bytes, flags); + dst = kmalloc_track_caller(kmalloc_size_roundup(bytes), flags); if (!dst) return NULL; } @@ -1005,12 +1081,19 @@ out: */ static void *realloc_array(void *arr, size_t old_n, size_t new_n, size_t size) { + size_t alloc_size; + void *new_arr; + if (!new_n || old_n == new_n) goto out; - arr = krealloc_array(arr, new_n, size, GFP_KERNEL); - if (!arr) + alloc_size = kmalloc_size_roundup(size_mul(new_n, size)); + new_arr = krealloc(arr, alloc_size, GFP_KERNEL); + if (!new_arr) { + kfree(arr); return NULL; + } + arr = new_arr; if (new_n > old_n) memset(arr + old_n * size, 0, (new_n - old_n) * size); @@ -1086,6 +1169,7 @@ static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx) id = ++env->id_gen; state->refs[new_ofs].id = id; state->refs[new_ofs].insn_idx = insn_idx; + state->refs[new_ofs].callback_ref = state->in_callback_fn ? state->frameno : 0; return id; } @@ -1098,6 +1182,9 @@ static int release_reference_state(struct bpf_func_state *state, int ptr_id) last_idx = state->acquired_refs - 1; for (i = 0; i < state->acquired_refs; i++) { if (state->refs[i].id == ptr_id) { + /* Cannot release caller references in callbacks */ + if (state->in_callback_fn && state->refs[i].callback_ref != state->frameno) + return -EINVAL; if (last_idx && i != last_idx) memcpy(&state->refs[i], &state->refs[last_idx], sizeof(*state->refs)); @@ -1173,8 +1260,10 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, dst_state->frame[i] = NULL; } dst_state->speculative = src->speculative; + dst_state->active_rcu_lock = src->active_rcu_lock; dst_state->curframe = src->curframe; - dst_state->active_spin_lock = src->active_spin_lock; + dst_state->active_lock.ptr = src->active_lock.ptr; + dst_state->active_lock.id = src->active_lock.id; dst_state->branches = src->branches; dst_state->parent = src->parent; dst_state->first_insn_idx = src->first_insn_idx; @@ -1293,9 +1382,6 @@ static const int caller_saved[CALLER_SAVED_REGS] = { BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5 }; -static void __mark_reg_not_init(const struct bpf_verifier_env *env, - struct bpf_reg_state *reg); - /* This helper doesn't clear reg->id */ static void ___mark_reg_known(struct bpf_reg_state *reg, u64 imm) { @@ -1358,6 +1444,19 @@ static void mark_reg_known_zero(struct bpf_verifier_env *env, __mark_reg_known_zero(regs + regno); } +static void __mark_dynptr_reg(struct bpf_reg_state *reg, enum bpf_dynptr_type type, + bool first_slot) +{ + /* reg->type has no meaning for STACK_DYNPTR, but when we set reg for + * callback arguments, it does need to be CONST_PTR_TO_DYNPTR, so simply + * set it unconditionally as it is ignored for STACK_DYNPTR anyway. + */ + __mark_reg_known_zero(reg); + reg->type = CONST_PTR_TO_DYNPTR; + reg->dynptr.type = type; + reg->dynptr.first_slot = first_slot; +} + static void mark_ptr_not_null_reg(struct bpf_reg_state *reg) { if (base_type(reg->type) == PTR_TO_MAP_VALUE) { @@ -1369,7 +1468,7 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg) /* transfer reg's id which is unique for every map_lookup_elem * as UID of the inner map. */ - if (map_value_has_timer(map->inner_map_meta)) + if (btf_record_has_field(map->inner_map_meta->record, BPF_TIMER)) reg->map_uid = reg->id; } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { reg->type = PTR_TO_XDP_SOCK; @@ -1658,7 +1757,7 @@ static void __mark_reg_unknown(const struct bpf_verifier_env *env, reg->type = SCALAR_VALUE; reg->var_off = tnum_unknown; reg->frameno = 0; - reg->precise = env->subprog_cnt > 1 || !env->bpf_capable; + reg->precise = !env->bpf_capable; __mark_reg_unbounded(reg); } @@ -1739,6 +1838,7 @@ static void init_func_state(struct bpf_verifier_env *env, state->callsite = callsite; state->frameno = frameno; state->subprogno = subprogno; + state->callback_ret_range = tnum_range(0, 0); init_reg_state(env, state); mark_verifier_state_scratched(env); } @@ -2466,15 +2566,30 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, return 0; } +static void mark_jmp_point(struct bpf_verifier_env *env, int idx) +{ + env->insn_aux_data[idx].jmp_point = true; +} + +static bool is_jmp_point(struct bpf_verifier_env *env, int insn_idx) +{ + return env->insn_aux_data[insn_idx].jmp_point; +} + /* for any branch, call, exit record the history of jmps in the given state */ static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur) { u32 cnt = cur->jmp_history_cnt; struct bpf_idx_pair *p; + size_t alloc_size; + + if (!is_jmp_point(env, env->insn_idx)) + return 0; cnt++; - p = krealloc(cur->jmp_history, cnt * sizeof(*p), GFP_USER); + alloc_size = kmalloc_size_roundup(size_mul(cnt, sizeof(*p))); + p = krealloc(cur->jmp_history, alloc_size, GFP_USER); if (!p) return -ENOMEM; p[cnt - 1].idx = env->insn_idx; @@ -2626,6 +2741,11 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, if (opcode == BPF_CALL) { if (insn->src_reg == BPF_PSEUDO_CALL) return -ENOTSUPP; + /* BPF helpers that invoke callback subprogs are + * equivalent to BPF_PSEUDO_CALL above + */ + if (insn->src_reg == 0 && is_callback_calling_function(insn->imm)) + return -ENOTSUPP; /* regular helper call sets R0 */ *reg_mask &= ~1; if (*reg_mask & 0x3f) { @@ -2715,8 +2835,11 @@ static void mark_all_scalars_precise(struct bpf_verifier_env *env, /* big hammer: mark all scalars precise in this path. * pop_stack may still get !precise scalars. + * We also skip current state and go straight to first parent state, + * because precision markings in current non-checkpointed state are + * not needed. See why in the comment in __mark_chain_precision below. */ - for (; st; st = st->parent) + for (st = st->parent; st; st = st->parent) { for (i = 0; i <= st->curframe; i++) { func = st->frame[i]; for (j = 0; j < BPF_REG_FP; j++) { @@ -2734,9 +2857,122 @@ static void mark_all_scalars_precise(struct bpf_verifier_env *env, reg->precise = true; } } + } } -static int __mark_chain_precision(struct bpf_verifier_env *env, int regno, +static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_verifier_state *st) +{ + struct bpf_func_state *func; + struct bpf_reg_state *reg; + int i, j; + + for (i = 0; i <= st->curframe; i++) { + func = st->frame[i]; + for (j = 0; j < BPF_REG_FP; j++) { + reg = &func->regs[j]; + if (reg->type != SCALAR_VALUE) + continue; + reg->precise = false; + } + for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) { + if (!is_spilled_reg(&func->stack[j])) + continue; + reg = &func->stack[j].spilled_ptr; + if (reg->type != SCALAR_VALUE) + continue; + reg->precise = false; + } + } +} + +/* + * __mark_chain_precision() backtracks BPF program instruction sequence and + * chain of verifier states making sure that register *regno* (if regno >= 0) + * and/or stack slot *spi* (if spi >= 0) are marked as precisely tracked + * SCALARS, as well as any other registers and slots that contribute to + * a tracked state of given registers/stack slots, depending on specific BPF + * assembly instructions (see backtrack_insns() for exact instruction handling + * logic). This backtracking relies on recorded jmp_history and is able to + * traverse entire chain of parent states. This process ends only when all the + * necessary registers/slots and their transitive dependencies are marked as + * precise. + * + * One important and subtle aspect is that precise marks *do not matter* in + * the currently verified state (current state). It is important to understand + * why this is the case. + * + * First, note that current state is the state that is not yet "checkpointed", + * i.e., it is not yet put into env->explored_states, and it has no children + * states as well. It's ephemeral, and can end up either a) being discarded if + * compatible explored state is found at some point or BPF_EXIT instruction is + * reached or b) checkpointed and put into env->explored_states, branching out + * into one or more children states. + * + * In the former case, precise markings in current state are completely + * ignored by state comparison code (see regsafe() for details). Only + * checkpointed ("old") state precise markings are important, and if old + * state's register/slot is precise, regsafe() assumes current state's + * register/slot as precise and checks value ranges exactly and precisely. If + * states turn out to be compatible, current state's necessary precise + * markings and any required parent states' precise markings are enforced + * after the fact with propagate_precision() logic, after the fact. But it's + * important to realize that in this case, even after marking current state + * registers/slots as precise, we immediately discard current state. So what + * actually matters is any of the precise markings propagated into current + * state's parent states, which are always checkpointed (due to b) case above). + * As such, for scenario a) it doesn't matter if current state has precise + * markings set or not. + * + * Now, for the scenario b), checkpointing and forking into child(ren) + * state(s). Note that before current state gets to checkpointing step, any + * processed instruction always assumes precise SCALAR register/slot + * knowledge: if precise value or range is useful to prune jump branch, BPF + * verifier takes this opportunity enthusiastically. Similarly, when + * register's value is used to calculate offset or memory address, exact + * knowledge of SCALAR range is assumed, checked, and enforced. So, similar to + * what we mentioned above about state comparison ignoring precise markings + * during state comparison, BPF verifier ignores and also assumes precise + * markings *at will* during instruction verification process. But as verifier + * assumes precision, it also propagates any precision dependencies across + * parent states, which are not yet finalized, so can be further restricted + * based on new knowledge gained from restrictions enforced by their children + * states. This is so that once those parent states are finalized, i.e., when + * they have no more active children state, state comparison logic in + * is_state_visited() would enforce strict and precise SCALAR ranges, if + * required for correctness. + * + * To build a bit more intuition, note also that once a state is checkpointed, + * the path we took to get to that state is not important. This is crucial + * property for state pruning. When state is checkpointed and finalized at + * some instruction index, it can be correctly and safely used to "short + * circuit" any *compatible* state that reaches exactly the same instruction + * index. I.e., if we jumped to that instruction from a completely different + * code path than original finalized state was derived from, it doesn't + * matter, current state can be discarded because from that instruction + * forward having a compatible state will ensure we will safely reach the + * exit. States describe preconditions for further exploration, but completely + * forget the history of how we got here. + * + * This also means that even if we needed precise SCALAR range to get to + * finalized state, but from that point forward *that same* SCALAR register is + * never used in a precise context (i.e., it's precise value is not needed for + * correctness), it's correct and safe to mark such register as "imprecise" + * (i.e., precise marking set to false). This is what we rely on when we do + * not set precise marking in current state. If no child state requires + * precision for any given SCALAR register, it's safe to dictate that it can + * be imprecise. If any child state does require this register to be precise, + * we'll mark it precise later retroactively during precise markings + * propagation from child state to parent states. + * + * Skipping precise marking setting in current state is a mild version of + * relying on the above observation. But we can utilize this property even + * more aggressively by proactively forgetting any precise marking in the + * current state (which we inherited from the parent state), right before we + * checkpoint it and branch off into new child state. This is done by + * mark_all_scalars_imprecise() to hopefully get more permissive and generic + * finalized states which help in short circuiting more future states. + */ +static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int regno, int spi) { struct bpf_verifier_state *st = env->cur_state; @@ -2753,18 +2989,18 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno, if (!env->bpf_capable) return 0; - func = st->frame[st->curframe]; + /* Do sanity checks against current state of register and/or stack + * slot, but don't set precise flag in current state, as precision + * tracking in the current state is unnecessary. + */ + func = st->frame[frame]; if (regno >= 0) { reg = &func->regs[regno]; if (reg->type != SCALAR_VALUE) { WARN_ONCE(1, "backtracing misuse"); return -EFAULT; } - if (!reg->precise) - new_marks = true; - else - reg_mask = 0; - reg->precise = true; + new_marks = true; } while (spi >= 0) { @@ -2777,11 +3013,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno, stack_mask = 0; break; } - if (!reg->precise) - new_marks = true; - else - stack_mask = 0; - reg->precise = true; + new_marks = true; break; } @@ -2789,12 +3021,42 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno, return 0; if (!reg_mask && !stack_mask) return 0; + for (;;) { DECLARE_BITMAP(mask, 64); u32 history = st->jmp_history_cnt; if (env->log.level & BPF_LOG_LEVEL2) verbose(env, "last_idx %d first_idx %d\n", last_idx, first_idx); + + if (last_idx < 0) { + /* we are at the entry into subprog, which + * is expected for global funcs, but only if + * requested precise registers are R1-R5 + * (which are global func's input arguments) + */ + if (st->curframe == 0 && + st->frame[0]->subprogno > 0 && + st->frame[0]->callsite == BPF_MAIN_FUNC && + stack_mask == 0 && (reg_mask & ~0x3e) == 0) { + bitmap_from_u64(mask, reg_mask); + for_each_set_bit(i, mask, 32) { + reg = &st->frame[0]->regs[i]; + if (reg->type != SCALAR_VALUE) { + reg_mask &= ~(1u << i); + continue; + } + reg->precise = true; + } + return 0; + } + + verbose(env, "BUG backtracing func entry subprog %d reg_mask %x stack_mask %llx\n", + st->frame[0]->subprogno, reg_mask, stack_mask); + WARN_ONCE(1, "verifier backtracking bug"); + return -EFAULT; + } + for (i = last_idx;;) { if (skip_first) { err = 0; @@ -2834,7 +3096,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno, break; new_marks = false; - func = st->frame[st->curframe]; + func = st->frame[frame]; bitmap_from_u64(mask, reg_mask); for_each_set_bit(i, mask, 32) { reg = &func->regs[i]; @@ -2898,14 +3160,19 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno, return 0; } -static int mark_chain_precision(struct bpf_verifier_env *env, int regno) +int mark_chain_precision(struct bpf_verifier_env *env, int regno) { - return __mark_chain_precision(env, regno, -1); + return __mark_chain_precision(env, env->cur_state->curframe, regno, -1); } -static int mark_chain_precision_stack(struct bpf_verifier_env *env, int spi) +static int mark_chain_precision_frame(struct bpf_verifier_env *env, int frame, int regno) { - return __mark_chain_precision(env, -1, spi); + return __mark_chain_precision(env, frame, regno, -1); +} + +static int mark_chain_precision_stack_frame(struct bpf_verifier_env *env, int frame, int spi) +{ + return __mark_chain_precision(env, frame, -1, spi); } static bool is_spillable_regtype(enum bpf_reg_type type) @@ -3154,14 +3421,17 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env, stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE]; mark_stack_slot_scratched(env, spi); - if (!env->allow_ptr_leaks - && *stype != NOT_INIT - && *stype != SCALAR_VALUE) { - /* Reject the write if there's are spilled pointers in - * range. If we didn't reject here, the ptr status - * would be erased below (even though not all slots are - * actually overwritten), possibly opening the door to - * leaks. + if (!env->allow_ptr_leaks && *stype != STACK_MISC && *stype != STACK_ZERO) { + /* Reject the write if range we may write to has not + * been initialized beforehand. If we didn't reject + * here, the ptr status would be erased below (even + * though not all slots are actually overwritten), + * possibly opening the door to leaks. + * + * We do however catch STACK_INVALID case below, and + * only allow reading possibly uninitialized memory + * later for CAP_PERFMON, as the write may not happen to + * that slot. */ verbose(env, "spilled ptr in range of var-offset stack write; insn %d, ptr off: %d", insn_idx, i); @@ -3651,15 +3921,15 @@ int check_ptr_off_reg(struct bpf_verifier_env *env, } static int map_kptr_match_type(struct bpf_verifier_env *env, - struct bpf_map_value_off_desc *off_desc, + struct btf_field *kptr_field, struct bpf_reg_state *reg, u32 regno) { - const char *targ_name = kernel_type_name(off_desc->kptr.btf, off_desc->kptr.btf_id); - int perm_flags = PTR_MAYBE_NULL; + const char *targ_name = kernel_type_name(kptr_field->kptr.btf, kptr_field->kptr.btf_id); + int perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED; const char *reg_name = ""; /* Only unreferenced case accepts untrusted pointers */ - if (off_desc->type == BPF_KPTR_UNREF) + if (kptr_field->type == BPF_KPTR_UNREF) perm_flags |= PTR_UNTRUSTED; if (base_type(reg->type) != PTR_TO_BTF_ID || (type_flag(reg->type) & ~perm_flags)) @@ -3706,15 +3976,15 @@ static int map_kptr_match_type(struct bpf_verifier_env *env, * strict mode to true for type match. */ if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off, - off_desc->kptr.btf, off_desc->kptr.btf_id, - off_desc->type == BPF_KPTR_REF)) + kptr_field->kptr.btf, kptr_field->kptr.btf_id, + kptr_field->type == BPF_KPTR_REF)) goto bad_type; return 0; bad_type: verbose(env, "invalid kptr access, R%d type=%s%s ", regno, reg_type_str(env, reg->type), reg_name); verbose(env, "expected=%s%s", reg_type_str(env, PTR_TO_BTF_ID), targ_name); - if (off_desc->type == BPF_KPTR_UNREF) + if (kptr_field->type == BPF_KPTR_UNREF) verbose(env, " or %s%s\n", reg_type_str(env, PTR_TO_BTF_ID | PTR_UNTRUSTED), targ_name); else @@ -3724,7 +3994,7 @@ bad_type: static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, int value_regno, int insn_idx, - struct bpf_map_value_off_desc *off_desc) + struct btf_field *kptr_field) { struct bpf_insn *insn = &env->prog->insnsi[insn_idx]; int class = BPF_CLASS(insn->code); @@ -3734,7 +4004,7 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, * - Reject cases where variable offset may touch kptr * - size of access (must be BPF_DW) * - tnum_is_const(reg->var_off) - * - off_desc->offset == off + reg->var_off.value + * - kptr_field->offset == off + reg->var_off.value */ /* Only BPF_[LDX,STX,ST] | BPF_MEM | BPF_DW is supported */ if (BPF_MODE(insn->code) != BPF_MEM) { @@ -3745,7 +4015,7 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, /* We only allow loading referenced kptr, since it will be marked as * untrusted, similar to unreferenced kptr. */ - if (class != BPF_LDX && off_desc->type == BPF_KPTR_REF) { + if (class != BPF_LDX && kptr_field->type == BPF_KPTR_REF) { verbose(env, "store to referenced kptr disallowed\n"); return -EACCES; } @@ -3755,19 +4025,19 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, /* We can simply mark the value_regno receiving the pointer * value from map as PTR_TO_BTF_ID, with the correct type. */ - mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, off_desc->kptr.btf, - off_desc->kptr.btf_id, PTR_MAYBE_NULL | PTR_UNTRUSTED); + mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, kptr_field->kptr.btf, + kptr_field->kptr.btf_id, PTR_MAYBE_NULL | PTR_UNTRUSTED); /* For mark_ptr_or_null_reg */ val_reg->id = ++env->id_gen; } else if (class == BPF_STX) { val_reg = reg_state(env, value_regno); if (!register_is_null(val_reg) && - map_kptr_match_type(env, off_desc, val_reg, value_regno)) + map_kptr_match_type(env, kptr_field, val_reg, value_regno)) return -EACCES; } else if (class == BPF_ST) { if (insn->imm) { verbose(env, "BPF_ST imm must be 0 when storing to kptr at off=%u\n", - off_desc->offset); + kptr_field->offset); return -EACCES; } } else { @@ -3786,45 +4056,30 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *reg = &state->regs[regno]; struct bpf_map *map = reg->map_ptr; - int err; + struct btf_record *rec; + int err, i; err = check_mem_region_access(env, regno, off, size, map->value_size, zero_size_allowed); if (err) return err; - if (map_value_has_spin_lock(map)) { - u32 lock = map->spin_lock_off; + if (IS_ERR_OR_NULL(map->record)) + return 0; + rec = map->record; + for (i = 0; i < rec->cnt; i++) { + struct btf_field *field = &rec->fields[i]; + u32 p = field->offset; - /* if any part of struct bpf_spin_lock can be touched by - * load/store reject this program. - * To check that [x1, x2) overlaps with [y1, y2) + /* If any part of a field can be touched by load/store, reject + * this program. To check that [x1, x2) overlaps with [y1, y2), * it is sufficient to check x1 < y2 && y1 < x2. */ - if (reg->smin_value + off < lock + sizeof(struct bpf_spin_lock) && - lock < reg->umax_value + off + size) { - verbose(env, "bpf_spin_lock cannot be accessed directly by load/store\n"); - return -EACCES; - } - } - if (map_value_has_timer(map)) { - u32 t = map->timer_off; - - if (reg->smin_value + off < t + sizeof(struct bpf_timer) && - t < reg->umax_value + off + size) { - verbose(env, "bpf_timer cannot be accessed directly by load/store\n"); - return -EACCES; - } - } - if (map_value_has_kptrs(map)) { - struct bpf_map_value_off *tab = map->kptr_off_tab; - int i; - - for (i = 0; i < tab->nr_off; i++) { - u32 p = tab->off[i].offset; - - if (reg->smin_value + off < p + sizeof(u64) && - p < reg->umax_value + off + size) { + if (reg->smin_value + off < p + btf_field_type_size(field->type) && + p < reg->umax_value + off + size) { + switch (field->type) { + case BPF_KPTR_UNREF: + case BPF_KPTR_REF: if (src != ACCESS_DIRECT) { verbose(env, "kptr cannot be accessed indirectly by helper\n"); return -EACCES; @@ -3843,10 +4098,14 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, return -EACCES; } break; + default: + verbose(env, "%s cannot be accessed directly by load/store\n", + btf_field_type_name(field->type)); + return -EACCES; } } } - return err; + return 0; } #define MAX_PACKET_OFF 0xffff @@ -4063,6 +4322,30 @@ static bool is_flow_key_reg(struct bpf_verifier_env *env, int regno) return reg->type == PTR_TO_FLOW_KEYS; } +static bool is_trusted_reg(const struct bpf_reg_state *reg) +{ + /* A referenced register is always trusted. */ + if (reg->ref_obj_id) + return true; + + /* If a register is not referenced, it is trusted if it has the + * MEM_ALLOC or PTR_TRUSTED type modifiers, and no others. Some of the + * other type modifiers may be safe, but we elect to take an opt-in + * approach here as some (e.g. PTR_UNTRUSTED and PTR_MAYBE_NULL) are + * not. + * + * Eventually, we should make PTR_TRUSTED the single source of truth + * for whether a register is trusted. + */ + return type_flag(reg->type) & BPF_REG_TRUSTED_MODIFIERS && + !bpf_type_has_unsafe_modifiers(reg->type); +} + +static bool is_rcu_reg(const struct bpf_reg_state *reg) +{ + return reg->type & MEM_RCU; +} + static int check_pkt_ptr_alignment(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int off, int size, bool strict) @@ -4479,6 +4762,18 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, u32 btf_id; int ret; + if (!env->allow_ptr_leaks) { + verbose(env, + "'struct %s' access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN\n", + tname); + return -EPERM; + } + if (!env->prog->gpl_compatible && btf_is_kernel(reg->btf)) { + verbose(env, + "Cannot access kernel 'struct %s' from non-GPL compatible program\n", + tname); + return -EINVAL; + } if (off < 0) { verbose(env, "R%d is ptr_%s invalid negative access: off=%d\n", @@ -4509,17 +4804,28 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, return -EACCES; } - if (env->ops->btf_struct_access) { - ret = env->ops->btf_struct_access(&env->log, reg->btf, t, - off, size, atype, &btf_id, &flag); + if (env->ops->btf_struct_access && !type_is_alloc(reg->type)) { + if (!btf_is_kernel(reg->btf)) { + verbose(env, "verifier internal error: reg->btf must be kernel btf\n"); + return -EFAULT; + } + ret = env->ops->btf_struct_access(&env->log, reg, off, size, atype, &btf_id, &flag); } else { - if (atype != BPF_READ) { + /* Writes are permitted with default btf_struct_access for + * program allocated objects (which always have ref_obj_id > 0), + * but not for untrusted PTR_TO_BTF_ID | MEM_ALLOC. + */ + if (atype != BPF_READ && reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { verbose(env, "only read is supported\n"); return -EACCES; } - ret = btf_struct_access(&env->log, reg->btf, t, off, size, - atype, &btf_id, &flag); + if (type_is_alloc(reg->type) && !reg->ref_obj_id) { + verbose(env, "verifier internal error: ref_obj_id for allocated object must be non-zero\n"); + return -EFAULT; + } + + ret = btf_struct_access(&env->log, reg, off, size, atype, &btf_id, &flag); } if (ret < 0) @@ -4531,6 +4837,30 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, if (type_flag(reg->type) & PTR_UNTRUSTED) flag |= PTR_UNTRUSTED; + /* By default any pointer obtained from walking a trusted pointer is + * no longer trusted except the rcu case below. + */ + flag &= ~PTR_TRUSTED; + + if (flag & MEM_RCU) { + /* Mark value register as MEM_RCU only if it is protected by + * bpf_rcu_read_lock() and the ptr reg is rcu or trusted. MEM_RCU + * itself can already indicate trustedness inside the rcu + * read lock region. Also mark rcu pointer as PTR_MAYBE_NULL since + * it could be null in some cases. + */ + if (!env->cur_state->active_rcu_lock || + !(is_trusted_reg(reg) || is_rcu_reg(reg))) + flag &= ~MEM_RCU; + else + flag |= PTR_MAYBE_NULL; + } else if (reg->type & MEM_RCU) { + /* ptr (reg) is marked as MEM_RCU, but the struct field is not tagged + * with __rcu. Mark the flag as PTR_UNTRUSTED conservatively. + */ + flag |= PTR_UNTRUSTED; + } + if (atype == BPF_READ && value_regno >= 0) mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id, flag); @@ -4545,6 +4875,7 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env, { struct bpf_reg_state *reg = regs + regno; struct bpf_map *map = reg->map_ptr; + struct bpf_reg_state map_reg; enum bpf_type_flag flag = 0; const struct btf_type *t; const char *tname; @@ -4565,9 +4896,9 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env, t = btf_type_by_id(btf_vmlinux, *map->ops->map_btf_id); tname = btf_name_by_offset(btf_vmlinux, t->name_off); - if (!env->allow_ptr_to_map_access) { + if (!env->allow_ptr_leaks) { verbose(env, - "%s access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN\n", + "'struct %s' access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN\n", tname); return -EPERM; } @@ -4583,7 +4914,10 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env, return -EACCES; } - ret = btf_struct_access(&env->log, btf_vmlinux, t, off, size, atype, &btf_id, &flag); + /* Simulate access to a PTR_TO_BTF_ID */ + memset(&map_reg, 0, sizeof(map_reg)); + mark_btf_ld_reg(env, &map_reg, 0, PTR_TO_BTF_ID, btf_vmlinux, *map->ops->map_btf_id, 0); + ret = btf_struct_access(&env->log, &map_reg, off, size, atype, &btf_id, &flag); if (ret < 0) return ret; @@ -4719,7 +5053,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (value_regno >= 0) mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_MAP_VALUE) { - struct bpf_map_value_off_desc *kptr_off_desc = NULL; + struct btf_field *kptr_field = NULL; if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { @@ -4733,11 +5067,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (err) return err; if (tnum_is_const(reg->var_off)) - kptr_off_desc = bpf_map_kptr_off_contains(reg->map_ptr, - off + reg->var_off.value); - if (kptr_off_desc) { - err = check_map_kptr_access(env, regno, value_regno, insn_idx, - kptr_off_desc); + kptr_field = btf_record_find(reg->map_ptr->record, + off + reg->var_off.value, BPF_KPTR); + if (kptr_field) { + err = check_map_kptr_access(env, regno, value_regno, insn_idx, kptr_field); } else if (t == BPF_READ && value_regno >= 0) { struct bpf_map *map = reg->map_ptr; @@ -5128,10 +5461,6 @@ static int check_stack_range_initialized( } if (is_spilled_reg(&state->stack[spi]) && - base_type(state->stack[spi].spilled_ptr.type) == PTR_TO_BTF_ID) - goto mark; - - if (is_spilled_reg(&state->stack[spi]) && (state->stack[spi].spilled_ptr.type == SCALAR_VALUE || env->allow_ptr_leaks)) { if (clobber) { @@ -5161,6 +5490,11 @@ mark: mark_reg_read(env, &state->stack[spi].spilled_ptr, state->stack[spi].spilled_ptr.parent, REG_LIVE_READ64); + /* We do not set REG_LIVE_WRITTEN for stack slot, as we can not + * be sure that whether stack slot is written to or not. Hence, + * we must still conservatively propagate reads upwards even if + * helper may write to the entire memory range. + */ } return update_stack_depth(env, state, min_off); } @@ -5223,6 +5557,25 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, env, regno, reg->off, access_size, zero_size_allowed, ACCESS_HELPER, meta); + case PTR_TO_CTX: + /* in case the function doesn't know how to access the context, + * (because we are in a program of type SYSCALL for example), we + * can not statically check its size. + * Dynamically check it now. + */ + if (!env->ops->convert_ctx_access) { + enum bpf_access_type atype = meta && meta->raw_mode ? BPF_WRITE : BPF_READ; + int offset = access_size - 1; + + /* Allow zero-byte read from PTR_TO_CTX */ + if (access_size == 0) + return zero_size_allowed ? 0 : -EACCES; + + return check_mem_access(env, env->insn_idx, regno, offset, BPF_B, + atype, -1, false); + } + + fallthrough; default: /* scalar_value or invalid ptr */ /* Allow zero-byte read from NULL, regardless of pointer type */ if (zero_size_allowed && access_size == 0 && @@ -5323,8 +5676,8 @@ int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, return err; } -int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, - u32 regno) +static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + u32 regno) { struct bpf_reg_state *mem_reg = &cur_regs(env)[regno - 1]; bool may_be_null = type_may_be_null(mem_reg->type); @@ -5352,23 +5705,26 @@ int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state } /* Implementation details: - * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL + * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL. + * bpf_obj_new returns PTR_TO_BTF_ID | MEM_ALLOC | PTR_MAYBE_NULL. * Two bpf_map_lookups (even with the same key) will have different reg->id. - * For traditional PTR_TO_MAP_VALUE the verifier clears reg->id after - * value_or_null->value transition, since the verifier only cares about - * the range of access to valid map value pointer and doesn't care about actual - * address of the map element. + * Two separate bpf_obj_new will also have different reg->id. + * For traditional PTR_TO_MAP_VALUE or PTR_TO_BTF_ID | MEM_ALLOC, the verifier + * clears reg->id after value_or_null->value transition, since the verifier only + * cares about the range of access to valid map value pointer and doesn't care + * about actual address of the map element. * For maps with 'struct bpf_spin_lock' inside map value the verifier keeps * reg->id > 0 after value_or_null->value transition. By doing so * two bpf_map_lookups will be considered two different pointers that - * point to different bpf_spin_locks. + * point to different bpf_spin_locks. Likewise for pointers to allocated objects + * returned from bpf_obj_new. * The verifier allows taking only one bpf_spin_lock at a time to avoid * dead-locks. * Since only one bpf_spin_lock is allowed the checks are simpler than * reg_is_refcounted() logic. The verifier needs to remember only * one spin_lock instead of array of acquired_refs. - * cur_state->active_spin_lock remembers which map value element got locked - * and clears it after bpf_spin_unlock. + * cur_state->active_lock remembers which map value element or allocated + * object got locked and clears it after bpf_spin_unlock. */ static int process_spin_lock(struct bpf_verifier_env *env, int regno, bool is_lock) @@ -5376,8 +5732,10 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; struct bpf_verifier_state *cur = env->cur_state; bool is_const = tnum_is_const(reg->var_off); - struct bpf_map *map = reg->map_ptr; u64 val = reg->var_off.value; + struct bpf_map *map = NULL; + struct btf *btf = NULL; + struct btf_record *rec; if (!is_const) { verbose(env, @@ -5385,49 +5743,78 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, regno); return -EINVAL; } - if (!map->btf) { - verbose(env, - "map '%s' has to have BTF in order to use bpf_spin_lock\n", - map->name); - return -EINVAL; - } - if (!map_value_has_spin_lock(map)) { - if (map->spin_lock_off == -E2BIG) + if (reg->type == PTR_TO_MAP_VALUE) { + map = reg->map_ptr; + if (!map->btf) { verbose(env, - "map '%s' has more than one 'struct bpf_spin_lock'\n", - map->name); - else if (map->spin_lock_off == -ENOENT) - verbose(env, - "map '%s' doesn't have 'struct bpf_spin_lock'\n", - map->name); - else - verbose(env, - "map '%s' is not a struct type or bpf_spin_lock is mangled\n", + "map '%s' has to have BTF in order to use bpf_spin_lock\n", map->name); + return -EINVAL; + } + } else { + btf = reg->btf; + } + + rec = reg_btf_record(reg); + if (!btf_record_has_field(rec, BPF_SPIN_LOCK)) { + verbose(env, "%s '%s' has no valid bpf_spin_lock\n", map ? "map" : "local", + map ? map->name : "kptr"); return -EINVAL; } - if (map->spin_lock_off != val + reg->off) { - verbose(env, "off %lld doesn't point to 'struct bpf_spin_lock'\n", - val + reg->off); + if (rec->spin_lock_off != val + reg->off) { + verbose(env, "off %lld doesn't point to 'struct bpf_spin_lock' that is at %d\n", + val + reg->off, rec->spin_lock_off); return -EINVAL; } if (is_lock) { - if (cur->active_spin_lock) { + if (cur->active_lock.ptr) { verbose(env, "Locking two bpf_spin_locks are not allowed\n"); return -EINVAL; } - cur->active_spin_lock = reg->id; + if (map) + cur->active_lock.ptr = map; + else + cur->active_lock.ptr = btf; + cur->active_lock.id = reg->id; } else { - if (!cur->active_spin_lock) { + struct bpf_func_state *fstate = cur_func(env); + void *ptr; + int i; + + if (map) + ptr = map; + else + ptr = btf; + + if (!cur->active_lock.ptr) { verbose(env, "bpf_spin_unlock without taking a lock\n"); return -EINVAL; } - if (cur->active_spin_lock != reg->id) { + if (cur->active_lock.ptr != ptr || + cur->active_lock.id != reg->id) { verbose(env, "bpf_spin_unlock of different lock\n"); return -EINVAL; } - cur->active_spin_lock = 0; + cur->active_lock.ptr = NULL; + cur->active_lock.id = 0; + + for (i = fstate->acquired_refs - 1; i >= 0; i--) { + int err; + + /* Complain on error because this reference state cannot + * be freed before this point, as bpf_spin_lock critical + * section does not allow functions that release the + * allocated object immediately. + */ + if (!fstate->refs[i].release_on_unlock) + continue; + err = release_reference(env, fstate->refs[i].id); + if (err) { + verbose(env, "failed to release release_on_unlock reference"); + return err; + } + } } return 0; } @@ -5451,24 +5838,13 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno, map->name); return -EINVAL; } - if (!map_value_has_timer(map)) { - if (map->timer_off == -E2BIG) - verbose(env, - "map '%s' has more than one 'struct bpf_timer'\n", - map->name); - else if (map->timer_off == -ENOENT) - verbose(env, - "map '%s' doesn't have 'struct bpf_timer'\n", - map->name); - else - verbose(env, - "map '%s' is not a struct type or bpf_timer is mangled\n", - map->name); + if (!btf_record_has_field(map->record, BPF_TIMER)) { + verbose(env, "map '%s' has no valid bpf_timer\n", map->name); return -EINVAL; } - if (map->timer_off != val + reg->off) { + if (map->record->timer_off != val + reg->off) { verbose(env, "off %lld doesn't point to 'struct bpf_timer' that is at %d\n", - val + reg->off, map->timer_off); + val + reg->off, map->record->timer_off); return -EINVAL; } if (meta->map_ptr) { @@ -5484,10 +5860,9 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno, struct bpf_call_arg_meta *meta) { struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; - struct bpf_map_value_off_desc *off_desc; struct bpf_map *map_ptr = reg->map_ptr; + struct btf_field *kptr_field; u32 kptr_off; - int ret; if (!tnum_is_const(reg->var_off)) { verbose(env, @@ -5500,30 +5875,136 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno, map_ptr->name); return -EINVAL; } - if (!map_value_has_kptrs(map_ptr)) { - ret = PTR_ERR_OR_ZERO(map_ptr->kptr_off_tab); - if (ret == -E2BIG) - verbose(env, "map '%s' has more than %d kptr\n", map_ptr->name, - BPF_MAP_VALUE_OFF_MAX); - else if (ret == -EEXIST) - verbose(env, "map '%s' has repeating kptr BTF tags\n", map_ptr->name); - else - verbose(env, "map '%s' has no valid kptr\n", map_ptr->name); + if (!btf_record_has_field(map_ptr->record, BPF_KPTR)) { + verbose(env, "map '%s' has no valid kptr\n", map_ptr->name); return -EINVAL; } meta->map_ptr = map_ptr; kptr_off = reg->off + reg->var_off.value; - off_desc = bpf_map_kptr_off_contains(map_ptr, kptr_off); - if (!off_desc) { + kptr_field = btf_record_find(map_ptr->record, kptr_off, BPF_KPTR); + if (!kptr_field) { verbose(env, "off=%d doesn't point to kptr\n", kptr_off); return -EACCES; } - if (off_desc->type != BPF_KPTR_REF) { + if (kptr_field->type != BPF_KPTR_REF) { verbose(env, "off=%d kptr isn't referenced kptr\n", kptr_off); return -EACCES; } - meta->kptr_off_desc = off_desc; + meta->kptr_field = kptr_field; + return 0; +} + +/* There are two register types representing a bpf_dynptr, one is PTR_TO_STACK + * which points to a stack slot, and the other is CONST_PTR_TO_DYNPTR. + * + * In both cases we deal with the first 8 bytes, but need to mark the next 8 + * bytes as STACK_DYNPTR in case of PTR_TO_STACK. In case of + * CONST_PTR_TO_DYNPTR, we are guaranteed to get the beginning of the object. + * + * Mutability of bpf_dynptr is at two levels, one is at the level of struct + * bpf_dynptr itself, i.e. whether the helper is receiving a pointer to struct + * bpf_dynptr or pointer to const struct bpf_dynptr. In the former case, it can + * mutate the view of the dynptr and also possibly destroy it. In the latter + * case, it cannot mutate the bpf_dynptr itself but it can still mutate the + * memory that dynptr points to. + * + * The verifier will keep track both levels of mutation (bpf_dynptr's in + * reg->type and the memory's in reg->dynptr.type), but there is no support for + * readonly dynptr view yet, hence only the first case is tracked and checked. + * + * This is consistent with how C applies the const modifier to a struct object, + * where the pointer itself inside bpf_dynptr becomes const but not what it + * points to. + * + * Helpers which do not mutate the bpf_dynptr set MEM_RDONLY in their argument + * type, and declare it as 'const struct bpf_dynptr *' in their prototype. + */ +int process_dynptr_func(struct bpf_verifier_env *env, int regno, + enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta) +{ + struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + + /* MEM_UNINIT and MEM_RDONLY are exclusive, when applied to an + * ARG_PTR_TO_DYNPTR (or ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_*): + */ + if ((arg_type & (MEM_UNINIT | MEM_RDONLY)) == (MEM_UNINIT | MEM_RDONLY)) { + verbose(env, "verifier internal error: misconfigured dynptr helper type flags\n"); + return -EFAULT; + } + /* CONST_PTR_TO_DYNPTR already has fixed and var_off as 0 due to + * check_func_arg_reg_off's logic. We only need to check offset + * alignment for PTR_TO_STACK. + */ + if (reg->type == PTR_TO_STACK && (reg->off % BPF_REG_SIZE)) { + verbose(env, "cannot pass in dynptr at an offset=%d\n", reg->off); + return -EINVAL; + } + /* MEM_UNINIT - Points to memory that is an appropriate candidate for + * constructing a mutable bpf_dynptr object. + * + * Currently, this is only possible with PTR_TO_STACK + * pointing to a region of at least 16 bytes which doesn't + * contain an existing bpf_dynptr. + * + * MEM_RDONLY - Points to a initialized bpf_dynptr that will not be + * mutated or destroyed. However, the memory it points to + * may be mutated. + * + * None - Points to a initialized dynptr that can be mutated and + * destroyed, including mutation of the memory it points + * to. + */ + if (arg_type & MEM_UNINIT) { + if (!is_dynptr_reg_valid_uninit(env, reg)) { + verbose(env, "Dynptr has to be an uninitialized dynptr\n"); + return -EINVAL; + } + + /* We only support one dynptr being uninitialized at the moment, + * which is sufficient for the helper functions we have right now. + */ + if (meta->uninit_dynptr_regno) { + verbose(env, "verifier internal error: multiple uninitialized dynptr args\n"); + return -EFAULT; + } + + meta->uninit_dynptr_regno = regno; + } else /* MEM_RDONLY and None case from above */ { + /* For the reg->type == PTR_TO_STACK case, bpf_dynptr is never const */ + if (reg->type == CONST_PTR_TO_DYNPTR && !(arg_type & MEM_RDONLY)) { + verbose(env, "cannot pass pointer to const bpf_dynptr, the helper mutates it\n"); + return -EINVAL; + } + + if (!is_dynptr_reg_valid_init(env, reg)) { + verbose(env, + "Expected an initialized dynptr as arg #%d\n", + regno); + return -EINVAL; + } + + /* Fold modifiers (in this case, MEM_RDONLY) when checking expected type */ + if (!is_dynptr_type_expected(env, reg, arg_type & ~MEM_RDONLY)) { + const char *err_extra = ""; + + switch (arg_type & DYNPTR_TYPE_FLAG_MASK) { + case DYNPTR_TYPE_LOCAL: + err_extra = "local"; + break; + case DYNPTR_TYPE_RINGBUF: + err_extra = "ringbuf"; + break; + default: + err_extra = "<unknown>"; + break; + } + verbose(env, + "Expected a dynptr of type %s as arg #%d\n", + err_extra, regno); + return -EINVAL; + } + } return 0; } @@ -5588,16 +6069,6 @@ struct bpf_reg_types { u32 *btf_id; }; -static const struct bpf_reg_types map_key_value_types = { - .types = { - PTR_TO_STACK, - PTR_TO_PACKET, - PTR_TO_PACKET_META, - PTR_TO_MAP_KEY, - PTR_TO_MAP_VALUE, - }, -}; - static const struct bpf_reg_types sock_types = { .types = { PTR_TO_SOCK_COMMON, @@ -5615,6 +6086,7 @@ static const struct bpf_reg_types btf_id_sock_common_types = { PTR_TO_TCP_SOCK, PTR_TO_XDP_SOCK, PTR_TO_BTF_ID, + PTR_TO_BTF_ID | PTR_TRUSTED, }, .btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], }; @@ -5628,7 +6100,7 @@ static const struct bpf_reg_types mem_types = { PTR_TO_MAP_KEY, PTR_TO_MAP_VALUE, PTR_TO_MEM, - PTR_TO_MEM | MEM_ALLOC, + PTR_TO_MEM | MEM_RINGBUF, PTR_TO_BUF, }, }; @@ -5643,23 +6115,46 @@ static const struct bpf_reg_types int_ptr_types = { }, }; +static const struct bpf_reg_types spin_lock_types = { + .types = { + PTR_TO_MAP_VALUE, + PTR_TO_BTF_ID | MEM_ALLOC, + } +}; + static const struct bpf_reg_types fullsock_types = { .types = { PTR_TO_SOCKET } }; static const struct bpf_reg_types scalar_types = { .types = { SCALAR_VALUE } }; static const struct bpf_reg_types context_types = { .types = { PTR_TO_CTX } }; -static const struct bpf_reg_types alloc_mem_types = { .types = { PTR_TO_MEM | MEM_ALLOC } }; +static const struct bpf_reg_types ringbuf_mem_types = { .types = { PTR_TO_MEM | MEM_RINGBUF } }; static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_TO_MAP } }; -static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } }; -static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } }; -static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_BTF_ID | MEM_PERCPU } }; +static const struct bpf_reg_types btf_ptr_types = { + .types = { + PTR_TO_BTF_ID, + PTR_TO_BTF_ID | PTR_TRUSTED, + PTR_TO_BTF_ID | MEM_RCU, + }, +}; +static const struct bpf_reg_types percpu_btf_ptr_types = { + .types = { + PTR_TO_BTF_ID | MEM_PERCPU, + PTR_TO_BTF_ID | MEM_PERCPU | PTR_TRUSTED, + } +}; static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } }; static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } }; static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } }; static const struct bpf_reg_types timer_types = { .types = { PTR_TO_MAP_VALUE } }; static const struct bpf_reg_types kptr_types = { .types = { PTR_TO_MAP_VALUE } }; +static const struct bpf_reg_types dynptr_types = { + .types = { + PTR_TO_STACK, + CONST_PTR_TO_DYNPTR, + } +}; static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { - [ARG_PTR_TO_MAP_KEY] = &map_key_value_types, - [ARG_PTR_TO_MAP_VALUE] = &map_key_value_types, + [ARG_PTR_TO_MAP_KEY] = &mem_types, + [ARG_PTR_TO_MAP_VALUE] = &mem_types, [ARG_CONST_SIZE] = &scalar_types, [ARG_CONST_SIZE_OR_ZERO] = &scalar_types, [ARG_CONST_ALLOC_SIZE_OR_ZERO] = &scalar_types, @@ -5673,7 +6168,7 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_BTF_ID] = &btf_ptr_types, [ARG_PTR_TO_SPIN_LOCK] = &spin_lock_types, [ARG_PTR_TO_MEM] = &mem_types, - [ARG_PTR_TO_ALLOC_MEM] = &alloc_mem_types, + [ARG_PTR_TO_RINGBUF_MEM] = &ringbuf_mem_types, [ARG_PTR_TO_INT] = &int_ptr_types, [ARG_PTR_TO_LONG] = &int_ptr_types, [ARG_PTR_TO_PERCPU_BTF_ID] = &percpu_btf_ptr_types, @@ -5682,7 +6177,7 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_CONST_STR] = &const_str_ptr_types, [ARG_PTR_TO_TIMER] = &timer_types, [ARG_PTR_TO_KPTR] = &kptr_types, - [ARG_PTR_TO_DYNPTR] = &stack_ptr_types, + [ARG_PTR_TO_DYNPTR] = &dynptr_types, }; static int check_reg_type(struct bpf_verifier_env *env, u32 regno, @@ -5732,7 +6227,7 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, return -EACCES; found: - if (reg->type == PTR_TO_BTF_ID) { + if (reg->type == PTR_TO_BTF_ID || reg->type & PTR_TRUSTED) { /* For bpf_sk_release, it needs to match against first member * 'struct sock_common', hence make an exception for it. This * allows bpf_sk_release to work for multiple socket types. @@ -5749,15 +6244,29 @@ found: } if (meta->func_id == BPF_FUNC_kptr_xchg) { - if (map_kptr_match_type(env, meta->kptr_off_desc, reg, regno)) + if (map_kptr_match_type(env, meta->kptr_field, reg, regno)) return -EACCES; - } else if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off, - btf_vmlinux, *arg_btf_id, - strict_type_match)) { - verbose(env, "R%d is of type %s but %s is expected\n", - regno, kernel_type_name(reg->btf, reg->btf_id), - kernel_type_name(btf_vmlinux, *arg_btf_id)); - return -EACCES; + } else { + if (arg_btf_id == BPF_PTR_POISON) { + verbose(env, "verifier internal error:"); + verbose(env, "R%d has non-overwritten BPF_PTR_POISON type\n", + regno); + return -EACCES; + } + + if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off, + btf_vmlinux, *arg_btf_id, + strict_type_match)) { + verbose(env, "R%d is of type %s but %s is expected\n", + regno, kernel_type_name(reg->btf, reg->btf_id), + kernel_type_name(btf_vmlinux, *arg_btf_id)); + return -EACCES; + } + } + } else if (type_is_alloc(reg->type)) { + if (meta->func_id != BPF_FUNC_spin_lock && meta->func_id != BPF_FUNC_spin_unlock) { + verbose(env, "verifier internal error: unimplemented handling of MEM_ALLOC\n"); + return -EFAULT; } } @@ -5768,64 +6277,80 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int regno, enum bpf_arg_type arg_type) { - enum bpf_reg_type type = reg->type; - bool fixed_off_ok = false; + u32 type = reg->type; - switch ((u32)type) { - /* Pointer types where reg offset is explicitly allowed: */ - case PTR_TO_STACK: - if (arg_type_is_dynptr(arg_type) && reg->off % BPF_REG_SIZE) { - verbose(env, "cannot pass in dynptr at an offset\n"); + /* When referenced register is passed to release function, its fixed + * offset must be 0. + * + * We will check arg_type_is_release reg has ref_obj_id when storing + * meta->release_regno. + */ + if (arg_type_is_release(arg_type)) { + /* ARG_PTR_TO_DYNPTR with OBJ_RELEASE is a bit special, as it + * may not directly point to the object being released, but to + * dynptr pointing to such object, which might be at some offset + * on the stack. In that case, we simply to fallback to the + * default handling. + */ + if (arg_type_is_dynptr(arg_type) && type == PTR_TO_STACK) + return 0; + /* Doing check_ptr_off_reg check for the offset will catch this + * because fixed_off_ok is false, but checking here allows us + * to give the user a better error message. + */ + if (reg->off) { + verbose(env, "R%d must have zero offset when passed to release func or trusted arg to kfunc\n", + regno); return -EINVAL; } - fallthrough; + return __check_ptr_off_reg(env, reg, regno, false); + } + + switch (type) { + /* Pointer types where both fixed and variable offset is explicitly allowed: */ + case PTR_TO_STACK: case PTR_TO_PACKET: case PTR_TO_PACKET_META: case PTR_TO_MAP_KEY: case PTR_TO_MAP_VALUE: case PTR_TO_MEM: case PTR_TO_MEM | MEM_RDONLY: - case PTR_TO_MEM | MEM_ALLOC: + case PTR_TO_MEM | MEM_RINGBUF: case PTR_TO_BUF: case PTR_TO_BUF | MEM_RDONLY: case SCALAR_VALUE: - /* Some of the argument types nevertheless require a - * zero register offset. - */ - if (base_type(arg_type) != ARG_PTR_TO_ALLOC_MEM) - return 0; - break; + return 0; /* All the rest must be rejected, except PTR_TO_BTF_ID which allows * fixed offset. */ case PTR_TO_BTF_ID: + case PTR_TO_BTF_ID | MEM_ALLOC: + case PTR_TO_BTF_ID | PTR_TRUSTED: + case PTR_TO_BTF_ID | MEM_RCU: + case PTR_TO_BTF_ID | MEM_ALLOC | PTR_TRUSTED: /* When referenced PTR_TO_BTF_ID is passed to release function, - * it's fixed offset must be 0. In the other cases, fixed offset - * can be non-zero. - */ - if (arg_type_is_release(arg_type) && reg->off) { - verbose(env, "R%d must have zero offset when passed to release func\n", - regno); - return -EINVAL; - } - /* For arg is release pointer, fixed_off_ok must be false, but - * we already checked and rejected reg->off != 0 above, so set - * to true to allow fixed offset for all other cases. + * its fixed offset must be 0. In the other cases, fixed offset + * can be non-zero. This was already checked above. So pass + * fixed_off_ok as true to allow fixed offset for all other + * cases. var_off always must be 0 for PTR_TO_BTF_ID, hence we + * still need to do checks instead of returning. */ - fixed_off_ok = true; - break; + return __check_ptr_off_reg(env, reg, regno, true); default: - break; + return __check_ptr_off_reg(env, reg, regno, false); } - return __check_ptr_off_reg(env, reg, regno, fixed_off_ok); } -static u32 stack_slot_get_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +static u32 dynptr_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { struct bpf_func_state *state = func(env, reg); - int spi = get_spi(reg->off); + int spi; + + if (reg->type == CONST_PTR_TO_DYNPTR) + return reg->ref_obj_id; - return state->stack[spi].spilled_ptr.id; + spi = get_spi(reg->off); + return state->stack[spi].spilled_ptr.ref_obj_id; } static int check_func_arg(struct bpf_verifier_env *env, u32 arg, @@ -5874,7 +6399,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, goto skip_type_check; /* arg_btf_id and arg_size are in a union. */ - if (base_type(arg_type) == ARG_PTR_TO_BTF_ID) + if (base_type(arg_type) == ARG_PTR_TO_BTF_ID || + base_type(arg_type) == ARG_PTR_TO_SPIN_LOCK) arg_btf_id = fn->arg_btf_id[arg]; err = check_reg_type(env, regno, arg_type, arg_btf_id, meta); @@ -5889,11 +6415,22 @@ skip_type_check: if (arg_type_is_release(arg_type)) { if (arg_type_is_dynptr(arg_type)) { struct bpf_func_state *state = func(env, reg); - int spi = get_spi(reg->off); + int spi; - if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) || - !state->stack[spi].spilled_ptr.id) { - verbose(env, "arg %d is an unacquired reference\n", regno); + /* Only dynptr created on stack can be released, thus + * the get_spi and stack state checks for spilled_ptr + * should only be done before process_dynptr_func for + * PTR_TO_STACK. + */ + if (reg->type == PTR_TO_STACK) { + spi = get_spi(reg->off); + if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) || + !state->stack[spi].spilled_ptr.ref_obj_id) { + verbose(env, "arg %d is an unacquired reference\n", regno); + return -EINVAL; + } + } else { + verbose(env, "cannot release unowned const bpf_dynptr\n"); return -EINVAL; } } else if (!reg->ref_obj_id && !register_is_null(reg)) { @@ -5990,19 +6527,22 @@ skip_type_check: break; case ARG_PTR_TO_SPIN_LOCK: if (meta->func_id == BPF_FUNC_spin_lock) { - if (process_spin_lock(env, regno, true)) - return -EACCES; + err = process_spin_lock(env, regno, true); + if (err) + return err; } else if (meta->func_id == BPF_FUNC_spin_unlock) { - if (process_spin_lock(env, regno, false)) - return -EACCES; + err = process_spin_lock(env, regno, false); + if (err) + return err; } else { verbose(env, "verifier internal error\n"); return -EFAULT; } break; case ARG_PTR_TO_TIMER: - if (process_timer_func(env, regno, meta)) - return -EACCES; + err = process_timer_func(env, regno, meta); + if (err) + return err; break; case ARG_PTR_TO_FUNC: meta->subprogno = reg->subprogno; @@ -6025,39 +6565,9 @@ skip_type_check: err = check_mem_size_reg(env, reg, regno, true, meta); break; case ARG_PTR_TO_DYNPTR: - if (arg_type & MEM_UNINIT) { - if (!is_dynptr_reg_valid_uninit(env, reg)) { - verbose(env, "Dynptr has to be an uninitialized dynptr\n"); - return -EINVAL; - } - - /* We only support one dynptr being uninitialized at the moment, - * which is sufficient for the helper functions we have right now. - */ - if (meta->uninit_dynptr_regno) { - verbose(env, "verifier internal error: multiple uninitialized dynptr args\n"); - return -EFAULT; - } - - meta->uninit_dynptr_regno = regno; - } else if (!is_dynptr_reg_valid_init(env, reg, arg_type)) { - const char *err_extra = ""; - - switch (arg_type & DYNPTR_TYPE_FLAG_MASK) { - case DYNPTR_TYPE_LOCAL: - err_extra = "local "; - break; - case DYNPTR_TYPE_RINGBUF: - err_extra = "ringbuf "; - break; - default: - break; - } - - verbose(env, "Expected an initialized %sdynptr as arg #%d\n", - err_extra, arg + 1); - return -EINVAL; - } + err = process_dynptr_func(env, regno, arg_type, meta); + if (err) + return err; break; case ARG_CONST_ALLOC_SIZE_OR_ZERO: if (!tnum_is_const(reg->var_off)) { @@ -6124,8 +6634,9 @@ skip_type_check: break; } case ARG_PTR_TO_KPTR: - if (process_kptr_func(env, regno, meta)) - return -EACCES; + err = process_kptr_func(env, regno, meta); + if (err) + return err; break; } @@ -6199,6 +6710,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_ringbuf_discard_dynptr) goto error; break; + case BPF_MAP_TYPE_USER_RINGBUF: + if (func_id != BPF_FUNC_user_ringbuf_drain) + goto error; + break; case BPF_MAP_TYPE_STACK_TRACE: if (func_id != BPF_FUNC_get_stackid) goto error; @@ -6282,6 +6797,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_task_storage_delete) goto error; break; + case BPF_MAP_TYPE_CGRP_STORAGE: + if (func_id != BPF_FUNC_cgrp_storage_get && + func_id != BPF_FUNC_cgrp_storage_delete) + goto error; + break; case BPF_MAP_TYPE_BLOOM_FILTER: if (func_id != BPF_FUNC_map_peek_elem && func_id != BPF_FUNC_map_push_elem) @@ -6318,6 +6838,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (map->map_type != BPF_MAP_TYPE_RINGBUF) goto error; break; + case BPF_FUNC_user_ringbuf_drain: + if (map->map_type != BPF_MAP_TYPE_USER_RINGBUF) + goto error; + break; case BPF_FUNC_get_stackid: if (map->map_type != BPF_MAP_TYPE_STACK_TRACE) goto error; @@ -6390,6 +6914,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (map->map_type != BPF_MAP_TYPE_TASK_STORAGE) goto error; break; + case BPF_FUNC_cgrp_storage_get: + case BPF_FUNC_cgrp_storage_delete: + if (map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) + goto error; + break; default: break; } @@ -6456,41 +6985,15 @@ static bool check_arg_pair_ok(const struct bpf_func_proto *fn) return true; } -static bool check_refcount_ok(const struct bpf_func_proto *fn, int func_id) -{ - int count = 0; - - if (arg_type_may_be_refcounted(fn->arg1_type)) - count++; - if (arg_type_may_be_refcounted(fn->arg2_type)) - count++; - if (arg_type_may_be_refcounted(fn->arg3_type)) - count++; - if (arg_type_may_be_refcounted(fn->arg4_type)) - count++; - if (arg_type_may_be_refcounted(fn->arg5_type)) - count++; - - /* A reference acquiring function cannot acquire - * another refcounted ptr. - */ - if (may_be_acquire_function(func_id) && count) - return false; - - /* We only support one arg being unreferenced at the moment, - * which is sufficient for the helper functions we have right now. - */ - return count <= 1; -} - static bool check_btf_id_ok(const struct bpf_func_proto *fn) { int i; for (i = 0; i < ARRAY_SIZE(fn->arg_type); i++) { - if (base_type(fn->arg_type[i]) == ARG_PTR_TO_BTF_ID && !fn->arg_btf_id[i]) - return false; - + if (base_type(fn->arg_type[i]) == ARG_PTR_TO_BTF_ID) + return !!fn->arg_btf_id[i]; + if (base_type(fn->arg_type[i]) == ARG_PTR_TO_SPIN_LOCK) + return fn->arg_btf_id[i] == BPF_PTR_POISON; if (base_type(fn->arg_type[i]) != ARG_PTR_TO_BTF_ID && fn->arg_btf_id[i] && /* arg_btf_id and arg_size are in a union. */ (base_type(fn->arg_type[i]) != ARG_PTR_TO_MEM || @@ -6501,43 +7004,25 @@ static bool check_btf_id_ok(const struct bpf_func_proto *fn) return true; } -static int check_func_proto(const struct bpf_func_proto *fn, int func_id, - struct bpf_call_arg_meta *meta) +static int check_func_proto(const struct bpf_func_proto *fn, int func_id) { return check_raw_mode_ok(fn) && check_arg_pair_ok(fn) && - check_btf_id_ok(fn) && - check_refcount_ok(fn, func_id) ? 0 : -EINVAL; + check_btf_id_ok(fn) ? 0 : -EINVAL; } /* Packet data might have moved, any old PTR_TO_PACKET[_META,_END] * are now invalid, so turn them into unknown SCALAR_VALUE. */ -static void __clear_all_pkt_pointers(struct bpf_verifier_env *env, - struct bpf_func_state *state) +static void clear_all_pkt_pointers(struct bpf_verifier_env *env) { - struct bpf_reg_state *regs = state->regs, *reg; - int i; - - for (i = 0; i < MAX_BPF_REG; i++) - if (reg_is_pkt_pointer_any(®s[i])) - mark_reg_unknown(env, regs, i); + struct bpf_func_state *state; + struct bpf_reg_state *reg; - bpf_for_each_spilled_reg(i, state, reg) { - if (!reg) - continue; + bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({ if (reg_is_pkt_pointer_any(reg)) __mark_reg_unknown(env, reg); - } -} - -static void clear_all_pkt_pointers(struct bpf_verifier_env *env) -{ - struct bpf_verifier_state *vstate = env->cur_state; - int i; - - for (i = 0; i <= vstate->curframe; i++) - __clear_all_pkt_pointers(env, vstate->frame[i]); + })); } enum { @@ -6566,41 +7051,28 @@ static void mark_pkt_end(struct bpf_verifier_state *vstate, int regn, bool range reg->range = AT_PKT_END; } -static void release_reg_references(struct bpf_verifier_env *env, - struct bpf_func_state *state, - int ref_obj_id) -{ - struct bpf_reg_state *regs = state->regs, *reg; - int i; - - for (i = 0; i < MAX_BPF_REG; i++) - if (regs[i].ref_obj_id == ref_obj_id) - mark_reg_unknown(env, regs, i); - - bpf_for_each_spilled_reg(i, state, reg) { - if (!reg) - continue; - if (reg->ref_obj_id == ref_obj_id) - __mark_reg_unknown(env, reg); - } -} - /* The pointer with the specified id has released its reference to kernel * resources. Identify all copies of the same pointer and clear the reference. */ static int release_reference(struct bpf_verifier_env *env, int ref_obj_id) { - struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state; + struct bpf_reg_state *reg; int err; - int i; err = release_reference_state(cur_func(env), ref_obj_id); if (err) return err; - for (i = 0; i <= vstate->curframe; i++) - release_reg_references(env, vstate->frame[i], ref_obj_id); + bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({ + if (reg->ref_obj_id == ref_obj_id) { + if (!env->allow_ptr_leaks) + __mark_reg_not_init(env, reg); + else + __mark_reg_unknown(env, reg); + } + })); return 0; } @@ -6622,6 +7094,10 @@ typedef int (*set_callee_state_fn)(struct bpf_verifier_env *env, struct bpf_func_state *callee, int insn_idx); +static int set_callee_state(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee, int insn_idx); + static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx, int subprog, set_callee_state_fn set_callee_state_cb) @@ -6648,7 +7124,7 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn func_info_aux = env->prog->aux->func_info_aux; if (func_info_aux) is_global = func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL; - err = btf_check_subprog_arg_match(env, subprog, caller->regs); + err = btf_check_subprog_call(env, subprog, caller->regs); if (err == -EFAULT) return err; if (is_global) { @@ -6672,6 +7148,16 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn } } + /* set_callee_state is used for direct subprog calls, but we are + * interested in validating only BPF helpers that can call subprogs as + * callbacks + */ + if (set_callee_state_cb != set_callee_state && !is_callback_calling_function(insn->imm)) { + verbose(env, "verifier bug: helper %s#%d is not marked as callback-calling\n", + func_id_name(insn->imm), insn->imm); + return -EFAULT; + } + if (insn->code == (BPF_JMP | BPF_CALL) && insn->src_reg == 0 && insn->imm == BPF_FUNC_timer_set_callback) { @@ -6716,11 +7202,11 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn /* Transfer references to the callee */ err = copy_reference_state(callee, caller); if (err) - return err; + goto err_out; err = set_callee_state_cb(env, caller, callee, *insn_idx); if (err) - return err; + goto err_out; clear_caller_saved_regs(env, caller->regs); @@ -6737,6 +7223,11 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn print_verifier_state(env, callee, true); } return 0; + +err_out: + free_func_state(callee); + state->frame[state->curframe + 1] = NULL; + return err; } int map_set_for_each_callback_args(struct bpf_verifier_env *env, @@ -6822,6 +7313,7 @@ static int set_map_elem_callback_state(struct bpf_verifier_env *env, return err; callee->in_callback_fn = true; + callee->callback_ret_range = tnum_range(0, 1); return 0; } @@ -6843,6 +7335,7 @@ static int set_loop_callback_state(struct bpf_verifier_env *env, __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); callee->in_callback_fn = true; + callee->callback_ret_range = tnum_range(0, 1); return 0; } @@ -6872,6 +7365,7 @@ static int set_timer_callback_state(struct bpf_verifier_env *env, __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); callee->in_async_callback_fn = true; + callee->callback_ret_range = tnum_range(0, 1); return 0; } @@ -6899,6 +7393,30 @@ static int set_find_vma_callback_state(struct bpf_verifier_env *env, __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); callee->in_callback_fn = true; + callee->callback_ret_range = tnum_range(0, 1); + return 0; +} + +static int set_user_ringbuf_callback_state(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee, + int insn_idx) +{ + /* bpf_user_ringbuf_drain(struct bpf_map *map, void *callback_fn, void + * callback_ctx, u64 flags); + * callback_fn(const struct bpf_dynptr_t* dynptr, void *callback_ctx); + */ + __mark_reg_not_init(env, &callee->regs[BPF_REG_0]); + mark_dynptr_cb_reg(&callee->regs[BPF_REG_1], BPF_DYNPTR_TYPE_LOCAL); + callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3]; + + /* unused */ + __mark_reg_not_init(env, &callee->regs[BPF_REG_3]); + __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); + __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); + + callee->in_callback_fn = true; + callee->callback_ret_range = tnum_range(0, 1); return 0; } @@ -6922,11 +7440,10 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) return -EINVAL; } - state->curframe--; - caller = state->frame[state->curframe]; + caller = state->frame[state->curframe - 1]; if (callee->in_callback_fn) { /* enforce R0 return value range [0, 1]. */ - struct tnum range = tnum_range(0, 1); + struct tnum range = callee->callback_ret_range; if (r0->type != SCALAR_VALUE) { verbose(env, "R0 not a scalar value\n"); @@ -6941,10 +7458,17 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) caller->regs[BPF_REG_0] = *r0; } - /* Transfer references to the caller */ - err = copy_reference_state(caller, callee); - if (err) - return err; + /* callback_fn frame should have released its own additions to parent's + * reference state at this point, or check_reference_leak would + * complain, hence it must be the same as the caller. There is no need + * to copy it back. + */ + if (!callee->in_callback_fn) { + /* Transfer references to the caller */ + err = copy_reference_state(caller, callee); + if (err) + return err; + } *insn_idx = callee->callsite + 1; if (env->log.level & BPF_LOG_LEVEL) { @@ -6955,7 +7479,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) } /* clear everything in the callee */ free_func_state(callee); - state->frame[state->curframe + 1] = NULL; + state->frame[state->curframe--] = NULL; return 0; } @@ -7066,13 +7590,20 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, static int check_reference_leak(struct bpf_verifier_env *env) { struct bpf_func_state *state = cur_func(env); + bool refs_lingering = false; int i; + if (state->frameno && !state->in_callback_fn) + return 0; + for (i = 0; i < state->acquired_refs; i++) { + if (state->in_callback_fn && state->refs[i].callback_ref != state->frameno) + continue; verbose(env, "Unreleased reference id=%d alloc_insn=%d\n", state->refs[i].id, state->refs[i].insn_idx); + refs_lingering = true; } - return state->acquired_refs ? -EINVAL : 0; + return refs_lingering ? -EINVAL : 0; } static int check_bpf_snprintf_call(struct bpf_verifier_env *env, @@ -7208,6 +7739,11 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn return -EINVAL; } + if (!env->prog->aux->sleepable && fn->might_sleep) { + verbose(env, "helper call might sleep in a non-sleepable prog\n"); + return -EINVAL; + } + /* With LD_ABS/IND some JITs save/restore skb from r1. */ changes_data = bpf_helper_changes_pkt_data(fn->func); if (changes_data && fn->arg1_type != ARG_PTR_TO_CTX) { @@ -7219,13 +7755,24 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn memset(&meta, 0, sizeof(meta)); meta.pkt_access = fn->pkt_access; - err = check_func_proto(fn, func_id, &meta); + err = check_func_proto(fn, func_id); if (err) { verbose(env, "kernel subsystem misconfigured func %s#%d\n", func_id_name(func_id), func_id); return err; } + if (env->cur_state->active_rcu_lock) { + if (fn->might_sleep) { + verbose(env, "sleepable helper %s#%d in rcu_read_lock region\n", + func_id_name(func_id), func_id); + return -EINVAL; + } + + if (env->prog->aux->sleepable && is_storage_get_function(func_id)) + env->insn_aux_data[insn_idx].storage_get_func_atomic = true; + } + meta.func_id = func_id; /* check args */ for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) { @@ -7254,7 +7801,15 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn regs = cur_regs(env); + /* This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot + * be reinitialized by any dynptr helper. Hence, mark_stack_slots_dynptr + * is safe to do directly. + */ if (meta.uninit_dynptr_regno) { + if (regs[meta.uninit_dynptr_regno].type == CONST_PTR_TO_DYNPTR) { + verbose(env, "verifier internal error: CONST_PTR_TO_DYNPTR cannot be initialized\n"); + return -EFAULT; + } /* we write BPF_DW bits (8 bytes) at a time */ for (i = 0; i < BPF_DYNPTR_SIZE; i += 8) { err = check_mem_access(env, insn_idx, meta.uninit_dynptr_regno, @@ -7272,15 +7827,24 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (meta.release_regno) { err = -EINVAL; - if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1])) + /* This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot + * be released by any dynptr helper. Hence, unmark_stack_slots_dynptr + * is safe to do directly. + */ + if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1])) { + if (regs[meta.release_regno].type == CONST_PTR_TO_DYNPTR) { + verbose(env, "verifier internal error: CONST_PTR_TO_DYNPTR cannot be released\n"); + return -EFAULT; + } err = unmark_stack_slots_dynptr(env, ®s[meta.release_regno]); - else if (meta.ref_obj_id) + } else if (meta.ref_obj_id) { err = release_reference(env, meta.ref_obj_id); - /* meta.ref_obj_id can only be 0 if register that is meant to be - * released is NULL, which must be > R0. - */ - else if (register_is_null(®s[meta.release_regno])) + } else if (register_is_null(®s[meta.release_regno])) { + /* meta.ref_obj_id can only be 0 if register that is meant to be + * released is NULL, which must be > R0. + */ err = 0; + } if (err) { verbose(env, "func %s#%d reference has not been acquired before\n", func_id_name(func_id), func_id); @@ -7344,6 +7908,29 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn } } break; + case BPF_FUNC_dynptr_data: + for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) { + if (arg_type_is_dynptr(fn->arg_type[i])) { + struct bpf_reg_state *reg = ®s[BPF_REG_1 + i]; + + if (meta.ref_obj_id) { + verbose(env, "verifier internal error: meta.ref_obj_id already set\n"); + return -EFAULT; + } + + meta.ref_obj_id = dynptr_ref_obj_id(env, reg); + break; + } + } + if (i == MAX_BPF_FUNC_REG_ARGS) { + verbose(env, "verifier internal error: no dynptr in bpf_dynptr_data()\n"); + return -EFAULT; + } + break; + case BPF_FUNC_user_ringbuf_drain: + err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, + set_user_ringbuf_callback_state); + break; } if (err) @@ -7360,13 +7947,17 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn /* update return register (already marked as written above) */ ret_type = fn->ret_type; - ret_flag = type_flag(fn->ret_type); - if (ret_type == RET_INTEGER) { + ret_flag = type_flag(ret_type); + + switch (base_type(ret_type)) { + case RET_INTEGER: /* sets type to SCALAR_VALUE */ mark_reg_unknown(env, regs, BPF_REG_0); - } else if (ret_type == RET_VOID) { + break; + case RET_VOID: regs[BPF_REG_0].type = NOT_INIT; - } else if (base_type(ret_type) == RET_PTR_TO_MAP_VALUE) { + break; + case RET_PTR_TO_MAP_VALUE: /* There is no offset yet applied, variable or fixed */ mark_reg_known_zero(env, regs, BPF_REG_0); /* remember map_ptr, so that check_map_access() @@ -7382,23 +7973,29 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn regs[BPF_REG_0].map_uid = meta.map_uid; regs[BPF_REG_0].type = PTR_TO_MAP_VALUE | ret_flag; if (!type_may_be_null(ret_type) && - map_value_has_spin_lock(meta.map_ptr)) { + btf_record_has_field(meta.map_ptr->record, BPF_SPIN_LOCK)) { regs[BPF_REG_0].id = ++env->id_gen; } - } else if (base_type(ret_type) == RET_PTR_TO_SOCKET) { + break; + case RET_PTR_TO_SOCKET: mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_SOCKET | ret_flag; - } else if (base_type(ret_type) == RET_PTR_TO_SOCK_COMMON) { + break; + case RET_PTR_TO_SOCK_COMMON: mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON | ret_flag; - } else if (base_type(ret_type) == RET_PTR_TO_TCP_SOCK) { + break; + case RET_PTR_TO_TCP_SOCK: mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_TCP_SOCK | ret_flag; - } else if (base_type(ret_type) == RET_PTR_TO_ALLOC_MEM) { + break; + case RET_PTR_TO_MEM: mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag; regs[BPF_REG_0].mem_size = meta.mem_size; - } else if (base_type(ret_type) == RET_PTR_TO_MEM_OR_BTF_ID) { + break; + case RET_PTR_TO_MEM_OR_BTF_ID: + { const struct btf_type *t; mark_reg_known_zero(env, regs, BPF_REG_0); @@ -7430,16 +8027,25 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn regs[BPF_REG_0].btf = meta.ret_btf; regs[BPF_REG_0].btf_id = meta.ret_btf_id; } - } else if (base_type(ret_type) == RET_PTR_TO_BTF_ID) { + break; + } + case RET_PTR_TO_BTF_ID: + { struct btf *ret_btf; int ret_btf_id; mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag; if (func_id == BPF_FUNC_kptr_xchg) { - ret_btf = meta.kptr_off_desc->kptr.btf; - ret_btf_id = meta.kptr_off_desc->kptr.btf_id; + ret_btf = meta.kptr_field->kptr.btf; + ret_btf_id = meta.kptr_field->kptr.btf_id; } else { + if (fn->ret_btf_id == BPF_PTR_POISON) { + verbose(env, "verifier internal error:"); + verbose(env, "func %s has non-overwritten BPF_PTR_POISON return type\n", + func_id_name(func_id)); + return -EINVAL; + } ret_btf = btf_vmlinux; ret_btf_id = *fn->ret_btf_id; } @@ -7451,7 +8057,9 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn } regs[BPF_REG_0].btf = ret_btf; regs[BPF_REG_0].btf_id = ret_btf_id; - } else { + break; + } + default: verbose(env, "unknown return type %u of func %s#%d\n", base_type(ret_type), func_id_name(func_id), func_id); return -EINVAL; @@ -7460,7 +8068,13 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (type_may_be_null(regs[BPF_REG_0].type)) regs[BPF_REG_0].id = ++env->id_gen; - if (is_ptr_cast_function(func_id)) { + if (helper_multiple_ref_obj_use(func_id, meta.map_ptr)) { + verbose(env, "verifier internal error: func %s#%d sets ref_obj_id more than once\n", + func_id_name(func_id), func_id); + return -EFAULT; + } + + if (is_ptr_cast_function(func_id) || is_dynptr_ref_function(func_id)) { /* For release_reference() */ regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id; } else if (is_acquire_function(func_id, meta.map_ptr)) { @@ -7472,21 +8086,6 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn regs[BPF_REG_0].id = id; /* For release_reference() */ regs[BPF_REG_0].ref_obj_id = id; - } else if (func_id == BPF_FUNC_dynptr_data) { - int dynptr_id = 0, i; - - /* Find the id of the dynptr we're acquiring a reference to */ - for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) { - if (arg_type_is_dynptr(fn->arg_type[i])) { - if (dynptr_id) { - verbose(env, "verifier internal error: multiple dynptr args in func\n"); - return -EFAULT; - } - dynptr_id = stack_slot_get_id(env, ®s[BPF_REG_1 + i]); - } - } - /* For release_reference() */ - regs[BPF_REG_0].ref_obj_id = dynptr_id; } do_refine_retval_range(regs, fn->ret_type, func_id, &meta); @@ -7553,18 +8152,926 @@ static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno, } } +struct bpf_kfunc_call_arg_meta { + /* In parameters */ + struct btf *btf; + u32 func_id; + u32 kfunc_flags; + const struct btf_type *func_proto; + const char *func_name; + /* Out parameters */ + u32 ref_obj_id; + u8 release_regno; + bool r0_rdonly; + u32 ret_btf_id; + u64 r0_size; + struct { + u64 value; + bool found; + } arg_constant; + struct { + struct btf *btf; + u32 btf_id; + } arg_obj_drop; + struct { + struct btf_field *field; + } arg_list_head; +}; + +static bool is_kfunc_acquire(struct bpf_kfunc_call_arg_meta *meta) +{ + return meta->kfunc_flags & KF_ACQUIRE; +} + +static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) +{ + return meta->kfunc_flags & KF_RET_NULL; +} + +static bool is_kfunc_release(struct bpf_kfunc_call_arg_meta *meta) +{ + return meta->kfunc_flags & KF_RELEASE; +} + +static bool is_kfunc_trusted_args(struct bpf_kfunc_call_arg_meta *meta) +{ + return meta->kfunc_flags & KF_TRUSTED_ARGS; +} + +static bool is_kfunc_sleepable(struct bpf_kfunc_call_arg_meta *meta) +{ + return meta->kfunc_flags & KF_SLEEPABLE; +} + +static bool is_kfunc_destructive(struct bpf_kfunc_call_arg_meta *meta) +{ + return meta->kfunc_flags & KF_DESTRUCTIVE; +} + +static bool is_kfunc_rcu(struct bpf_kfunc_call_arg_meta *meta) +{ + return meta->kfunc_flags & KF_RCU; +} + +static bool is_kfunc_arg_kptr_get(struct bpf_kfunc_call_arg_meta *meta, int arg) +{ + return arg == 0 && (meta->kfunc_flags & KF_KPTR_GET); +} + +static bool __kfunc_param_match_suffix(const struct btf *btf, + const struct btf_param *arg, + const char *suffix) +{ + int suffix_len = strlen(suffix), len; + const char *param_name; + + /* In the future, this can be ported to use BTF tagging */ + param_name = btf_name_by_offset(btf, arg->name_off); + if (str_is_empty(param_name)) + return false; + len = strlen(param_name); + if (len < suffix_len) + return false; + param_name += len - suffix_len; + return !strncmp(param_name, suffix, suffix_len); +} + +static bool is_kfunc_arg_mem_size(const struct btf *btf, + const struct btf_param *arg, + const struct bpf_reg_state *reg) +{ + const struct btf_type *t; + + t = btf_type_skip_modifiers(btf, arg->type, NULL); + if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE) + return false; + + return __kfunc_param_match_suffix(btf, arg, "__sz"); +} + +static bool is_kfunc_arg_constant(const struct btf *btf, const struct btf_param *arg) +{ + return __kfunc_param_match_suffix(btf, arg, "__k"); +} + +static bool is_kfunc_arg_ignore(const struct btf *btf, const struct btf_param *arg) +{ + return __kfunc_param_match_suffix(btf, arg, "__ign"); +} + +static bool is_kfunc_arg_alloc_obj(const struct btf *btf, const struct btf_param *arg) +{ + return __kfunc_param_match_suffix(btf, arg, "__alloc"); +} + +static bool is_kfunc_arg_scalar_with_name(const struct btf *btf, + const struct btf_param *arg, + const char *name) +{ + int len, target_len = strlen(name); + const char *param_name; + + param_name = btf_name_by_offset(btf, arg->name_off); + if (str_is_empty(param_name)) + return false; + len = strlen(param_name); + if (len != target_len) + return false; + if (strcmp(param_name, name)) + return false; + + return true; +} + +enum { + KF_ARG_DYNPTR_ID, + KF_ARG_LIST_HEAD_ID, + KF_ARG_LIST_NODE_ID, +}; + +BTF_ID_LIST(kf_arg_btf_ids) +BTF_ID(struct, bpf_dynptr_kern) +BTF_ID(struct, bpf_list_head) +BTF_ID(struct, bpf_list_node) + +static bool __is_kfunc_ptr_arg_type(const struct btf *btf, + const struct btf_param *arg, int type) +{ + const struct btf_type *t; + u32 res_id; + + t = btf_type_skip_modifiers(btf, arg->type, NULL); + if (!t) + return false; + if (!btf_type_is_ptr(t)) + return false; + t = btf_type_skip_modifiers(btf, t->type, &res_id); + if (!t) + return false; + return btf_types_are_same(btf, res_id, btf_vmlinux, kf_arg_btf_ids[type]); +} + +static bool is_kfunc_arg_dynptr(const struct btf *btf, const struct btf_param *arg) +{ + return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_DYNPTR_ID); +} + +static bool is_kfunc_arg_list_head(const struct btf *btf, const struct btf_param *arg) +{ + return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_LIST_HEAD_ID); +} + +static bool is_kfunc_arg_list_node(const struct btf *btf, const struct btf_param *arg) +{ + return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_LIST_NODE_ID); +} + +/* Returns true if struct is composed of scalars, 4 levels of nesting allowed */ +static bool __btf_type_is_scalar_struct(struct bpf_verifier_env *env, + const struct btf *btf, + const struct btf_type *t, int rec) +{ + const struct btf_type *member_type; + const struct btf_member *member; + u32 i; + + if (!btf_type_is_struct(t)) + return false; + + for_each_member(i, t, member) { + const struct btf_array *array; + + member_type = btf_type_skip_modifiers(btf, member->type, NULL); + if (btf_type_is_struct(member_type)) { + if (rec >= 3) { + verbose(env, "max struct nesting depth exceeded\n"); + return false; + } + if (!__btf_type_is_scalar_struct(env, btf, member_type, rec + 1)) + return false; + continue; + } + if (btf_type_is_array(member_type)) { + array = btf_array(member_type); + if (!array->nelems) + return false; + member_type = btf_type_skip_modifiers(btf, array->type, NULL); + if (!btf_type_is_scalar(member_type)) + return false; + continue; + } + if (!btf_type_is_scalar(member_type)) + return false; + } + return true; +} + + +static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = { +#ifdef CONFIG_NET + [PTR_TO_SOCKET] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK], + [PTR_TO_SOCK_COMMON] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], + [PTR_TO_TCP_SOCK] = &btf_sock_ids[BTF_SOCK_TYPE_TCP], +#endif +}; + +enum kfunc_ptr_arg_type { + KF_ARG_PTR_TO_CTX, + KF_ARG_PTR_TO_ALLOC_BTF_ID, /* Allocated object */ + KF_ARG_PTR_TO_KPTR, /* PTR_TO_KPTR but type specific */ + KF_ARG_PTR_TO_DYNPTR, + KF_ARG_PTR_TO_LIST_HEAD, + KF_ARG_PTR_TO_LIST_NODE, + KF_ARG_PTR_TO_BTF_ID, /* Also covers reg2btf_ids conversions */ + KF_ARG_PTR_TO_MEM, + KF_ARG_PTR_TO_MEM_SIZE, /* Size derived from next argument, skip it */ +}; + +enum special_kfunc_type { + KF_bpf_obj_new_impl, + KF_bpf_obj_drop_impl, + KF_bpf_list_push_front, + KF_bpf_list_push_back, + KF_bpf_list_pop_front, + KF_bpf_list_pop_back, + KF_bpf_cast_to_kern_ctx, + KF_bpf_rdonly_cast, + KF_bpf_rcu_read_lock, + KF_bpf_rcu_read_unlock, +}; + +BTF_SET_START(special_kfunc_set) +BTF_ID(func, bpf_obj_new_impl) +BTF_ID(func, bpf_obj_drop_impl) +BTF_ID(func, bpf_list_push_front) +BTF_ID(func, bpf_list_push_back) +BTF_ID(func, bpf_list_pop_front) +BTF_ID(func, bpf_list_pop_back) +BTF_ID(func, bpf_cast_to_kern_ctx) +BTF_ID(func, bpf_rdonly_cast) +BTF_SET_END(special_kfunc_set) + +BTF_ID_LIST(special_kfunc_list) +BTF_ID(func, bpf_obj_new_impl) +BTF_ID(func, bpf_obj_drop_impl) +BTF_ID(func, bpf_list_push_front) +BTF_ID(func, bpf_list_push_back) +BTF_ID(func, bpf_list_pop_front) +BTF_ID(func, bpf_list_pop_back) +BTF_ID(func, bpf_cast_to_kern_ctx) +BTF_ID(func, bpf_rdonly_cast) +BTF_ID(func, bpf_rcu_read_lock) +BTF_ID(func, bpf_rcu_read_unlock) + +static bool is_kfunc_bpf_rcu_read_lock(struct bpf_kfunc_call_arg_meta *meta) +{ + return meta->func_id == special_kfunc_list[KF_bpf_rcu_read_lock]; +} + +static bool is_kfunc_bpf_rcu_read_unlock(struct bpf_kfunc_call_arg_meta *meta) +{ + return meta->func_id == special_kfunc_list[KF_bpf_rcu_read_unlock]; +} + +static enum kfunc_ptr_arg_type +get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, + struct bpf_kfunc_call_arg_meta *meta, + const struct btf_type *t, const struct btf_type *ref_t, + const char *ref_tname, const struct btf_param *args, + int argno, int nargs) +{ + u32 regno = argno + 1; + struct bpf_reg_state *regs = cur_regs(env); + struct bpf_reg_state *reg = ®s[regno]; + bool arg_mem_size = false; + + if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) + return KF_ARG_PTR_TO_CTX; + + /* In this function, we verify the kfunc's BTF as per the argument type, + * leaving the rest of the verification with respect to the register + * type to our caller. When a set of conditions hold in the BTF type of + * arguments, we resolve it to a known kfunc_ptr_arg_type. + */ + if (btf_get_prog_ctx_type(&env->log, meta->btf, t, resolve_prog_type(env->prog), argno)) + return KF_ARG_PTR_TO_CTX; + + if (is_kfunc_arg_alloc_obj(meta->btf, &args[argno])) + return KF_ARG_PTR_TO_ALLOC_BTF_ID; + + if (is_kfunc_arg_kptr_get(meta, argno)) { + if (!btf_type_is_ptr(ref_t)) { + verbose(env, "arg#0 BTF type must be a double pointer for kptr_get kfunc\n"); + return -EINVAL; + } + ref_t = btf_type_by_id(meta->btf, ref_t->type); + ref_tname = btf_name_by_offset(meta->btf, ref_t->name_off); + if (!btf_type_is_struct(ref_t)) { + verbose(env, "kernel function %s args#0 pointer type %s %s is not supported\n", + meta->func_name, btf_type_str(ref_t), ref_tname); + return -EINVAL; + } + return KF_ARG_PTR_TO_KPTR; + } + + if (is_kfunc_arg_dynptr(meta->btf, &args[argno])) + return KF_ARG_PTR_TO_DYNPTR; + + if (is_kfunc_arg_list_head(meta->btf, &args[argno])) + return KF_ARG_PTR_TO_LIST_HEAD; + + if (is_kfunc_arg_list_node(meta->btf, &args[argno])) + return KF_ARG_PTR_TO_LIST_NODE; + + if ((base_type(reg->type) == PTR_TO_BTF_ID || reg2btf_ids[base_type(reg->type)])) { + if (!btf_type_is_struct(ref_t)) { + verbose(env, "kernel function %s args#%d pointer type %s %s is not supported\n", + meta->func_name, argno, btf_type_str(ref_t), ref_tname); + return -EINVAL; + } + return KF_ARG_PTR_TO_BTF_ID; + } + + if (argno + 1 < nargs && is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], ®s[regno + 1])) + arg_mem_size = true; + + /* This is the catch all argument type of register types supported by + * check_helper_mem_access. However, we only allow when argument type is + * pointer to scalar, or struct composed (recursively) of scalars. When + * arg_mem_size is true, the pointer can be void *. + */ + if (!btf_type_is_scalar(ref_t) && !__btf_type_is_scalar_struct(env, meta->btf, ref_t, 0) && + (arg_mem_size ? !btf_type_is_void(ref_t) : 1)) { + verbose(env, "arg#%d pointer type %s %s must point to %sscalar, or struct with scalar\n", + argno, btf_type_str(ref_t), ref_tname, arg_mem_size ? "void, " : ""); + return -EINVAL; + } + return arg_mem_size ? KF_ARG_PTR_TO_MEM_SIZE : KF_ARG_PTR_TO_MEM; +} + +static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, + const struct btf_type *ref_t, + const char *ref_tname, u32 ref_id, + struct bpf_kfunc_call_arg_meta *meta, + int argno) +{ + const struct btf_type *reg_ref_t; + bool strict_type_match = false; + const struct btf *reg_btf; + const char *reg_ref_tname; + u32 reg_ref_id; + + if (base_type(reg->type) == PTR_TO_BTF_ID) { + reg_btf = reg->btf; + reg_ref_id = reg->btf_id; + } else { + reg_btf = btf_vmlinux; + reg_ref_id = *reg2btf_ids[base_type(reg->type)]; + } + + if (is_kfunc_trusted_args(meta) || (is_kfunc_release(meta) && reg->ref_obj_id)) + strict_type_match = true; + + reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id, ®_ref_id); + reg_ref_tname = btf_name_by_offset(reg_btf, reg_ref_t->name_off); + if (!btf_struct_ids_match(&env->log, reg_btf, reg_ref_id, reg->off, meta->btf, ref_id, strict_type_match)) { + verbose(env, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n", + meta->func_name, argno, btf_type_str(ref_t), ref_tname, argno + 1, + btf_type_str(reg_ref_t), reg_ref_tname); + return -EINVAL; + } + return 0; +} + +static int process_kf_arg_ptr_to_kptr(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, + const struct btf_type *ref_t, + const char *ref_tname, + struct bpf_kfunc_call_arg_meta *meta, + int argno) +{ + struct btf_field *kptr_field; + + /* check_func_arg_reg_off allows var_off for + * PTR_TO_MAP_VALUE, but we need fixed offset to find + * off_desc. + */ + if (!tnum_is_const(reg->var_off)) { + verbose(env, "arg#0 must have constant offset\n"); + return -EINVAL; + } + + kptr_field = btf_record_find(reg->map_ptr->record, reg->off + reg->var_off.value, BPF_KPTR); + if (!kptr_field || kptr_field->type != BPF_KPTR_REF) { + verbose(env, "arg#0 no referenced kptr at map value offset=%llu\n", + reg->off + reg->var_off.value); + return -EINVAL; + } + + if (!btf_struct_ids_match(&env->log, meta->btf, ref_t->type, 0, kptr_field->kptr.btf, + kptr_field->kptr.btf_id, true)) { + verbose(env, "kernel function %s args#%d expected pointer to %s %s\n", + meta->func_name, argno, btf_type_str(ref_t), ref_tname); + return -EINVAL; + } + return 0; +} + +static int ref_set_release_on_unlock(struct bpf_verifier_env *env, u32 ref_obj_id) +{ + struct bpf_func_state *state = cur_func(env); + struct bpf_reg_state *reg; + int i; + + /* bpf_spin_lock only allows calling list_push and list_pop, no BPF + * subprogs, no global functions. This means that the references would + * not be released inside the critical section but they may be added to + * the reference state, and the acquired_refs are never copied out for a + * different frame as BPF to BPF calls don't work in bpf_spin_lock + * critical sections. + */ + if (!ref_obj_id) { + verbose(env, "verifier internal error: ref_obj_id is zero for release_on_unlock\n"); + return -EFAULT; + } + for (i = 0; i < state->acquired_refs; i++) { + if (state->refs[i].id == ref_obj_id) { + if (state->refs[i].release_on_unlock) { + verbose(env, "verifier internal error: expected false release_on_unlock"); + return -EFAULT; + } + state->refs[i].release_on_unlock = true; + /* Now mark everyone sharing same ref_obj_id as untrusted */ + bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({ + if (reg->ref_obj_id == ref_obj_id) + reg->type |= PTR_UNTRUSTED; + })); + return 0; + } + } + verbose(env, "verifier internal error: ref state missing for ref_obj_id\n"); + return -EFAULT; +} + +/* Implementation details: + * + * Each register points to some region of memory, which we define as an + * allocation. Each allocation may embed a bpf_spin_lock which protects any + * special BPF objects (bpf_list_head, bpf_rb_root, etc.) part of the same + * allocation. The lock and the data it protects are colocated in the same + * memory region. + * + * Hence, everytime a register holds a pointer value pointing to such + * allocation, the verifier preserves a unique reg->id for it. + * + * The verifier remembers the lock 'ptr' and the lock 'id' whenever + * bpf_spin_lock is called. + * + * To enable this, lock state in the verifier captures two values: + * active_lock.ptr = Register's type specific pointer + * active_lock.id = A unique ID for each register pointer value + * + * Currently, PTR_TO_MAP_VALUE and PTR_TO_BTF_ID | MEM_ALLOC are the two + * supported register types. + * + * The active_lock.ptr in case of map values is the reg->map_ptr, and in case of + * allocated objects is the reg->btf pointer. + * + * The active_lock.id is non-unique for maps supporting direct_value_addr, as we + * can establish the provenance of the map value statically for each distinct + * lookup into such maps. They always contain a single map value hence unique + * IDs for each pseudo load pessimizes the algorithm and rejects valid programs. + * + * So, in case of global variables, they use array maps with max_entries = 1, + * hence their active_lock.ptr becomes map_ptr and id = 0 (since they all point + * into the same map value as max_entries is 1, as described above). + * + * In case of inner map lookups, the inner map pointer has same map_ptr as the + * outer map pointer (in verifier context), but each lookup into an inner map + * assigns a fresh reg->id to the lookup, so while lookups into distinct inner + * maps from the same outer map share the same map_ptr as active_lock.ptr, they + * will get different reg->id assigned to each lookup, hence different + * active_lock.id. + * + * In case of allocated objects, active_lock.ptr is the reg->btf, and the + * reg->id is a unique ID preserved after the NULL pointer check on the pointer + * returned from bpf_obj_new. Each allocation receives a new reg->id. + */ +static int check_reg_allocation_locked(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +{ + void *ptr; + u32 id; + + switch ((int)reg->type) { + case PTR_TO_MAP_VALUE: + ptr = reg->map_ptr; + break; + case PTR_TO_BTF_ID | MEM_ALLOC: + case PTR_TO_BTF_ID | MEM_ALLOC | PTR_TRUSTED: + ptr = reg->btf; + break; + default: + verbose(env, "verifier internal error: unknown reg type for lock check\n"); + return -EFAULT; + } + id = reg->id; + + if (!env->cur_state->active_lock.ptr) + return -EINVAL; + if (env->cur_state->active_lock.ptr != ptr || + env->cur_state->active_lock.id != id) { + verbose(env, "held lock and object are not in the same allocation\n"); + return -EINVAL; + } + return 0; +} + +static bool is_bpf_list_api_kfunc(u32 btf_id) +{ + return btf_id == special_kfunc_list[KF_bpf_list_push_front] || + btf_id == special_kfunc_list[KF_bpf_list_push_back] || + btf_id == special_kfunc_list[KF_bpf_list_pop_front] || + btf_id == special_kfunc_list[KF_bpf_list_pop_back]; +} + +static int process_kf_arg_ptr_to_list_head(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, u32 regno, + struct bpf_kfunc_call_arg_meta *meta) +{ + struct btf_field *field; + struct btf_record *rec; + u32 list_head_off; + + if (meta->btf != btf_vmlinux || !is_bpf_list_api_kfunc(meta->func_id)) { + verbose(env, "verifier internal error: bpf_list_head argument for unknown kfunc\n"); + return -EFAULT; + } + + if (!tnum_is_const(reg->var_off)) { + verbose(env, + "R%d doesn't have constant offset. bpf_list_head has to be at the constant offset\n", + regno); + return -EINVAL; + } + + rec = reg_btf_record(reg); + list_head_off = reg->off + reg->var_off.value; + field = btf_record_find(rec, list_head_off, BPF_LIST_HEAD); + if (!field) { + verbose(env, "bpf_list_head not found at offset=%u\n", list_head_off); + return -EINVAL; + } + + /* All functions require bpf_list_head to be protected using a bpf_spin_lock */ + if (check_reg_allocation_locked(env, reg)) { + verbose(env, "bpf_spin_lock at off=%d must be held for bpf_list_head\n", + rec->spin_lock_off); + return -EINVAL; + } + + if (meta->arg_list_head.field) { + verbose(env, "verifier internal error: repeating bpf_list_head arg\n"); + return -EFAULT; + } + meta->arg_list_head.field = field; + return 0; +} + +static int process_kf_arg_ptr_to_list_node(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, u32 regno, + struct bpf_kfunc_call_arg_meta *meta) +{ + const struct btf_type *et, *t; + struct btf_field *field; + struct btf_record *rec; + u32 list_node_off; + + if (meta->btf != btf_vmlinux || + (meta->func_id != special_kfunc_list[KF_bpf_list_push_front] && + meta->func_id != special_kfunc_list[KF_bpf_list_push_back])) { + verbose(env, "verifier internal error: bpf_list_node argument for unknown kfunc\n"); + return -EFAULT; + } + + if (!tnum_is_const(reg->var_off)) { + verbose(env, + "R%d doesn't have constant offset. bpf_list_node has to be at the constant offset\n", + regno); + return -EINVAL; + } + + rec = reg_btf_record(reg); + list_node_off = reg->off + reg->var_off.value; + field = btf_record_find(rec, list_node_off, BPF_LIST_NODE); + if (!field || field->offset != list_node_off) { + verbose(env, "bpf_list_node not found at offset=%u\n", list_node_off); + return -EINVAL; + } + + field = meta->arg_list_head.field; + + et = btf_type_by_id(field->list_head.btf, field->list_head.value_btf_id); + t = btf_type_by_id(reg->btf, reg->btf_id); + if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, 0, field->list_head.btf, + field->list_head.value_btf_id, true)) { + verbose(env, "operation on bpf_list_head expects arg#1 bpf_list_node at offset=%d " + "in struct %s, but arg is at offset=%d in struct %s\n", + field->list_head.node_offset, btf_name_by_offset(field->list_head.btf, et->name_off), + list_node_off, btf_name_by_offset(reg->btf, t->name_off)); + return -EINVAL; + } + + if (list_node_off != field->list_head.node_offset) { + verbose(env, "arg#1 offset=%d, but expected bpf_list_node at offset=%d in struct %s\n", + list_node_off, field->list_head.node_offset, + btf_name_by_offset(field->list_head.btf, et->name_off)); + return -EINVAL; + } + /* Set arg#1 for expiration after unlock */ + return ref_set_release_on_unlock(env, reg->ref_obj_id); +} + +static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta) +{ + const char *func_name = meta->func_name, *ref_tname; + const struct btf *btf = meta->btf; + const struct btf_param *args; + u32 i, nargs; + int ret; + + args = (const struct btf_param *)(meta->func_proto + 1); + nargs = btf_type_vlen(meta->func_proto); + if (nargs > MAX_BPF_FUNC_REG_ARGS) { + verbose(env, "Function %s has %d > %d args\n", func_name, nargs, + MAX_BPF_FUNC_REG_ARGS); + return -EINVAL; + } + + /* Check that BTF function arguments match actual types that the + * verifier sees. + */ + for (i = 0; i < nargs; i++) { + struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[i + 1]; + const struct btf_type *t, *ref_t, *resolve_ret; + enum bpf_arg_type arg_type = ARG_DONTCARE; + u32 regno = i + 1, ref_id, type_size; + bool is_ret_buf_sz = false; + int kf_arg_type; + + t = btf_type_skip_modifiers(btf, args[i].type, NULL); + + if (is_kfunc_arg_ignore(btf, &args[i])) + continue; + + if (btf_type_is_scalar(t)) { + if (reg->type != SCALAR_VALUE) { + verbose(env, "R%d is not a scalar\n", regno); + return -EINVAL; + } + + if (is_kfunc_arg_constant(meta->btf, &args[i])) { + if (meta->arg_constant.found) { + verbose(env, "verifier internal error: only one constant argument permitted\n"); + return -EFAULT; + } + if (!tnum_is_const(reg->var_off)) { + verbose(env, "R%d must be a known constant\n", regno); + return -EINVAL; + } + ret = mark_chain_precision(env, regno); + if (ret < 0) + return ret; + meta->arg_constant.found = true; + meta->arg_constant.value = reg->var_off.value; + } else if (is_kfunc_arg_scalar_with_name(btf, &args[i], "rdonly_buf_size")) { + meta->r0_rdonly = true; + is_ret_buf_sz = true; + } else if (is_kfunc_arg_scalar_with_name(btf, &args[i], "rdwr_buf_size")) { + is_ret_buf_sz = true; + } + + if (is_ret_buf_sz) { + if (meta->r0_size) { + verbose(env, "2 or more rdonly/rdwr_buf_size parameters for kfunc"); + return -EINVAL; + } + + if (!tnum_is_const(reg->var_off)) { + verbose(env, "R%d is not a const\n", regno); + return -EINVAL; + } + + meta->r0_size = reg->var_off.value; + ret = mark_chain_precision(env, regno); + if (ret) + return ret; + } + continue; + } + + if (!btf_type_is_ptr(t)) { + verbose(env, "Unrecognized arg#%d type %s\n", i, btf_type_str(t)); + return -EINVAL; + } + + if (reg->ref_obj_id) { + if (is_kfunc_release(meta) && meta->ref_obj_id) { + verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", + regno, reg->ref_obj_id, + meta->ref_obj_id); + return -EFAULT; + } + meta->ref_obj_id = reg->ref_obj_id; + if (is_kfunc_release(meta)) + meta->release_regno = regno; + } + + ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id); + ref_tname = btf_name_by_offset(btf, ref_t->name_off); + + kf_arg_type = get_kfunc_ptr_arg_type(env, meta, t, ref_t, ref_tname, args, i, nargs); + if (kf_arg_type < 0) + return kf_arg_type; + + switch (kf_arg_type) { + case KF_ARG_PTR_TO_ALLOC_BTF_ID: + case KF_ARG_PTR_TO_BTF_ID: + if (!is_kfunc_trusted_args(meta) && !is_kfunc_rcu(meta)) + break; + + if (!is_trusted_reg(reg)) { + if (!is_kfunc_rcu(meta)) { + verbose(env, "R%d must be referenced or trusted\n", regno); + return -EINVAL; + } + if (!is_rcu_reg(reg)) { + verbose(env, "R%d must be a rcu pointer\n", regno); + return -EINVAL; + } + } + + fallthrough; + case KF_ARG_PTR_TO_CTX: + /* Trusted arguments have the same offset checks as release arguments */ + arg_type |= OBJ_RELEASE; + break; + case KF_ARG_PTR_TO_KPTR: + case KF_ARG_PTR_TO_DYNPTR: + case KF_ARG_PTR_TO_LIST_HEAD: + case KF_ARG_PTR_TO_LIST_NODE: + case KF_ARG_PTR_TO_MEM: + case KF_ARG_PTR_TO_MEM_SIZE: + /* Trusted by default */ + break; + default: + WARN_ON_ONCE(1); + return -EFAULT; + } + + if (is_kfunc_release(meta) && reg->ref_obj_id) + arg_type |= OBJ_RELEASE; + ret = check_func_arg_reg_off(env, reg, regno, arg_type); + if (ret < 0) + return ret; + + switch (kf_arg_type) { + case KF_ARG_PTR_TO_CTX: + if (reg->type != PTR_TO_CTX) { + verbose(env, "arg#%d expected pointer to ctx, but got %s\n", i, btf_type_str(t)); + return -EINVAL; + } + + if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) { + ret = get_kern_ctx_btf_id(&env->log, resolve_prog_type(env->prog)); + if (ret < 0) + return -EINVAL; + meta->ret_btf_id = ret; + } + break; + case KF_ARG_PTR_TO_ALLOC_BTF_ID: + if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { + verbose(env, "arg#%d expected pointer to allocated object\n", i); + return -EINVAL; + } + if (!reg->ref_obj_id) { + verbose(env, "allocated object must be referenced\n"); + return -EINVAL; + } + if (meta->btf == btf_vmlinux && + meta->func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) { + meta->arg_obj_drop.btf = reg->btf; + meta->arg_obj_drop.btf_id = reg->btf_id; + } + break; + case KF_ARG_PTR_TO_KPTR: + if (reg->type != PTR_TO_MAP_VALUE) { + verbose(env, "arg#0 expected pointer to map value\n"); + return -EINVAL; + } + ret = process_kf_arg_ptr_to_kptr(env, reg, ref_t, ref_tname, meta, i); + if (ret < 0) + return ret; + break; + case KF_ARG_PTR_TO_DYNPTR: + if (reg->type != PTR_TO_STACK && + reg->type != CONST_PTR_TO_DYNPTR) { + verbose(env, "arg#%d expected pointer to stack or dynptr_ptr\n", i); + return -EINVAL; + } + + ret = process_dynptr_func(env, regno, ARG_PTR_TO_DYNPTR | MEM_RDONLY, NULL); + if (ret < 0) + return ret; + break; + case KF_ARG_PTR_TO_LIST_HEAD: + if (reg->type != PTR_TO_MAP_VALUE && + reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { + verbose(env, "arg#%d expected pointer to map value or allocated object\n", i); + return -EINVAL; + } + if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC) && !reg->ref_obj_id) { + verbose(env, "allocated object must be referenced\n"); + return -EINVAL; + } + ret = process_kf_arg_ptr_to_list_head(env, reg, regno, meta); + if (ret < 0) + return ret; + break; + case KF_ARG_PTR_TO_LIST_NODE: + if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { + verbose(env, "arg#%d expected pointer to allocated object\n", i); + return -EINVAL; + } + if (!reg->ref_obj_id) { + verbose(env, "allocated object must be referenced\n"); + return -EINVAL; + } + ret = process_kf_arg_ptr_to_list_node(env, reg, regno, meta); + if (ret < 0) + return ret; + break; + case KF_ARG_PTR_TO_BTF_ID: + /* Only base_type is checked, further checks are done here */ + if ((base_type(reg->type) != PTR_TO_BTF_ID || + (bpf_type_has_unsafe_modifiers(reg->type) && !is_rcu_reg(reg))) && + !reg2btf_ids[base_type(reg->type)]) { + verbose(env, "arg#%d is %s ", i, reg_type_str(env, reg->type)); + verbose(env, "expected %s or socket\n", + reg_type_str(env, base_type(reg->type) | + (type_flag(reg->type) & BPF_REG_TRUSTED_MODIFIERS))); + return -EINVAL; + } + ret = process_kf_arg_ptr_to_btf_id(env, reg, ref_t, ref_tname, ref_id, meta, i); + if (ret < 0) + return ret; + break; + case KF_ARG_PTR_TO_MEM: + resolve_ret = btf_resolve_size(btf, ref_t, &type_size); + if (IS_ERR(resolve_ret)) { + verbose(env, "arg#%d reference type('%s %s') size cannot be determined: %ld\n", + i, btf_type_str(ref_t), ref_tname, PTR_ERR(resolve_ret)); + return -EINVAL; + } + ret = check_mem_reg(env, reg, regno, type_size); + if (ret < 0) + return ret; + break; + case KF_ARG_PTR_TO_MEM_SIZE: + ret = check_kfunc_mem_size_reg(env, ®s[regno + 1], regno + 1); + if (ret < 0) { + verbose(env, "arg#%d arg#%d memory, len pair leads to invalid memory access\n", i, i + 1); + return ret; + } + /* Skip next '__sz' argument */ + i++; + break; + } + } + + if (is_kfunc_release(meta) && !meta->release_regno) { + verbose(env, "release kernel function %s expects refcounted PTR_TO_BTF_ID\n", + func_name); + return -EINVAL; + } + + return 0; +} + static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx_p) { const struct btf_type *t, *func, *func_proto, *ptr_type; struct bpf_reg_state *regs = cur_regs(env); const char *func_name, *ptr_type_name; + bool sleepable, rcu_lock, rcu_unlock; + struct bpf_kfunc_call_arg_meta meta; u32 i, nargs, func_id, ptr_type_id; int err, insn_idx = *insn_idx_p; const struct btf_param *args; + const struct btf_type *ret_t; struct btf *desc_btf; u32 *kfunc_flags; - bool acq; /* skip for now, but return error when we find this in fixup_kfunc_call */ if (!insn->imm) @@ -7585,17 +9092,68 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, func_name); return -EACCES; } - acq = *kfunc_flags & KF_ACQUIRE; + + /* Prepare kfunc call metadata */ + memset(&meta, 0, sizeof(meta)); + meta.btf = desc_btf; + meta.func_id = func_id; + meta.kfunc_flags = *kfunc_flags; + meta.func_proto = func_proto; + meta.func_name = func_name; + + if (is_kfunc_destructive(&meta) && !capable(CAP_SYS_BOOT)) { + verbose(env, "destructive kfunc calls require CAP_SYS_BOOT capability\n"); + return -EACCES; + } + + sleepable = is_kfunc_sleepable(&meta); + if (sleepable && !env->prog->aux->sleepable) { + verbose(env, "program must be sleepable to call sleepable kfunc %s\n", func_name); + return -EACCES; + } + + rcu_lock = is_kfunc_bpf_rcu_read_lock(&meta); + rcu_unlock = is_kfunc_bpf_rcu_read_unlock(&meta); + if ((rcu_lock || rcu_unlock) && !env->rcu_tag_supported) { + verbose(env, "no vmlinux btf rcu tag support for kfunc %s\n", func_name); + return -EACCES; + } + + if (env->cur_state->active_rcu_lock) { + struct bpf_func_state *state; + struct bpf_reg_state *reg; + + if (rcu_lock) { + verbose(env, "nested rcu read lock (kernel function %s)\n", func_name); + return -EINVAL; + } else if (rcu_unlock) { + bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({ + if (reg->type & MEM_RCU) { + reg->type &= ~(MEM_RCU | PTR_MAYBE_NULL); + reg->type |= PTR_UNTRUSTED; + } + })); + env->cur_state->active_rcu_lock = false; + } else if (sleepable) { + verbose(env, "kernel func %s is sleepable within rcu_read_lock region\n", func_name); + return -EACCES; + } + } else if (rcu_lock) { + env->cur_state->active_rcu_lock = true; + } else if (rcu_unlock) { + verbose(env, "unmatched rcu read unlock (kernel function %s)\n", func_name); + return -EINVAL; + } /* Check the arguments */ - err = btf_check_kfunc_arg_match(env, desc_btf, func_id, regs, *kfunc_flags); + err = check_kfunc_args(env, &meta); if (err < 0) return err; /* In case of release function, we get register number of refcounted - * PTR_TO_BTF_ID back from btf_check_kfunc_arg_match, do the release now + * PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now. */ - if (err) { - err = release_reference(env, regs[err].ref_obj_id); + if (meta.release_regno) { + err = release_reference(env, regs[meta.release_regno].ref_obj_id); if (err) { verbose(env, "kfunc %s#%d reference has not been acquired before\n", func_name, func_id); @@ -7609,43 +9167,137 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, /* Check return type */ t = btf_type_skip_modifiers(desc_btf, func_proto->type, NULL); - if (acq && !btf_type_is_ptr(t)) { - verbose(env, "acquire kernel function does not return PTR_TO_BTF_ID\n"); - return -EINVAL; + if (is_kfunc_acquire(&meta) && !btf_type_is_struct_ptr(meta.btf, t)) { + /* Only exception is bpf_obj_new_impl */ + if (meta.btf != btf_vmlinux || meta.func_id != special_kfunc_list[KF_bpf_obj_new_impl]) { + verbose(env, "acquire kernel function does not return PTR_TO_BTF_ID\n"); + return -EINVAL; + } } if (btf_type_is_scalar(t)) { mark_reg_unknown(env, regs, BPF_REG_0); mark_btf_func_reg_size(env, BPF_REG_0, t->size); } else if (btf_type_is_ptr(t)) { - ptr_type = btf_type_skip_modifiers(desc_btf, t->type, - &ptr_type_id); - if (!btf_type_is_struct(ptr_type)) { - ptr_type_name = btf_name_by_offset(desc_btf, - ptr_type->name_off); - verbose(env, "kernel function %s returns pointer type %s %s is not supported\n", - func_name, btf_type_str(ptr_type), - ptr_type_name); - return -EINVAL; + ptr_type = btf_type_skip_modifiers(desc_btf, t->type, &ptr_type_id); + + if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) { + if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl]) { + struct btf *ret_btf; + u32 ret_btf_id; + + if (unlikely(!bpf_global_ma_set)) + return -ENOMEM; + + if (((u64)(u32)meta.arg_constant.value) != meta.arg_constant.value) { + verbose(env, "local type ID argument must be in range [0, U32_MAX]\n"); + return -EINVAL; + } + + ret_btf = env->prog->aux->btf; + ret_btf_id = meta.arg_constant.value; + + /* This may be NULL due to user not supplying a BTF */ + if (!ret_btf) { + verbose(env, "bpf_obj_new requires prog BTF\n"); + return -EINVAL; + } + + ret_t = btf_type_by_id(ret_btf, ret_btf_id); + if (!ret_t || !__btf_type_is_struct(ret_t)) { + verbose(env, "bpf_obj_new type ID argument must be of a struct\n"); + return -EINVAL; + } + + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC; + regs[BPF_REG_0].btf = ret_btf; + regs[BPF_REG_0].btf_id = ret_btf_id; + + env->insn_aux_data[insn_idx].obj_new_size = ret_t->size; + env->insn_aux_data[insn_idx].kptr_struct_meta = + btf_find_struct_meta(ret_btf, ret_btf_id); + } else if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) { + env->insn_aux_data[insn_idx].kptr_struct_meta = + btf_find_struct_meta(meta.arg_obj_drop.btf, + meta.arg_obj_drop.btf_id); + } else if (meta.func_id == special_kfunc_list[KF_bpf_list_pop_front] || + meta.func_id == special_kfunc_list[KF_bpf_list_pop_back]) { + struct btf_field *field = meta.arg_list_head.field; + + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC; + regs[BPF_REG_0].btf = field->list_head.btf; + regs[BPF_REG_0].btf_id = field->list_head.value_btf_id; + regs[BPF_REG_0].off = field->list_head.node_offset; + } else if (meta.func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) { + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_TRUSTED; + regs[BPF_REG_0].btf = desc_btf; + regs[BPF_REG_0].btf_id = meta.ret_btf_id; + } else if (meta.func_id == special_kfunc_list[KF_bpf_rdonly_cast]) { + ret_t = btf_type_by_id(desc_btf, meta.arg_constant.value); + if (!ret_t || !btf_type_is_struct(ret_t)) { + verbose(env, + "kfunc bpf_rdonly_cast type ID argument must be of a struct\n"); + return -EINVAL; + } + + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_UNTRUSTED; + regs[BPF_REG_0].btf = desc_btf; + regs[BPF_REG_0].btf_id = meta.arg_constant.value; + } else { + verbose(env, "kernel function %s unhandled dynamic return type\n", + meta.func_name); + return -EFAULT; + } + } else if (!__btf_type_is_struct(ptr_type)) { + if (!meta.r0_size) { + ptr_type_name = btf_name_by_offset(desc_btf, + ptr_type->name_off); + verbose(env, + "kernel function %s returns pointer type %s %s is not supported\n", + func_name, + btf_type_str(ptr_type), + ptr_type_name); + return -EINVAL; + } + + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_MEM; + regs[BPF_REG_0].mem_size = meta.r0_size; + + if (meta.r0_rdonly) + regs[BPF_REG_0].type |= MEM_RDONLY; + + /* Ensures we don't access the memory after a release_reference() */ + if (meta.ref_obj_id) + regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id; + } else { + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].btf = desc_btf; + regs[BPF_REG_0].type = PTR_TO_BTF_ID; + regs[BPF_REG_0].btf_id = ptr_type_id; } - mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].btf = desc_btf; - regs[BPF_REG_0].type = PTR_TO_BTF_ID; - regs[BPF_REG_0].btf_id = ptr_type_id; - if (*kfunc_flags & KF_RET_NULL) { + + if (is_kfunc_ret_null(&meta)) { regs[BPF_REG_0].type |= PTR_MAYBE_NULL; /* For mark_ptr_or_null_reg, see 93c230e3f5bd6 */ regs[BPF_REG_0].id = ++env->id_gen; } mark_btf_func_reg_size(env, BPF_REG_0, sizeof(void *)); - if (acq) { + if (is_kfunc_acquire(&meta)) { int id = acquire_reference_state(env, insn_idx); if (id < 0) return id; - regs[BPF_REG_0].id = id; + if (is_kfunc_ret_null(&meta)) + regs[BPF_REG_0].id = id; regs[BPF_REG_0].ref_obj_id = id; } + if (reg_may_point_to_spin_lock(®s[BPF_REG_0]) && !regs[BPF_REG_0].id) + regs[BPF_REG_0].id = ++env->id_gen; } /* else { add_kfunc_call() ensures it is btf_type_is_void(t) } */ nargs = btf_type_vlen(func_proto); @@ -9073,6 +10725,11 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, return err; return adjust_ptr_min_max_vals(env, insn, dst_reg, src_reg); + } else if (dst_reg->precise) { + /* if dst_reg is precise, src_reg should be precise as well */ + err = mark_chain_precision(env, insn->src_reg); + if (err) + return err; } } else { /* Pretend the src is a reg with a known value, since we only @@ -9274,34 +10931,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return 0; } -static void __find_good_pkt_pointers(struct bpf_func_state *state, - struct bpf_reg_state *dst_reg, - enum bpf_reg_type type, int new_range) -{ - struct bpf_reg_state *reg; - int i; - - for (i = 0; i < MAX_BPF_REG; i++) { - reg = &state->regs[i]; - if (reg->type == type && reg->id == dst_reg->id) - /* keep the maximum range already checked */ - reg->range = max(reg->range, new_range); - } - - bpf_for_each_spilled_reg(i, state, reg) { - if (!reg) - continue; - if (reg->type == type && reg->id == dst_reg->id) - reg->range = max(reg->range, new_range); - } -} - static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, struct bpf_reg_state *dst_reg, enum bpf_reg_type type, bool range_right_open) { - int new_range, i; + struct bpf_func_state *state; + struct bpf_reg_state *reg; + int new_range; if (dst_reg->off < 0 || (dst_reg->off == 0 && range_right_open)) @@ -9366,9 +11003,11 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, * the range won't allow anything. * dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16. */ - for (i = 0; i <= vstate->curframe; i++) - __find_good_pkt_pointers(vstate->frame[i], dst_reg, type, - new_range); + bpf_for_each_reg_in_vstate(vstate, state, reg, ({ + if (reg->type == type && reg->id == dst_reg->id) + /* keep the maximum range already checked */ + reg->range = max(reg->range, new_range); + })); } static int is_branch32_taken(struct bpf_reg_state *reg, u32 val, u8 opcode) @@ -9830,17 +11469,20 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, bool is_null) { if (type_may_be_null(reg->type) && reg->id == id && - !WARN_ON_ONCE(!reg->id)) { - if (WARN_ON_ONCE(reg->smin_value || reg->smax_value || - !tnum_equals_const(reg->var_off, 0) || - reg->off)) { - /* Old offset (both fixed and variable parts) should - * have been known-zero, because we don't allow pointer - * arithmetic on pointers that might be NULL. If we - * see this happening, don't convert the register. - */ + (is_rcu_reg(reg) || !WARN_ON_ONCE(!reg->id))) { + /* Old offset (both fixed and variable parts) should have been + * known-zero, because we don't allow pointer arithmetic on + * pointers that might be NULL. If we see this happening, don't + * convert the register. + * + * But in some cases, some helpers that return local kptrs + * advance offset for the returned pointer. In those cases, it + * is fine to expect to see reg->off. + */ + if (WARN_ON_ONCE(reg->smin_value || reg->smax_value || !tnum_equals_const(reg->var_off, 0))) + return; + if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC | PTR_MAYBE_NULL) && WARN_ON_ONCE(reg->off)) return; - } if (is_null) { reg->type = SCALAR_VALUE; /* We don't need id and ref_obj_id from this point @@ -9857,7 +11499,7 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, if (!reg_may_point_to_spin_lock(reg)) { /* For not-NULL ptr, reg->ref_obj_id will be reset - * in release_reg_references(). + * in release_reference(). * * reg->id is still used by spin_lock ptr. Other * than spin_lock ptr type, reg->id can be reset. @@ -9867,22 +11509,6 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, } } -static void __mark_ptr_or_null_regs(struct bpf_func_state *state, u32 id, - bool is_null) -{ - struct bpf_reg_state *reg; - int i; - - for (i = 0; i < MAX_BPF_REG; i++) - mark_ptr_or_null_reg(state, &state->regs[i], id, is_null); - - bpf_for_each_spilled_reg(i, state, reg) { - if (!reg) - continue; - mark_ptr_or_null_reg(state, reg, id, is_null); - } -} - /* The logic is similar to find_good_pkt_pointers(), both could eventually * be folded together at some point. */ @@ -9890,10 +11516,9 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno, bool is_null) { struct bpf_func_state *state = vstate->frame[vstate->curframe]; - struct bpf_reg_state *regs = state->regs; + struct bpf_reg_state *regs = state->regs, *reg; u32 ref_obj_id = regs[regno].ref_obj_id; u32 id = regs[regno].id; - int i; if (ref_obj_id && ref_obj_id == id && is_null) /* regs[regno] is in the " == NULL" branch. @@ -9902,8 +11527,9 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno, */ WARN_ON_ONCE(release_reference_state(state, id)); - for (i = 0; i <= vstate->curframe; i++) - __mark_ptr_or_null_regs(vstate->frame[i], id, is_null); + bpf_for_each_reg_in_vstate(vstate, state, reg, ({ + mark_ptr_or_null_reg(state, reg, id, is_null); + })); } static bool try_match_pkt_pointers(const struct bpf_insn *insn, @@ -10016,23 +11642,11 @@ static void find_equal_scalars(struct bpf_verifier_state *vstate, { struct bpf_func_state *state; struct bpf_reg_state *reg; - int i, j; - for (i = 0; i <= vstate->curframe; i++) { - state = vstate->frame[i]; - for (j = 0; j < MAX_BPF_REG; j++) { - reg = &state->regs[j]; - if (reg->type == SCALAR_VALUE && reg->id == known_reg->id) - *reg = *known_reg; - } - - bpf_for_each_spilled_reg(j, state, reg) { - if (!reg) - continue; - if (reg->type == SCALAR_VALUE && reg->id == known_reg->id) - *reg = *known_reg; - } - } + bpf_for_each_reg_in_vstate(vstate, state, reg, ({ + if (reg->type == SCALAR_VALUE && reg->id == known_reg->id) + *reg = *known_reg; + })); } static int check_cond_jmp_op(struct bpf_verifier_env *env, @@ -10042,6 +11656,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, struct bpf_verifier_state *other_branch; struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs; struct bpf_reg_state *dst_reg, *other_branch_regs, *src_reg = NULL; + struct bpf_reg_state *eq_branch_regs; u8 opcode = BPF_OP(insn->code); bool is_jmp32; int pred = -1; @@ -10151,8 +11766,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, /* detect if we are comparing against a constant value so we can adjust * our min/max values for our dst register. * this is only legit if both are scalars (or pointers to the same - * object, I suppose, but we don't support that right now), because - * otherwise the different base pointers mean the offsets aren't + * object, I suppose, see the PTR_MAYBE_NULL related if block below), + * because otherwise the different base pointers mean the offsets aren't * comparable. */ if (BPF_SRC(insn->code) == BPF_X) { @@ -10201,6 +11816,36 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, find_equal_scalars(other_branch, &other_branch_regs[insn->dst_reg]); } + /* if one pointer register is compared to another pointer + * register check if PTR_MAYBE_NULL could be lifted. + * E.g. register A - maybe null + * register B - not null + * for JNE A, B, ... - A is not null in the false branch; + * for JEQ A, B, ... - A is not null in the true branch. + */ + if (!is_jmp32 && BPF_SRC(insn->code) == BPF_X && + __is_pointer_value(false, src_reg) && __is_pointer_value(false, dst_reg) && + type_may_be_null(src_reg->type) != type_may_be_null(dst_reg->type)) { + eq_branch_regs = NULL; + switch (opcode) { + case BPF_JEQ: + eq_branch_regs = other_branch_regs; + break; + case BPF_JNE: + eq_branch_regs = regs; + break; + default: + /* do nothing */ + break; + } + if (eq_branch_regs) { + if (type_may_be_null(src_reg->type)) + mark_ptr_not_null_reg(&eq_branch_regs[insn->src_reg]); + else + mark_ptr_not_null_reg(&eq_branch_regs[insn->dst_reg]); + } + } + /* detect if R == 0 where R is returned from bpf_map_lookup_elem(). * NOTE: these optimizations below are related with pointer comparison * which will never be JMP32. @@ -10307,8 +11952,8 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) insn->src_reg == BPF_PSEUDO_MAP_IDX_VALUE) { dst_reg->type = PTR_TO_MAP_VALUE; dst_reg->off = aux->map_off; - if (map_value_has_spin_lock(map)) - dst_reg->id = ++env->id_gen; + WARN_ON_ONCE(map->max_entries != 1); + /* We want reg->id to be same (0) as map_value is not distinct */ } else if (insn->src_reg == BPF_PSEUDO_MAP_FD || insn->src_reg == BPF_PSEUDO_MAP_IDX) { dst_reg->type = CONST_PTR_TO_MAP; @@ -10386,11 +12031,16 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return err; } - if (env->cur_state->active_spin_lock) { + if (env->cur_state->active_lock.ptr) { verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_spin_lock-ed region\n"); return -EINVAL; } + if (env->cur_state->active_rcu_lock) { + verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_rcu_read_lock-ed region\n"); + return -EINVAL; + } + if (regs[ctx_reg].type != PTR_TO_CTX) { verbose(env, "at the time of BPF_LD_ABS|IND R6 != pointer to skb\n"); @@ -10592,7 +12242,7 @@ static int check_return_code(struct bpf_verifier_env *env) * 3 let S be a stack * 4 S.push(v) * 5 while S is not empty - * 6 t <- S.pop() + * 6 t <- S.peek() * 7 if t is what we're looking for: * 8 return t * 9 for all edges e in G.adjacentEdges(t) do @@ -10641,11 +12291,16 @@ static struct bpf_verifier_state_list **explored_state( return &env->explored_states[(idx ^ state->callsite) % state_htab_size(env)]; } -static void init_explored_state(struct bpf_verifier_env *env, int idx) +static void mark_prune_point(struct bpf_verifier_env *env, int idx) { env->insn_aux_data[idx].prune_point = true; } +static bool is_prune_point(struct bpf_verifier_env *env, int insn_idx) +{ + return env->insn_aux_data[insn_idx].prune_point; +} + enum { DONE_EXPLORING = 0, KEEP_EXPLORING = 1, @@ -10674,9 +12329,11 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env, return -EINVAL; } - if (e == BRANCH) + if (e == BRANCH) { /* mark branch target for state pruning */ - init_explored_state(env, w); + mark_prune_point(env, w); + mark_jmp_point(env, w); + } if (insn_state[w] == 0) { /* tree-edge */ @@ -10703,8 +12360,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env, return DONE_EXPLORING; } -static int visit_func_call_insn(int t, int insn_cnt, - struct bpf_insn *insns, +static int visit_func_call_insn(int t, struct bpf_insn *insns, struct bpf_verifier_env *env, bool visit_callee) { @@ -10714,10 +12370,12 @@ static int visit_func_call_insn(int t, int insn_cnt, if (ret) return ret; - if (t + 1 < insn_cnt) - init_explored_state(env, t + 1); + mark_prune_point(env, t + 1); + /* when we exit from subprog, we need to record non-linear history */ + mark_jmp_point(env, t + 1); + if (visit_callee) { - init_explored_state(env, t); + mark_prune_point(env, t); ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env, /* It's ok to allow recursion from CFG point of * view. __check_func_call() will do the actual @@ -10733,13 +12391,13 @@ static int visit_func_call_insn(int t, int insn_cnt, * DONE_EXPLORING - the instruction was fully explored * KEEP_EXPLORING - there is still work to be done before it is fully explored */ -static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env) +static int visit_insn(int t, struct bpf_verifier_env *env) { struct bpf_insn *insns = env->prog->insnsi; int ret; if (bpf_pseudo_func(insns + t)) - return visit_func_call_insn(t, insn_cnt, insns, env, true); + return visit_func_call_insn(t, insns, env, true); /* All non-branch instructions have a single fall-through edge. */ if (BPF_CLASS(insns[t].code) != BPF_JMP && @@ -10752,13 +12410,13 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env) case BPF_CALL: if (insns[t].imm == BPF_FUNC_timer_set_callback) - /* Mark this call insn to trigger is_state_visited() check - * before call itself is processed by __check_func_call(). - * Otherwise new async state will be pushed for further - * exploration. + /* Mark this call insn as a prune point to trigger + * is_state_visited() check before call itself is + * processed by __check_func_call(). Otherwise new + * async state will be pushed for further exploration. */ - init_explored_state(env, t); - return visit_func_call_insn(t, insn_cnt, insns, env, + mark_prune_point(env, t); + return visit_func_call_insn(t, insns, env, insns[t].src_reg == BPF_PSEUDO_CALL); case BPF_JA: @@ -10771,22 +12429,15 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env) if (ret) return ret; - /* unconditional jmp is not a good pruning point, - * but it's marked, since backtracking needs - * to record jmp history in is_state_visited(). - */ - init_explored_state(env, t + insns[t].off + 1); - /* tell verifier to check for equivalent states - * after every call and jump - */ - if (t + 1 < insn_cnt) - init_explored_state(env, t + 1); + mark_prune_point(env, t + insns[t].off + 1); + mark_jmp_point(env, t + insns[t].off + 1); return ret; default: /* conditional jump with two edges */ - init_explored_state(env, t); + mark_prune_point(env, t); + ret = push_insn(t, t + 1, FALLTHROUGH, env, true); if (ret) return ret; @@ -10822,7 +12473,7 @@ static int check_cfg(struct bpf_verifier_env *env) while (env->cfg.cur_stack > 0) { int t = insn_stack[env->cfg.cur_stack - 1]; - ret = visit_insn(t, insn_cnt, env); + ret = visit_insn(t, env); switch (ret) { case DONE_EXPLORING: insn_state[t] = EXPLORED; @@ -11413,15 +13064,6 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, equal = memcmp(rold, rcur, offsetof(struct bpf_reg_state, parent)) == 0; - if (rold->type == PTR_TO_STACK) - /* two stack pointers are equal only if they're pointing to - * the same stack frame, since fp-8 in foo != fp-8 in bar - */ - return equal && rold->frameno == rcur->frameno; - - if (equal) - return true; - if (rold->type == NOT_INIT) /* explored state can't have used this */ return true; @@ -11429,10 +13071,12 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, return false; switch (base_type(rold->type)) { case SCALAR_VALUE: + if (equal) + return true; if (env->explore_alu_limits) return false; if (rcur->type == SCALAR_VALUE) { - if (!rold->precise && !rcur->precise) + if (!rold->precise) return true; /* new val must satisfy old val knowledge */ return range_within(rold, rcur) && @@ -11475,7 +13119,8 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, */ return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 && range_within(rold, rcur) && - tnum_in(rold->var_off, rcur->var_off); + tnum_in(rold->var_off, rcur->var_off) && + check_ids(rold->id, rcur->id, idmap); case PTR_TO_PACKET_META: case PTR_TO_PACKET: if (rcur->type != rold->type) @@ -11499,20 +13144,14 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, /* new val must satisfy old val knowledge */ return range_within(rold, rcur) && tnum_in(rold->var_off, rcur->var_off); - case PTR_TO_CTX: - case CONST_PTR_TO_MAP: - case PTR_TO_PACKET_END: - case PTR_TO_FLOW_KEYS: - case PTR_TO_SOCKET: - case PTR_TO_SOCK_COMMON: - case PTR_TO_TCP_SOCK: - case PTR_TO_XDP_SOCK: - /* Only valid matches are exact, which memcmp() above - * would have accepted + case PTR_TO_STACK: + /* two stack pointers are equal only if they're pointing to + * the same stack frame, since fp-8 in foo != fp-8 in bar */ + return equal && rold->frameno == rcur->frameno; default: - /* Don't know what's going on, just say it's not safe */ - return false; + /* Only valid matches are exact, which memcmp() */ + return equal; } /* Shouldn't get here; if we do, say it's not safe */ @@ -11622,7 +13261,6 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat { int i; - memset(env->idmap_scratch, 0, sizeof(env->idmap_scratch)); for (i = 0; i < MAX_BPF_REG; i++) if (!regsafe(env, &old->regs[i], &cur->regs[i], env->idmap_scratch)) @@ -11646,13 +13284,28 @@ static bool states_equal(struct bpf_verifier_env *env, if (old->curframe != cur->curframe) return false; + memset(env->idmap_scratch, 0, sizeof(env->idmap_scratch)); + /* Verification state from speculative execution simulation * must never prune a non-speculative execution one. */ if (old->speculative && !cur->speculative) return false; - if (old->active_spin_lock != cur->active_spin_lock) + if (old->active_lock.ptr != cur->active_lock.ptr) + return false; + + /* Old and cur active_lock's have to be either both present + * or both absent. + */ + if (!!old->active_lock.id != !!cur->active_lock.id) + return false; + + if (old->active_lock.id && + !check_ids(old->active_lock.id, cur->active_lock.id, env->idmap_scratch)) + return false; + + if (old->active_rcu_lock != cur->active_rcu_lock) return false; /* for states to be equal callsites have to be the same @@ -11755,34 +13408,36 @@ static int propagate_precision(struct bpf_verifier_env *env, { struct bpf_reg_state *state_reg; struct bpf_func_state *state; - int i, err = 0; + int i, err = 0, fr; - state = old->frame[old->curframe]; - state_reg = state->regs; - for (i = 0; i < BPF_REG_FP; i++, state_reg++) { - if (state_reg->type != SCALAR_VALUE || - !state_reg->precise) - continue; - if (env->log.level & BPF_LOG_LEVEL2) - verbose(env, "propagating r%d\n", i); - err = mark_chain_precision(env, i); - if (err < 0) - return err; - } + for (fr = old->curframe; fr >= 0; fr--) { + state = old->frame[fr]; + state_reg = state->regs; + for (i = 0; i < BPF_REG_FP; i++, state_reg++) { + if (state_reg->type != SCALAR_VALUE || + !state_reg->precise) + continue; + if (env->log.level & BPF_LOG_LEVEL2) + verbose(env, "frame %d: propagating r%d\n", i, fr); + err = mark_chain_precision_frame(env, fr, i); + if (err < 0) + return err; + } - for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { - if (!is_spilled_reg(&state->stack[i])) - continue; - state_reg = &state->stack[i].spilled_ptr; - if (state_reg->type != SCALAR_VALUE || - !state_reg->precise) - continue; - if (env->log.level & BPF_LOG_LEVEL2) - verbose(env, "propagating fp%d\n", - (-i - 1) * BPF_REG_SIZE); - err = mark_chain_precision_stack(env, i); - if (err < 0) - return err; + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { + if (!is_spilled_reg(&state->stack[i])) + continue; + state_reg = &state->stack[i].spilled_ptr; + if (state_reg->type != SCALAR_VALUE || + !state_reg->precise) + continue; + if (env->log.level & BPF_LOG_LEVEL2) + verbose(env, "frame %d: propagating fp%d\n", + (-i - 1) * BPF_REG_SIZE, fr); + err = mark_chain_precision_stack_frame(env, fr, i); + if (err < 0) + return err; + } } return 0; } @@ -11814,13 +13469,6 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) int i, j, err, states_cnt = 0; bool add_new_state = env->test_state_freq ? true : false; - cur->last_insn_idx = env->prev_insn_idx; - if (!env->insn_aux_data[insn_idx].prune_point) - /* this 'insn_idx' instruction wasn't marked, so we will not - * be doing state search here - */ - return 0; - /* bpf progs typically have pruning point every 4 instructions * http://vger.kernel.org/bpfconf2019.html#session-1 * Do not add new state for future pruning if the verifier hasn't seen @@ -11955,10 +13603,10 @@ next: env->max_states_per_insn = states_cnt; if (!env->bpf_capable && states_cnt > BPF_COMPLEXITY_LIMIT_STATES) - return push_jmp_history(env, cur); + return 0; if (!add_new_state) - return push_jmp_history(env, cur); + return 0; /* There were no equivalent states, remember the current one. * Technically the current state is not proven to be safe yet, @@ -11977,6 +13625,10 @@ next: env->prev_jmps_processed = env->jmps_processed; env->prev_insn_processed = env->insn_processed; + /* forget precise markings we inherited, see __mark_chain_precision */ + if (env->bpf_capable) + mark_all_scalars_imprecise(env, cur); + /* add new state to the head of linked list */ new = &new_sl->state; err = copy_verifier_state(new, cur); @@ -12094,21 +13746,31 @@ static int do_check(struct bpf_verifier_env *env) return -E2BIG; } - err = is_state_visited(env, env->insn_idx); - if (err < 0) - return err; - if (err == 1) { - /* found equivalent state, can prune the search */ - if (env->log.level & BPF_LOG_LEVEL) { - if (do_print_state) - verbose(env, "\nfrom %d to %d%s: safe\n", - env->prev_insn_idx, env->insn_idx, - env->cur_state->speculative ? - " (speculative execution)" : ""); - else - verbose(env, "%d: safe\n", env->insn_idx); + state->last_insn_idx = env->prev_insn_idx; + + if (is_prune_point(env, env->insn_idx)) { + err = is_state_visited(env, env->insn_idx); + if (err < 0) + return err; + if (err == 1) { + /* found equivalent state, can prune the search */ + if (env->log.level & BPF_LOG_LEVEL) { + if (do_print_state) + verbose(env, "\nfrom %d to %d%s: safe\n", + env->prev_insn_idx, env->insn_idx, + env->cur_state->speculative ? + " (speculative execution)" : ""); + else + verbose(env, "%d: safe\n", env->insn_idx); + } + goto process_bpf_exit; } - goto process_bpf_exit; + } + + if (is_jmp_point(env, env->insn_idx)) { + err = push_jmp_history(env, state); + if (err) + return err; } if (signal_pending(current)) @@ -12291,11 +13953,14 @@ static int do_check(struct bpf_verifier_env *env) return -EINVAL; } - if (env->cur_state->active_spin_lock && - (insn->src_reg == BPF_PSEUDO_CALL || - insn->imm != BPF_FUNC_spin_unlock)) { - verbose(env, "function calls are not allowed while holding a lock\n"); - return -EINVAL; + if (env->cur_state->active_lock.ptr) { + if ((insn->src_reg == BPF_REG_0 && insn->imm != BPF_FUNC_spin_unlock) || + (insn->src_reg == BPF_PSEUDO_CALL) || + (insn->src_reg == BPF_PSEUDO_KFUNC_CALL && + (insn->off != 0 || !is_bpf_list_api_kfunc(insn->imm)))) { + verbose(env, "function calls are not allowed while holding a lock\n"); + return -EINVAL; + } } if (insn->src_reg == BPF_PSEUDO_CALL) err = check_func_call(env, insn, &env->insn_idx); @@ -12328,11 +13993,26 @@ static int do_check(struct bpf_verifier_env *env) return -EINVAL; } - if (env->cur_state->active_spin_lock) { + if (env->cur_state->active_lock.ptr) { verbose(env, "bpf_spin_unlock is missing\n"); return -EINVAL; } + if (env->cur_state->active_rcu_lock) { + verbose(env, "bpf_rcu_read_unlock is missing\n"); + return -EINVAL; + } + + /* We must do check_reference_leak here before + * prepare_func_exit to handle the case when + * state->curframe > 0, it may be a callback + * function, for which reference_state must + * match caller reference state when it exits. + */ + err = check_reference_leak(env); + if (err) + return err; + if (state->curframe) { /* exit from nested function */ err = prepare_func_exit(env, &env->insn_idx); @@ -12342,10 +14022,6 @@ static int do_check(struct bpf_verifier_env *env) continue; } - err = check_reference_leak(env); - if (err) - return err; - err = check_return_code(env); if (err) return err; @@ -12558,14 +14234,6 @@ err_put: return err; } -static int check_map_prealloc(struct bpf_map *map) -{ - return (map->map_type != BPF_MAP_TYPE_HASH && - map->map_type != BPF_MAP_TYPE_PERCPU_HASH && - map->map_type != BPF_MAP_TYPE_HASH_OF_MAPS) || - !(map->map_flags & BPF_F_NO_PREALLOC); -} - static bool is_tracing_prog_type(enum bpf_prog_type type) { switch (type) { @@ -12580,52 +14248,21 @@ static bool is_tracing_prog_type(enum bpf_prog_type type) } } -static bool is_preallocated_map(struct bpf_map *map) -{ - if (!check_map_prealloc(map)) - return false; - if (map->inner_map_meta && !check_map_prealloc(map->inner_map_meta)) - return false; - return true; -} - static int check_map_prog_compatibility(struct bpf_verifier_env *env, struct bpf_map *map, struct bpf_prog *prog) { enum bpf_prog_type prog_type = resolve_prog_type(prog); - /* - * Validate that trace type programs use preallocated hash maps. - * - * For programs attached to PERF events this is mandatory as the - * perf NMI can hit any arbitrary code sequence. - * - * All other trace types using preallocated hash maps are unsafe as - * well because tracepoint or kprobes can be inside locked regions - * of the memory allocator or at a place where a recursion into the - * memory allocator would see inconsistent state. - * - * On RT enabled kernels run-time allocation of all trace type - * programs is strictly prohibited due to lock type constraints. On - * !RT kernels it is allowed for backwards compatibility reasons for - * now, but warnings are emitted so developers are made aware of - * the unsafety and can fix their programs before this is enforced. - */ - if (is_tracing_prog_type(prog_type) && !is_preallocated_map(map)) { - if (prog_type == BPF_PROG_TYPE_PERF_EVENT) { - verbose(env, "perf_event programs can only use preallocated hash map\n"); - return -EINVAL; - } - if (IS_ENABLED(CONFIG_PREEMPT_RT)) { - verbose(env, "trace type programs can only use preallocated hash map\n"); + + if (btf_record_has_field(map->record, BPF_LIST_HEAD)) { + if (is_tracing_prog_type(prog_type)) { + verbose(env, "tracing progs cannot use bpf_list_head yet\n"); return -EINVAL; } - WARN_ONCE(1, "trace type BPF program uses run-time allocation\n"); - verbose(env, "trace type programs with run-time allocated hash maps are unsafe. Switch to preallocated hash maps.\n"); } - if (map_value_has_spin_lock(map)) { + if (btf_record_has_field(map->record, BPF_SPIN_LOCK)) { if (prog_type == BPF_PROG_TYPE_SOCKET_FILTER) { verbose(env, "socket filter progs cannot use bpf_spin_lock yet\n"); return -EINVAL; @@ -12642,7 +14279,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, } } - if (map_value_has_timer(map)) { + if (btf_record_has_field(map->record, BPF_TIMER)) { if (is_tracing_prog_type(prog_type)) { verbose(env, "tracing progs cannot use bpf_timer yet\n"); return -EINVAL; @@ -12670,20 +14307,16 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, case BPF_MAP_TYPE_LRU_PERCPU_HASH: case BPF_MAP_TYPE_ARRAY_OF_MAPS: case BPF_MAP_TYPE_HASH_OF_MAPS: - if (!is_preallocated_map(map)) { - verbose(env, - "Sleepable programs can only use preallocated maps\n"); - return -EINVAL; - } - break; case BPF_MAP_TYPE_RINGBUF: + case BPF_MAP_TYPE_USER_RINGBUF: case BPF_MAP_TYPE_INODE_STORAGE: case BPF_MAP_TYPE_SK_STORAGE: case BPF_MAP_TYPE_TASK_STORAGE: + case BPF_MAP_TYPE_CGRP_STORAGE: break; default: verbose(env, - "Sleepable programs can only use array, hash, and ringbuf maps\n"); + "Sleepable programs can only use array, hash, ringbuf and local storage maps\n"); return -EINVAL; } @@ -13317,7 +14950,7 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env, aux[adj_idx].ptr_type == PTR_TO_CTX) continue; - imm_rnd = get_random_int(); + imm_rnd = get_random_u32(); rnd_hi32_patch[0] = insn; rnd_hi32_patch[1].imm = imm_rnd; rnd_hi32_patch[3].dst_reg = load_reg; @@ -13339,6 +14972,10 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env, if (!bpf_jit_needs_zext() && !is_cmpxchg_insn(&insn)) continue; + /* Zero-extension is done by the caller. */ + if (bpf_pseudo_kfunc_call(&insn)) + continue; + if (WARN_ON(load_reg == -1)) { verbose(env, "verifier bug. zext_dst is set, but no reg is defined\n"); return -EFAULT; @@ -13466,13 +15103,17 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) break; case PTR_TO_BTF_ID: case PTR_TO_BTF_ID | PTR_UNTRUSTED: + /* PTR_TO_BTF_ID | MEM_ALLOC always has a valid lifetime, unlike + * PTR_TO_BTF_ID, and an active ref_obj_id, but the same cannot + * be said once it is marked PTR_UNTRUSTED, hence we must handle + * any faults for loads into such types. BPF_WRITE is disallowed + * for this case. + */ + case PTR_TO_BTF_ID | MEM_ALLOC | PTR_UNTRUSTED: if (type == BPF_READ) { insn->code = BPF_LDX | BPF_PROBE_MEM | BPF_SIZE((insn)->code); env->prog->aux->num_exentries++; - } else if (resolve_prog_type(env->prog) != BPF_PROG_TYPE_STRUCT_OPS) { - verbose(env, "Writes through BTF pointers are not allowed\n"); - return -EINVAL; } continue; default: @@ -13834,8 +15475,8 @@ static int fixup_call_args(struct bpf_verifier_env *env) return err; } -static int fixup_kfunc_call(struct bpf_verifier_env *env, - struct bpf_insn *insn) +static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, + struct bpf_insn *insn_buf, int insn_idx, int *cnt) { const struct bpf_kfunc_desc *desc; @@ -13845,7 +15486,7 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, } /* insn->imm has the btf func_id. Replace it with - * an address (relative to __bpf_base_call). + * an address (relative to __bpf_call_base). */ desc = find_kfunc_desc(env->prog, insn->imm, insn->off); if (!desc) { @@ -13854,8 +15495,33 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, return -EFAULT; } + *cnt = 0; insn->imm = desc->imm; - + if (insn->off) + return 0; + if (desc->func_id == special_kfunc_list[KF_bpf_obj_new_impl]) { + struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta; + struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) }; + u64 obj_new_size = env->insn_aux_data[insn_idx].obj_new_size; + + insn_buf[0] = BPF_MOV64_IMM(BPF_REG_1, obj_new_size); + insn_buf[1] = addr[0]; + insn_buf[2] = addr[1]; + insn_buf[3] = *insn; + *cnt = 4; + } else if (desc->func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) { + struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta; + struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) }; + + insn_buf[0] = addr[0]; + insn_buf[1] = addr[1]; + insn_buf[2] = *insn; + *cnt = 3; + } else if (desc->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] || + desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) { + insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1); + *cnt = 1; + } return 0; } @@ -13997,9 +15663,19 @@ static int do_misc_fixups(struct bpf_verifier_env *env) if (insn->src_reg == BPF_PSEUDO_CALL) continue; if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { - ret = fixup_kfunc_call(env, insn); + ret = fixup_kfunc_call(env, insn, insn_buf, i + delta, &cnt); if (ret) return ret; + if (cnt == 0) + continue; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; continue; } @@ -14117,13 +15793,12 @@ static int do_misc_fixups(struct bpf_verifier_env *env) goto patch_call_imm; } - if (insn->imm == BPF_FUNC_task_storage_get || - insn->imm == BPF_FUNC_sk_storage_get || - insn->imm == BPF_FUNC_inode_storage_get) { - if (env->prog->aux->sleepable) - insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_KERNEL); - else + if (is_storage_get_function(insn->imm)) { + if (!env->prog->aux->sleepable || + env->insn_aux_data[i + delta].storage_get_func_atomic) insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_ATOMIC); + else + insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_KERNEL); insn_buf[1] = *insn; cnt = 2; @@ -14193,7 +15868,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) BUILD_BUG_ON(!__same_type(ops->map_peek_elem, (int (*)(struct bpf_map *map, void *value))NULL)); BUILD_BUG_ON(!__same_type(ops->map_redirect, - (int (*)(struct bpf_map *map, u32 ifindex, u64 flags))NULL)); + (int (*)(struct bpf_map *map, u64 index, u64 flags))NULL)); BUILD_BUG_ON(!__same_type(ops->map_for_each_callback, (int (*)(struct bpf_map *map, bpf_callback_t callback_fn, @@ -14572,6 +16247,8 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) BPF_MAIN_FUNC /* callsite */, 0 /* frameno */, subprog); + state->first_insn_idx = env->subprog_info[subprog].start; + state->last_insn_idx = -1; regs = state->frame[state->curframe]->regs; if (subprog || env->prog->type == BPF_PROG_TYPE_EXT) { @@ -14981,12 +16658,22 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, ret = -EINVAL; switch (prog->type) { case BPF_PROG_TYPE_TRACING: - /* fentry/fexit/fmod_ret progs can be sleepable only if they are + + /* fentry/fexit/fmod_ret progs can be sleepable if they are * attached to ALLOW_ERROR_INJECTION and are not in denylist. */ if (!check_non_sleepable_error_inject(btf_id) && within_error_injection_list(addr)) ret = 0; + /* fentry/fexit/fmod_ret progs can also be sleepable if they are + * in the fmodret id set with the KF_SLEEPABLE flag. + */ + else { + u32 *flags = btf_kfunc_is_modify_return(btf, btf_id); + + if (flags && (*flags & KF_SLEEPABLE)) + ret = 0; + } break; case BPF_PROG_TYPE_LSM: /* LSM progs check that they are attached to bpf_lsm_*() funcs. @@ -15007,7 +16694,10 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, bpf_log(log, "can't modify return codes of BPF programs\n"); return -EINVAL; } - ret = check_attach_modify_return(addr, tname); + ret = -EINVAL; + if (btf_kfunc_is_modify_return(btf, btf_id) || + !check_attach_modify_return(addr, tname)) + ret = 0; if (ret) { bpf_log(log, "%s() is not modifiable\n", tname); return ret; @@ -15196,10 +16886,11 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr) env->allow_ptr_leaks = bpf_allow_ptr_leaks(); env->allow_uninit_stack = bpf_allow_uninit_stack(); - env->allow_ptr_to_map_access = bpf_allow_ptr_to_map_access(); env->bypass_spec_v1 = bpf_bypass_spec_v1(); env->bypass_spec_v4 = bpf_bypass_spec_v4(); env->bpf_capable = bpf_capable(); + env->rcu_tag_supported = btf_vmlinux && + btf_find_by_name_kind(btf_vmlinux, "rcu", BTF_KIND_TYPE_TAG) > 0; if (is_priv) env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ; diff --git a/kernel/capability.c b/kernel/capability.c index 765194f5d678..860fd22117c1 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -489,8 +489,8 @@ bool privileged_wrt_inode_uidgid(struct user_namespace *ns, struct user_namespace *mnt_userns, const struct inode *inode) { - return kuid_has_mapping(ns, i_uid_into_mnt(mnt_userns, inode)) && - kgid_has_mapping(ns, i_gid_into_mnt(mnt_userns, inode)); + return vfsuid_has_mapping(ns, i_uid_into_vfsuid(mnt_userns, inode)) && + vfsgid_has_mapping(ns, i_gid_into_vfsgid(mnt_userns, inode)); } /** diff --git a/kernel/cfi.c b/kernel/cfi.c index 2046276ee234..08caad776717 100644 --- a/kernel/cfi.c +++ b/kernel/cfi.c @@ -1,339 +1,101 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Clang Control Flow Integrity (CFI) error and slowpath handling. + * Clang Control Flow Integrity (CFI) error handling. * - * Copyright (C) 2021 Google LLC + * Copyright (C) 2022 Google LLC */ -#include <linux/hardirq.h> -#include <linux/kallsyms.h> -#include <linux/module.h> -#include <linux/mutex.h> -#include <linux/printk.h> -#include <linux/ratelimit.h> -#include <linux/rcupdate.h> -#include <linux/vmalloc.h> -#include <asm/cacheflush.h> -#include <asm/set_memory.h> +#include <linux/cfi.h> -/* Compiler-defined handler names */ -#ifdef CONFIG_CFI_PERMISSIVE -#define cfi_failure_handler __ubsan_handle_cfi_check_fail -#else -#define cfi_failure_handler __ubsan_handle_cfi_check_fail_abort -#endif - -static inline void handle_cfi_failure(void *ptr) +enum bug_trap_type report_cfi_failure(struct pt_regs *regs, unsigned long addr, + unsigned long *target, u32 type) { - if (IS_ENABLED(CONFIG_CFI_PERMISSIVE)) - WARN_RATELIMIT(1, "CFI failure (target: %pS):\n", ptr); + if (target) + pr_err("CFI failure at %pS (target: %pS; expected type: 0x%08x)\n", + (void *)addr, (void *)*target, type); else - panic("CFI failure (target: %pS)\n", ptr); -} - -#ifdef CONFIG_MODULES -#ifdef CONFIG_CFI_CLANG_SHADOW -/* - * Index type. A 16-bit index can address at most (2^16)-2 pages (taking - * into account SHADOW_INVALID), i.e. ~256M with 4k pages. - */ -typedef u16 shadow_t; -#define SHADOW_INVALID ((shadow_t)~0UL) - -struct cfi_shadow { - /* Page index for the beginning of the shadow */ - unsigned long base; - /* An array of __cfi_check locations (as indices to the shadow) */ - shadow_t shadow[1]; -} __packed; - -/* - * The shadow covers ~128M from the beginning of the module region. If - * the region is larger, we fall back to __module_address for the rest. - */ -#define __SHADOW_RANGE (_UL(SZ_128M) >> PAGE_SHIFT) - -/* The in-memory size of struct cfi_shadow, always at least one page */ -#define __SHADOW_PAGES ((__SHADOW_RANGE * sizeof(shadow_t)) >> PAGE_SHIFT) -#define SHADOW_PAGES max(1UL, __SHADOW_PAGES) -#define SHADOW_SIZE (SHADOW_PAGES << PAGE_SHIFT) - -/* The actual size of the shadow array, minus metadata */ -#define SHADOW_ARR_SIZE (SHADOW_SIZE - offsetof(struct cfi_shadow, shadow)) -#define SHADOW_ARR_SLOTS (SHADOW_ARR_SIZE / sizeof(shadow_t)) - -static DEFINE_MUTEX(shadow_update_lock); -static struct cfi_shadow __rcu *cfi_shadow __read_mostly; + pr_err("CFI failure at %pS (no target information)\n", + (void *)addr); -/* Returns the index in the shadow for the given address */ -static inline int ptr_to_shadow(const struct cfi_shadow *s, unsigned long ptr) -{ - unsigned long index; - unsigned long page = ptr >> PAGE_SHIFT; - - if (unlikely(page < s->base)) - return -1; /* Outside of module area */ - - index = page - s->base; - - if (index >= SHADOW_ARR_SLOTS) - return -1; /* Cannot be addressed with shadow */ - - return (int)index; -} - -/* Returns the page address for an index in the shadow */ -static inline unsigned long shadow_to_ptr(const struct cfi_shadow *s, - int index) -{ - if (unlikely(index < 0 || index >= SHADOW_ARR_SLOTS)) - return 0; - - return (s->base + index) << PAGE_SHIFT; -} - -/* Returns the __cfi_check function address for the given shadow location */ -static inline unsigned long shadow_to_check_fn(const struct cfi_shadow *s, - int index) -{ - if (unlikely(index < 0 || index >= SHADOW_ARR_SLOTS)) - return 0; - - if (unlikely(s->shadow[index] == SHADOW_INVALID)) - return 0; - - /* __cfi_check is always page aligned */ - return (s->base + s->shadow[index]) << PAGE_SHIFT; -} - -static void prepare_next_shadow(const struct cfi_shadow __rcu *prev, - struct cfi_shadow *next) -{ - int i, index, check; - - /* Mark everything invalid */ - memset(next->shadow, 0xFF, SHADOW_ARR_SIZE); - - if (!prev) - return; /* No previous shadow */ - - /* If the base address didn't change, an update is not needed */ - if (prev->base == next->base) { - memcpy(next->shadow, prev->shadow, SHADOW_ARR_SIZE); - return; - } - - /* Convert the previous shadow to the new address range */ - for (i = 0; i < SHADOW_ARR_SLOTS; ++i) { - if (prev->shadow[i] == SHADOW_INVALID) - continue; - - index = ptr_to_shadow(next, shadow_to_ptr(prev, i)); - if (index < 0) - continue; - - check = ptr_to_shadow(next, - shadow_to_check_fn(prev, prev->shadow[i])); - if (check < 0) - continue; - - next->shadow[index] = (shadow_t)check; - } -} - -static void add_module_to_shadow(struct cfi_shadow *s, struct module *mod, - unsigned long min_addr, unsigned long max_addr) -{ - int check_index; - unsigned long check = (unsigned long)mod->cfi_check; - unsigned long ptr; - - if (unlikely(!PAGE_ALIGNED(check))) { - pr_warn("cfi: not using shadow for module %s\n", mod->name); - return; + if (IS_ENABLED(CONFIG_CFI_PERMISSIVE)) { + __warn(NULL, 0, (void *)addr, 0, regs, NULL); + return BUG_TRAP_TYPE_WARN; } - check_index = ptr_to_shadow(s, check); - if (check_index < 0) - return; /* Module not addressable with shadow */ - - /* For each page, store the check function index in the shadow */ - for (ptr = min_addr; ptr <= max_addr; ptr += PAGE_SIZE) { - int index = ptr_to_shadow(s, ptr); - - if (index >= 0) { - /* Each page must only contain one module */ - WARN_ON_ONCE(s->shadow[index] != SHADOW_INVALID); - s->shadow[index] = (shadow_t)check_index; - } - } + return BUG_TRAP_TYPE_BUG; } -static void remove_module_from_shadow(struct cfi_shadow *s, struct module *mod, - unsigned long min_addr, unsigned long max_addr) +#ifdef CONFIG_ARCH_USES_CFI_TRAPS +static inline unsigned long trap_address(s32 *p) { - unsigned long ptr; - - for (ptr = min_addr; ptr <= max_addr; ptr += PAGE_SIZE) { - int index = ptr_to_shadow(s, ptr); - - if (index >= 0) - s->shadow[index] = SHADOW_INVALID; - } + return (unsigned long)((long)p + (long)*p); } -typedef void (*update_shadow_fn)(struct cfi_shadow *, struct module *, - unsigned long min_addr, unsigned long max_addr); - -static void update_shadow(struct module *mod, unsigned long base_addr, - update_shadow_fn fn) +static bool is_trap(unsigned long addr, s32 *start, s32 *end) { - struct cfi_shadow *prev; - struct cfi_shadow *next; - unsigned long min_addr, max_addr; - - next = vmalloc(SHADOW_SIZE); - - mutex_lock(&shadow_update_lock); - prev = rcu_dereference_protected(cfi_shadow, - mutex_is_locked(&shadow_update_lock)); - - if (next) { - next->base = base_addr >> PAGE_SHIFT; - prepare_next_shadow(prev, next); + s32 *p; - min_addr = (unsigned long)mod->core_layout.base; - max_addr = min_addr + mod->core_layout.text_size; - fn(next, mod, min_addr & PAGE_MASK, max_addr & PAGE_MASK); - - set_memory_ro((unsigned long)next, SHADOW_PAGES); - } - - rcu_assign_pointer(cfi_shadow, next); - mutex_unlock(&shadow_update_lock); - synchronize_rcu(); - - if (prev) { - set_memory_rw((unsigned long)prev, SHADOW_PAGES); - vfree(prev); + for (p = start; p < end; ++p) { + if (trap_address(p) == addr) + return true; } -} -void cfi_module_add(struct module *mod, unsigned long base_addr) -{ - update_shadow(mod, base_addr, add_module_to_shadow); + return false; } -void cfi_module_remove(struct module *mod, unsigned long base_addr) -{ - update_shadow(mod, base_addr, remove_module_from_shadow); -} - -static inline cfi_check_fn ptr_to_check_fn(const struct cfi_shadow __rcu *s, - unsigned long ptr) +#ifdef CONFIG_MODULES +/* Populates `kcfi_trap(_end)?` fields in `struct module`. */ +void module_cfi_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, + struct module *mod) { - int index; - - if (unlikely(!s)) - return NULL; /* No shadow available */ - - index = ptr_to_shadow(s, ptr); - if (index < 0) - return NULL; /* Cannot be addressed with shadow */ + char *secstrings; + unsigned int i; - return (cfi_check_fn)shadow_to_check_fn(s, index); -} - -static inline cfi_check_fn find_shadow_check_fn(unsigned long ptr) -{ - cfi_check_fn fn; + mod->kcfi_traps = NULL; + mod->kcfi_traps_end = NULL; - rcu_read_lock_sched_notrace(); - fn = ptr_to_check_fn(rcu_dereference_sched(cfi_shadow), ptr); - rcu_read_unlock_sched_notrace(); + secstrings = (char *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; - return fn; -} - -#else /* !CONFIG_CFI_CLANG_SHADOW */ + for (i = 1; i < hdr->e_shnum; i++) { + if (strcmp(secstrings + sechdrs[i].sh_name, "__kcfi_traps")) + continue; -static inline cfi_check_fn find_shadow_check_fn(unsigned long ptr) -{ - return NULL; + mod->kcfi_traps = (s32 *)sechdrs[i].sh_addr; + mod->kcfi_traps_end = (s32 *)(sechdrs[i].sh_addr + sechdrs[i].sh_size); + break; + } } -#endif /* CONFIG_CFI_CLANG_SHADOW */ - -static inline cfi_check_fn find_module_check_fn(unsigned long ptr) +static bool is_module_cfi_trap(unsigned long addr) { - cfi_check_fn fn = NULL; struct module *mod; + bool found = false; rcu_read_lock_sched_notrace(); - mod = __module_address(ptr); - if (mod) - fn = mod->cfi_check; - rcu_read_unlock_sched_notrace(); - - return fn; -} - -static inline cfi_check_fn find_check_fn(unsigned long ptr) -{ - cfi_check_fn fn = NULL; - unsigned long flags; - bool rcu_idle; - - if (is_kernel_text(ptr)) - return __cfi_check; - /* - * Indirect call checks can happen when RCU is not watching. Both - * the shadow and __module_address use RCU, so we need to wake it - * up if necessary. - */ - rcu_idle = !rcu_is_watching(); - if (rcu_idle) { - local_irq_save(flags); - ct_irq_enter(); - } - - if (IS_ENABLED(CONFIG_CFI_CLANG_SHADOW)) - fn = find_shadow_check_fn(ptr); - if (!fn) - fn = find_module_check_fn(ptr); + mod = __module_address(addr); + if (mod) + found = is_trap(addr, mod->kcfi_traps, mod->kcfi_traps_end); - if (rcu_idle) { - ct_irq_exit(); - local_irq_restore(flags); - } + rcu_read_unlock_sched_notrace(); - return fn; + return found; } - -void __cfi_slowpath_diag(uint64_t id, void *ptr, void *diag) +#else /* CONFIG_MODULES */ +static inline bool is_module_cfi_trap(unsigned long addr) { - cfi_check_fn fn = find_check_fn((unsigned long)ptr); - - if (likely(fn)) - fn(id, ptr, diag); - else /* Don't allow unchecked modules */ - handle_cfi_failure(ptr); + return false; } -EXPORT_SYMBOL(__cfi_slowpath_diag); +#endif /* CONFIG_MODULES */ -#else /* !CONFIG_MODULES */ +extern s32 __start___kcfi_traps[]; +extern s32 __stop___kcfi_traps[]; -void __cfi_slowpath_diag(uint64_t id, void *ptr, void *diag) +bool is_cfi_trap(unsigned long addr) { - handle_cfi_failure(ptr); /* No modules */ -} -EXPORT_SYMBOL(__cfi_slowpath_diag); + if (is_trap(addr, __start___kcfi_traps, __stop___kcfi_traps)) + return true; -#endif /* CONFIG_MODULES */ - -void cfi_failure_handler(void *data, void *ptr, void *vtable) -{ - handle_cfi_failure(ptr); + return is_module_cfi_trap(addr); } -EXPORT_SYMBOL(cfi_failure_handler); +#endif /* CONFIG_ARCH_USES_CFI_TRAPS */ diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 36b740cb3d59..367b0a42ada9 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -164,11 +164,9 @@ struct cgroup_mgctx { #define DEFINE_CGROUP_MGCTX(name) \ struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name) -extern struct mutex cgroup_mutex; extern spinlock_t css_set_lock; extern struct cgroup_subsys *cgroup_subsys[]; extern struct list_head cgroup_roots; -extern struct file_system_type cgroup_fs_type; /* iterate across the hierarchies */ #define for_each_root(root) \ @@ -250,6 +248,8 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup, int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, bool threadgroup); +void cgroup_attach_lock(bool lock_threadgroup); +void cgroup_attach_unlock(bool lock_threadgroup); struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, bool *locked) __acquires(&cgroup_threadgroup_rwsem); diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index ff6a8099eb2a..52bb5a74a23b 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -59,8 +59,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) int retval = 0; mutex_lock(&cgroup_mutex); - cpus_read_lock(); - percpu_down_write(&cgroup_threadgroup_rwsem); + cgroup_attach_lock(true); for_each_root(root) { struct cgroup *from_cgrp; @@ -72,8 +71,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) if (retval) break; } - percpu_up_write(&cgroup_threadgroup_rwsem); - cpus_read_unlock(); + cgroup_attach_unlock(true); mutex_unlock(&cgroup_mutex); return retval; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index e4bb5d57f4d1..c099cf3fa02d 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -217,6 +217,7 @@ struct cgroup_namespace init_cgroup_ns = { static struct file_system_type cgroup2_fs_type; static struct cftype cgroup_base_files[]; +static struct cftype cgroup_psi_files[]; /* cgroup optional features */ enum cgroup_opt_features { @@ -247,6 +248,12 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css, struct cgroup *cgrp, struct cftype cfts[], bool is_add); +#ifdef CONFIG_DEBUG_CGROUP_REF +#define CGROUP_REF_FN_ATTRS noinline +#define CGROUP_REF_EXPORT(fn) EXPORT_SYMBOL_GPL(fn); +#include <linux/cgroup_refcnt.h> +#endif + /** * cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID * @ssid: subsys ID of interest @@ -1391,6 +1398,9 @@ static void cgroup_destroy_root(struct cgroup_root *root) cgroup_free_root(root); } +/* + * Returned cgroup is without refcount but it's valid as long as cset pins it. + */ static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset, struct cgroup_root *root) { @@ -1402,6 +1412,7 @@ static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset, res_cgroup = cset->dfl_cgrp; } else { struct cgrp_cset_link *link; + lockdep_assert_held(&css_set_lock); list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; @@ -1413,6 +1424,7 @@ static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset, } } + BUG_ON(!res_cgroup); return res_cgroup; } @@ -1435,23 +1447,36 @@ current_cgns_cgroup_from_root(struct cgroup_root *root) rcu_read_unlock(); - BUG_ON(!res); return res; } +/* + * Look up cgroup associated with current task's cgroup namespace on the default + * hierarchy. + * + * Unlike current_cgns_cgroup_from_root(), this doesn't need locks: + * - Internal rcu_read_lock is unnecessary because we don't dereference any rcu + * pointers. + * - css_set_lock is not needed because we just read cset->dfl_cgrp. + * - As a bonus returned cgrp is pinned with the current because it cannot + * switch cgroup_ns asynchronously. + */ +static struct cgroup *current_cgns_cgroup_dfl(void) +{ + struct css_set *cset; + + cset = current->nsproxy->cgroup_ns->root_cset; + return __cset_cgroup_from_root(cset, &cgrp_dfl_root); +} + /* look up cgroup associated with given css_set on the specified hierarchy */ static struct cgroup *cset_cgroup_from_root(struct css_set *cset, struct cgroup_root *root) { - struct cgroup *res = NULL; - lockdep_assert_held(&cgroup_mutex); lockdep_assert_held(&css_set_lock); - res = __cset_cgroup_from_root(cset, root); - - BUG_ON(!res); - return res; + return __cset_cgroup_from_root(cset, root); } /* @@ -1689,12 +1714,16 @@ static void css_clear_dir(struct cgroup_subsys_state *css) css->flags &= ~CSS_VISIBLE; if (!css->ss) { - if (cgroup_on_dfl(cgrp)) - cfts = cgroup_base_files; - else - cfts = cgroup1_base_files; - - cgroup_addrm_files(css, cgrp, cfts, false); + if (cgroup_on_dfl(cgrp)) { + cgroup_addrm_files(css, cgrp, + cgroup_base_files, false); + if (cgroup_psi_enabled()) + cgroup_addrm_files(css, cgrp, + cgroup_psi_files, false); + } else { + cgroup_addrm_files(css, cgrp, + cgroup1_base_files, false); + } } else { list_for_each_entry(cfts, &css->ss->cfts, node) cgroup_addrm_files(css, cgrp, cfts, false); @@ -1717,14 +1746,22 @@ static int css_populate_dir(struct cgroup_subsys_state *css) return 0; if (!css->ss) { - if (cgroup_on_dfl(cgrp)) - cfts = cgroup_base_files; - else - cfts = cgroup1_base_files; - - ret = cgroup_addrm_files(&cgrp->self, cgrp, cfts, true); - if (ret < 0) - return ret; + if (cgroup_on_dfl(cgrp)) { + ret = cgroup_addrm_files(&cgrp->self, cgrp, + cgroup_base_files, true); + if (ret < 0) + return ret; + + if (cgroup_psi_enabled()) { + ret = cgroup_addrm_files(&cgrp->self, cgrp, + cgroup_psi_files, true); + if (ret < 0) + return ret; + } + } else { + cgroup_addrm_files(css, cgrp, + cgroup1_base_files, true); + } } else { list_for_each_entry(cfts, &css->ss->cfts, node) { ret = cgroup_addrm_files(css, cgrp, cfts, true); @@ -2050,7 +2087,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) } root_cgrp->kn = kernfs_root_to_node(root->kf_root); WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1); - root_cgrp->ancestor_ids[0] = cgroup_id(root_cgrp); + root_cgrp->ancestors[0] = root_cgrp; ret = css_populate_dir(&root_cgrp->self); if (ret) @@ -2173,7 +2210,7 @@ static int cgroup_get_tree(struct fs_context *fc) struct cgroup_fs_context *ctx = cgroup_fc2context(fc); int ret; - cgrp_dfl_visible = true; + WRITE_ONCE(cgrp_dfl_visible, true); cgroup_get_live(&cgrp_dfl_root.cgrp); ctx->root = &cgrp_dfl_root; @@ -2361,7 +2398,7 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) ret = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns); } else { /* if no hierarchy exists, everyone is in "/" */ - ret = strlcpy(buf, "/", buflen); + ret = strscpy(buf, "/", buflen); } spin_unlock_irq(&css_set_lock); @@ -2393,7 +2430,7 @@ EXPORT_SYMBOL_GPL(task_cgroup_path); * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that * CPU hotplug is disabled on entry. */ -static void cgroup_attach_lock(bool lock_threadgroup) +void cgroup_attach_lock(bool lock_threadgroup) { cpus_read_lock(); if (lock_threadgroup) @@ -2404,7 +2441,7 @@ static void cgroup_attach_lock(bool lock_threadgroup) * cgroup_attach_unlock - Undo cgroup_attach_lock() * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem */ -static void cgroup_attach_unlock(bool lock_threadgroup) +void cgroup_attach_unlock(bool lock_threadgroup) { if (lock_threadgroup) percpu_up_write(&cgroup_threadgroup_rwsem); @@ -2829,14 +2866,12 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup, * take an rcu_read_lock. */ spin_lock_irq(&css_set_lock); - rcu_read_lock(); task = leader; do { cgroup_migrate_add_task(task, mgctx); if (!threadgroup) break; } while_each_thread(leader, task); - rcu_read_unlock(); spin_unlock_irq(&css_set_lock); return cgroup_migrate_execute(mgctx); @@ -3292,11 +3327,7 @@ static int cgroup_apply_control(struct cgroup *cgrp) * making the following cgroup_update_dfl_csses() properly update * css associations of all tasks in the subtree. */ - ret = cgroup_update_dfl_csses(cgrp); - if (ret) - return ret; - - return 0; + return cgroup_update_dfl_csses(cgrp); } /** @@ -3689,27 +3720,27 @@ static int cpu_stat_show(struct seq_file *seq, void *v) static int cgroup_io_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi; + struct psi_group *psi = cgroup_psi(cgrp); return psi_show(seq, psi, PSI_IO); } static int cgroup_memory_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi; + struct psi_group *psi = cgroup_psi(cgrp); return psi_show(seq, psi, PSI_MEM); } static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi; + struct psi_group *psi = cgroup_psi(cgrp); return psi_show(seq, psi, PSI_CPU); } -static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, - size_t nbytes, enum psi_res res) +static ssize_t pressure_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, enum psi_res res) { struct cgroup_file_ctx *ctx = of->priv; struct psi_trigger *new; @@ -3729,7 +3760,7 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, return -EBUSY; } - psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi; + psi = cgroup_psi(cgrp); new = psi_trigger_create(psi, buf, res); if (IS_ERR(new)) { cgroup_put(cgrp); @@ -3746,21 +3777,86 @@ static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - return cgroup_pressure_write(of, buf, nbytes, PSI_IO); + return pressure_write(of, buf, nbytes, PSI_IO); } static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - return cgroup_pressure_write(of, buf, nbytes, PSI_MEM); + return pressure_write(of, buf, nbytes, PSI_MEM); } static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - return cgroup_pressure_write(of, buf, nbytes, PSI_CPU); + return pressure_write(of, buf, nbytes, PSI_CPU); +} + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +static int cgroup_irq_pressure_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + struct psi_group *psi = cgroup_psi(cgrp); + + return psi_show(seq, psi, PSI_IRQ); +} + +static ssize_t cgroup_irq_pressure_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + return pressure_write(of, buf, nbytes, PSI_IRQ); +} +#endif + +static int cgroup_pressure_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + struct psi_group *psi = cgroup_psi(cgrp); + + seq_printf(seq, "%d\n", psi->enabled); + + return 0; +} + +static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + ssize_t ret; + int enable; + struct cgroup *cgrp; + struct psi_group *psi; + + ret = kstrtoint(strstrip(buf), 0, &enable); + if (ret) + return ret; + + if (enable < 0 || enable > 1) + return -ERANGE; + + cgrp = cgroup_kn_lock_live(of->kn, false); + if (!cgrp) + return -ENOENT; + + psi = cgroup_psi(cgrp); + if (psi->enabled != enable) { + int i; + + /* show or hide {cpu,memory,io,irq}.pressure files */ + for (i = 0; i < NR_PSI_RESOURCES; i++) + cgroup_file_show(&cgrp->psi_files[i], enable); + + psi->enabled = enable; + if (enable) + psi_cgroup_restart(psi); + } + + cgroup_kn_unlock(of->kn); + + return nbytes; } static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of, @@ -3780,6 +3876,9 @@ static void cgroup_pressure_release(struct kernfs_open_file *of) bool cgroup_psi_enabled(void) { + if (static_branch_likely(&psi_disabled)) + return false; + return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0; } @@ -4132,8 +4231,6 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css, restart: for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) { /* does cft->flags tell us to skip this file on @cgrp? */ - if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled()) - continue; if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp)) continue; if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp)) @@ -4198,21 +4295,25 @@ static void cgroup_exit_cftypes(struct cftype *cfts) cft->ss = NULL; /* revert flags set by cgroup core while adding @cfts */ - cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL); + cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL | + __CFTYPE_ADDED); } } static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { struct cftype *cft; + int ret = 0; for (cft = cfts; cft->name[0] != '\0'; cft++) { struct kernfs_ops *kf_ops; WARN_ON(cft->ss || cft->kf_ops); - if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled()) - continue; + if (cft->flags & __CFTYPE_ADDED) { + ret = -EBUSY; + break; + } if (cft->seq_start) kf_ops = &cgroup_kf_ops; @@ -4226,26 +4327,26 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) { kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL); if (!kf_ops) { - cgroup_exit_cftypes(cfts); - return -ENOMEM; + ret = -ENOMEM; + break; } kf_ops->atomic_write_len = cft->max_write_len; } cft->kf_ops = kf_ops; cft->ss = ss; + cft->flags |= __CFTYPE_ADDED; } - return 0; + if (ret) + cgroup_exit_cftypes(cfts); + return ret; } static int cgroup_rm_cftypes_locked(struct cftype *cfts) { lockdep_assert_held(&cgroup_mutex); - if (!cfts || !cfts[0].ss) - return -ENOENT; - list_del(&cfts->node); cgroup_apply_cftypes(cfts, false); cgroup_exit_cftypes(cfts); @@ -4267,6 +4368,12 @@ int cgroup_rm_cftypes(struct cftype *cfts) { int ret; + if (!cfts || cfts[0].name[0] == '\0') + return 0; + + if (!(cfts[0].flags & __CFTYPE_ADDED)) + return -ENOENT; + mutex_lock(&cgroup_mutex); ret = cgroup_rm_cftypes_locked(cfts); mutex_unlock(&cgroup_mutex); @@ -4372,6 +4479,26 @@ void cgroup_file_notify(struct cgroup_file *cfile) } /** + * cgroup_file_show - show or hide a hidden cgroup file + * @cfile: target cgroup_file obtained by setting cftype->file_offset + * @show: whether to show or hide + */ +void cgroup_file_show(struct cgroup_file *cfile, bool show) +{ + struct kernfs_node *kn; + + spin_lock_irq(&cgroup_file_kn_lock); + kn = cfile->kn; + kernfs_get(kn); + spin_unlock_irq(&cgroup_file_kn_lock); + + if (kn) + kernfs_show(kn, show); + + kernfs_put(kn); +} + +/** * css_next_child - find the next child of a given css * @pos: the current position (%NULL to initiate traversal) * @parent: css whose children to walk @@ -5131,10 +5258,14 @@ static struct cftype cgroup_base_files[] = { .name = "cpu.stat", .seq_show = cpu_stat_show, }, + { } /* terminate */ +}; + +static struct cftype cgroup_psi_files[] = { #ifdef CONFIG_PSI { .name = "io.pressure", - .flags = CFTYPE_PRESSURE, + .file_offset = offsetof(struct cgroup, psi_files[PSI_IO]), .seq_show = cgroup_io_pressure_show, .write = cgroup_io_pressure_write, .poll = cgroup_pressure_poll, @@ -5142,7 +5273,7 @@ static struct cftype cgroup_base_files[] = { }, { .name = "memory.pressure", - .flags = CFTYPE_PRESSURE, + .file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]), .seq_show = cgroup_memory_pressure_show, .write = cgroup_memory_pressure_write, .poll = cgroup_pressure_poll, @@ -5150,12 +5281,27 @@ static struct cftype cgroup_base_files[] = { }, { .name = "cpu.pressure", - .flags = CFTYPE_PRESSURE, + .file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]), .seq_show = cgroup_cpu_pressure_show, .write = cgroup_cpu_pressure_write, .poll = cgroup_pressure_poll, .release = cgroup_pressure_release, }, +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + { + .name = "irq.pressure", + .file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]), + .seq_show = cgroup_irq_pressure_show, + .write = cgroup_irq_pressure_write, + .poll = cgroup_pressure_poll, + .release = cgroup_pressure_release, + }, +#endif + { + .name = "cgroup.pressure", + .seq_show = cgroup_pressure_show, + .write = cgroup_pressure_write, + }, #endif /* CONFIG_PSI */ { } /* terminate */ }; @@ -5207,6 +5353,7 @@ static void css_free_rwork_fn(struct work_struct *work) atomic_dec(&cgrp->root->nr_cgrps); cgroup1_pidlist_destroy_all(cgrp); cancel_work_sync(&cgrp->release_agent_work); + bpf_cgrp_storage_free(cgrp); if (cgroup_parent(cgrp)) { /* @@ -5432,8 +5579,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, int ret; /* allocate the cgroup and its ID, 0 is reserved for the root */ - cgrp = kzalloc(struct_size(cgrp, ancestor_ids, (level + 1)), - GFP_KERNEL); + cgrp = kzalloc(struct_size(cgrp, ancestors, (level + 1)), GFP_KERNEL); if (!cgrp) return ERR_PTR(-ENOMEM); @@ -5485,7 +5631,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, spin_lock_irq(&css_set_lock); for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { - cgrp->ancestor_ids[tcgrp->level] = cgroup_id(tcgrp); + cgrp->ancestors[tcgrp->level] = tcgrp; if (tcgrp != cgrp) { tcgrp->nr_descendants++; @@ -5918,6 +6064,7 @@ int __init cgroup_init(void) BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16); BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); + BUG_ON(cgroup_init_cftypes(NULL, cgroup_psi_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files)); cgroup_rstat_boot(); @@ -6038,16 +6185,22 @@ void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen) /* * cgroup_get_from_id : get the cgroup associated with cgroup id * @id: cgroup id - * On success return the cgrp, on failure return NULL + * On success return the cgrp or ERR_PTR on failure + * Only cgroups within current task's cgroup NS are valid. */ struct cgroup *cgroup_get_from_id(u64 id) { struct kernfs_node *kn; - struct cgroup *cgrp = NULL; + struct cgroup *cgrp, *root_cgrp; kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id); if (!kn) - goto out; + return ERR_PTR(-ENOENT); + + if (kernfs_type(kn) != KERNFS_DIR) { + kernfs_put(kn); + return ERR_PTR(-ENOENT); + } rcu_read_lock(); @@ -6056,9 +6209,17 @@ struct cgroup *cgroup_get_from_id(u64 id) cgrp = NULL; rcu_read_unlock(); - kernfs_put(kn); -out: + + if (!cgrp) + return ERR_PTR(-ENOENT); + + root_cgrp = current_cgns_cgroup_dfl(); + if (!cgroup_is_descendant(cgrp, root_cgrp)) { + cgroup_put(cgrp); + return ERR_PTR(-ENOENT); + } + return cgrp; } EXPORT_SYMBOL_GPL(cgroup_get_from_id); @@ -6088,7 +6249,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, struct cgroup *cgrp; int ssid, count = 0; - if (root == &cgrp_dfl_root && !cgrp_dfl_visible) + if (root == &cgrp_dfl_root && !READ_ONCE(cgrp_dfl_visible)) continue; seq_printf(m, "%d:", root->hierarchy_id); @@ -6154,16 +6315,37 @@ void cgroup_fork(struct task_struct *child) INIT_LIST_HEAD(&child->cg_list); } -static struct cgroup *cgroup_get_from_file(struct file *f) +/** + * cgroup_v1v2_get_from_file - get a cgroup pointer from a file pointer + * @f: file corresponding to cgroup_dir + * + * Find the cgroup from a file pointer associated with a cgroup directory. + * Returns a pointer to the cgroup on success. ERR_PTR is returned if the + * cgroup cannot be found. + */ +static struct cgroup *cgroup_v1v2_get_from_file(struct file *f) { struct cgroup_subsys_state *css; - struct cgroup *cgrp; css = css_tryget_online_from_dir(f->f_path.dentry, NULL); if (IS_ERR(css)) return ERR_CAST(css); - cgrp = css->cgroup; + return css->cgroup; +} + +/** + * cgroup_get_from_file - same as cgroup_v1v2_get_from_file, but only supports + * cgroup2. + * @f: file corresponding to cgroup2_dir + */ +static struct cgroup *cgroup_get_from_file(struct file *f) +{ + struct cgroup *cgrp = cgroup_v1v2_get_from_file(f); + + if (IS_ERR(cgrp)) + return ERR_CAST(cgrp); + if (!cgroup_on_dfl(cgrp)) { cgroup_put(cgrp); return ERR_PTR(-EBADF); @@ -6635,8 +6817,10 @@ struct cgroup *cgroup_get_from_path(const char *path) { struct kernfs_node *kn; struct cgroup *cgrp = ERR_PTR(-ENOENT); + struct cgroup *root_cgrp; - kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path); + root_cgrp = current_cgns_cgroup_dfl(); + kn = kernfs_walk_and_get(root_cgrp->kn, path); if (!kn) goto out; @@ -6661,15 +6845,15 @@ out: EXPORT_SYMBOL_GPL(cgroup_get_from_path); /** - * cgroup_get_from_fd - get a cgroup pointer from a fd - * @fd: fd obtained by open(cgroup2_dir) + * cgroup_v1v2_get_from_fd - get a cgroup pointer from a fd + * @fd: fd obtained by open(cgroup_dir) * * Find the cgroup from a fd which should be obtained * by opening a cgroup directory. Returns a pointer to the * cgroup on success. ERR_PTR is returned if the cgroup * cannot be found. */ -struct cgroup *cgroup_get_from_fd(int fd) +struct cgroup *cgroup_v1v2_get_from_fd(int fd) { struct cgroup *cgrp; struct file *f; @@ -6678,10 +6862,29 @@ struct cgroup *cgroup_get_from_fd(int fd) if (!f) return ERR_PTR(-EBADF); - cgrp = cgroup_get_from_file(f); + cgrp = cgroup_v1v2_get_from_file(f); fput(f); return cgrp; } + +/** + * cgroup_get_from_fd - same as cgroup_v1v2_get_from_fd, but only supports + * cgroup2. + * @fd: fd obtained by open(cgroup2_dir) + */ +struct cgroup *cgroup_get_from_fd(int fd) +{ + struct cgroup *cgrp = cgroup_v1v2_get_from_fd(fd); + + if (IS_ERR(cgrp)) + return ERR_CAST(cgrp); + + if (!cgroup_on_dfl(cgrp)) { + cgroup_put(cgrp); + return ERR_PTR(-EBADF); + } + return cgrp; +} EXPORT_SYMBOL_GPL(cgroup_get_from_fd); static u64 power_of_ten(int power) @@ -6794,9 +6997,6 @@ static ssize_t show_delegatable_files(struct cftype *files, char *buf, if (!(cft->flags & CFTYPE_NS_DELEGATABLE)) continue; - if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled()) - continue; - if (prefix) ret += snprintf(buf + ret, size - ret, "%s.", prefix); @@ -6816,8 +7016,11 @@ static ssize_t delegate_show(struct kobject *kobj, struct kobj_attribute *attr, int ssid; ssize_t ret = 0; - ret = show_delegatable_files(cgroup_base_files, buf, PAGE_SIZE - ret, - NULL); + ret = show_delegatable_files(cgroup_base_files, buf + ret, + PAGE_SIZE - ret, NULL); + if (cgroup_psi_enabled()) + ret += show_delegatable_files(cgroup_psi_files, buf + ret, + PAGE_SIZE - ret, NULL); for_each_subsys(ss, ssid) ret += show_delegatable_files(ss->dfl_cftypes, buf + ret, diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 1f3a55297f39..a29c0b13706b 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -33,6 +33,7 @@ #include <linux/interrupt.h> #include <linux/kernel.h> #include <linux/kmod.h> +#include <linux/kthread.h> #include <linux/list.h> #include <linux/mempolicy.h> #include <linux/mm.h> @@ -85,6 +86,30 @@ struct fmeter { spinlock_t lock; /* guards read or write of above */ }; +/* + * Invalid partition error code + */ +enum prs_errcode { + PERR_NONE = 0, + PERR_INVCPUS, + PERR_INVPARENT, + PERR_NOTPART, + PERR_NOTEXCL, + PERR_NOCPUS, + PERR_HOTPLUG, + PERR_CPUSEMPTY, +}; + +static const char * const perr_strings[] = { + [PERR_INVCPUS] = "Invalid cpu list in cpuset.cpus", + [PERR_INVPARENT] = "Parent is an invalid partition root", + [PERR_NOTPART] = "Parent is not a partition root", + [PERR_NOTEXCL] = "Cpu list in cpuset.cpus not exclusive", + [PERR_NOCPUS] = "Parent unable to distribute cpu downstream", + [PERR_HOTPLUG] = "No cpu available due to hotplug", + [PERR_CPUSEMPTY] = "cpuset.cpus is empty", +}; + struct cpuset { struct cgroup_subsys_state css; @@ -168,6 +193,9 @@ struct cpuset { int use_parent_ecpus; int child_ecpus_count; + /* Invalid partition error code, not lock protected */ + enum prs_errcode prs_err; + /* Handle for cpuset.cpus.partition */ struct cgroup_file partition_file; }; @@ -175,20 +203,22 @@ struct cpuset { /* * Partition root states: * - * 0 - not a partition root - * + * 0 - member (not a partition root) * 1 - partition root - * + * 2 - partition root without load balancing (isolated) * -1 - invalid partition root - * None of the cpus in cpus_allowed can be put into the parent's - * subparts_cpus. In this case, the cpuset is not a real partition - * root anymore. However, the CPU_EXCLUSIVE bit will still be set - * and the cpuset can be restored back to a partition root if the - * parent cpuset can give more CPUs back to this child cpuset. + * -2 - invalid isolated partition root */ -#define PRS_DISABLED 0 -#define PRS_ENABLED 1 -#define PRS_ERROR -1 +#define PRS_MEMBER 0 +#define PRS_ROOT 1 +#define PRS_ISOLATED 2 +#define PRS_INVALID_ROOT -1 +#define PRS_INVALID_ISOLATED -2 + +static inline bool is_prs_invalid(int prs_state) +{ + return prs_state < 0; +} /* * Temporary cpumasks for working with partitions that are passed among @@ -268,25 +298,43 @@ static inline int is_spread_slab(const struct cpuset *cs) return test_bit(CS_SPREAD_SLAB, &cs->flags); } -static inline int is_partition_root(const struct cpuset *cs) +static inline int is_partition_valid(const struct cpuset *cs) { return cs->partition_root_state > 0; } +static inline int is_partition_invalid(const struct cpuset *cs) +{ + return cs->partition_root_state < 0; +} + +/* + * Callers should hold callback_lock to modify partition_root_state. + */ +static inline void make_partition_invalid(struct cpuset *cs) +{ + if (is_partition_valid(cs)) + cs->partition_root_state = -cs->partition_root_state; +} + /* * Send notification event of whenever partition_root_state changes. */ -static inline void notify_partition_change(struct cpuset *cs, - int old_prs, int new_prs) +static inline void notify_partition_change(struct cpuset *cs, int old_prs) { - if (old_prs != new_prs) - cgroup_file_notify(&cs->partition_file); + if (old_prs == cs->partition_root_state) + return; + cgroup_file_notify(&cs->partition_file); + + /* Reset prs_err if not invalid */ + if (is_partition_valid(cs)) + WRITE_ONCE(cs->prs_err, PERR_NONE); } static struct cpuset top_cpuset = { .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), - .partition_root_state = PRS_ENABLED, + .partition_root_state = PRS_ROOT, }; /** @@ -404,6 +452,41 @@ static inline bool is_in_v2_mode(void) (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE); } +/** + * partition_is_populated - check if partition has tasks + * @cs: partition root to be checked + * @excluded_child: a child cpuset to be excluded in task checking + * Return: true if there are tasks, false otherwise + * + * It is assumed that @cs is a valid partition root. @excluded_child should + * be non-NULL when this cpuset is going to become a partition itself. + */ +static inline bool partition_is_populated(struct cpuset *cs, + struct cpuset *excluded_child) +{ + struct cgroup_subsys_state *css; + struct cpuset *child; + + if (cs->css.cgroup->nr_populated_csets) + return true; + if (!excluded_child && !cs->nr_subparts_cpus) + return cgroup_is_populated(cs->css.cgroup); + + rcu_read_lock(); + cpuset_for_each_child(child, css, cs) { + if (child == excluded_child) + continue; + if (is_partition_valid(child)) + continue; + if (cgroup_is_populated(child->css.cgroup)) { + rcu_read_unlock(); + return true; + } + } + rcu_read_unlock(); + return false; +} + /* * Return in pmask the portion of a task's cpusets's cpus_allowed that * are online and are capable of running the task. If none are found, @@ -467,11 +550,15 @@ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) /* * update task's spread flag if cpuset's page/slab spread flag is set * - * Call with callback_lock or cpuset_rwsem held. + * Call with callback_lock or cpuset_rwsem held. The check can be skipped + * if on default hierarchy. */ -static void cpuset_update_task_spread_flag(struct cpuset *cs, +static void cpuset_update_task_spread_flags(struct cpuset *cs, struct task_struct *tsk) { + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) + return; + if (is_spread_page(cs)) task_set_spread_page(tsk); else @@ -659,22 +746,6 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) par = parent_cs(cur); /* - * If either I or some sibling (!= me) is exclusive, we can't - * overlap - */ - ret = -EINVAL; - cpuset_for_each_child(c, css, par) { - if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && - c != cur && - cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) - goto out; - if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && - c != cur && - nodes_intersects(trial->mems_allowed, c->mems_allowed)) - goto out; - } - - /* * Cpusets with tasks - existing or newly being attached - can't * be changed to have empty cpus_allowed or mems_allowed. */ @@ -698,6 +769,22 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) trial->cpus_allowed)) goto out; + /* + * If either I or some sibling (!= me) is exclusive, we can't + * overlap + */ + ret = -EINVAL; + cpuset_for_each_child(c, css, par) { + if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && + c != cur && + cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) + goto out; + if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && + c != cur && + nodes_intersects(trial->mems_allowed, c->mems_allowed)) + goto out; + } + ret = 0; out: rcu_read_unlock(); @@ -875,7 +962,7 @@ static int generate_sched_domains(cpumask_var_t **domains, csa[csn++] = cp; /* skip @cp's subtree if not a partition root */ - if (!is_partition_root(cp)) + if (!is_partition_valid(cp)) pos_css = css_rightmost_descendant(pos_css); } rcu_read_unlock(); @@ -1081,7 +1168,7 @@ static void rebuild_sched_domains_locked(void) if (top_cpuset.nr_subparts_cpus) { rcu_read_lock(); cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { - if (!is_partition_root(cs)) { + if (!is_partition_valid(cs)) { pos_css = css_rightmost_descendant(pos_css); continue; } @@ -1127,10 +1214,18 @@ static void update_tasks_cpumask(struct cpuset *cs) { struct css_task_iter it; struct task_struct *task; + bool top_cs = cs == &top_cpuset; css_task_iter_start(&cs->css, 0, &it); - while ((task = css_task_iter_next(&it))) + while ((task = css_task_iter_next(&it))) { + /* + * Percpu kthreads in top_cpuset are ignored + */ + if (top_cs && (task->flags & PF_KTHREAD) && + kthread_is_per_cpu(task)) + continue; set_cpus_allowed_ptr(task, cs->effective_cpus); + } css_task_iter_end(&it); } @@ -1165,15 +1260,18 @@ enum subparts_cmd { partcmd_enable, /* Enable partition root */ partcmd_disable, /* Disable partition root */ partcmd_update, /* Update parent's subparts_cpus */ + partcmd_invalidate, /* Make partition invalid */ }; +static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, + int turning_on); /** * update_parent_subparts_cpumask - update subparts_cpus mask of parent cpuset * @cpuset: The cpuset that requests change in partition root state * @cmd: Partition root state change command * @newmask: Optional new cpumask for partcmd_update * @tmp: Temporary addmask and delmask - * Return: 0, 1 or an error code + * Return: 0 or a partition root state error code * * For partcmd_enable, the cpuset is being transformed from a non-partition * root to a partition root. The cpus_allowed mask of the given cpuset will @@ -1184,38 +1282,36 @@ enum subparts_cmd { * For partcmd_disable, the cpuset is being transformed from a partition * root back to a non-partition root. Any CPUs in cpus_allowed that are in * parent's subparts_cpus will be taken away from that cpumask and put back - * into parent's effective_cpus. 0 should always be returned. + * into parent's effective_cpus. 0 will always be returned. * - * For partcmd_update, if the optional newmask is specified, the cpu - * list is to be changed from cpus_allowed to newmask. Otherwise, - * cpus_allowed is assumed to remain the same. The cpuset should either - * be a partition root or an invalid partition root. The partition root - * state may change if newmask is NULL and none of the requested CPUs can - * be granted by the parent. The function will return 1 if changes to - * parent's subparts_cpus and effective_cpus happen or 0 otherwise. - * Error code should only be returned when newmask is non-NULL. + * For partcmd_update, if the optional newmask is specified, the cpu list is + * to be changed from cpus_allowed to newmask. Otherwise, cpus_allowed is + * assumed to remain the same. The cpuset should either be a valid or invalid + * partition root. The partition root state may change from valid to invalid + * or vice versa. An error code will only be returned if transitioning from + * invalid to valid violates the exclusivity rule. * - * The partcmd_enable and partcmd_disable commands are used by - * update_prstate(). The partcmd_update command is used by - * update_cpumasks_hier() with newmask NULL and update_cpumask() with - * newmask set. + * For partcmd_invalidate, the current partition will be made invalid. * - * The checking is more strict when enabling partition root than the - * other two commands. - * - * Because of the implicit cpu exclusive nature of a partition root, - * cpumask changes that violates the cpu exclusivity rule will not be - * permitted when checked by validate_change(). + * The partcmd_enable and partcmd_disable commands are used by + * update_prstate(). An error code may be returned and the caller will check + * for error. + * + * The partcmd_update command is used by update_cpumasks_hier() with newmask + * NULL and update_cpumask() with newmask set. The partcmd_invalidate is used + * by update_cpumask() with NULL newmask. In both cases, the callers won't + * check for error and so partition_root_state and prs_error will be updated + * directly. */ -static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, +static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd, struct cpumask *newmask, struct tmpmasks *tmp) { - struct cpuset *parent = parent_cs(cpuset); + struct cpuset *parent = parent_cs(cs); int adding; /* Moving cpus from effective_cpus to subparts_cpus */ int deleting; /* Moving cpus from subparts_cpus to effective_cpus */ int old_prs, new_prs; - bool part_error = false; /* Partition error? */ + int part_error = PERR_NONE; /* Partition error? */ percpu_rwsem_assert_held(&cpuset_rwsem); @@ -1224,126 +1320,165 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, * The new cpumask, if present, or the current cpus_allowed must * not be empty. */ - if (!is_partition_root(parent) || - (newmask && cpumask_empty(newmask)) || - (!newmask && cpumask_empty(cpuset->cpus_allowed))) - return -EINVAL; - - /* - * Enabling/disabling partition root is not allowed if there are - * online children. - */ - if ((cmd != partcmd_update) && css_has_online_children(&cpuset->css)) - return -EBUSY; - - /* - * Enabling partition root is not allowed if not all the CPUs - * can be granted from parent's effective_cpus or at least one - * CPU will be left after that. - */ - if ((cmd == partcmd_enable) && - (!cpumask_subset(cpuset->cpus_allowed, parent->effective_cpus) || - cpumask_equal(cpuset->cpus_allowed, parent->effective_cpus))) - return -EINVAL; + if (!is_partition_valid(parent)) { + return is_partition_invalid(parent) + ? PERR_INVPARENT : PERR_NOTPART; + } + if ((newmask && cpumask_empty(newmask)) || + (!newmask && cpumask_empty(cs->cpus_allowed))) + return PERR_CPUSEMPTY; /* - * A cpumask update cannot make parent's effective_cpus become empty. + * new_prs will only be changed for the partcmd_update and + * partcmd_invalidate commands. */ adding = deleting = false; - old_prs = new_prs = cpuset->partition_root_state; + old_prs = new_prs = cs->partition_root_state; if (cmd == partcmd_enable) { - cpumask_copy(tmp->addmask, cpuset->cpus_allowed); + /* + * Enabling partition root is not allowed if cpus_allowed + * doesn't overlap parent's cpus_allowed. + */ + if (!cpumask_intersects(cs->cpus_allowed, parent->cpus_allowed)) + return PERR_INVCPUS; + + /* + * A parent can be left with no CPU as long as there is no + * task directly associated with the parent partition. + */ + if (!cpumask_intersects(cs->cpus_allowed, parent->effective_cpus) && + partition_is_populated(parent, cs)) + return PERR_NOCPUS; + + cpumask_copy(tmp->addmask, cs->cpus_allowed); adding = true; } else if (cmd == partcmd_disable) { - deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed, + /* + * Need to remove cpus from parent's subparts_cpus for valid + * partition root. + */ + deleting = !is_prs_invalid(old_prs) && + cpumask_and(tmp->delmask, cs->cpus_allowed, parent->subparts_cpus); + } else if (cmd == partcmd_invalidate) { + if (is_prs_invalid(old_prs)) + return 0; + + /* + * Make the current partition invalid. It is assumed that + * invalidation is caused by violating cpu exclusivity rule. + */ + deleting = cpumask_and(tmp->delmask, cs->cpus_allowed, + parent->subparts_cpus); + if (old_prs > 0) { + new_prs = -old_prs; + part_error = PERR_NOTEXCL; + } } else if (newmask) { /* * partcmd_update with newmask: * + * Compute add/delete mask to/from subparts_cpus + * * delmask = cpus_allowed & ~newmask & parent->subparts_cpus - * addmask = newmask & parent->effective_cpus + * addmask = newmask & parent->cpus_allowed * & ~parent->subparts_cpus */ - cpumask_andnot(tmp->delmask, cpuset->cpus_allowed, newmask); + cpumask_andnot(tmp->delmask, cs->cpus_allowed, newmask); deleting = cpumask_and(tmp->delmask, tmp->delmask, parent->subparts_cpus); - cpumask_and(tmp->addmask, newmask, parent->effective_cpus); + cpumask_and(tmp->addmask, newmask, parent->cpus_allowed); adding = cpumask_andnot(tmp->addmask, tmp->addmask, parent->subparts_cpus); /* - * Return error if the new effective_cpus could become empty. + * Make partition invalid if parent's effective_cpus could + * become empty and there are tasks in the parent. */ if (adding && - cpumask_equal(parent->effective_cpus, tmp->addmask)) { - if (!deleting) - return -EINVAL; - /* - * As some of the CPUs in subparts_cpus might have - * been offlined, we need to compute the real delmask - * to confirm that. - */ - if (!cpumask_and(tmp->addmask, tmp->delmask, - cpu_active_mask)) - return -EINVAL; - cpumask_copy(tmp->addmask, parent->effective_cpus); + cpumask_subset(parent->effective_cpus, tmp->addmask) && + !cpumask_intersects(tmp->delmask, cpu_active_mask) && + partition_is_populated(parent, cs)) { + part_error = PERR_NOCPUS; + adding = false; + deleting = cpumask_and(tmp->delmask, cs->cpus_allowed, + parent->subparts_cpus); } } else { /* * partcmd_update w/o newmask: * - * addmask = cpus_allowed & parent->effective_cpus + * delmask = cpus_allowed & parent->subparts_cpus + * addmask = cpus_allowed & parent->cpus_allowed + * & ~parent->subparts_cpus * - * Note that parent's subparts_cpus may have been - * pre-shrunk in case there is a change in the cpu list. - * So no deletion is needed. + * This gets invoked either due to a hotplug event or from + * update_cpumasks_hier(). This can cause the state of a + * partition root to transition from valid to invalid or vice + * versa. So we still need to compute the addmask and delmask. + + * A partition error happens when: + * 1) Cpuset is valid partition, but parent does not distribute + * out any CPUs. + * 2) Parent has tasks and all its effective CPUs will have + * to be distributed out. */ - adding = cpumask_and(tmp->addmask, cpuset->cpus_allowed, - parent->effective_cpus); - part_error = cpumask_equal(tmp->addmask, - parent->effective_cpus); + cpumask_and(tmp->addmask, cs->cpus_allowed, + parent->cpus_allowed); + adding = cpumask_andnot(tmp->addmask, tmp->addmask, + parent->subparts_cpus); + + if ((is_partition_valid(cs) && !parent->nr_subparts_cpus) || + (adding && + cpumask_subset(parent->effective_cpus, tmp->addmask) && + partition_is_populated(parent, cs))) { + part_error = PERR_NOCPUS; + adding = false; + } + + if (part_error && is_partition_valid(cs) && + parent->nr_subparts_cpus) + deleting = cpumask_and(tmp->delmask, cs->cpus_allowed, + parent->subparts_cpus); } + if (part_error) + WRITE_ONCE(cs->prs_err, part_error); if (cmd == partcmd_update) { - int prev_prs = cpuset->partition_root_state; - /* - * Check for possible transition between PRS_ENABLED - * and PRS_ERROR. + * Check for possible transition between valid and invalid + * partition root. */ - switch (cpuset->partition_root_state) { - case PRS_ENABLED: + switch (cs->partition_root_state) { + case PRS_ROOT: + case PRS_ISOLATED: if (part_error) - new_prs = PRS_ERROR; + new_prs = -old_prs; break; - case PRS_ERROR: + case PRS_INVALID_ROOT: + case PRS_INVALID_ISOLATED: if (!part_error) - new_prs = PRS_ENABLED; + new_prs = -old_prs; break; } - /* - * Set part_error if previously in invalid state. - */ - part_error = (prev_prs == PRS_ERROR); - } - - if (!part_error && (new_prs == PRS_ERROR)) - return 0; /* Nothing need to be done */ - - if (new_prs == PRS_ERROR) { - /* - * Remove all its cpus from parent's subparts_cpus. - */ - adding = false; - deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed, - parent->subparts_cpus); } if (!adding && !deleting && (new_prs == old_prs)) return 0; /* + * Transitioning between invalid to valid or vice versa may require + * changing CS_CPU_EXCLUSIVE and CS_SCHED_LOAD_BALANCE. + */ + if (old_prs != new_prs) { + if (is_prs_invalid(old_prs) && !is_cpu_exclusive(cs) && + (update_flag(CS_CPU_EXCLUSIVE, cs, 1) < 0)) + return PERR_NOTEXCL; + if (is_prs_invalid(new_prs) && is_cpu_exclusive(cs)) + update_flag(CS_CPU_EXCLUSIVE, cs, 0); + } + + /* * Change the parent's subparts_cpus. * Newly added CPUs will be removed from effective_cpus and * newly deleted ones will be added back to effective_cpus. @@ -1369,18 +1504,32 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, parent->nr_subparts_cpus = cpumask_weight(parent->subparts_cpus); if (old_prs != new_prs) - cpuset->partition_root_state = new_prs; + cs->partition_root_state = new_prs; spin_unlock_irq(&callback_lock); - notify_partition_change(cpuset, old_prs, new_prs); - return cmd == partcmd_update; + if (adding || deleting) + update_tasks_cpumask(parent); + + /* + * Set or clear CS_SCHED_LOAD_BALANCE when partcmd_update, if necessary. + * rebuild_sched_domains_locked() may be called. + */ + if (old_prs != new_prs) { + if (old_prs == PRS_ISOLATED) + update_flag(CS_SCHED_LOAD_BALANCE, cs, 1); + else if (new_prs == PRS_ISOLATED) + update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); + } + notify_partition_change(cs, old_prs); + return 0; } /* * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree * @cs: the cpuset to consider * @tmp: temp variables for calculating effective_cpus & partition setup + * @force: don't skip any descendant cpusets if set * * When configured cpumask is changed, the effective cpumasks of this cpuset * and all its descendants need to be updated. @@ -1389,7 +1538,8 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, * * Called with cpuset_rwsem held */ -static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) +static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, + bool force) { struct cpuset *cp; struct cgroup_subsys_state *pos_css; @@ -1399,14 +1549,21 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) rcu_read_lock(); cpuset_for_each_descendant_pre(cp, pos_css, cs) { struct cpuset *parent = parent_cs(cp); + bool update_parent = false; compute_effective_cpumask(tmp->new_cpus, cp, parent); /* * If it becomes empty, inherit the effective mask of the - * parent, which is guaranteed to have some CPUs. + * parent, which is guaranteed to have some CPUs unless + * it is a partition root that has explicitly distributed + * out all its CPUs. */ if (is_in_v2_mode() && cpumask_empty(tmp->new_cpus)) { + if (is_partition_valid(cp) && + cpumask_equal(cp->cpus_allowed, cp->subparts_cpus)) + goto update_parent_subparts; + cpumask_copy(tmp->new_cpus, parent->effective_cpus); if (!cp->use_parent_ecpus) { cp->use_parent_ecpus = true; @@ -1420,14 +1577,15 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) /* * Skip the whole subtree if the cpumask remains the same - * and has no partition root state. + * and has no partition root state and force flag not set. */ - if (!cp->partition_root_state && + if (!cp->partition_root_state && !force && cpumask_equal(tmp->new_cpus, cp->effective_cpus)) { pos_css = css_rightmost_descendant(pos_css); continue; } +update_parent_subparts: /* * update_parent_subparts_cpumask() should have been called * for cs already in update_cpumask(). We should also call @@ -1437,36 +1595,22 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) old_prs = new_prs = cp->partition_root_state; if ((cp != cs) && old_prs) { switch (parent->partition_root_state) { - case PRS_DISABLED: - /* - * If parent is not a partition root or an - * invalid partition root, clear its state - * and its CS_CPU_EXCLUSIVE flag. - */ - WARN_ON_ONCE(cp->partition_root_state - != PRS_ERROR); - new_prs = PRS_DISABLED; - - /* - * clear_bit() is an atomic operation and - * readers aren't interested in the state - * of CS_CPU_EXCLUSIVE anyway. So we can - * just update the flag without holding - * the callback_lock. - */ - clear_bit(CS_CPU_EXCLUSIVE, &cp->flags); - break; - - case PRS_ENABLED: - if (update_parent_subparts_cpumask(cp, partcmd_update, NULL, tmp)) - update_tasks_cpumask(parent); + case PRS_ROOT: + case PRS_ISOLATED: + update_parent = true; break; - case PRS_ERROR: + default: /* - * When parent is invalid, it has to be too. + * When parent is not a partition root or is + * invalid, child partition roots become + * invalid too. */ - new_prs = PRS_ERROR; + if (is_partition_valid(cp)) + new_prs = -cp->partition_root_state; + WRITE_ONCE(cp->prs_err, + is_partition_invalid(parent) + ? PERR_INVPARENT : PERR_NOTPART); break; } } @@ -1475,42 +1619,44 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) continue; rcu_read_unlock(); + if (update_parent) { + update_parent_subparts_cpumask(cp, partcmd_update, NULL, + tmp); + /* + * The cpuset partition_root_state may become + * invalid. Capture it. + */ + new_prs = cp->partition_root_state; + } + spin_lock_irq(&callback_lock); - cpumask_copy(cp->effective_cpus, tmp->new_cpus); - if (cp->nr_subparts_cpus && (new_prs != PRS_ENABLED)) { + if (cp->nr_subparts_cpus && !is_partition_valid(cp)) { + /* + * Put all active subparts_cpus back to effective_cpus. + */ + cpumask_or(tmp->new_cpus, tmp->new_cpus, + cp->subparts_cpus); + cpumask_and(tmp->new_cpus, tmp->new_cpus, + cpu_active_mask); cp->nr_subparts_cpus = 0; cpumask_clear(cp->subparts_cpus); - } else if (cp->nr_subparts_cpus) { + } + + cpumask_copy(cp->effective_cpus, tmp->new_cpus); + if (cp->nr_subparts_cpus) { /* * Make sure that effective_cpus & subparts_cpus * are mutually exclusive. - * - * In the unlikely event that effective_cpus - * becomes empty. we clear cp->nr_subparts_cpus and - * let its child partition roots to compete for - * CPUs again. */ cpumask_andnot(cp->effective_cpus, cp->effective_cpus, cp->subparts_cpus); - if (cpumask_empty(cp->effective_cpus)) { - cpumask_copy(cp->effective_cpus, tmp->new_cpus); - cpumask_clear(cp->subparts_cpus); - cp->nr_subparts_cpus = 0; - } else if (!cpumask_subset(cp->subparts_cpus, - tmp->new_cpus)) { - cpumask_andnot(cp->subparts_cpus, - cp->subparts_cpus, tmp->new_cpus); - cp->nr_subparts_cpus - = cpumask_weight(cp->subparts_cpus); - } } - if (new_prs != old_prs) - cp->partition_root_state = new_prs; - + cp->partition_root_state = new_prs; spin_unlock_irq(&callback_lock); - notify_partition_change(cp, old_prs, new_prs); + + notify_partition_change(cp, old_prs); WARN_ON(!is_in_v2_mode() && !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); @@ -1526,7 +1672,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) if (!cpumask_empty(cp->cpus_allowed) && is_sched_load_balance(cp) && (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || - is_partition_root(cp))) + is_partition_valid(cp))) need_rebuild_sched_domains = true; rcu_read_lock(); @@ -1570,7 +1716,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, continue; rcu_read_unlock(); - update_cpumasks_hier(sibling, tmp); + update_cpumasks_hier(sibling, tmp, false); rcu_read_lock(); css_put(&sibling->css); } @@ -1588,6 +1734,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, { int retval; struct tmpmasks tmp; + bool invalidate = false; /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */ if (cs == &top_cpuset) @@ -1615,10 +1762,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) return 0; - retval = validate_change(cs, trialcs); - if (retval < 0) - return retval; - #ifdef CONFIG_CPUMASK_OFFSTACK /* * Use the cpumasks in trialcs for tmpmasks when they are pointers @@ -1629,28 +1772,70 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, tmp.new_cpus = trialcs->cpus_allowed; #endif + retval = validate_change(cs, trialcs); + + if ((retval == -EINVAL) && cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) { + struct cpuset *cp, *parent; + struct cgroup_subsys_state *css; + + /* + * The -EINVAL error code indicates that partition sibling + * CPU exclusivity rule has been violated. We still allow + * the cpumask change to proceed while invalidating the + * partition. However, any conflicting sibling partitions + * have to be marked as invalid too. + */ + invalidate = true; + rcu_read_lock(); + parent = parent_cs(cs); + cpuset_for_each_child(cp, css, parent) + if (is_partition_valid(cp) && + cpumask_intersects(trialcs->cpus_allowed, cp->cpus_allowed)) { + rcu_read_unlock(); + update_parent_subparts_cpumask(cp, partcmd_invalidate, NULL, &tmp); + rcu_read_lock(); + } + rcu_read_unlock(); + retval = 0; + } + if (retval < 0) + return retval; + if (cs->partition_root_state) { - /* Cpumask of a partition root cannot be empty */ - if (cpumask_empty(trialcs->cpus_allowed)) - return -EINVAL; - if (update_parent_subparts_cpumask(cs, partcmd_update, - trialcs->cpus_allowed, &tmp) < 0) - return -EINVAL; + if (invalidate) + update_parent_subparts_cpumask(cs, partcmd_invalidate, + NULL, &tmp); + else + update_parent_subparts_cpumask(cs, partcmd_update, + trialcs->cpus_allowed, &tmp); } + compute_effective_cpumask(trialcs->effective_cpus, trialcs, + parent_cs(cs)); spin_lock_irq(&callback_lock); cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); /* - * Make sure that subparts_cpus is a subset of cpus_allowed. + * Make sure that subparts_cpus, if not empty, is a subset of + * cpus_allowed. Clear subparts_cpus if partition not valid or + * empty effective cpus with tasks. */ if (cs->nr_subparts_cpus) { - cpumask_and(cs->subparts_cpus, cs->subparts_cpus, cs->cpus_allowed); - cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus); + if (!is_partition_valid(cs) || + (cpumask_subset(trialcs->effective_cpus, cs->subparts_cpus) && + partition_is_populated(cs, NULL))) { + cs->nr_subparts_cpus = 0; + cpumask_clear(cs->subparts_cpus); + } else { + cpumask_and(cs->subparts_cpus, cs->subparts_cpus, + cs->cpus_allowed); + cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus); + } } spin_unlock_irq(&callback_lock); - update_cpumasks_hier(cs, &tmp); + /* effective_cpus will be updated here */ + update_cpumasks_hier(cs, &tmp, false); if (cs->partition_root_state) { struct cpuset *parent = parent_cs(cs); @@ -1972,7 +2157,7 @@ static void update_tasks_flags(struct cpuset *cs) css_task_iter_start(&cs->css, 0, &it); while ((task = css_task_iter_next(&it))) - cpuset_update_task_spread_flag(cs, task); + cpuset_update_task_spread_flags(cs, task); css_task_iter_end(&it); } @@ -2026,16 +2211,18 @@ out: return err; } -/* +/** * update_prstate - update partition_root_state - * cs: the cpuset to update - * new_prs: new partition root state + * @cs: the cpuset to update + * @new_prs: new partition root state + * Return: 0 if successful, != 0 if error * * Call with cpuset_rwsem held. */ static int update_prstate(struct cpuset *cs, int new_prs) { - int err, old_prs = cs->partition_root_state; + int err = PERR_NONE, old_prs = cs->partition_root_state; + bool sched_domain_rebuilt = false; struct cpuset *parent = parent_cs(cs); struct tmpmasks tmpmask; @@ -2043,28 +2230,33 @@ static int update_prstate(struct cpuset *cs, int new_prs) return 0; /* - * Cannot force a partial or invalid partition root to a full - * partition root. + * For a previously invalid partition root, leave it at being + * invalid if new_prs is not "member". */ - if (new_prs && (old_prs == PRS_ERROR)) - return -EINVAL; + if (new_prs && is_prs_invalid(old_prs)) { + cs->partition_root_state = -new_prs; + return 0; + } if (alloc_cpumasks(NULL, &tmpmask)) return -ENOMEM; - err = -EINVAL; if (!old_prs) { /* * Turning on partition root requires setting the * CS_CPU_EXCLUSIVE bit implicitly as well and cpus_allowed - * cannot be NULL. + * cannot be empty. */ - if (cpumask_empty(cs->cpus_allowed)) + if (cpumask_empty(cs->cpus_allowed)) { + err = PERR_CPUSEMPTY; goto out; + } err = update_flag(CS_CPU_EXCLUSIVE, cs, 1); - if (err) + if (err) { + err = PERR_NOTEXCL; goto out; + } err = update_parent_subparts_cpumask(cs, partcmd_enable, NULL, &tmpmask); @@ -2072,47 +2264,77 @@ static int update_prstate(struct cpuset *cs, int new_prs) update_flag(CS_CPU_EXCLUSIVE, cs, 0); goto out; } + + if (new_prs == PRS_ISOLATED) { + /* + * Disable the load balance flag should not return an + * error unless the system is running out of memory. + */ + update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); + sched_domain_rebuilt = true; + } + } else if (old_prs && new_prs) { + /* + * A change in load balance state only, no change in cpumasks. + */ + update_flag(CS_SCHED_LOAD_BALANCE, cs, (new_prs != PRS_ISOLATED)); + sched_domain_rebuilt = true; + goto out; /* Sched domain is rebuilt in update_flag() */ } else { /* - * Turning off partition root will clear the - * CS_CPU_EXCLUSIVE bit. + * Switching back to member is always allowed even if it + * disables child partitions. */ - if (old_prs == PRS_ERROR) { - update_flag(CS_CPU_EXCLUSIVE, cs, 0); - err = 0; - goto out; - } + update_parent_subparts_cpumask(cs, partcmd_disable, NULL, + &tmpmask); - err = update_parent_subparts_cpumask(cs, partcmd_disable, - NULL, &tmpmask); - if (err) - goto out; + /* + * If there are child partitions, they will all become invalid. + */ + if (unlikely(cs->nr_subparts_cpus)) { + spin_lock_irq(&callback_lock); + cs->nr_subparts_cpus = 0; + cpumask_clear(cs->subparts_cpus); + compute_effective_cpumask(cs->effective_cpus, cs, parent); + spin_unlock_irq(&callback_lock); + } /* Turning off CS_CPU_EXCLUSIVE will not return error */ update_flag(CS_CPU_EXCLUSIVE, cs, 0); + + if (!is_sched_load_balance(cs)) { + /* Make sure load balance is on */ + update_flag(CS_SCHED_LOAD_BALANCE, cs, 1); + sched_domain_rebuilt = true; + } } - /* - * Update cpumask of parent's tasks except when it is the top - * cpuset as some system daemons cannot be mapped to other CPUs. - */ - if (parent != &top_cpuset) - update_tasks_cpumask(parent); + update_tasks_cpumask(parent); if (parent->child_ecpus_count) update_sibling_cpumasks(parent, cs, &tmpmask); - rebuild_sched_domains_locked(); + if (!sched_domain_rebuilt) + rebuild_sched_domains_locked(); out: - if (!err) { - spin_lock_irq(&callback_lock); - cs->partition_root_state = new_prs; - spin_unlock_irq(&callback_lock); - notify_partition_change(cs, old_prs, new_prs); - } + /* + * Make partition invalid if an error happen + */ + if (err) + new_prs = -new_prs; + spin_lock_irq(&callback_lock); + cs->partition_root_state = new_prs; + spin_unlock_irq(&callback_lock); + /* + * Update child cpusets, if present. + * Force update if switching back to member. + */ + if (!list_empty(&cs->css.children)) + update_cpumasks_hier(cs, &tmpmask, !new_prs); + notify_partition_change(cs, old_prs); free_cpumasks(NULL, &tmpmask); - return err; + return 0; } /* @@ -2238,6 +2460,12 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) goto out_unlock; + /* + * Task cannot be moved to a cpuset with empty effective cpus. + */ + if (cpumask_empty(cs->effective_cpus)) + goto out_unlock; + cgroup_taskset_for_each(task, css, tset) { ret = task_can_attach(task, cs->effective_cpus); if (ret) @@ -2285,12 +2513,28 @@ static void cpuset_attach(struct cgroup_taskset *tset) struct cgroup_subsys_state *css; struct cpuset *cs; struct cpuset *oldcs = cpuset_attach_old_cs; + bool cpus_updated, mems_updated; cgroup_taskset_first(tset, &css); cs = css_cs(css); lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */ percpu_down_write(&cpuset_rwsem); + cpus_updated = !cpumask_equal(cs->effective_cpus, + oldcs->effective_cpus); + mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems); + + /* + * In the default hierarchy, enabling cpuset in the child cgroups + * will trigger a number of cpuset_attach() calls with no change + * in effective cpus and mems. In that case, we can optimize out + * by skipping the task iteration and update. + */ + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && + !cpus_updated && !mems_updated) { + cpuset_attach_nodemask_to = cs->effective_mems; + goto out; + } guarantee_online_mems(cs, &cpuset_attach_nodemask_to); @@ -2306,14 +2550,19 @@ static void cpuset_attach(struct cgroup_taskset *tset) WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); - cpuset_update_task_spread_flag(cs, task); + cpuset_update_task_spread_flags(cs, task); } /* * Change mm for all threadgroup leaders. This is expensive and may - * sleep and should be moved outside migration path proper. + * sleep and should be moved outside migration path proper. Skip it + * if there is no change in effective_mems and CS_MEMORY_MIGRATE is + * not set. */ cpuset_attach_nodemask_to = cs->effective_mems; + if (!is_memory_migrate(cs) && !mems_updated) + goto out; + cgroup_taskset_for_each_leader(leader, css, tset) { struct mm_struct *mm = get_task_mm(leader); @@ -2336,6 +2585,7 @@ static void cpuset_attach(struct cgroup_taskset *tset) } } +out: cs->old_mems_allowed = cpuset_attach_nodemask_to; cs->attach_in_progress--; @@ -2598,16 +2848,29 @@ static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) static int sched_partition_show(struct seq_file *seq, void *v) { struct cpuset *cs = css_cs(seq_css(seq)); + const char *err, *type = NULL; switch (cs->partition_root_state) { - case PRS_ENABLED: + case PRS_ROOT: seq_puts(seq, "root\n"); break; - case PRS_DISABLED: + case PRS_ISOLATED: + seq_puts(seq, "isolated\n"); + break; + case PRS_MEMBER: seq_puts(seq, "member\n"); break; - case PRS_ERROR: - seq_puts(seq, "root invalid\n"); + case PRS_INVALID_ROOT: + type = "root"; + fallthrough; + case PRS_INVALID_ISOLATED: + if (!type) + type = "isolated"; + err = perr_strings[READ_ONCE(cs->prs_err)]; + if (err) + seq_printf(seq, "%s invalid (%s)\n", type, err); + else + seq_printf(seq, "%s invalid\n", type); break; } return 0; @@ -2626,9 +2889,11 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf, * Convert "root" to ENABLED, and convert "member" to DISABLED. */ if (!strcmp(buf, "root")) - val = PRS_ENABLED; + val = PRS_ROOT; else if (!strcmp(buf, "member")) - val = PRS_DISABLED; + val = PRS_MEMBER; + else if (!strcmp(buf, "isolated")) + val = PRS_ISOLATED; else return -EINVAL; @@ -2807,11 +3072,15 @@ static struct cftype dfl_files[] = { }; -/* - * cpuset_css_alloc - allocate a cpuset css - * cgrp: control group that the new cpuset will be part of +/** + * cpuset_css_alloc - Allocate a cpuset css + * @parent_css: Parent css of the control group that the new cpuset will be + * part of + * Return: cpuset css on success, -ENOMEM on failure. + * + * Allocate and initialize a new cpuset css, for non-NULL @parent_css, return + * top cpuset css otherwise. */ - static struct cgroup_subsys_state * cpuset_css_alloc(struct cgroup_subsys_state *parent_css) { @@ -2927,7 +3196,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) cpus_read_lock(); percpu_down_write(&cpuset_rwsem); - if (is_partition_root(cs)) + if (is_partition_valid(cs)) update_prstate(cs, 0); if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && @@ -3103,7 +3372,8 @@ hotplug_update_tasks(struct cpuset *cs, struct cpumask *new_cpus, nodemask_t *new_mems, bool cpus_updated, bool mems_updated) { - if (cpumask_empty(new_cpus)) + /* A partition root is allowed to have empty effective cpus */ + if (cpumask_empty(new_cpus) && !is_partition_valid(cs)) cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus); if (nodes_empty(*new_mems)) *new_mems = parent_cs(cs)->effective_mems; @@ -3172,11 +3442,31 @@ retry: /* * In the unlikely event that a partition root has empty - * effective_cpus or its parent becomes erroneous, we have to - * transition it to the erroneous state. + * effective_cpus with tasks, we will have to invalidate child + * partitions, if present, by setting nr_subparts_cpus to 0 to + * reclaim their cpus. */ - if (is_partition_root(cs) && (cpumask_empty(&new_cpus) || - (parent->partition_root_state == PRS_ERROR))) { + if (cs->nr_subparts_cpus && is_partition_valid(cs) && + cpumask_empty(&new_cpus) && partition_is_populated(cs, NULL)) { + spin_lock_irq(&callback_lock); + cs->nr_subparts_cpus = 0; + cpumask_clear(cs->subparts_cpus); + spin_unlock_irq(&callback_lock); + compute_effective_cpumask(&new_cpus, cs, parent); + } + + /* + * Force the partition to become invalid if either one of + * the following conditions hold: + * 1) empty effective cpus but not valid empty partition. + * 2) parent is invalid or doesn't grant any cpus to child + * partitions. + */ + if (is_partition_valid(cs) && (!parent->nr_subparts_cpus || + (cpumask_empty(&new_cpus) && partition_is_populated(cs, NULL)))) { + int old_prs, parent_prs; + + update_parent_subparts_cpumask(cs, partcmd_disable, NULL, tmp); if (cs->nr_subparts_cpus) { spin_lock_irq(&callback_lock); cs->nr_subparts_cpus = 0; @@ -3185,39 +3475,32 @@ retry: compute_effective_cpumask(&new_cpus, cs, parent); } - /* - * If the effective_cpus is empty because the child - * partitions take away all the CPUs, we can keep - * the current partition and let the child partitions - * fight for available CPUs. - */ - if ((parent->partition_root_state == PRS_ERROR) || - cpumask_empty(&new_cpus)) { - int old_prs; - - update_parent_subparts_cpumask(cs, partcmd_disable, - NULL, tmp); - old_prs = cs->partition_root_state; - if (old_prs != PRS_ERROR) { - spin_lock_irq(&callback_lock); - cs->partition_root_state = PRS_ERROR; - spin_unlock_irq(&callback_lock); - notify_partition_change(cs, old_prs, PRS_ERROR); - } + old_prs = cs->partition_root_state; + parent_prs = parent->partition_root_state; + if (is_partition_valid(cs)) { + spin_lock_irq(&callback_lock); + make_partition_invalid(cs); + spin_unlock_irq(&callback_lock); + if (is_prs_invalid(parent_prs)) + WRITE_ONCE(cs->prs_err, PERR_INVPARENT); + else if (!parent_prs) + WRITE_ONCE(cs->prs_err, PERR_NOTPART); + else + WRITE_ONCE(cs->prs_err, PERR_HOTPLUG); + notify_partition_change(cs, old_prs); } cpuset_force_rebuild(); } /* - * On the other hand, an erroneous partition root may be transitioned - * back to a regular one or a partition root with no CPU allocated - * from the parent may change to erroneous. + * On the other hand, an invalid partition root may be transitioned + * back to a regular one. */ - if (is_partition_root(parent) && - ((cs->partition_root_state == PRS_ERROR) || - !cpumask_intersects(&new_cpus, parent->subparts_cpus)) && - update_parent_subparts_cpumask(cs, partcmd_update, NULL, tmp)) - cpuset_force_rebuild(); + else if (is_partition_valid(parent) && is_partition_invalid(cs)) { + update_parent_subparts_cpumask(cs, partcmd_update, NULL, tmp); + if (is_partition_valid(cs)) + cpuset_force_rebuild(); + } update_tasks: cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus); @@ -3377,11 +3660,6 @@ static int cpuset_track_online_nodes(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block cpuset_track_online_nodes_nb = { - .notifier_call = cpuset_track_online_nodes, - .priority = 10, /* ??! */ -}; - /** * cpuset_init_smp - initialize cpus_allowed * @@ -3399,7 +3677,7 @@ void __init cpuset_init_smp(void) cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask); top_cpuset.effective_mems = node_states[N_MEMORY]; - register_hotmemory_notifier(&cpuset_track_online_nodes_nb); + hotplug_memory_notifier(cpuset_track_online_nodes, CPUSET_CALLBACK_PRI); cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0); BUG_ON(!cpuset_migrate_mm_wq); diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c index 08236798d173..1b6b21851e9d 100644 --- a/kernel/cgroup/legacy_freezer.c +++ b/kernel/cgroup/legacy_freezer.c @@ -113,7 +113,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css) if (parent && (parent->state & CGROUP_FREEZING)) { freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN; - atomic_inc(&system_freezing_cnt); + static_branch_inc(&freezer_active); } mutex_unlock(&freezer_mutex); @@ -134,7 +134,7 @@ static void freezer_css_offline(struct cgroup_subsys_state *css) mutex_lock(&freezer_mutex); if (freezer->state & CGROUP_FREEZING) - atomic_dec(&system_freezing_cnt); + static_branch_dec(&freezer_active); freezer->state = 0; @@ -179,6 +179,7 @@ static void freezer_attach(struct cgroup_taskset *tset) __thaw_task(task); } else { freeze_task(task); + /* clear FROZEN and propagate upwards */ while (freezer && (freezer->state & CGROUP_FROZEN)) { freezer->state &= ~CGROUP_FROZEN; @@ -271,16 +272,8 @@ static void update_if_frozen(struct cgroup_subsys_state *css) css_task_iter_start(css, 0, &it); while ((task = css_task_iter_next(&it))) { - if (freezing(task)) { - /* - * freezer_should_skip() indicates that the task - * should be skipped when determining freezing - * completion. Consider it frozen in addition to - * the usual frozen condition. - */ - if (!frozen(task) && !freezer_should_skip(task)) - goto out_iter_end; - } + if (freezing(task) && !frozen(task)) + goto out_iter_end; } freezer->state |= CGROUP_FROZEN; @@ -357,7 +350,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze, if (freeze) { if (!(freezer->state & CGROUP_FREEZING)) - atomic_inc(&system_freezing_cnt); + static_branch_inc(&freezer_active); freezer->state |= state; freeze_cgroup(freezer); } else { @@ -366,9 +359,9 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze, freezer->state &= ~state; if (!(freezer->state & CGROUP_FREEZING)) { - if (was_freezing) - atomic_dec(&system_freezing_cnt); freezer->state &= ~CGROUP_FROZEN; + if (was_freezing) + static_branch_dec(&freezer_active); unfreeze_cgroup(freezer); } } diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c index 511af87f685e..7695e60bcb40 100644 --- a/kernel/cgroup/pids.c +++ b/kernel/cgroup/pids.c @@ -47,6 +47,7 @@ struct pids_cgroup { */ atomic64_t counter; atomic64_t limit; + int64_t watermark; /* Handle for "pids.events" */ struct cgroup_file events_file; @@ -85,6 +86,16 @@ static void pids_css_free(struct cgroup_subsys_state *css) kfree(css_pids(css)); } +static void pids_update_watermark(struct pids_cgroup *p, int64_t nr_pids) +{ + /* + * This is racy, but we don't need perfectly accurate tallying of + * the watermark, and this lets us avoid extra atomic overhead. + */ + if (nr_pids > READ_ONCE(p->watermark)) + WRITE_ONCE(p->watermark, nr_pids); +} + /** * pids_cancel - uncharge the local pid count * @pids: the pid cgroup state @@ -128,8 +139,11 @@ static void pids_charge(struct pids_cgroup *pids, int num) { struct pids_cgroup *p; - for (p = pids; parent_pids(p); p = parent_pids(p)) - atomic64_add(num, &p->counter); + for (p = pids; parent_pids(p); p = parent_pids(p)) { + int64_t new = atomic64_add_return(num, &p->counter); + + pids_update_watermark(p, new); + } } /** @@ -156,6 +170,12 @@ static int pids_try_charge(struct pids_cgroup *pids, int num) */ if (new > limit) goto revert; + + /* + * Not technically accurate if we go over limit somewhere up + * the hierarchy, but that's tolerable for the watermark. + */ + pids_update_watermark(p, new); } return 0; @@ -311,6 +331,14 @@ static s64 pids_current_read(struct cgroup_subsys_state *css, return atomic64_read(&pids->counter); } +static s64 pids_peak_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct pids_cgroup *pids = css_pids(css); + + return READ_ONCE(pids->watermark); +} + static int pids_events_show(struct seq_file *sf, void *v) { struct pids_cgroup *pids = css_pids(seq_css(sf)); @@ -332,6 +360,11 @@ static struct cftype pids_files[] = { .flags = CFTYPE_NOT_ON_ROOT, }, { + .name = "peak", + .flags = CFTYPE_NOT_ON_ROOT, + .read_s64 = pids_peak_read, + }, + { .name = "events", .seq_show = pids_events_show, .file_offset = offsetof(struct pids_cgroup, events_file), diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index feb59380c896..793ecff29038 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -3,6 +3,10 @@ #include <linux/sched/cputime.h> +#include <linux/bpf.h> +#include <linux/btf.h> +#include <linux/btf_ids.h> + static DEFINE_SPINLOCK(cgroup_rstat_lock); static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock); @@ -141,6 +145,31 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, return pos; } +/* + * A hook for bpf stat collectors to attach to and flush their stats. + * Together with providing bpf kfuncs for cgroup_rstat_updated() and + * cgroup_rstat_flush(), this enables a complete workflow where bpf progs that + * collect cgroup stats can integrate with rstat for efficient flushing. + * + * A static noinline declaration here could cause the compiler to optimize away + * the function. A global noinline declaration will keep the definition, but may + * optimize away the callsite. Therefore, __weak is needed to ensure that the + * call is still emitted, by telling the compiler that we don't know what the + * function might eventually be. + * + * __diag_* below are needed to dismiss the missing prototype warning. + */ +__diag_push(); +__diag_ignore_all("-Wmissing-prototypes", + "kfuncs which will be used in BPF programs"); + +__weak noinline void bpf_rstat_flush(struct cgroup *cgrp, + struct cgroup *parent, int cpu) +{ +} + +__diag_pop(); + /* see cgroup_rstat_flush() */ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep) __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock) @@ -168,6 +197,7 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep) struct cgroup_subsys_state *css; cgroup_base_stat_flush(pos, cpu); + bpf_rstat_flush(pos, cgroup_parent(pos), cpu); rcu_read_lock(); list_for_each_entry_rcu(css, &pos->rstat_css_list, @@ -501,3 +531,21 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq) seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time); #endif } + +/* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */ +BTF_SET8_START(bpf_rstat_kfunc_ids) +BTF_ID_FLAGS(func, cgroup_rstat_updated) +BTF_ID_FLAGS(func, cgroup_rstat_flush, KF_SLEEPABLE) +BTF_SET8_END(bpf_rstat_kfunc_ids) + +static const struct btf_kfunc_id_set bpf_rstat_kfunc_set = { + .owner = THIS_MODULE, + .set = &bpf_rstat_kfunc_ids, +}; + +static int __init bpf_rstat_kfunc_init(void) +{ + return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, + &bpf_rstat_kfunc_set); +} +late_initcall(bpf_rstat_kfunc_init); diff --git a/kernel/configs/rust.config b/kernel/configs/rust.config new file mode 100644 index 000000000000..38a7c5362c9c --- /dev/null +++ b/kernel/configs/rust.config @@ -0,0 +1 @@ +CONFIG_RUST=y diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config index 8a44b93da0f3..c2f9c912df1c 100644 --- a/kernel/configs/tiny.config +++ b/kernel/configs/tiny.config @@ -7,5 +7,6 @@ CONFIG_KERNEL_XZ=y # CONFIG_KERNEL_LZO is not set # CONFIG_KERNEL_LZ4 is not set # CONFIG_SLAB is not set -# CONFIG_SLUB is not set -CONFIG_SLOB=y +# CONFIG_SLOB_DEPRECATED is not set +CONFIG_SLUB=y +CONFIG_SLUB_TINY=y diff --git a/kernel/cpu.c b/kernel/cpu.c index bbad5e375d3b..6c0a92ca6bb5 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -663,21 +663,51 @@ static bool cpuhp_next_state(bool bringup, return true; } -static int cpuhp_invoke_callback_range(bool bringup, - unsigned int cpu, - struct cpuhp_cpu_state *st, - enum cpuhp_state target) +static int __cpuhp_invoke_callback_range(bool bringup, + unsigned int cpu, + struct cpuhp_cpu_state *st, + enum cpuhp_state target, + bool nofail) { enum cpuhp_state state; - int err = 0; + int ret = 0; while (cpuhp_next_state(bringup, &state, st, target)) { + int err; + err = cpuhp_invoke_callback(cpu, state, bringup, NULL, NULL); - if (err) + if (!err) + continue; + + if (nofail) { + pr_warn("CPU %u %s state %s (%d) failed (%d)\n", + cpu, bringup ? "UP" : "DOWN", + cpuhp_get_step(st->state)->name, + st->state, err); + ret = -1; + } else { + ret = err; break; + } } - return err; + return ret; +} + +static inline int cpuhp_invoke_callback_range(bool bringup, + unsigned int cpu, + struct cpuhp_cpu_state *st, + enum cpuhp_state target) +{ + return __cpuhp_invoke_callback_range(bringup, cpu, st, target, false); +} + +static inline void cpuhp_invoke_callback_range_nofail(bool bringup, + unsigned int cpu, + struct cpuhp_cpu_state *st, + enum cpuhp_state target) +{ + __cpuhp_invoke_callback_range(bringup, cpu, st, target, true); } static inline bool can_rollback_cpu(struct cpuhp_cpu_state *st) @@ -999,7 +1029,6 @@ static int take_cpu_down(void *_param) struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); enum cpuhp_state target = max((int)st->target, CPUHP_AP_OFFLINE); int err, cpu = smp_processor_id(); - int ret; /* Ensure this CPU doesn't handle any more interrupts. */ err = __cpu_disable(); @@ -1012,13 +1041,10 @@ static int take_cpu_down(void *_param) */ WARN_ON(st->state != (CPUHP_TEARDOWN_CPU - 1)); - /* Invoke the former CPU_DYING callbacks */ - ret = cpuhp_invoke_callback_range(false, cpu, st, target); - /* - * DYING must not fail! + * Invoke the former CPU_DYING callbacks. DYING must not fail! */ - WARN_ON_ONCE(ret); + cpuhp_invoke_callback_range_nofail(false, cpu, st, target); /* Give up timekeeping duties */ tick_handover_do_timer(); @@ -1296,16 +1322,14 @@ void notify_cpu_starting(unsigned int cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE); - int ret; rcu_cpu_starting(cpu); /* Enables RCU usage on this CPU. */ cpumask_set_cpu(cpu, &cpus_booted_once_mask); - ret = cpuhp_invoke_callback_range(true, cpu, st, target); /* * STARTING must not fail! */ - WARN_ON_ONCE(ret); + cpuhp_invoke_callback_range_nofail(true, cpu, st, target); } /* @@ -2326,8 +2350,10 @@ static ssize_t target_store(struct device *dev, struct device_attribute *attr, if (st->state < target) ret = cpu_up(dev->id, target); - else + else if (st->state > target) ret = cpu_down(dev->id, target); + else if (WARN_ON(st->target != target)) + st->target = target; out: unlock_device_hotplug(); return ret ? ret : count; @@ -2688,6 +2714,7 @@ void __init boot_cpu_hotplug_init(void) cpumask_set_cpu(smp_processor_id(), &cpus_booted_once_mask); #endif this_cpu_write(cpuhp_state.state, CPUHP_ONLINE); + this_cpu_write(cpuhp_state.target, CPUHP_ONLINE); } /* diff --git a/kernel/crash_core.c b/kernel/crash_core.c index a0eb4d5cf557..87ef6096823f 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -383,6 +383,9 @@ void vmcoreinfo_append_str(const char *fmt, ...) memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); vmcoreinfo_size += r; + + WARN_ONCE(vmcoreinfo_size == VMCOREINFO_BYTES, + "vmcoreinfo data exceeds allocated size, truncating"); } /* diff --git a/kernel/cred.c b/kernel/cred.c index e10c15f51c1f..811ad654abd1 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -701,9 +701,9 @@ void __init cred_init(void) * override a task's own credentials so that work can be done on behalf of that * task that requires a different subjective context. * - * @daemon is used to provide a base for the security record, but can be NULL. - * If @daemon is supplied, then the security data will be derived from that; - * otherwise they'll be set to 0 and no groups, full capabilities and no keys. + * @daemon is used to provide a base cred, with the security data derived from + * that; if this is "&init_task", they'll be set to 0, no groups, full + * capabilities, and no keys. * * The caller may change these controls afterwards if desired. * @@ -714,17 +714,16 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) const struct cred *old; struct cred *new; + if (WARN_ON_ONCE(!daemon)) + return NULL; + new = kmem_cache_alloc(cred_jar, GFP_KERNEL); if (!new) return NULL; kdebug("prepare_kernel_cred() alloc %p", new); - if (daemon) - old = get_task_cred(daemon); - else - old = get_cred(&init_cred); - + old = get_task_cred(daemon); validate_creds(old); *new = *old; diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 7beceb447211..d5e9ccde3ab8 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -50,7 +50,6 @@ #include <linux/pid.h> #include <linux/smp.h> #include <linux/mm.h> -#include <linux/vmacache.h> #include <linux/rcupdate.h> #include <linux/irq.h> #include <linux/security.h> @@ -283,17 +282,6 @@ static void kgdb_flush_swbreak_addr(unsigned long addr) if (!CACHE_FLUSH_IS_SAFE) return; - if (current->mm) { - int i; - - for (i = 0; i < VMACACHE_SIZE; i++) { - if (!current->vmacache.vmas[i]) - continue; - flush_cache_range(current->vmacache.vmas[i], - addr, addr + BREAK_INSTR_SIZE); - } - } - /* Force flush instruction cache if it was outside the mm */ flush_icache_range(addr, addr + BREAK_INSTR_SIZE); } diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 67d3c48a1522..5c7e9ba7cd6b 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -545,6 +545,7 @@ static void kdb_msg_write(const char *msg, int msg_len) { struct console *c; const char *cp; + int cookie; int len; if (msg_len == 0) @@ -558,8 +559,20 @@ static void kdb_msg_write(const char *msg, int msg_len) cp++; } - for_each_console(c) { - if (!(c->flags & CON_ENABLED)) + /* + * The console_srcu_read_lock() only provides safe console list + * traversal. The use of the ->write() callback relies on all other + * CPUs being stopped at the moment and console drivers being able to + * handle reentrance when @oops_in_progress is set. + * + * There is no guarantee that every console driver can handle + * reentrance in this way; the developer deploying the debugger + * is responsible for ensuring that the console drivers they + * have selected handle reentrance appropriately. + */ + cookie = console_srcu_read_lock(); + for_each_console_srcu(c) { + if (!(console_srcu_read_flags(c) & CON_ENABLED)) continue; if (c == dbg_io_ops->cons) continue; @@ -577,6 +590,7 @@ static void kdb_msg_write(const char *msg, int msg_len) --oops_in_progress; touch_nmi_watchdog(); } + console_srcu_read_unlock(cookie); } int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap) diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 164ed9ef77a3..e39cb696cfbd 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -214,13 +214,22 @@ void __delayacct_freepages_end(void) ¤t->delays->freepages_count); } -void __delayacct_thrashing_start(void) +void __delayacct_thrashing_start(bool *in_thrashing) { + *in_thrashing = !!current->in_thrashing; + if (*in_thrashing) + return; + + current->in_thrashing = 1; current->delays->thrashing_start = local_clock(); } -void __delayacct_thrashing_end(void) +void __delayacct_thrashing_end(bool *in_thrashing) { + if (*in_thrashing) + return; + + current->in_thrashing = 0; delayacct_end(¤t->delays->lock, ¤t->delays->thrashing_start, ¤t->delays->thrashing_delay, diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 27f272381cf2..68106e3791f6 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -10,6 +10,7 @@ #include <linux/dma-map-ops.h> #include <linux/export.h> #include <linux/gfp.h> +#include <linux/kmsan.h> #include <linux/of_device.h> #include <linux/slab.h> #include <linux/vmalloc.h> @@ -156,6 +157,7 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, addr = dma_direct_map_page(dev, page, offset, size, dir, attrs); else addr = ops->map_page(dev, page, offset, size, dir, attrs); + kmsan_handle_dma(page, offset, size, dir); debug_dma_map_page(dev, page, offset, size, dir, addr, attrs); return addr; @@ -194,11 +196,13 @@ static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, else ents = ops->map_sg(dev, sg, nents, dir, attrs); - if (ents > 0) + if (ents > 0) { + kmsan_handle_dma_sg(sg, nents, dir); debug_dma_map_sg(dev, sg, nents, ents, dir, attrs); - else if (WARN_ON_ONCE(ents != -EINVAL && ents != -ENOMEM && - ents != -EIO && ents != -EREMOTEIO)) + } else if (WARN_ON_ONCE(ents != -EINVAL && ents != -ENOMEM && + ents != -EIO && ents != -EREMOTEIO)) { return -EIO; + } return ents; } @@ -494,6 +498,14 @@ void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, WARN_ON_ONCE(!dev->coherent_dma_mask); + /* + * DMA allocations can never be turned back into a page pointer, so + * requesting compound pages doesn't make sense (and can't even be + * supported at all by various backends). + */ + if (WARN_ON_ONCE(flag & __GFP_COMP)) + return NULL; + if (dma_alloc_from_dev_coherent(dev, size, dma_handle, &cpu_addr)) return cpu_addr; @@ -548,6 +560,8 @@ static struct page *__dma_alloc_pages(struct device *dev, size_t size, return NULL; if (WARN_ON_ONCE(gfp & (__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM))) return NULL; + if (WARN_ON_ONCE(gfp & __GFP_COMP)) + return NULL; size = PAGE_ALIGN(size); if (dma_alloc_direct(dev, ops)) @@ -633,6 +647,8 @@ struct sg_table *dma_alloc_noncontiguous(struct device *dev, size_t size, if (WARN_ON_ONCE(attrs & ~DMA_ATTR_ALLOC_SINGLE_PAGES)) return NULL; + if (WARN_ON_ONCE(gfp & __GFP_COMP)) + return NULL; if (ops && ops->alloc_noncontiguous) sgt = ops->alloc_noncontiguous(dev, size, dir, gfp, attrs); diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 0ef6b12f961d..a34c38bbe28f 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -300,6 +300,37 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start, return; } +static void *swiotlb_memblock_alloc(unsigned long nslabs, unsigned int flags, + int (*remap)(void *tlb, unsigned long nslabs)) +{ + size_t bytes = PAGE_ALIGN(nslabs << IO_TLB_SHIFT); + void *tlb; + + /* + * By default allocate the bounce buffer memory from low memory, but + * allow to pick a location everywhere for hypervisors with guest + * memory encryption. + */ + if (flags & SWIOTLB_ANY) + tlb = memblock_alloc(bytes, PAGE_SIZE); + else + tlb = memblock_alloc_low(bytes, PAGE_SIZE); + + if (!tlb) { + pr_warn("%s: Failed to allocate %zu bytes tlb structure\n", + __func__, bytes); + return NULL; + } + + if (remap && remap(tlb, nslabs) < 0) { + memblock_free(tlb, PAGE_ALIGN(bytes)); + pr_warn("%s: Failed to remap %zu bytes\n", __func__, bytes); + return NULL; + } + + return tlb; +} + /* * Statically reserve bounce buffer space and initialize bounce buffer data * structures for the software IO TLB used to implement the DMA API. @@ -310,7 +341,6 @@ void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags, struct io_tlb_mem *mem = &io_tlb_default_mem; unsigned long nslabs; size_t alloc_size; - size_t bytes; void *tlb; if (!addressing_limit && !swiotlb_force_bounce) @@ -326,42 +356,32 @@ void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags, swiotlb_adjust_nareas(num_possible_cpus()); nslabs = default_nslabs; - /* - * By default allocate the bounce buffer memory from low memory, but - * allow to pick a location everywhere for hypervisors with guest - * memory encryption. - */ -retry: - bytes = PAGE_ALIGN(nslabs << IO_TLB_SHIFT); - if (flags & SWIOTLB_ANY) - tlb = memblock_alloc(bytes, PAGE_SIZE); - else - tlb = memblock_alloc_low(bytes, PAGE_SIZE); - if (!tlb) { - pr_warn("%s: failed to allocate tlb structure\n", __func__); - return; + while ((tlb = swiotlb_memblock_alloc(nslabs, flags, remap)) == NULL) { + if (nslabs <= IO_TLB_MIN_SLABS) + return; + nslabs = ALIGN(nslabs >> 1, IO_TLB_SEGSIZE); } - if (remap && remap(tlb, nslabs) < 0) { - memblock_free(tlb, PAGE_ALIGN(bytes)); - - nslabs = ALIGN(nslabs >> 1, IO_TLB_SEGSIZE); - if (nslabs < IO_TLB_MIN_SLABS) - panic("%s: Failed to remap %zu bytes\n", - __func__, bytes); - goto retry; + if (default_nslabs != nslabs) { + pr_info("SWIOTLB bounce buffer size adjusted %lu -> %lu slabs", + default_nslabs, nslabs); + default_nslabs = nslabs; } alloc_size = PAGE_ALIGN(array_size(sizeof(*mem->slots), nslabs)); mem->slots = memblock_alloc(alloc_size, PAGE_SIZE); - if (!mem->slots) - panic("%s: Failed to allocate %zu bytes align=0x%lx\n", - __func__, alloc_size, PAGE_SIZE); + if (!mem->slots) { + pr_warn("%s: Failed to allocate %zu bytes align=0x%lx\n", + __func__, alloc_size, PAGE_SIZE); + return; + } mem->areas = memblock_alloc(array_size(sizeof(struct io_tlb_area), default_nareas), SMP_CACHE_BYTES); - if (!mem->areas) - panic("%s: Failed to allocate mem->areas.\n", __func__); + if (!mem->areas) { + pr_warn("%s: Failed to allocate mem->areas.\n", __func__); + return; + } swiotlb_init_io_tlb_mem(mem, __pa(tlb), nslabs, flags, false, default_nareas); @@ -545,9 +565,8 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size } if (PageHighMem(pfn_to_page(pfn))) { - /* The buffer does not have a mapping. Map it in and copy */ unsigned int offset = orig_addr & ~PAGE_MASK; - char *buffer; + struct page *page; unsigned int sz = 0; unsigned long flags; @@ -555,12 +574,11 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size sz = min_t(size_t, PAGE_SIZE - offset, size); local_irq_save(flags); - buffer = kmap_atomic(pfn_to_page(pfn)); + page = pfn_to_page(pfn); if (dir == DMA_TO_DEVICE) - memcpy(vaddr, buffer + offset, sz); + memcpy_from_page(vaddr, page, offset, sz); else - memcpy(buffer + offset, vaddr, sz); - kunmap_atomic(buffer); + memcpy_to_page(page, offset, vaddr, sz); local_irq_restore(flags); size -= sz; @@ -731,8 +749,11 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, int index; phys_addr_t tlb_addr; - if (!mem || !mem->nslabs) - panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer"); + if (!mem || !mem->nslabs) { + dev_warn_ratelimited(dev, + "Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer"); + return (phys_addr_t)DMA_MAPPING_ERROR; + } if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) pr_warn_once("Memory encryption is active and system is using DMA bounce buffers\n"); diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 063068a9ea9b..846add8394c4 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -5,6 +5,7 @@ #include <linux/resume_user_mode.h> #include <linux/highmem.h> #include <linux/jump_label.h> +#include <linux/kmsan.h> #include <linux/livepatch.h> #include <linux/audit.h> #include <linux/tick.h> @@ -24,6 +25,7 @@ static __always_inline void __enter_from_user_mode(struct pt_regs *regs) user_exit_irqoff(); instrumentation_begin(); + kmsan_unpoison_entry_regs(regs); trace_hardirqs_off_finish(); instrumentation_end(); } @@ -352,6 +354,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) lockdep_hardirqs_off(CALLER_ADDR0); ct_irq_enter(); instrumentation_begin(); + kmsan_unpoison_entry_regs(regs); trace_hardirqs_off_finish(); instrumentation_end(); @@ -367,6 +370,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) */ lockdep_hardirqs_off(CALLER_ADDR0); instrumentation_begin(); + kmsan_unpoison_entry_regs(regs); rcu_irq_enter_check_tick(); trace_hardirqs_off_finish(); instrumentation_end(); @@ -452,6 +456,7 @@ irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs) ct_nmi_enter(); instrumentation_begin(); + kmsan_unpoison_entry_regs(regs); trace_hardirqs_off_finish(); ftrace_nmi_enter(); instrumentation_end(); diff --git a/kernel/events/Makefile b/kernel/events/Makefile index 8591c180b52b..91a62f566743 100644 --- a/kernel/events/Makefile +++ b/kernel/events/Makefile @@ -2,4 +2,5 @@ obj-y := core.o ring_buffer.o callchain.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o +obj-$(CONFIG_HW_BREAKPOINT_KUNIT_TEST) += hw_breakpoint_test.o obj-$(CONFIG_UPROBES) += uprobes.o diff --git a/kernel/events/core.c b/kernel/events/core.c index 2621fd24ad26..eacc3702654d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -54,6 +54,7 @@ #include <linux/highmem.h> #include <linux/pgtable.h> #include <linux/buildid.h> +#include <linux/task_work.h> #include "internal.h" @@ -154,12 +155,6 @@ static int cpu_function_call(int cpu, remote_function_f func, void *info) return data.ret; } -static inline struct perf_cpu_context * -__get_cpu_context(struct perf_event_context *ctx) -{ - return this_cpu_ptr(ctx->pmu->pmu_cpu_context); -} - static void perf_ctx_lock(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { @@ -183,6 +178,14 @@ static bool is_kernel_event(struct perf_event *event) return READ_ONCE(event->owner) == TASK_TOMBSTONE; } +static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); + +struct perf_event_context *perf_cpu_task_ctx(void) +{ + lockdep_assert_irqs_disabled(); + return this_cpu_ptr(&perf_cpu_context)->task_ctx; +} + /* * On task ctx scheduling... * @@ -216,7 +219,7 @@ static int event_function(void *info) struct event_function_struct *efs = info; struct perf_event *event = efs->event; struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_context *task_ctx = cpuctx->task_ctx; int ret = 0; @@ -313,7 +316,7 @@ again: static void event_function_local(struct perf_event *event, event_f func, void *data) { struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct task_struct *task = READ_ONCE(ctx->task); struct perf_event_context *task_ctx = NULL; @@ -387,7 +390,6 @@ static DEFINE_MUTEX(perf_sched_mutex); static atomic_t perf_sched_count; static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); -static DEFINE_PER_CPU(int, perf_sched_cb_usages); static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events); static atomic_t nr_mmap_events __read_mostly; @@ -447,7 +449,7 @@ static void update_perf_cpu_limits(void) WRITE_ONCE(perf_sample_allowed_ns, tmp); } -static bool perf_rotate_context(struct perf_cpu_context *cpuctx); +static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc); int perf_proc_update_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) @@ -570,12 +572,6 @@ void perf_sample_event_took(u64 sample_len_ns) static atomic64_t perf_event_id; -static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, - enum event_type_t event_type); - -static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, - enum event_type_t event_type); - static void update_context_time(struct perf_event_context *ctx); static u64 perf_event_time(struct perf_event *event); @@ -690,13 +686,31 @@ do { \ ___p; \ }) +static void perf_ctx_disable(struct perf_event_context *ctx) +{ + struct perf_event_pmu_context *pmu_ctx; + + list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) + perf_pmu_disable(pmu_ctx->pmu); +} + +static void perf_ctx_enable(struct perf_event_context *ctx) +{ + struct perf_event_pmu_context *pmu_ctx; + + list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) + perf_pmu_enable(pmu_ctx->pmu); +} + +static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type); +static void ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type); + #ifdef CONFIG_CGROUP_PERF static inline bool perf_cgroup_match(struct perf_event *event) { - struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); /* @event doesn't care about cgroup */ if (!event->cgrp) @@ -822,54 +836,39 @@ perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) } } -static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list); - /* * reschedule events based on the cgroup constraint of task. */ static void perf_cgroup_switch(struct task_struct *task) { + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_cgroup *cgrp; - struct perf_cpu_context *cpuctx, *tmp; - struct list_head *list; - unsigned long flags; - - /* - * Disable interrupts and preemption to avoid this CPU's - * cgrp_cpuctx_entry to change under us. - */ - local_irq_save(flags); cgrp = perf_cgroup_from_task(task, NULL); - list = this_cpu_ptr(&cgrp_cpuctx_list); - list_for_each_entry_safe(cpuctx, tmp, list, cgrp_cpuctx_entry) { - WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); - if (READ_ONCE(cpuctx->cgrp) == cgrp) - continue; - - perf_ctx_lock(cpuctx, cpuctx->task_ctx); - perf_pmu_disable(cpuctx->ctx.pmu); + WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); + if (READ_ONCE(cpuctx->cgrp) == cgrp) + return; - cpu_ctx_sched_out(cpuctx, EVENT_ALL); - /* - * must not be done before ctxswout due - * to update_cgrp_time_from_cpuctx() in - * ctx_sched_out() - */ - cpuctx->cgrp = cgrp; - /* - * set cgrp before ctxsw in to allow - * perf_cgroup_set_timestamp() in ctx_sched_in() - * to not have to pass task around - */ - cpu_ctx_sched_in(cpuctx, EVENT_ALL); + perf_ctx_lock(cpuctx, cpuctx->task_ctx); + perf_ctx_disable(&cpuctx->ctx); - perf_pmu_enable(cpuctx->ctx.pmu); - perf_ctx_unlock(cpuctx, cpuctx->task_ctx); - } + ctx_sched_out(&cpuctx->ctx, EVENT_ALL); + /* + * must not be done before ctxswout due + * to update_cgrp_time_from_cpuctx() in + * ctx_sched_out() + */ + cpuctx->cgrp = cgrp; + /* + * set cgrp before ctxsw in to allow + * perf_cgroup_set_timestamp() in ctx_sched_in() + * to not have to pass task around + */ + ctx_sched_in(&cpuctx->ctx, EVENT_ALL); - local_irq_restore(flags); + perf_ctx_enable(&cpuctx->ctx); + perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } static int perf_cgroup_ensure_storage(struct perf_event *event, @@ -887,7 +886,7 @@ static int perf_cgroup_ensure_storage(struct perf_event *event, heap_size++; for_each_possible_cpu(cpu) { - cpuctx = per_cpu_ptr(event->pmu->pmu_cpu_context, cpu); + cpuctx = per_cpu_ptr(&perf_cpu_context, cpu); if (heap_size <= cpuctx->heap_size) continue; @@ -971,8 +970,6 @@ perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ct return; cpuctx->cgrp = perf_cgroup_from_task(current, ctx); - list_add(&cpuctx->cgrp_cpuctx_entry, - per_cpu_ptr(&cgrp_cpuctx_list, event->cpu)); } static inline void @@ -993,7 +990,6 @@ perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *c return; cpuctx->cgrp = NULL; - list_del(&cpuctx->cgrp_cpuctx_entry); } #else /* !CONFIG_CGROUP_PERF */ @@ -1068,34 +1064,30 @@ static void perf_cgroup_switch(struct task_struct *task) */ static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr) { - struct perf_cpu_context *cpuctx; + struct perf_cpu_pmu_context *cpc; bool rotations; lockdep_assert_irqs_disabled(); - cpuctx = container_of(hr, struct perf_cpu_context, hrtimer); - rotations = perf_rotate_context(cpuctx); + cpc = container_of(hr, struct perf_cpu_pmu_context, hrtimer); + rotations = perf_rotate_context(cpc); - raw_spin_lock(&cpuctx->hrtimer_lock); + raw_spin_lock(&cpc->hrtimer_lock); if (rotations) - hrtimer_forward_now(hr, cpuctx->hrtimer_interval); + hrtimer_forward_now(hr, cpc->hrtimer_interval); else - cpuctx->hrtimer_active = 0; - raw_spin_unlock(&cpuctx->hrtimer_lock); + cpc->hrtimer_active = 0; + raw_spin_unlock(&cpc->hrtimer_lock); return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART; } -static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu) +static void __perf_mux_hrtimer_init(struct perf_cpu_pmu_context *cpc, int cpu) { - struct hrtimer *timer = &cpuctx->hrtimer; - struct pmu *pmu = cpuctx->ctx.pmu; + struct hrtimer *timer = &cpc->hrtimer; + struct pmu *pmu = cpc->epc.pmu; u64 interval; - /* no multiplexing needed for SW PMU */ - if (pmu->task_ctx_nr == perf_sw_context) - return; - /* * check default is sane, if not set then force to * default interval (1/tick) @@ -1104,34 +1096,34 @@ static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu) if (interval < 1) interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER; - cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval); + cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval); - raw_spin_lock_init(&cpuctx->hrtimer_lock); + raw_spin_lock_init(&cpc->hrtimer_lock); hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); timer->function = perf_mux_hrtimer_handler; } -static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx) +static int perf_mux_hrtimer_restart(struct perf_cpu_pmu_context *cpc) { - struct hrtimer *timer = &cpuctx->hrtimer; - struct pmu *pmu = cpuctx->ctx.pmu; + struct hrtimer *timer = &cpc->hrtimer; unsigned long flags; - /* not for SW PMU */ - if (pmu->task_ctx_nr == perf_sw_context) - return 0; - - raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags); - if (!cpuctx->hrtimer_active) { - cpuctx->hrtimer_active = 1; - hrtimer_forward_now(timer, cpuctx->hrtimer_interval); + raw_spin_lock_irqsave(&cpc->hrtimer_lock, flags); + if (!cpc->hrtimer_active) { + cpc->hrtimer_active = 1; + hrtimer_forward_now(timer, cpc->hrtimer_interval); hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD); } - raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags); + raw_spin_unlock_irqrestore(&cpc->hrtimer_lock, flags); return 0; } +static int perf_mux_hrtimer_restart_ipi(void *arg) +{ + return perf_mux_hrtimer_restart(arg); +} + void perf_pmu_disable(struct pmu *pmu) { int *count = this_cpu_ptr(pmu->pmu_disable_count); @@ -1146,32 +1138,9 @@ void perf_pmu_enable(struct pmu *pmu) pmu->pmu_enable(pmu); } -static DEFINE_PER_CPU(struct list_head, active_ctx_list); - -/* - * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and - * perf_event_task_tick() are fully serialized because they're strictly cpu - * affine and perf_event_ctx{activate,deactivate} are called with IRQs - * disabled, while perf_event_task_tick is called from IRQ context. - */ -static void perf_event_ctx_activate(struct perf_event_context *ctx) -{ - struct list_head *head = this_cpu_ptr(&active_ctx_list); - - lockdep_assert_irqs_disabled(); - - WARN_ON(!list_empty(&ctx->active_ctx_list)); - - list_add(&ctx->active_ctx_list, head); -} - -static void perf_event_ctx_deactivate(struct perf_event_context *ctx) +static void perf_assert_pmu_disabled(struct pmu *pmu) { - lockdep_assert_irqs_disabled(); - - WARN_ON(list_empty(&ctx->active_ctx_list)); - - list_del_init(&ctx->active_ctx_list); + WARN_ON_ONCE(*this_cpu_ptr(pmu->pmu_disable_count) == 0); } static void get_ctx(struct perf_event_context *ctx) @@ -1198,7 +1167,6 @@ static void free_ctx(struct rcu_head *head) struct perf_event_context *ctx; ctx = container_of(head, struct perf_event_context, rcu_head); - free_task_ctx_data(ctx->pmu, ctx->task_ctx_data); kfree(ctx); } @@ -1383,7 +1351,7 @@ static u64 primary_event_id(struct perf_event *event) * the context could get moved to another task. */ static struct perf_event_context * -perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags) +perf_lock_task_context(struct task_struct *task, unsigned long *flags) { struct perf_event_context *ctx; @@ -1399,7 +1367,7 @@ retry: */ local_irq_save(*flags); rcu_read_lock(); - ctx = rcu_dereference(task->perf_event_ctxp[ctxn]); + ctx = rcu_dereference(task->perf_event_ctxp); if (ctx) { /* * If this context is a clone of another, it might @@ -1412,7 +1380,7 @@ retry: * can't get swapped on us any more. */ raw_spin_lock(&ctx->lock); - if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) { + if (ctx != rcu_dereference(task->perf_event_ctxp)) { raw_spin_unlock(&ctx->lock); rcu_read_unlock(); local_irq_restore(*flags); @@ -1439,12 +1407,12 @@ retry: * reference count so that the context can't get freed. */ static struct perf_event_context * -perf_pin_task_context(struct task_struct *task, int ctxn) +perf_pin_task_context(struct task_struct *task) { struct perf_event_context *ctx; unsigned long flags; - ctx = perf_lock_task_context(task, ctxn, &flags); + ctx = perf_lock_task_context(task, &flags); if (ctx) { ++ctx->pin_count; raw_spin_unlock_irqrestore(&ctx->lock, flags); @@ -1468,6 +1436,8 @@ static void __update_context_time(struct perf_event_context *ctx, bool adv) { u64 now = perf_clock(); + lockdep_assert_held(&ctx->lock); + if (adv) ctx->time += now - ctx->timestamp; ctx->timestamp = now; @@ -1590,14 +1560,22 @@ static inline struct cgroup *event_cgroup(const struct perf_event *event) * which provides ordering when rotating groups for the same CPU. */ static __always_inline int -perf_event_groups_cmp(const int left_cpu, const struct cgroup *left_cgroup, - const u64 left_group_index, const struct perf_event *right) +perf_event_groups_cmp(const int left_cpu, const struct pmu *left_pmu, + const struct cgroup *left_cgroup, const u64 left_group_index, + const struct perf_event *right) { if (left_cpu < right->cpu) return -1; if (left_cpu > right->cpu) return 1; + if (left_pmu) { + if (left_pmu < right->pmu_ctx->pmu) + return -1; + if (left_pmu > right->pmu_ctx->pmu) + return 1; + } + #ifdef CONFIG_CGROUP_PERF { const struct cgroup *right_cgroup = event_cgroup(right); @@ -1640,12 +1618,13 @@ perf_event_groups_cmp(const int left_cpu, const struct cgroup *left_cgroup, static inline bool __group_less(struct rb_node *a, const struct rb_node *b) { struct perf_event *e = __node_2_pe(a); - return perf_event_groups_cmp(e->cpu, event_cgroup(e), e->group_index, - __node_2_pe(b)) < 0; + return perf_event_groups_cmp(e->cpu, e->pmu_ctx->pmu, event_cgroup(e), + e->group_index, __node_2_pe(b)) < 0; } struct __group_key { int cpu; + struct pmu *pmu; struct cgroup *cgroup; }; @@ -1654,14 +1633,25 @@ static inline int __group_cmp(const void *key, const struct rb_node *node) const struct __group_key *a = key; const struct perf_event *b = __node_2_pe(node); - /* partial/subtree match: @cpu, @cgroup; ignore: @group_index */ - return perf_event_groups_cmp(a->cpu, a->cgroup, b->group_index, b); + /* partial/subtree match: @cpu, @pmu, @cgroup; ignore: @group_index */ + return perf_event_groups_cmp(a->cpu, a->pmu, a->cgroup, b->group_index, b); +} + +static inline int +__group_cmp_ignore_cgroup(const void *key, const struct rb_node *node) +{ + const struct __group_key *a = key; + const struct perf_event *b = __node_2_pe(node); + + /* partial/subtree match: @cpu, @pmu, ignore: @cgroup, @group_index */ + return perf_event_groups_cmp(a->cpu, a->pmu, event_cgroup(b), + b->group_index, b); } /* - * Insert @event into @groups' tree; using {@event->cpu, ++@groups->index} for - * key (see perf_event_groups_less). This places it last inside the CPU - * subtree. + * Insert @event into @groups' tree; using + * {@event->cpu, @event->pmu_ctx->pmu, event_cgroup(@event), ++@groups->index} + * as key. This places it last inside the {cpu,pmu,cgroup} subtree. */ static void perf_event_groups_insert(struct perf_event_groups *groups, @@ -1711,14 +1701,15 @@ del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx) } /* - * Get the leftmost event in the cpu/cgroup subtree. + * Get the leftmost event in the {cpu,pmu,cgroup} subtree. */ static struct perf_event * perf_event_groups_first(struct perf_event_groups *groups, int cpu, - struct cgroup *cgrp) + struct pmu *pmu, struct cgroup *cgrp) { struct __group_key key = { .cpu = cpu, + .pmu = pmu, .cgroup = cgrp, }; struct rb_node *node; @@ -1730,14 +1721,12 @@ perf_event_groups_first(struct perf_event_groups *groups, int cpu, return NULL; } -/* - * Like rb_entry_next_safe() for the @cpu subtree. - */ static struct perf_event * -perf_event_groups_next(struct perf_event *event) +perf_event_groups_next(struct perf_event *event, struct pmu *pmu) { struct __group_key key = { .cpu = event->cpu, + .pmu = pmu, .cgroup = event_cgroup(event), }; struct rb_node *next; @@ -1749,6 +1738,10 @@ perf_event_groups_next(struct perf_event *event) return NULL; } +#define perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) \ + for (event = perf_event_groups_first(groups, cpu, pmu, NULL); \ + event; event = perf_event_groups_next(event, pmu)) + /* * Iterate through the whole groups tree. */ @@ -1793,6 +1786,7 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) perf_cgroup_event_enable(event, ctx); ctx->generation++; + event->pmu_ctx->nr_events++; } /* @@ -1938,7 +1932,8 @@ static void perf_group_attach(struct perf_event *event) lockdep_assert_held(&event->ctx->lock); /* - * We can have double attach due to group movement in perf_event_open. + * We can have double attach due to group movement (move_group) in + * perf_event_open(). */ if (event->attach_state & PERF_ATTACH_GROUP) return; @@ -2003,6 +1998,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) } ctx->generation++; + event->pmu_ctx->nr_events--; } static int @@ -2019,13 +2015,11 @@ perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event) static void put_event(struct perf_event *event); static void event_sched_out(struct perf_event *event, - struct perf_cpu_context *cpuctx, struct perf_event_context *ctx); static void perf_put_aux_event(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); struct perf_event *iter; /* @@ -2054,7 +2048,7 @@ static void perf_put_aux_event(struct perf_event *event) * state so that we don't try to schedule it again. Note * that perf_event_enable() will clear the ERROR status. */ - event_sched_out(iter, cpuctx, ctx); + event_sched_out(iter, ctx); perf_event_set_state(event, PERF_EVENT_STATE_ERROR); } } @@ -2105,8 +2099,8 @@ static int perf_get_aux_event(struct perf_event *event, static inline struct list_head *get_event_list(struct perf_event *event) { - struct perf_event_context *ctx = event->ctx; - return event->attr.pinned ? &ctx->pinned_active : &ctx->flexible_active; + return event->attr.pinned ? &event->pmu_ctx->pinned_active : + &event->pmu_ctx->flexible_active; } /* @@ -2117,10 +2111,7 @@ static inline struct list_head *get_event_list(struct perf_event *event) */ static inline void perf_remove_sibling_event(struct perf_event *event) { - struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); - - event_sched_out(event, cpuctx, ctx); + event_sched_out(event, event->ctx); perf_event_set_state(event, PERF_EVENT_STATE_ERROR); } @@ -2209,47 +2200,22 @@ static bool is_orphaned_event(struct perf_event *event) return event->state == PERF_EVENT_STATE_DEAD; } -static inline int __pmu_filter_match(struct perf_event *event) -{ - struct pmu *pmu = event->pmu; - return pmu->filter_match ? pmu->filter_match(event) : 1; -} - -/* - * Check whether we should attempt to schedule an event group based on - * PMU-specific filtering. An event group can consist of HW and SW events, - * potentially with a SW leader, so we must check all the filters, to - * determine whether a group is schedulable: - */ -static inline int pmu_filter_match(struct perf_event *event) -{ - struct perf_event *sibling; - - if (!__pmu_filter_match(event)) - return 0; - - for_each_sibling_event(sibling, event) { - if (!__pmu_filter_match(sibling)) - return 0; - } - - return 1; -} - static inline int event_filter_match(struct perf_event *event) { return (event->cpu == -1 || event->cpu == smp_processor_id()) && - perf_cgroup_match(event) && pmu_filter_match(event); + perf_cgroup_match(event); } static void -event_sched_out(struct perf_event *event, - struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) +event_sched_out(struct perf_event *event, struct perf_event_context *ctx) { + struct perf_event_pmu_context *epc = event->pmu_ctx; + struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context); enum perf_event_state state = PERF_EVENT_STATE_INACTIVE; + // XXX cpc serialization, probably per-cpu IRQ disabled + WARN_ON_ONCE(event->ctx != ctx); lockdep_assert_held(&ctx->lock); @@ -2268,50 +2234,61 @@ event_sched_out(struct perf_event *event, event->pmu->del(event, 0); event->oncpu = -1; - if (READ_ONCE(event->pending_disable) >= 0) { - WRITE_ONCE(event->pending_disable, -1); + if (event->pending_disable) { + event->pending_disable = 0; perf_cgroup_event_disable(event, ctx); state = PERF_EVENT_STATE_OFF; } + + if (event->pending_sigtrap) { + bool dec = true; + + event->pending_sigtrap = 0; + if (state != PERF_EVENT_STATE_OFF && + !event->pending_work) { + event->pending_work = 1; + dec = false; + WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount)); + task_work_add(current, &event->pending_task, TWA_RESUME); + } + if (dec) + local_dec(&event->ctx->nr_pending); + } + perf_event_set_state(event, state); if (!is_software_event(event)) - cpuctx->active_oncpu--; - if (!--ctx->nr_active) - perf_event_ctx_deactivate(ctx); + cpc->active_oncpu--; if (event->attr.freq && event->attr.sample_freq) ctx->nr_freq--; - if (event->attr.exclusive || !cpuctx->active_oncpu) - cpuctx->exclusive = 0; + if (event->attr.exclusive || !cpc->active_oncpu) + cpc->exclusive = 0; perf_pmu_enable(event->pmu); } static void -group_sched_out(struct perf_event *group_event, - struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) +group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx) { struct perf_event *event; if (group_event->state != PERF_EVENT_STATE_ACTIVE) return; - perf_pmu_disable(ctx->pmu); + perf_assert_pmu_disabled(group_event->pmu_ctx->pmu); - event_sched_out(group_event, cpuctx, ctx); + event_sched_out(group_event, ctx); /* * Schedule out siblings (if any): */ for_each_sibling_event(event, group_event) - event_sched_out(event, cpuctx, ctx); - - perf_pmu_enable(ctx->pmu); + event_sched_out(event, ctx); } #define DETACH_GROUP 0x01UL #define DETACH_CHILD 0x02UL +#define DETACH_DEAD 0x04UL /* * Cross CPU call to remove a performance event @@ -2325,6 +2302,7 @@ __perf_remove_from_context(struct perf_event *event, struct perf_event_context *ctx, void *info) { + struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx; unsigned long flags = (unsigned long)info; if (ctx->is_active & EVENT_TIME) { @@ -2332,19 +2310,38 @@ __perf_remove_from_context(struct perf_event *event, update_cgrp_time_from_cpuctx(cpuctx, false); } - event_sched_out(event, cpuctx, ctx); + /* + * Ensure event_sched_out() switches to OFF, at the very least + * this avoids raising perf_pending_task() at this time. + */ + if (flags & DETACH_DEAD) + event->pending_disable = 1; + event_sched_out(event, ctx); if (flags & DETACH_GROUP) perf_group_detach(event); if (flags & DETACH_CHILD) perf_child_detach(event); list_del_event(event, ctx); + if (flags & DETACH_DEAD) + event->state = PERF_EVENT_STATE_DEAD; + + if (!pmu_ctx->nr_events) { + pmu_ctx->rotate_necessary = 0; + + if (ctx->task && ctx->is_active) { + struct perf_cpu_pmu_context *cpc; + + cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context); + WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx); + cpc->task_epc = NULL; + } + } if (!ctx->nr_events && ctx->is_active) { if (ctx == &cpuctx->ctx) update_cgrp_time_from_cpuctx(cpuctx, true); ctx->is_active = 0; - ctx->rotate_necessary = 0; if (ctx->task) { WARN_ON_ONCE(cpuctx->task_ctx != ctx); cpuctx->task_ctx = NULL; @@ -2374,12 +2371,8 @@ static void perf_remove_from_context(struct perf_event *event, unsigned long fla * event_function_call() user. */ raw_spin_lock_irq(&ctx->lock); - /* - * Cgroup events are per-cpu events, and must IPI because of - * cgrp_cpuctx_list. - */ - if (!ctx->is_active && !is_cgroup_event(event)) { - __perf_remove_from_context(event, __get_cpu_context(ctx), + if (!ctx->is_active) { + __perf_remove_from_context(event, this_cpu_ptr(&perf_cpu_context), ctx, (void *)flags); raw_spin_unlock_irq(&ctx->lock); return; @@ -2405,13 +2398,17 @@ static void __perf_event_disable(struct perf_event *event, update_cgrp_time_from_event(event); } + perf_pmu_disable(event->pmu_ctx->pmu); + if (event == event->group_leader) - group_sched_out(event, cpuctx, ctx); + group_sched_out(event, ctx); else - event_sched_out(event, cpuctx, ctx); + event_sched_out(event, ctx); perf_event_set_state(event, PERF_EVENT_STATE_OFF); perf_cgroup_event_disable(event, ctx); + + perf_pmu_enable(event->pmu_ctx->pmu); } /* @@ -2424,7 +2421,7 @@ static void __perf_event_disable(struct perf_event *event, * hold the top-level event's child_mutex, so any descendant that * goes to exit will block in perf_event_exit_event(). * - * When called from perf_pending_event it's OK because event->ctx + * When called from perf_pending_irq it's OK because event->ctx * is the current context on this CPU and preemption is disabled, * hence we can't get into perf_event_task_sched_out for this context. */ @@ -2463,9 +2460,8 @@ EXPORT_SYMBOL_GPL(perf_event_disable); void perf_event_disable_inatomic(struct perf_event *event) { - WRITE_ONCE(event->pending_disable, smp_processor_id()); - /* can fail, see perf_pending_event_disable() */ - irq_work_queue(&event->pending); + event->pending_disable = 1; + irq_work_queue(&event->pending_irq); } #define MAX_INTERRUPTS (~0ULL) @@ -2474,10 +2470,10 @@ static void perf_log_throttle(struct perf_event *event, int enable); static void perf_log_itrace_start(struct perf_event *event); static int -event_sched_in(struct perf_event *event, - struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) +event_sched_in(struct perf_event *event, struct perf_event_context *ctx) { + struct perf_event_pmu_context *epc = event->pmu_ctx; + struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context); int ret = 0; WARN_ON_ONCE(event->ctx != ctx); @@ -2518,14 +2514,12 @@ event_sched_in(struct perf_event *event, } if (!is_software_event(event)) - cpuctx->active_oncpu++; - if (!ctx->nr_active++) - perf_event_ctx_activate(ctx); + cpc->active_oncpu++; if (event->attr.freq && event->attr.sample_freq) ctx->nr_freq++; if (event->attr.exclusive) - cpuctx->exclusive = 1; + cpc->exclusive = 1; out: perf_pmu_enable(event->pmu); @@ -2534,26 +2528,24 @@ out: } static int -group_sched_in(struct perf_event *group_event, - struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) +group_sched_in(struct perf_event *group_event, struct perf_event_context *ctx) { struct perf_event *event, *partial_group = NULL; - struct pmu *pmu = ctx->pmu; + struct pmu *pmu = group_event->pmu_ctx->pmu; if (group_event->state == PERF_EVENT_STATE_OFF) return 0; pmu->start_txn(pmu, PERF_PMU_TXN_ADD); - if (event_sched_in(group_event, cpuctx, ctx)) + if (event_sched_in(group_event, ctx)) goto error; /* * Schedule in siblings as one group (if any): */ for_each_sibling_event(event, group_event) { - if (event_sched_in(event, cpuctx, ctx)) { + if (event_sched_in(event, ctx)) { partial_group = event; goto group_error; } @@ -2572,9 +2564,9 @@ group_error: if (event == partial_group) break; - event_sched_out(event, cpuctx, ctx); + event_sched_out(event, ctx); } - event_sched_out(group_event, cpuctx, ctx); + event_sched_out(group_event, ctx); error: pmu->cancel_txn(pmu); @@ -2584,10 +2576,11 @@ error: /* * Work out whether we can put this event group on the CPU now. */ -static int group_can_go_on(struct perf_event *event, - struct perf_cpu_context *cpuctx, - int can_add_hw) +static int group_can_go_on(struct perf_event *event, int can_add_hw) { + struct perf_event_pmu_context *epc = event->pmu_ctx; + struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context); + /* * Groups consisting entirely of software events can always go on. */ @@ -2597,7 +2590,7 @@ static int group_can_go_on(struct perf_event *event, * If an exclusive group is already on, no other hardware * events can go on. */ - if (cpuctx->exclusive) + if (cpc->exclusive) return 0; /* * If this group is exclusive and there are already @@ -2619,36 +2612,29 @@ static void add_event_to_ctx(struct perf_event *event, perf_group_attach(event); } -static void ctx_sched_out(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx, - enum event_type_t event_type); -static void -ctx_sched_in(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx, - enum event_type_t event_type); - -static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx, - enum event_type_t event_type) +static void task_ctx_sched_out(struct perf_event_context *ctx, + enum event_type_t event_type) { + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + if (!cpuctx->task_ctx) return; if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) return; - ctx_sched_out(ctx, cpuctx, event_type); + ctx_sched_out(ctx, event_type); } static void perf_event_sched_in(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { - cpu_ctx_sched_in(cpuctx, EVENT_PINNED); + ctx_sched_in(&cpuctx->ctx, EVENT_PINNED); if (ctx) - ctx_sched_in(ctx, cpuctx, EVENT_PINNED); - cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); + ctx_sched_in(ctx, EVENT_PINNED); + ctx_sched_in(&cpuctx->ctx, EVENT_FLEXIBLE); if (ctx) - ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE); + ctx_sched_in(ctx, EVENT_FLEXIBLE); } /* @@ -2666,11 +2652,15 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx, * event_type is a bit mask of the types of events involved. For CPU events, * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE. */ +/* + * XXX: ctx_resched() reschedule entire perf_event_context while adding new + * event to the context or enabling existing event in the context. We can + * probably optimize it by rescheduling only affected pmu_ctx. + */ static void ctx_resched(struct perf_cpu_context *cpuctx, struct perf_event_context *task_ctx, enum event_type_t event_type) { - enum event_type_t ctx_event_type; bool cpu_event = !!(event_type & EVENT_CPU); /* @@ -2680,11 +2670,13 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, if (event_type & EVENT_PINNED) event_type |= EVENT_FLEXIBLE; - ctx_event_type = event_type & EVENT_ALL; + event_type &= EVENT_ALL; - perf_pmu_disable(cpuctx->ctx.pmu); - if (task_ctx) - task_ctx_sched_out(cpuctx, task_ctx, event_type); + perf_ctx_disable(&cpuctx->ctx); + if (task_ctx) { + perf_ctx_disable(task_ctx); + task_ctx_sched_out(task_ctx, event_type); + } /* * Decide which cpu ctx groups to schedule out based on the types @@ -2694,17 +2686,20 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, * - otherwise, do nothing more. */ if (cpu_event) - cpu_ctx_sched_out(cpuctx, ctx_event_type); - else if (ctx_event_type & EVENT_PINNED) - cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); + ctx_sched_out(&cpuctx->ctx, event_type); + else if (event_type & EVENT_PINNED) + ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE); perf_event_sched_in(cpuctx, task_ctx); - perf_pmu_enable(cpuctx->ctx.pmu); + + perf_ctx_enable(&cpuctx->ctx); + if (task_ctx) + perf_ctx_enable(task_ctx); } void perf_pmu_resched(struct pmu *pmu) { - struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_context *task_ctx = cpuctx->task_ctx; perf_ctx_lock(cpuctx, task_ctx); @@ -2722,7 +2717,7 @@ static int __perf_install_in_context(void *info) { struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_context *task_ctx = cpuctx->task_ctx; bool reprogram = true; int ret = 0; @@ -2764,7 +2759,7 @@ static int __perf_install_in_context(void *info) #endif if (reprogram) { - ctx_sched_out(ctx, cpuctx, EVENT_TIME); + ctx_sched_out(ctx, EVENT_TIME); add_event_to_ctx(event, ctx); ctx_resched(cpuctx, task_ctx, get_event_type(event)); } else { @@ -2797,7 +2792,7 @@ perf_install_in_context(struct perf_event_context *ctx, WARN_ON_ONCE(!exclusive_event_installable(event, ctx)); if (event->cpu != -1) - event->cpu = cpu; + WARN_ON_ONCE(event->cpu != cpu); /* * Ensures that if we can observe event->ctx, both the event and ctx @@ -2809,8 +2804,6 @@ perf_install_in_context(struct perf_event_context *ctx, * perf_event_attr::disabled events will not run and can be initialized * without IPI. Except when this is the first event for the context, in * that case we need the magic of the IPI to set ctx->is_active. - * Similarly, cgroup events for the context also needs the IPI to - * manipulate the cgrp_cpuctx_list. * * The IOC_ENABLE that is sure to follow the creation of a disabled * event will issue the IPI and reprogram the hardware. @@ -2912,7 +2905,7 @@ static void __perf_event_enable(struct perf_event *event, return; if (ctx->is_active) - ctx_sched_out(ctx, cpuctx, EVENT_TIME); + ctx_sched_out(ctx, EVENT_TIME); perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE); perf_cgroup_event_enable(event, ctx); @@ -2921,7 +2914,7 @@ static void __perf_event_enable(struct perf_event *event, return; if (!event_filter_match(event)) { - ctx_sched_in(ctx, cpuctx, EVENT_TIME); + ctx_sched_in(ctx, EVENT_TIME); return; } @@ -2930,7 +2923,7 @@ static void __perf_event_enable(struct perf_event *event, * then don't put it on unless the group is on. */ if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) { - ctx_sched_in(ctx, cpuctx, EVENT_TIME); + ctx_sched_in(ctx, EVENT_TIME); return; } @@ -3199,11 +3192,52 @@ out: return err; } -static void ctx_sched_out(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx, - enum event_type_t event_type) +static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx, + enum event_type_t event_type) { + struct perf_event_context *ctx = pmu_ctx->ctx; struct perf_event *event, *tmp; + struct pmu *pmu = pmu_ctx->pmu; + + if (ctx->task && !ctx->is_active) { + struct perf_cpu_pmu_context *cpc; + + cpc = this_cpu_ptr(pmu->cpu_pmu_context); + WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx); + cpc->task_epc = NULL; + } + + if (!event_type) + return; + + perf_pmu_disable(pmu); + if (event_type & EVENT_PINNED) { + list_for_each_entry_safe(event, tmp, + &pmu_ctx->pinned_active, + active_list) + group_sched_out(event, ctx); + } + + if (event_type & EVENT_FLEXIBLE) { + list_for_each_entry_safe(event, tmp, + &pmu_ctx->flexible_active, + active_list) + group_sched_out(event, ctx); + /* + * Since we cleared EVENT_FLEXIBLE, also clear + * rotate_necessary, is will be reset by + * ctx_flexible_sched_in() when needed. + */ + pmu_ctx->rotate_necessary = 0; + } + perf_pmu_enable(pmu); +} + +static void +ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type) +{ + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + struct perf_event_pmu_context *pmu_ctx; int is_active = ctx->is_active; lockdep_assert_held(&ctx->lock); @@ -3251,27 +3285,8 @@ static void ctx_sched_out(struct perf_event_context *ctx, is_active ^= ctx->is_active; /* changed bits */ - if (!ctx->nr_active || !(is_active & EVENT_ALL)) - return; - - perf_pmu_disable(ctx->pmu); - if (is_active & EVENT_PINNED) { - list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list) - group_sched_out(event, cpuctx, ctx); - } - - if (is_active & EVENT_FLEXIBLE) { - list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list) - group_sched_out(event, cpuctx, ctx); - - /* - * Since we cleared EVENT_FLEXIBLE, also clear - * rotate_necessary, is will be reset by - * ctx_flexible_sched_in() when needed. - */ - ctx->rotate_necessary = 0; - } - perf_pmu_enable(ctx->pmu); + list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) + __pmu_ctx_sched_out(pmu_ctx, is_active); } /* @@ -3376,26 +3391,68 @@ static void perf_event_sync_stat(struct perf_event_context *ctx, } } -static void perf_event_context_sched_out(struct task_struct *task, int ctxn, - struct task_struct *next) +#define double_list_for_each_entry(pos1, pos2, head1, head2, member) \ + for (pos1 = list_first_entry(head1, typeof(*pos1), member), \ + pos2 = list_first_entry(head2, typeof(*pos2), member); \ + !list_entry_is_head(pos1, head1, member) && \ + !list_entry_is_head(pos2, head2, member); \ + pos1 = list_next_entry(pos1, member), \ + pos2 = list_next_entry(pos2, member)) + +static void perf_event_swap_task_ctx_data(struct perf_event_context *prev_ctx, + struct perf_event_context *next_ctx) +{ + struct perf_event_pmu_context *prev_epc, *next_epc; + + if (!prev_ctx->nr_task_data) + return; + + double_list_for_each_entry(prev_epc, next_epc, + &prev_ctx->pmu_ctx_list, &next_ctx->pmu_ctx_list, + pmu_ctx_entry) { + + if (WARN_ON_ONCE(prev_epc->pmu != next_epc->pmu)) + continue; + + /* + * PMU specific parts of task perf context can require + * additional synchronization. As an example of such + * synchronization see implementation details of Intel + * LBR call stack data profiling; + */ + if (prev_epc->pmu->swap_task_ctx) + prev_epc->pmu->swap_task_ctx(prev_epc, next_epc); + else + swap(prev_epc->task_ctx_data, next_epc->task_ctx_data); + } +} + +static void perf_ctx_sched_task_cb(struct perf_event_context *ctx, bool sched_in) { - struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; + struct perf_event_pmu_context *pmu_ctx; + struct perf_cpu_pmu_context *cpc; + + list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { + cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context); + + if (cpc->sched_cb_usage && pmu_ctx->pmu->sched_task) + pmu_ctx->pmu->sched_task(pmu_ctx, sched_in); + } +} + +static void +perf_event_context_sched_out(struct task_struct *task, struct task_struct *next) +{ + struct perf_event_context *ctx = task->perf_event_ctxp; struct perf_event_context *next_ctx; struct perf_event_context *parent, *next_parent; - struct perf_cpu_context *cpuctx; int do_switch = 1; - struct pmu *pmu; if (likely(!ctx)) return; - pmu = ctx->pmu; - cpuctx = __get_cpu_context(ctx); - if (!cpuctx->task_ctx) - return; - rcu_read_lock(); - next_ctx = next->perf_event_ctxp[ctxn]; + next_ctx = rcu_dereference(next->perf_event_ctxp); if (!next_ctx) goto unlock; @@ -3420,26 +3477,27 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); if (context_equiv(ctx, next_ctx)) { - WRITE_ONCE(ctx->task, next); - WRITE_ONCE(next_ctx->task, task); + perf_ctx_disable(ctx); - perf_pmu_disable(pmu); + /* PMIs are disabled; ctx->nr_pending is stable. */ + if (local_read(&ctx->nr_pending) || + local_read(&next_ctx->nr_pending)) { + /* + * Must not swap out ctx when there's pending + * events that rely on the ctx->task relation. + */ + raw_spin_unlock(&next_ctx->lock); + rcu_read_unlock(); + goto inside_switch; + } - if (cpuctx->sched_cb_usage && pmu->sched_task) - pmu->sched_task(ctx, false); + WRITE_ONCE(ctx->task, next); + WRITE_ONCE(next_ctx->task, task); - /* - * PMU specific parts of task perf context can require - * additional synchronization. As an example of such - * synchronization see implementation details of Intel - * LBR call stack data profiling; - */ - if (pmu->swap_task_ctx) - pmu->swap_task_ctx(ctx, next_ctx); - else - swap(ctx->task_ctx_data, next_ctx->task_ctx_data); + perf_ctx_sched_task_cb(ctx, false); + perf_event_swap_task_ctx_data(ctx, next_ctx); - perf_pmu_enable(pmu); + perf_ctx_enable(ctx); /* * RCU_INIT_POINTER here is safe because we've not @@ -3448,8 +3506,8 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, * since those values are always verified under * ctx->lock which we're now holding. */ - RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx); - RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx); + RCU_INIT_POINTER(task->perf_event_ctxp, next_ctx); + RCU_INIT_POINTER(next->perf_event_ctxp, ctx); do_switch = 0; @@ -3463,37 +3521,40 @@ unlock: if (do_switch) { raw_spin_lock(&ctx->lock); - perf_pmu_disable(pmu); + perf_ctx_disable(ctx); - if (cpuctx->sched_cb_usage && pmu->sched_task) - pmu->sched_task(ctx, false); - task_ctx_sched_out(cpuctx, ctx, EVENT_ALL); +inside_switch: + perf_ctx_sched_task_cb(ctx, false); + task_ctx_sched_out(ctx, EVENT_ALL); - perf_pmu_enable(pmu); + perf_ctx_enable(ctx); raw_spin_unlock(&ctx->lock); } } static DEFINE_PER_CPU(struct list_head, sched_cb_list); +static DEFINE_PER_CPU(int, perf_sched_cb_usages); void perf_sched_cb_dec(struct pmu *pmu) { - struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context); this_cpu_dec(perf_sched_cb_usages); + barrier(); - if (!--cpuctx->sched_cb_usage) - list_del(&cpuctx->sched_cb_entry); + if (!--cpc->sched_cb_usage) + list_del(&cpc->sched_cb_entry); } void perf_sched_cb_inc(struct pmu *pmu) { - struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context); - if (!cpuctx->sched_cb_usage++) - list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list)); + if (!cpc->sched_cb_usage++) + list_add(&cpc->sched_cb_entry, this_cpu_ptr(&sched_cb_list)); + barrier(); this_cpu_inc(perf_sched_cb_usages); } @@ -3505,19 +3566,21 @@ void perf_sched_cb_inc(struct pmu *pmu) * PEBS requires this to provide PID/TID information. This requires we flush * all queued PEBS records before we context switch to a new task. */ -static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in) +static void __perf_pmu_sched_task(struct perf_cpu_pmu_context *cpc, bool sched_in) { + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct pmu *pmu; - pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */ + pmu = cpc->epc.pmu; + /* software PMUs will not have sched_task */ if (WARN_ON_ONCE(!pmu->sched_task)) return; perf_ctx_lock(cpuctx, cpuctx->task_ctx); perf_pmu_disable(pmu); - pmu->sched_task(cpuctx->task_ctx, sched_in); + pmu->sched_task(cpc->task_epc, sched_in); perf_pmu_enable(pmu); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); @@ -3527,26 +3590,20 @@ static void perf_pmu_sched_task(struct task_struct *prev, struct task_struct *next, bool sched_in) { - struct perf_cpu_context *cpuctx; + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + struct perf_cpu_pmu_context *cpc; - if (prev == next) + /* cpuctx->task_ctx will be handled in perf_event_context_sched_in/out */ + if (prev == next || cpuctx->task_ctx) return; - list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) { - /* will be handled in perf_event_context_sched_in/out */ - if (cpuctx->task_ctx) - continue; - - __perf_pmu_sched_task(cpuctx, sched_in); - } + list_for_each_entry(cpc, this_cpu_ptr(&sched_cb_list), sched_cb_entry) + __perf_pmu_sched_task(cpc, sched_in); } static void perf_event_switch(struct task_struct *task, struct task_struct *next_prev, bool sched_in); -#define for_each_task_context_nr(ctxn) \ - for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++) - /* * Called from scheduler to remove the events of the current task, * with interrupts disabled. @@ -3561,16 +3618,13 @@ static void perf_event_switch(struct task_struct *task, void __perf_event_task_sched_out(struct task_struct *task, struct task_struct *next) { - int ctxn; - if (__this_cpu_read(perf_sched_cb_usages)) perf_pmu_sched_task(task, next, false); if (atomic_read(&nr_switch_events)) perf_event_switch(task, next, false); - for_each_task_context_nr(ctxn) - perf_event_context_sched_out(task, ctxn, next); + perf_event_context_sched_out(task, next); /* * if cgroup events exist on this CPU, then we need @@ -3581,15 +3635,6 @@ void __perf_event_task_sched_out(struct task_struct *task, perf_cgroup_switch(next); } -/* - * Called with IRQs disabled - */ -static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, - enum event_type_t event_type) -{ - ctx_sched_out(&cpuctx->ctx, cpuctx, event_type); -} - static bool perf_less_group_idx(const void *l, const void *r) { const struct perf_event *le = *(const struct perf_event **)l; @@ -3621,21 +3666,39 @@ static void __heap_add(struct min_heap *heap, struct perf_event *event) } } -static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx, +static void __link_epc(struct perf_event_pmu_context *pmu_ctx) +{ + struct perf_cpu_pmu_context *cpc; + + if (!pmu_ctx->ctx->task) + return; + + cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context); + WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx); + cpc->task_epc = pmu_ctx; +} + +static noinline int visit_groups_merge(struct perf_event_context *ctx, struct perf_event_groups *groups, int cpu, + struct pmu *pmu, int (*func)(struct perf_event *, void *), void *data) { #ifdef CONFIG_CGROUP_PERF struct cgroup_subsys_state *css = NULL; #endif + struct perf_cpu_context *cpuctx = NULL; /* Space for per CPU and/or any CPU event iterators. */ struct perf_event *itrs[2]; struct min_heap event_heap; struct perf_event **evt; int ret; - if (cpuctx) { + if (pmu->filter && pmu->filter(pmu, cpu)) + return 0; + + if (!ctx->task) { + cpuctx = this_cpu_ptr(&perf_cpu_context); event_heap = (struct min_heap){ .data = cpuctx->heap, .nr = 0, @@ -3655,17 +3718,22 @@ static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx, .size = ARRAY_SIZE(itrs), }; /* Events not within a CPU context may be on any CPU. */ - __heap_add(&event_heap, perf_event_groups_first(groups, -1, NULL)); + __heap_add(&event_heap, perf_event_groups_first(groups, -1, pmu, NULL)); } evt = event_heap.data; - __heap_add(&event_heap, perf_event_groups_first(groups, cpu, NULL)); + __heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, NULL)); #ifdef CONFIG_CGROUP_PERF for (; css; css = css->parent) - __heap_add(&event_heap, perf_event_groups_first(groups, cpu, css->cgroup)); + __heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, css->cgroup)); #endif + if (event_heap.nr) { + __link_epc((*evt)->pmu_ctx); + perf_assert_pmu_disabled((*evt)->pmu_ctx->pmu); + } + min_heapify_all(&event_heap, &perf_min_heap); while (event_heap.nr) { @@ -3673,7 +3741,7 @@ static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx, if (ret) return ret; - *evt = perf_event_groups_next(*evt); + *evt = perf_event_groups_next(*evt, pmu); if (*evt) min_heapify(&event_heap, 0, &perf_min_heap); else @@ -3715,7 +3783,6 @@ static inline void group_update_userpage(struct perf_event *group_event) static int merge_sched_in(struct perf_event *event, void *data) { struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); int *can_add_hw = data; if (event->state <= PERF_EVENT_STATE_OFF) @@ -3724,8 +3791,8 @@ static int merge_sched_in(struct perf_event *event, void *data) if (!event_filter_match(event)) return 0; - if (group_can_go_on(event, cpuctx, *can_add_hw)) { - if (!group_sched_in(event, cpuctx, ctx)) + if (group_can_go_on(event, *can_add_hw)) { + if (!group_sched_in(event, ctx)) list_add_tail(&event->active_list, get_event_list(event)); } @@ -3735,8 +3802,11 @@ static int merge_sched_in(struct perf_event *event, void *data) perf_cgroup_event_disable(event, ctx); perf_event_set_state(event, PERF_EVENT_STATE_ERROR); } else { - ctx->rotate_necessary = 1; - perf_mux_hrtimer_restart(cpuctx); + struct perf_cpu_pmu_context *cpc; + + event->pmu_ctx->rotate_necessary = 1; + cpc = this_cpu_ptr(event->pmu_ctx->pmu->cpu_pmu_context); + perf_mux_hrtimer_restart(cpc); group_update_userpage(event); } } @@ -3744,39 +3814,53 @@ static int merge_sched_in(struct perf_event *event, void *data) return 0; } -static void -ctx_pinned_sched_in(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx) +static void ctx_pinned_sched_in(struct perf_event_context *ctx, struct pmu *pmu) { + struct perf_event_pmu_context *pmu_ctx; int can_add_hw = 1; - if (ctx != &cpuctx->ctx) - cpuctx = NULL; - - visit_groups_merge(cpuctx, &ctx->pinned_groups, - smp_processor_id(), - merge_sched_in, &can_add_hw); + if (pmu) { + visit_groups_merge(ctx, &ctx->pinned_groups, + smp_processor_id(), pmu, + merge_sched_in, &can_add_hw); + } else { + list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { + can_add_hw = 1; + visit_groups_merge(ctx, &ctx->pinned_groups, + smp_processor_id(), pmu_ctx->pmu, + merge_sched_in, &can_add_hw); + } + } } -static void -ctx_flexible_sched_in(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx) +static void ctx_flexible_sched_in(struct perf_event_context *ctx, struct pmu *pmu) { + struct perf_event_pmu_context *pmu_ctx; int can_add_hw = 1; - if (ctx != &cpuctx->ctx) - cpuctx = NULL; + if (pmu) { + visit_groups_merge(ctx, &ctx->flexible_groups, + smp_processor_id(), pmu, + merge_sched_in, &can_add_hw); + } else { + list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { + can_add_hw = 1; + visit_groups_merge(ctx, &ctx->flexible_groups, + smp_processor_id(), pmu_ctx->pmu, + merge_sched_in, &can_add_hw); + } + } +} - visit_groups_merge(cpuctx, &ctx->flexible_groups, - smp_processor_id(), - merge_sched_in, &can_add_hw); +static void __pmu_ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu) +{ + ctx_flexible_sched_in(ctx, pmu); } static void -ctx_sched_in(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx, - enum event_type_t event_type) +ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type) { + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); int is_active = ctx->is_active; lockdep_assert_held(&ctx->lock); @@ -3810,39 +3894,32 @@ ctx_sched_in(struct perf_event_context *ctx, * in order to give them the best chance of going on. */ if (is_active & EVENT_PINNED) - ctx_pinned_sched_in(ctx, cpuctx); + ctx_pinned_sched_in(ctx, NULL); /* Then walk through the lower prio flexible groups */ if (is_active & EVENT_FLEXIBLE) - ctx_flexible_sched_in(ctx, cpuctx); + ctx_flexible_sched_in(ctx, NULL); } -static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, - enum event_type_t event_type) +static void perf_event_context_sched_in(struct task_struct *task) { - struct perf_event_context *ctx = &cpuctx->ctx; + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + struct perf_event_context *ctx; - ctx_sched_in(ctx, cpuctx, event_type); -} + rcu_read_lock(); + ctx = rcu_dereference(task->perf_event_ctxp); + if (!ctx) + goto rcu_unlock; -static void perf_event_context_sched_in(struct perf_event_context *ctx, - struct task_struct *task) -{ - struct perf_cpu_context *cpuctx; - struct pmu *pmu; + if (cpuctx->task_ctx == ctx) { + perf_ctx_lock(cpuctx, ctx); + perf_ctx_disable(ctx); - cpuctx = __get_cpu_context(ctx); + perf_ctx_sched_task_cb(ctx, true); - /* - * HACK: for HETEROGENEOUS the task context might have switched to a - * different PMU, force (re)set the context, - */ - pmu = ctx->pmu = cpuctx->ctx.pmu; - - if (cpuctx->task_ctx == ctx) { - if (cpuctx->sched_cb_usage) - __perf_pmu_sched_task(cpuctx, true); - return; + perf_ctx_enable(ctx); + perf_ctx_unlock(cpuctx, ctx); + goto rcu_unlock; } perf_ctx_lock(cpuctx, ctx); @@ -3853,7 +3930,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, if (!ctx->nr_events) goto unlock; - perf_pmu_disable(pmu); + perf_ctx_disable(ctx); /* * We want to keep the following priority order: * cpu pinned (that don't need to move), task pinned, @@ -3862,17 +3939,24 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, * However, if task's ctx is not carrying any pinned * events, no need to flip the cpuctx's events around. */ - if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) - cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); + if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) { + perf_ctx_disable(&cpuctx->ctx); + ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE); + } + perf_event_sched_in(cpuctx, ctx); - if (cpuctx->sched_cb_usage && pmu->sched_task) - pmu->sched_task(cpuctx->task_ctx, true); + perf_ctx_sched_task_cb(cpuctx->task_ctx, true); - perf_pmu_enable(pmu); + if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) + perf_ctx_enable(&cpuctx->ctx); + + perf_ctx_enable(ctx); unlock: perf_ctx_unlock(cpuctx, ctx); +rcu_unlock: + rcu_read_unlock(); } /* @@ -3889,16 +3973,7 @@ unlock: void __perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task) { - struct perf_event_context *ctx; - int ctxn; - - for_each_task_context_nr(ctxn) { - ctx = task->perf_event_ctxp[ctxn]; - if (likely(!ctx)) - continue; - - perf_event_context_sched_in(ctx, task); - } + perf_event_context_sched_in(task); if (atomic_read(&nr_switch_events)) perf_event_switch(task, prev, true); @@ -4017,8 +4092,8 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bo * events. At the same time, make sure, having freq events does not change * the rate of unthrottling as that would introduce bias. */ -static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, - int needs_unthr) +static void +perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle) { struct perf_event *event; struct hw_perf_event *hwc; @@ -4030,16 +4105,16 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, * - context have events in frequency mode (needs freq adjust) * - there are events to unthrottle on this cpu */ - if (!(ctx->nr_freq || needs_unthr)) + if (!(ctx->nr_freq || unthrottle)) return; raw_spin_lock(&ctx->lock); - perf_pmu_disable(ctx->pmu); list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { if (event->state != PERF_EVENT_STATE_ACTIVE) continue; + // XXX use visit thingy to avoid the -1,cpu match if (!event_filter_match(event)) continue; @@ -4080,7 +4155,6 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, perf_pmu_enable(event->pmu); } - perf_pmu_enable(ctx->pmu); raw_spin_unlock(&ctx->lock); } @@ -4102,72 +4176,109 @@ static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event) /* pick an event from the flexible_groups to rotate */ static inline struct perf_event * -ctx_event_to_rotate(struct perf_event_context *ctx) +ctx_event_to_rotate(struct perf_event_pmu_context *pmu_ctx) { struct perf_event *event; + struct rb_node *node; + struct rb_root *tree; + struct __group_key key = { + .pmu = pmu_ctx->pmu, + }; /* pick the first active flexible event */ - event = list_first_entry_or_null(&ctx->flexible_active, + event = list_first_entry_or_null(&pmu_ctx->flexible_active, struct perf_event, active_list); + if (event) + goto out; /* if no active flexible event, pick the first event */ - if (!event) { - event = rb_entry_safe(rb_first(&ctx->flexible_groups.tree), - typeof(*event), group_node); + tree = &pmu_ctx->ctx->flexible_groups.tree; + + if (!pmu_ctx->ctx->task) { + key.cpu = smp_processor_id(); + + node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup); + if (node) + event = __node_2_pe(node); + goto out; } + key.cpu = -1; + node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup); + if (node) { + event = __node_2_pe(node); + goto out; + } + + key.cpu = smp_processor_id(); + node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup); + if (node) + event = __node_2_pe(node); + +out: /* * Unconditionally clear rotate_necessary; if ctx_flexible_sched_in() * finds there are unschedulable events, it will set it again. */ - ctx->rotate_necessary = 0; + pmu_ctx->rotate_necessary = 0; return event; } -static bool perf_rotate_context(struct perf_cpu_context *cpuctx) +static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc) { + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + struct perf_event_pmu_context *cpu_epc, *task_epc = NULL; struct perf_event *cpu_event = NULL, *task_event = NULL; - struct perf_event_context *task_ctx = NULL; int cpu_rotate, task_rotate; + struct pmu *pmu; /* * Since we run this from IRQ context, nobody can install new * events, thus the event count values are stable. */ - cpu_rotate = cpuctx->ctx.rotate_necessary; - task_ctx = cpuctx->task_ctx; - task_rotate = task_ctx ? task_ctx->rotate_necessary : 0; + cpu_epc = &cpc->epc; + pmu = cpu_epc->pmu; + task_epc = cpc->task_epc; + + cpu_rotate = cpu_epc->rotate_necessary; + task_rotate = task_epc ? task_epc->rotate_necessary : 0; if (!(cpu_rotate || task_rotate)) return false; perf_ctx_lock(cpuctx, cpuctx->task_ctx); - perf_pmu_disable(cpuctx->ctx.pmu); + perf_pmu_disable(pmu); if (task_rotate) - task_event = ctx_event_to_rotate(task_ctx); + task_event = ctx_event_to_rotate(task_epc); if (cpu_rotate) - cpu_event = ctx_event_to_rotate(&cpuctx->ctx); + cpu_event = ctx_event_to_rotate(cpu_epc); /* * As per the order given at ctx_resched() first 'pop' task flexible * and then, if needed CPU flexible. */ - if (task_event || (task_ctx && cpu_event)) - ctx_sched_out(task_ctx, cpuctx, EVENT_FLEXIBLE); - if (cpu_event) - cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); + if (task_event || (task_epc && cpu_event)) { + update_context_time(task_epc->ctx); + __pmu_ctx_sched_out(task_epc, EVENT_FLEXIBLE); + } - if (task_event) - rotate_ctx(task_ctx, task_event); - if (cpu_event) + if (cpu_event) { + update_context_time(&cpuctx->ctx); + __pmu_ctx_sched_out(cpu_epc, EVENT_FLEXIBLE); rotate_ctx(&cpuctx->ctx, cpu_event); + __pmu_ctx_sched_in(&cpuctx->ctx, pmu); + } - perf_event_sched_in(cpuctx, task_ctx); + if (task_event) + rotate_ctx(task_epc->ctx, task_event); + + if (task_event || (task_epc && cpu_event)) + __pmu_ctx_sched_in(task_epc->ctx, pmu); - perf_pmu_enable(cpuctx->ctx.pmu); + perf_pmu_enable(pmu); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); return true; @@ -4175,8 +4286,8 @@ static bool perf_rotate_context(struct perf_cpu_context *cpuctx) void perf_event_task_tick(void) { - struct list_head *head = this_cpu_ptr(&active_ctx_list); - struct perf_event_context *ctx, *tmp; + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + struct perf_event_context *ctx; int throttled; lockdep_assert_irqs_disabled(); @@ -4185,8 +4296,13 @@ void perf_event_task_tick(void) throttled = __this_cpu_xchg(perf_throttled_count, 0); tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS); - list_for_each_entry_safe(ctx, tmp, head, active_ctx_list) - perf_adjust_freq_unthr_context(ctx, throttled); + perf_adjust_freq_unthr_context(&cpuctx->ctx, !!throttled); + + rcu_read_lock(); + ctx = rcu_dereference(current->perf_event_ctxp); + if (ctx) + perf_adjust_freq_unthr_context(ctx, !!throttled); + rcu_read_unlock(); } static int event_enable_on_exec(struct perf_event *event, @@ -4208,9 +4324,9 @@ static int event_enable_on_exec(struct perf_event *event, * Enable all of a task's events that have been marked enable-on-exec. * This expects task == current. */ -static void perf_event_enable_on_exec(int ctxn) +static void perf_event_enable_on_exec(struct perf_event_context *ctx) { - struct perf_event_context *ctx, *clone_ctx = NULL; + struct perf_event_context *clone_ctx = NULL; enum event_type_t event_type = 0; struct perf_cpu_context *cpuctx; struct perf_event *event; @@ -4218,13 +4334,16 @@ static void perf_event_enable_on_exec(int ctxn) int enabled = 0; local_irq_save(flags); - ctx = current->perf_event_ctxp[ctxn]; - if (!ctx || !ctx->nr_events) + if (WARN_ON_ONCE(current->perf_event_ctxp != ctx)) goto out; - cpuctx = __get_cpu_context(ctx); + if (!ctx->nr_events) + goto out; + + cpuctx = this_cpu_ptr(&perf_cpu_context); perf_ctx_lock(cpuctx, ctx); - ctx_sched_out(ctx, cpuctx, EVENT_TIME); + ctx_sched_out(ctx, EVENT_TIME); + list_for_each_entry(event, &ctx->event_list, event_entry) { enabled |= event_enable_on_exec(event, ctx); event_type |= get_event_type(event); @@ -4237,7 +4356,7 @@ static void perf_event_enable_on_exec(int ctxn) clone_ctx = unclone_ctx(ctx); ctx_resched(cpuctx, ctx, event_type); } else { - ctx_sched_in(ctx, cpuctx, EVENT_TIME); + ctx_sched_in(ctx, EVENT_TIME); } perf_ctx_unlock(cpuctx, ctx); @@ -4256,17 +4375,13 @@ static void perf_event_exit_event(struct perf_event *event, * Removes all events from the current task that have been marked * remove-on-exec, and feeds their values back to parent events. */ -static void perf_event_remove_on_exec(int ctxn) +static void perf_event_remove_on_exec(struct perf_event_context *ctx) { - struct perf_event_context *ctx, *clone_ctx = NULL; + struct perf_event_context *clone_ctx = NULL; struct perf_event *event, *next; unsigned long flags; bool modified = false; - ctx = perf_pin_task_context(current, ctxn); - if (!ctx) - return; - mutex_lock(&ctx->mutex); if (WARN_ON_ONCE(ctx->task != current)) @@ -4287,13 +4402,11 @@ static void perf_event_remove_on_exec(int ctxn) raw_spin_lock_irqsave(&ctx->lock, flags); if (modified) clone_ctx = unclone_ctx(ctx); - --ctx->pin_count; raw_spin_unlock_irqrestore(&ctx->lock, flags); unlock: mutex_unlock(&ctx->mutex); - put_ctx(ctx); if (clone_ctx) put_ctx(clone_ctx); } @@ -4329,7 +4442,7 @@ static void __perf_event_read(void *info) struct perf_read_data *data = info; struct perf_event *sub, *event = data->event; struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct pmu *pmu = event->pmu; /* @@ -4555,17 +4668,25 @@ static void __perf_event_init_context(struct perf_event_context *ctx) { raw_spin_lock_init(&ctx->lock); mutex_init(&ctx->mutex); - INIT_LIST_HEAD(&ctx->active_ctx_list); + INIT_LIST_HEAD(&ctx->pmu_ctx_list); perf_event_groups_init(&ctx->pinned_groups); perf_event_groups_init(&ctx->flexible_groups); INIT_LIST_HEAD(&ctx->event_list); - INIT_LIST_HEAD(&ctx->pinned_active); - INIT_LIST_HEAD(&ctx->flexible_active); refcount_set(&ctx->refcount, 1); } +static void +__perf_init_event_pmu_context(struct perf_event_pmu_context *epc, struct pmu *pmu) +{ + epc->pmu = pmu; + INIT_LIST_HEAD(&epc->pmu_ctx_entry); + INIT_LIST_HEAD(&epc->pinned_active); + INIT_LIST_HEAD(&epc->flexible_active); + atomic_set(&epc->refcount, 1); +} + static struct perf_event_context * -alloc_perf_context(struct pmu *pmu, struct task_struct *task) +alloc_perf_context(struct task_struct *task) { struct perf_event_context *ctx; @@ -4576,7 +4697,6 @@ alloc_perf_context(struct pmu *pmu, struct task_struct *task) __perf_event_init_context(ctx); if (task) ctx->task = get_task_struct(task); - ctx->pmu = pmu; return ctx; } @@ -4605,15 +4725,12 @@ find_lively_task_by_vpid(pid_t vpid) * Returns a matching context with refcount and pincount. */ static struct perf_event_context * -find_get_context(struct pmu *pmu, struct task_struct *task, - struct perf_event *event) +find_get_context(struct task_struct *task, struct perf_event *event) { struct perf_event_context *ctx, *clone_ctx = NULL; struct perf_cpu_context *cpuctx; - void *task_ctx_data = NULL; unsigned long flags; - int ctxn, err; - int cpu = event->cpu; + int err; if (!task) { /* Must be root to operate on a CPU event: */ @@ -4621,7 +4738,7 @@ find_get_context(struct pmu *pmu, struct task_struct *task, if (err) return ERR_PTR(err); - cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); + cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu); ctx = &cpuctx->ctx; get_ctx(ctx); raw_spin_lock_irqsave(&ctx->lock, flags); @@ -4632,43 +4749,22 @@ find_get_context(struct pmu *pmu, struct task_struct *task, } err = -EINVAL; - ctxn = pmu->task_ctx_nr; - if (ctxn < 0) - goto errout; - - if (event->attach_state & PERF_ATTACH_TASK_DATA) { - task_ctx_data = alloc_task_ctx_data(pmu); - if (!task_ctx_data) { - err = -ENOMEM; - goto errout; - } - } - retry: - ctx = perf_lock_task_context(task, ctxn, &flags); + ctx = perf_lock_task_context(task, &flags); if (ctx) { clone_ctx = unclone_ctx(ctx); ++ctx->pin_count; - if (task_ctx_data && !ctx->task_ctx_data) { - ctx->task_ctx_data = task_ctx_data; - task_ctx_data = NULL; - } raw_spin_unlock_irqrestore(&ctx->lock, flags); if (clone_ctx) put_ctx(clone_ctx); } else { - ctx = alloc_perf_context(pmu, task); + ctx = alloc_perf_context(task); err = -ENOMEM; if (!ctx) goto errout; - if (task_ctx_data) { - ctx->task_ctx_data = task_ctx_data; - task_ctx_data = NULL; - } - err = 0; mutex_lock(&task->perf_event_mutex); /* @@ -4677,12 +4773,12 @@ retry: */ if (task->flags & PF_EXITING) err = -ESRCH; - else if (task->perf_event_ctxp[ctxn]) + else if (task->perf_event_ctxp) err = -EAGAIN; else { get_ctx(ctx); ++ctx->pin_count; - rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx); + rcu_assign_pointer(task->perf_event_ctxp, ctx); } mutex_unlock(&task->perf_event_mutex); @@ -4695,21 +4791,146 @@ retry: } } - free_task_ctx_data(pmu, task_ctx_data); return ctx; errout: - free_task_ctx_data(pmu, task_ctx_data); return ERR_PTR(err); } +static struct perf_event_pmu_context * +find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx, + struct perf_event *event) +{ + struct perf_event_pmu_context *new = NULL, *epc; + void *task_ctx_data = NULL; + + if (!ctx->task) { + struct perf_cpu_pmu_context *cpc; + + cpc = per_cpu_ptr(pmu->cpu_pmu_context, event->cpu); + epc = &cpc->epc; + + if (!epc->ctx) { + atomic_set(&epc->refcount, 1); + epc->embedded = 1; + raw_spin_lock_irq(&ctx->lock); + list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list); + epc->ctx = ctx; + raw_spin_unlock_irq(&ctx->lock); + } else { + WARN_ON_ONCE(epc->ctx != ctx); + atomic_inc(&epc->refcount); + } + + return epc; + } + + new = kzalloc(sizeof(*epc), GFP_KERNEL); + if (!new) + return ERR_PTR(-ENOMEM); + + if (event->attach_state & PERF_ATTACH_TASK_DATA) { + task_ctx_data = alloc_task_ctx_data(pmu); + if (!task_ctx_data) { + kfree(new); + return ERR_PTR(-ENOMEM); + } + } + + __perf_init_event_pmu_context(new, pmu); + + /* + * XXX + * + * lockdep_assert_held(&ctx->mutex); + * + * can't because perf_event_init_task() doesn't actually hold the + * child_ctx->mutex. + */ + + raw_spin_lock_irq(&ctx->lock); + list_for_each_entry(epc, &ctx->pmu_ctx_list, pmu_ctx_entry) { + if (epc->pmu == pmu) { + WARN_ON_ONCE(epc->ctx != ctx); + atomic_inc(&epc->refcount); + goto found_epc; + } + } + + epc = new; + new = NULL; + + list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list); + epc->ctx = ctx; + +found_epc: + if (task_ctx_data && !epc->task_ctx_data) { + epc->task_ctx_data = task_ctx_data; + task_ctx_data = NULL; + ctx->nr_task_data++; + } + raw_spin_unlock_irq(&ctx->lock); + + free_task_ctx_data(pmu, task_ctx_data); + kfree(new); + + return epc; +} + +static void get_pmu_ctx(struct perf_event_pmu_context *epc) +{ + WARN_ON_ONCE(!atomic_inc_not_zero(&epc->refcount)); +} + +static void free_epc_rcu(struct rcu_head *head) +{ + struct perf_event_pmu_context *epc = container_of(head, typeof(*epc), rcu_head); + + kfree(epc->task_ctx_data); + kfree(epc); +} + +static void put_pmu_ctx(struct perf_event_pmu_context *epc) +{ + unsigned long flags; + + if (!atomic_dec_and_test(&epc->refcount)) + return; + + if (epc->ctx) { + struct perf_event_context *ctx = epc->ctx; + + /* + * XXX + * + * lockdep_assert_held(&ctx->mutex); + * + * can't because of the call-site in _free_event()/put_event() + * which isn't always called under ctx->mutex. + */ + + WARN_ON_ONCE(list_empty(&epc->pmu_ctx_entry)); + raw_spin_lock_irqsave(&ctx->lock, flags); + list_del_init(&epc->pmu_ctx_entry); + epc->ctx = NULL; + raw_spin_unlock_irqrestore(&ctx->lock, flags); + } + + WARN_ON_ONCE(!list_empty(&epc->pinned_active)); + WARN_ON_ONCE(!list_empty(&epc->flexible_active)); + + if (epc->embedded) + return; + + call_rcu(&epc->rcu_head, free_epc_rcu); +} + static void perf_event_free_filter(struct perf_event *event); static void free_event_rcu(struct rcu_head *head) { - struct perf_event *event; + struct perf_event *event = container_of(head, typeof(*event), rcu_head); - event = container_of(head, struct perf_event, rcu_head); if (event->ns) put_pid_ns(event->ns); perf_event_free_filter(event); @@ -4847,7 +5068,7 @@ static void perf_sched_delayed(struct work_struct *work) * * 1) cpu-wide events in the presence of per-task events, * 2) per-task events in the presence of cpu-wide events, - * 3) two matching events on the same context. + * 3) two matching events on the same perf_event_context. * * The former two cases are handled in the allocation path (perf_event_alloc(), * _free_event()), the latter -- before the first perf_install_in_context(). @@ -4931,7 +5152,7 @@ static void perf_addr_filters_splice(struct perf_event *event, static void _free_event(struct perf_event *event) { - irq_work_sync(&event->pending); + irq_work_sync(&event->pending_irq); unaccount_event(event); @@ -4971,6 +5192,9 @@ static void _free_event(struct perf_event *event) if (event->hw.target) put_task_struct(event->hw.target); + if (event->pmu_ctx) + put_pmu_ctx(event->pmu_ctx); + /* * perf_event_free_task() relies on put_ctx() being 'last', in particular * all task references must be cleaned up. @@ -5071,8 +5295,8 @@ int perf_event_release_kernel(struct perf_event *event) LIST_HEAD(free_list); /* - * If we got here through err_file: fput(event_file); we will not have - * attached to a context yet. + * If we got here through err_alloc: free_event(event); we will not + * have attached to a context yet. */ if (!ctx) { WARN_ON_ONCE(event->attach_state & @@ -5085,9 +5309,7 @@ int perf_event_release_kernel(struct perf_event *event) ctx = perf_event_ctx_lock(event); WARN_ON_ONCE(ctx->parent_ctx); - perf_remove_from_context(event, DETACH_GROUP); - raw_spin_lock_irq(&ctx->lock); /* * Mark this event as STATE_DEAD, there is no external reference to it * anymore. @@ -5099,8 +5321,7 @@ int perf_event_release_kernel(struct perf_event *event) * Thus this guarantees that we will in fact observe and kill _ALL_ * child events. */ - event->state = PERF_EVENT_STATE_DEAD; - raw_spin_unlock_irq(&ctx->lock); + perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD); perf_event_ctx_unlock(event, ctx); @@ -5507,7 +5728,7 @@ static void __perf_event_period(struct perf_event *event, active = (event->state == PERF_EVENT_STATE_ACTIVE); if (active) { - perf_pmu_disable(ctx->pmu); + perf_pmu_disable(event->pmu); /* * We could be throttled; unthrottle now to avoid the tick * trying to unthrottle while we already re-started the event. @@ -5523,7 +5744,7 @@ static void __perf_event_period(struct perf_event *event, if (active) { event->pmu->start(event, PERF_EF_RELOAD); - perf_pmu_enable(ctx->pmu); + perf_pmu_enable(event->pmu); } } @@ -6431,7 +6652,8 @@ static void perf_sigtrap(struct perf_event *event) return; /* - * perf_pending_event() can race with the task exiting. + * Both perf_pending_task() and perf_pending_irq() can race with the + * task exiting. */ if (current->flags & PF_EXITING) return; @@ -6440,23 +6662,33 @@ static void perf_sigtrap(struct perf_event *event) event->attr.type, event->attr.sig_data); } -static void perf_pending_event_disable(struct perf_event *event) +/* + * Deliver the pending work in-event-context or follow the context. + */ +static void __perf_pending_irq(struct perf_event *event) { - int cpu = READ_ONCE(event->pending_disable); + int cpu = READ_ONCE(event->oncpu); + /* + * If the event isn't running; we done. event_sched_out() will have + * taken care of things. + */ if (cpu < 0) return; + /* + * Yay, we hit home and are in the context of the event. + */ if (cpu == smp_processor_id()) { - WRITE_ONCE(event->pending_disable, -1); - - if (event->attr.sigtrap) { + if (event->pending_sigtrap) { + event->pending_sigtrap = 0; perf_sigtrap(event); - atomic_set_release(&event->event_limit, 1); /* rearm event */ - return; + local_dec(&event->ctx->nr_pending); + } + if (event->pending_disable) { + event->pending_disable = 0; + perf_event_disable_local(event); } - - perf_event_disable_local(event); return; } @@ -6476,35 +6708,64 @@ static void perf_pending_event_disable(struct perf_event *event) * irq_work_queue(); // FAILS * * irq_work_run() - * perf_pending_event() + * perf_pending_irq() * * But the event runs on CPU-B and wants disabling there. */ - irq_work_queue_on(&event->pending, cpu); + irq_work_queue_on(&event->pending_irq, cpu); } -static void perf_pending_event(struct irq_work *entry) +static void perf_pending_irq(struct irq_work *entry) { - struct perf_event *event = container_of(entry, struct perf_event, pending); + struct perf_event *event = container_of(entry, struct perf_event, pending_irq); int rctx; - rctx = perf_swevent_get_recursion_context(); /* * If we 'fail' here, that's OK, it means recursion is already disabled * and we won't recurse 'further'. */ + rctx = perf_swevent_get_recursion_context(); - perf_pending_event_disable(event); - + /* + * The wakeup isn't bound to the context of the event -- it can happen + * irrespective of where the event is. + */ if (event->pending_wakeup) { event->pending_wakeup = 0; perf_event_wakeup(event); } + __perf_pending_irq(event); + if (rctx >= 0) perf_swevent_put_recursion_context(rctx); } +static void perf_pending_task(struct callback_head *head) +{ + struct perf_event *event = container_of(head, struct perf_event, pending_task); + int rctx; + + /* + * If we 'fail' here, that's OK, it means recursion is already disabled + * and we won't recurse 'further'. + */ + preempt_disable_notrace(); + rctx = perf_swevent_get_recursion_context(); + + if (event->pending_work) { + event->pending_work = 0; + perf_sigtrap(event); + local_dec(&event->ctx->nr_pending); + } + + if (rctx >= 0) + perf_swevent_put_recursion_context(rctx); + preempt_enable_notrace(); + + put_event(event); +} + #ifdef CONFIG_GUEST_PERF_EVENTS struct perf_guest_info_callbacks __rcu *perf_guest_cbs; @@ -6794,11 +7055,10 @@ out_put: static void __perf_event_header__init_id(struct perf_event_header *header, struct perf_sample_data *data, - struct perf_event *event) + struct perf_event *event, + u64 sample_type) { - u64 sample_type = event->attr.sample_type; - - data->type = sample_type; + data->type = event->attr.sample_type; header->size += event->id_header_size; if (sample_type & PERF_SAMPLE_TID) { @@ -6827,7 +7087,7 @@ void perf_event_header__init_id(struct perf_event_header *header, struct perf_event *event) { if (event->attr.sample_id_all) - __perf_event_header__init_id(header, data, event); + __perf_event_header__init_id(header, data, event, event->attr.sample_type); } static void __perf_event__output_id_sample(struct perf_output_handle *handle, @@ -6893,9 +7153,16 @@ static void perf_output_read_group(struct perf_output_handle *handle, { struct perf_event *leader = event->group_leader, *sub; u64 read_format = event->attr.read_format; + unsigned long flags; u64 values[6]; int n = 0; + /* + * Disabling interrupts avoids all counter scheduling + * (context switches, timer based rotation and IPIs). + */ + local_irq_save(flags); + values[n++] = 1 + leader->nr_siblings; if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) @@ -6931,6 +7198,8 @@ static void perf_output_read_group(struct perf_output_handle *handle, __output_copy(handle, values, n * sizeof(u64)); } + + local_irq_restore(flags); } #define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\ @@ -6967,11 +7236,6 @@ static void perf_output_read(struct perf_output_handle *handle, perf_output_read_one(handle, event, enabled, running); } -static inline bool perf_sample_save_hw_index(struct perf_event *event) -{ - return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX; -} - void perf_output_sample(struct perf_output_handle *handle, struct perf_event_header *header, struct perf_sample_data *data, @@ -7053,14 +7317,14 @@ void perf_output_sample(struct perf_output_handle *handle, } if (sample_type & PERF_SAMPLE_BRANCH_STACK) { - if (data->br_stack) { + if (data->sample_flags & PERF_SAMPLE_BRANCH_STACK) { size_t size; size = data->br_stack->nr * sizeof(struct perf_branch_entry); perf_output_put(handle, data->br_stack->nr); - if (perf_sample_save_hw_index(event)) + if (branch_sample_hw_index(event)) perf_output_put(handle, data->br_stack->hw_idx); perf_output_copy(handle, data->br_stack->entries, size); } else { @@ -7229,7 +7493,7 @@ static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr) return pud_leaf_size(pud); pmdp = pmd_offset_lockless(pudp, pud, addr); - pmd = READ_ONCE(*pmdp); + pmd = pmdp_get_lockless(pmdp); if (!pmd_present(pmd)) return 0; @@ -7303,6 +7567,7 @@ void perf_prepare_sample(struct perf_event_header *header, struct pt_regs *regs) { u64 sample_type = event->attr.sample_type; + u64 filtered_sample_type; header->type = PERF_RECORD_SAMPLE; header->size = sizeof(*header) + event->header_size; @@ -7310,7 +7575,12 @@ void perf_prepare_sample(struct perf_event_header *header, header->misc = 0; header->misc |= perf_misc_flags(regs); - __perf_event_header__init_id(header, data, event); + /* + * Clear the sample flags that have already been done by the + * PMU driver. + */ + filtered_sample_type = sample_type & ~data->sample_flags; + __perf_event_header__init_id(header, data, event, filtered_sample_type); if (sample_type & (PERF_SAMPLE_IP | PERF_SAMPLE_CODE_PAGE_SIZE)) data->ip = perf_instruction_pointer(regs); @@ -7318,7 +7588,7 @@ void perf_prepare_sample(struct perf_event_header *header, if (sample_type & PERF_SAMPLE_CALLCHAIN) { int size = 1; - if (!(sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY)) + if (filtered_sample_type & PERF_SAMPLE_CALLCHAIN) data->callchain = perf_callchain(event, regs); size += data->callchain->nr; @@ -7330,7 +7600,7 @@ void perf_prepare_sample(struct perf_event_header *header, struct perf_raw_record *raw = data->raw; int size; - if (raw) { + if (raw && (data->sample_flags & PERF_SAMPLE_RAW)) { struct perf_raw_frag *frag = &raw->frag; u32 sum = 0; @@ -7346,6 +7616,7 @@ void perf_prepare_sample(struct perf_event_header *header, frag->pad = raw->size - sum; } else { size = sizeof(u64); + data->raw = NULL; } header->size += size; @@ -7353,8 +7624,8 @@ void perf_prepare_sample(struct perf_event_header *header, if (sample_type & PERF_SAMPLE_BRANCH_STACK) { int size = sizeof(u64); /* nr */ - if (data->br_stack) { - if (perf_sample_save_hw_index(event)) + if (data->sample_flags & PERF_SAMPLE_BRANCH_STACK) { + if (branch_sample_hw_index(event)) size += sizeof(u64); size += data->br_stack->nr @@ -7403,6 +7674,20 @@ void perf_prepare_sample(struct perf_event_header *header, header->size += size; } + if (filtered_sample_type & PERF_SAMPLE_WEIGHT_TYPE) + data->weight.full = 0; + + if (filtered_sample_type & PERF_SAMPLE_DATA_SRC) + data->data_src.val = PERF_MEM_NA; + + if (filtered_sample_type & PERF_SAMPLE_TRANSACTION) + data->txn = 0; + + if (sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR | PERF_SAMPLE_DATA_PAGE_SIZE)) { + if (filtered_sample_type & PERF_SAMPLE_ADDR) + data->addr = 0; + } + if (sample_type & PERF_SAMPLE_REGS_INTR) { /* regs dump ABI info */ int size = sizeof(u64); @@ -7418,7 +7703,8 @@ void perf_prepare_sample(struct perf_event_header *header, header->size += size; } - if (sample_type & PERF_SAMPLE_PHYS_ADDR) + if (sample_type & PERF_SAMPLE_PHYS_ADDR && + filtered_sample_type & PERF_SAMPLE_PHYS_ADDR) data->phys_addr = perf_virt_to_phys(data->addr); #ifdef CONFIG_CGROUP_PERF @@ -7621,7 +7907,6 @@ perf_iterate_sb(perf_iterate_f output, void *data, struct perf_event_context *task_ctx) { struct perf_event_context *ctx; - int ctxn; rcu_read_lock(); preempt_disable(); @@ -7638,11 +7923,9 @@ perf_iterate_sb(perf_iterate_f output, void *data, perf_iterate_sb_cpu(output, data); - for_each_task_context_nr(ctxn) { - ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); - if (ctx) - perf_iterate_ctx(ctx, output, data, false); - } + ctx = rcu_dereference(current->perf_event_ctxp); + if (ctx) + perf_iterate_ctx(ctx, output, data, false); done: preempt_enable(); rcu_read_unlock(); @@ -7684,20 +7967,17 @@ static void perf_event_addr_filters_exec(struct perf_event *event, void *data) void perf_event_exec(void) { struct perf_event_context *ctx; - int ctxn; - for_each_task_context_nr(ctxn) { - perf_event_enable_on_exec(ctxn); - perf_event_remove_on_exec(ctxn); + ctx = perf_pin_task_context(current); + if (!ctx) + return; - rcu_read_lock(); - ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); - if (ctx) { - perf_iterate_ctx(ctx, perf_event_addr_filters_exec, - NULL, true); - } - rcu_read_unlock(); - } + perf_event_enable_on_exec(ctx); + perf_event_remove_on_exec(ctx); + perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, true); + + perf_unpin_context(ctx); + put_ctx(ctx); } struct remote_output { @@ -7737,8 +8017,7 @@ static void __perf_event_output_stop(struct perf_event *event, void *data) static int __perf_pmu_output_stop(void *info) { struct perf_event *event = info; - struct pmu *pmu = event->ctx->pmu; - struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct remote_output ro = { .rb = event->rb, }; @@ -8527,7 +8806,6 @@ static void __perf_addr_filters_adjust(struct perf_event *event, void *data) static void perf_addr_filters_adjust(struct vm_area_struct *vma) { struct perf_event_context *ctx; - int ctxn; /* * Data tracing isn't supported yet and as such there is no need @@ -8537,13 +8815,9 @@ static void perf_addr_filters_adjust(struct vm_area_struct *vma) return; rcu_read_lock(); - for_each_task_context_nr(ctxn) { - ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); - if (!ctx) - continue; - + ctx = rcu_dereference(current->perf_event_ctxp); + if (ctx) perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true); - } rcu_read_unlock(); } @@ -8931,7 +9205,7 @@ static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog, PERF_RECORD_KSYMBOL_TYPE_BPF, (u64)(unsigned long)subprog->bpf_func, subprog->jited_len, unregister, - prog->aux->ksym.name); + subprog->aux->ksym.name); } } } @@ -9174,13 +9448,26 @@ int perf_event_account_interrupt(struct perf_event *event) return __perf_event_account_interrupt(event, 1); } +static inline bool sample_is_allowed(struct perf_event *event, struct pt_regs *regs) +{ + /* + * Due to interrupt latency (AKA "skid"), we may enter the + * kernel before taking an overflow, even if the PMU is only + * counting user events. + */ + if (event->attr.exclude_kernel && !user_mode(regs)) + return false; + + return true; +} + /* * Generic event overflow handling, sampling. */ static int __perf_event_overflow(struct perf_event *event, - int throttle, struct perf_sample_data *data, - struct pt_regs *regs) + int throttle, struct perf_sample_data *data, + struct pt_regs *regs) { int events = atomic_read(&event->event_limit); int ret = 0; @@ -9203,24 +9490,59 @@ static int __perf_event_overflow(struct perf_event *event, if (events && atomic_dec_and_test(&event->event_limit)) { ret = 1; event->pending_kill = POLL_HUP; - event->pending_addr = data->addr; - perf_event_disable_inatomic(event); } + if (event->attr.sigtrap) { + /* + * The desired behaviour of sigtrap vs invalid samples is a bit + * tricky; on the one hand, one should not loose the SIGTRAP if + * it is the first event, on the other hand, we should also not + * trigger the WARN or override the data address. + */ + bool valid_sample = sample_is_allowed(event, regs); + unsigned int pending_id = 1; + + if (regs) + pending_id = hash32_ptr((void *)instruction_pointer(regs)) ?: 1; + if (!event->pending_sigtrap) { + event->pending_sigtrap = pending_id; + local_inc(&event->ctx->nr_pending); + } else if (event->attr.exclude_kernel && valid_sample) { + /* + * Should not be able to return to user space without + * consuming pending_sigtrap; with exceptions: + * + * 1. Where !exclude_kernel, events can overflow again + * in the kernel without returning to user space. + * + * 2. Events that can overflow again before the IRQ- + * work without user space progress (e.g. hrtimer). + * To approximate progress (with false negatives), + * check 32-bit hash of the current IP. + */ + WARN_ON_ONCE(event->pending_sigtrap != pending_id); + } + + event->pending_addr = 0; + if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR)) + event->pending_addr = data->addr; + irq_work_queue(&event->pending_irq); + } + READ_ONCE(event->overflow_handler)(event, data, regs); if (*perf_event_fasync(event) && event->pending_kill) { event->pending_wakeup = 1; - irq_work_queue(&event->pending); + irq_work_queue(&event->pending_irq); } return ret; } int perf_event_overflow(struct perf_event *event, - struct perf_sample_data *data, - struct pt_regs *regs) + struct perf_sample_data *data, + struct pt_regs *regs) { return __perf_event_overflow(event, 1, data, regs); } @@ -9670,6 +9992,44 @@ static struct pmu perf_swevent = { #ifdef CONFIG_EVENT_TRACING +static void tp_perf_event_destroy(struct perf_event *event) +{ + perf_trace_destroy(event); +} + +static int perf_tp_event_init(struct perf_event *event) +{ + int err; + + if (event->attr.type != PERF_TYPE_TRACEPOINT) + return -ENOENT; + + /* + * no branch sampling for tracepoint events + */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + + err = perf_trace_init(event); + if (err) + return err; + + event->destroy = tp_perf_event_destroy; + + return 0; +} + +static struct pmu perf_tracepoint = { + .task_ctx_nr = perf_sw_context, + + .event_init = perf_tp_event_init, + .add = perf_trace_add, + .del = perf_trace_del, + .start = perf_swevent_start, + .stop = perf_swevent_stop, + .read = perf_swevent_read, +}; + static int perf_tp_filter_match(struct perf_event *event, struct perf_sample_data *data) { @@ -9719,6 +10079,44 @@ void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx, } EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit); +static void __perf_tp_event_target_task(u64 count, void *record, + struct pt_regs *regs, + struct perf_sample_data *data, + struct perf_event *event) +{ + struct trace_entry *entry = record; + + if (event->attr.config != entry->type) + return; + /* Cannot deliver synchronous signal to other task. */ + if (event->attr.sigtrap) + return; + if (perf_tp_event_match(event, data, regs)) + perf_swevent_event(event, count, data, regs); +} + +static void perf_tp_event_target_task(u64 count, void *record, + struct pt_regs *regs, + struct perf_sample_data *data, + struct perf_event_context *ctx) +{ + unsigned int cpu = smp_processor_id(); + struct pmu *pmu = &perf_tracepoint; + struct perf_event *event, *sibling; + + perf_event_groups_for_cpu_pmu(event, &ctx->pinned_groups, cpu, pmu) { + __perf_tp_event_target_task(count, record, regs, data, event); + for_each_sibling_event(sibling, event) + __perf_tp_event_target_task(count, record, regs, data, sibling); + } + + perf_event_groups_for_cpu_pmu(event, &ctx->flexible_groups, cpu, pmu) { + __perf_tp_event_target_task(count, record, regs, data, event); + for_each_sibling_event(sibling, event) + __perf_tp_event_target_task(count, record, regs, data, sibling); + } +} + void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, struct pt_regs *regs, struct hlist_head *head, int rctx, struct task_struct *task) @@ -9735,6 +10133,7 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, perf_sample_data_init(&data, 0, 0); data.raw = &raw; + data.sample_flags |= PERF_SAMPLE_RAW; perf_trace_buf_update(record, event_type); @@ -9749,26 +10148,15 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, */ if (task && task != current) { struct perf_event_context *ctx; - struct trace_entry *entry = record; rcu_read_lock(); - ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]); + ctx = rcu_dereference(task->perf_event_ctxp); if (!ctx) goto unlock; - list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { - if (event->cpu != smp_processor_id()) - continue; - if (event->attr.type != PERF_TYPE_TRACEPOINT) - continue; - if (event->attr.config != entry->type) - continue; - /* Cannot deliver synchronous signal to other task. */ - if (event->attr.sigtrap) - continue; - if (perf_tp_event_match(event, &data, regs)) - perf_swevent_event(event, count, &data, regs); - } + raw_spin_lock(&ctx->lock); + perf_tp_event_target_task(count, record, regs, &data, ctx); + raw_spin_unlock(&ctx->lock); unlock: rcu_read_unlock(); } @@ -9777,44 +10165,6 @@ unlock: } EXPORT_SYMBOL_GPL(perf_tp_event); -static void tp_perf_event_destroy(struct perf_event *event) -{ - perf_trace_destroy(event); -} - -static int perf_tp_event_init(struct perf_event *event) -{ - int err; - - if (event->attr.type != PERF_TYPE_TRACEPOINT) - return -ENOENT; - - /* - * no branch sampling for tracepoint events - */ - if (has_branch_stack(event)) - return -EOPNOTSUPP; - - err = perf_trace_init(event); - if (err) - return err; - - event->destroy = tp_perf_event_destroy; - - return 0; -} - -static struct pmu perf_tracepoint = { - .task_ctx_nr = perf_sw_context, - - .event_init = perf_tp_event_init, - .add = perf_trace_add, - .del = perf_trace_del, - .start = perf_swevent_start, - .stop = perf_swevent_stop, - .read = perf_swevent_read, -}; - #if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) /* * Flags in config, used by dynamic PMU kprobe and uprobe @@ -9989,8 +10339,16 @@ static void bpf_overflow_handler(struct perf_event *event, goto out; rcu_read_lock(); prog = READ_ONCE(event->prog); - if (prog) + if (prog) { + if (prog->call_get_stack && + (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) && + !(data->sample_flags & PERF_SAMPLE_CALLCHAIN)) { + data->callchain = perf_callchain(event, regs); + data->sample_flags |= PERF_SAMPLE_CALLCHAIN; + } + ret = bpf_prog_run(prog, &ctx); + } rcu_read_unlock(); out: __this_cpu_dec(bpf_prog_active); @@ -10016,7 +10374,7 @@ static int perf_event_set_bpf_handler(struct perf_event *event, if (event->attr.precise_ip && prog->call_get_stack && - (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY) || + (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) || event->attr.exclude_callchain_kernel || event->attr.exclude_callchain_user)) { /* @@ -10229,8 +10587,9 @@ static void perf_addr_filter_apply(struct perf_addr_filter *filter, struct perf_addr_filter_range *fr) { struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { if (!vma->vm_file) continue; @@ -10892,36 +11251,9 @@ static int perf_event_idx_default(struct perf_event *event) return 0; } -/* - * Ensures all contexts with the same task_ctx_nr have the same - * pmu_cpu_context too. - */ -static struct perf_cpu_context __percpu *find_pmu_context(int ctxn) -{ - struct pmu *pmu; - - if (ctxn < 0) - return NULL; - - list_for_each_entry(pmu, &pmus, entry) { - if (pmu->task_ctx_nr == ctxn) - return pmu->pmu_cpu_context; - } - - return NULL; -} - static void free_pmu_context(struct pmu *pmu) { - /* - * Static contexts such as perf_sw_context have a global lifetime - * and may be shared between different PMUs. Avoid freeing them - * when a single PMU is going away. - */ - if (pmu->task_ctx_nr > perf_invalid_context) - return; - - free_percpu(pmu->pmu_cpu_context); + free_percpu(pmu->cpu_pmu_context); } /* @@ -10933,7 +11265,7 @@ static ssize_t nr_addr_filters_show(struct device *dev, { struct pmu *pmu = dev_get_drvdata(dev); - return snprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters); + return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters); } DEVICE_ATTR_RO(nr_addr_filters); @@ -10944,7 +11276,7 @@ type_show(struct device *dev, struct device_attribute *attr, char *page) { struct pmu *pmu = dev_get_drvdata(dev); - return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type); + return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->type); } static DEVICE_ATTR_RO(type); @@ -10955,7 +11287,7 @@ perf_event_mux_interval_ms_show(struct device *dev, { struct pmu *pmu = dev_get_drvdata(dev); - return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms); + return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->hrtimer_interval_ms); } static DEFINE_MUTEX(mux_interval_mutex); @@ -10985,12 +11317,11 @@ perf_event_mux_interval_ms_store(struct device *dev, /* update all cpuctx for this PMU */ cpus_read_lock(); for_each_online_cpu(cpu) { - struct perf_cpu_context *cpuctx; - cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); - cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer); + struct perf_cpu_pmu_context *cpc; + cpc = per_cpu_ptr(pmu->cpu_pmu_context, cpu); + cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer); - cpu_function_call(cpu, - (remote_function_f)perf_mux_hrtimer_restart, cpuctx); + cpu_function_call(cpu, perf_mux_hrtimer_restart_ipi, cpc); } cpus_read_unlock(); mutex_unlock(&mux_interval_mutex); @@ -11027,13 +11358,15 @@ static int pmu_dev_alloc(struct pmu *pmu) pmu->dev->groups = pmu->attr_groups; device_initialize(pmu->dev); - ret = dev_set_name(pmu->dev, "%s", pmu->name); - if (ret) - goto free_dev; dev_set_drvdata(pmu->dev, pmu); pmu->dev->bus = &pmu_bus; pmu->dev->release = pmu_dev_release; + + ret = dev_set_name(pmu->dev, "%s", pmu->name); + if (ret) + goto free_dev; + ret = device_add(pmu->dev); if (ret) goto free_dev; @@ -11101,47 +11434,19 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type) } skip_type: - if (pmu->task_ctx_nr == perf_hw_context) { - static int hw_context_taken = 0; - - /* - * Other than systems with heterogeneous CPUs, it never makes - * sense for two PMUs to share perf_hw_context. PMUs which are - * uncore must use perf_invalid_context. - */ - if (WARN_ON_ONCE(hw_context_taken && - !(pmu->capabilities & PERF_PMU_CAP_HETEROGENEOUS_CPUS))) - pmu->task_ctx_nr = perf_invalid_context; - - hw_context_taken = 1; - } - - pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr); - if (pmu->pmu_cpu_context) - goto got_cpu_context; - ret = -ENOMEM; - pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context); - if (!pmu->pmu_cpu_context) + pmu->cpu_pmu_context = alloc_percpu(struct perf_cpu_pmu_context); + if (!pmu->cpu_pmu_context) goto free_dev; for_each_possible_cpu(cpu) { - struct perf_cpu_context *cpuctx; + struct perf_cpu_pmu_context *cpc; - cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); - __perf_event_init_context(&cpuctx->ctx); - lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); - lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); - cpuctx->ctx.pmu = pmu; - cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask); - - __perf_mux_hrtimer_init(cpuctx, cpu); - - cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default); - cpuctx->heap = cpuctx->heap_default; + cpc = per_cpu_ptr(pmu->cpu_pmu_context, cpu); + __perf_init_event_pmu_context(&cpc->epc, pmu); + __perf_mux_hrtimer_init(cpc, cpu); } -got_cpu_context: if (!pmu->start_txn) { if (pmu->pmu_enable) { /* @@ -11528,8 +11833,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, init_waitqueue_head(&event->waitq); - event->pending_disable = -1; - init_irq_work(&event->pending, perf_pending_event); + init_irq_work(&event->pending_irq, perf_pending_irq); + init_task_work(&event->pending_task, perf_pending_task); mutex_init(&event->mmap_mutex); raw_spin_lock_init(&event->addr_filters.lock); @@ -11551,9 +11856,6 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (parent_event) event->event_caps = parent_event->event_caps; - if (event->attr.sigtrap) - atomic_set(&event->event_limit, 1); - if (task) { event->attach_state = PERF_ATTACH_TASK; /* @@ -11623,10 +11925,11 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, } /* - * Disallow uncore-cgroup events, they don't make sense as the cgroup will - * be different on other CPUs in the uncore mask. + * Disallow uncore-task events. Similarly, disallow uncore-cgroup + * events (they don't make sense as the cgroup will be different + * on other CPUs in the uncore mask). */ - if (pmu->task_ctx_nr == perf_invalid_context && cgroup_fd != -1) { + if (pmu->task_ctx_nr == perf_invalid_context && (task || cgroup_fd != -1)) { err = -EINVAL; goto err_pmu; } @@ -11709,11 +12012,9 @@ err_pmu: event->destroy(event); module_put(pmu->module); err_ns: - if (event->ns) - put_pid_ns(event->ns); if (event->hw.target) put_task_struct(event->hw.target); - kmem_cache_free(perf_event_cache, event); + call_rcu(&event->rcu_head, free_event_rcu); return ERR_PTR(err); } @@ -11975,37 +12276,6 @@ static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id) return 0; } -/* - * Variation on perf_event_ctx_lock_nested(), except we take two context - * mutexes. - */ -static struct perf_event_context * -__perf_event_ctx_lock_double(struct perf_event *group_leader, - struct perf_event_context *ctx) -{ - struct perf_event_context *gctx; - -again: - rcu_read_lock(); - gctx = READ_ONCE(group_leader->ctx); - if (!refcount_inc_not_zero(&gctx->refcount)) { - rcu_read_unlock(); - goto again; - } - rcu_read_unlock(); - - mutex_lock_double(&gctx->mutex, &ctx->mutex); - - if (group_leader->ctx != gctx) { - mutex_unlock(&ctx->mutex); - mutex_unlock(&gctx->mutex); - put_ctx(gctx); - goto again; - } - - return gctx; -} - static bool perf_check_permission(struct perf_event_attr *attr, struct task_struct *task) { @@ -12051,9 +12321,10 @@ SYSCALL_DEFINE5(perf_event_open, pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) { struct perf_event *group_leader = NULL, *output_event = NULL; + struct perf_event_pmu_context *pmu_ctx; struct perf_event *event, *sibling; struct perf_event_attr attr; - struct perf_event_context *ctx, *gctx; + struct perf_event_context *ctx; struct file *event_file = NULL; struct fd group = {NULL, 0}; struct task_struct *task = NULL; @@ -12183,42 +12454,53 @@ SYSCALL_DEFINE5(perf_event_open, if (pmu->task_ctx_nr == perf_sw_context) event->event_caps |= PERF_EV_CAP_SOFTWARE; - if (group_leader) { - if (is_software_event(event) && - !in_software_context(group_leader)) { - /* - * If the event is a sw event, but the group_leader - * is on hw context. - * - * Allow the addition of software events to hw - * groups, this is safe because software events - * never fail to schedule. - */ - pmu = group_leader->ctx->pmu; - } else if (!is_software_event(event) && - is_software_event(group_leader) && - (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) { - /* - * In case the group is a pure software group, and we - * try to add a hardware event, move the whole group to - * the hardware context. - */ - move_group = 1; - } + if (task) { + err = down_read_interruptible(&task->signal->exec_update_lock); + if (err) + goto err_alloc; + + /* + * We must hold exec_update_lock across this and any potential + * perf_install_in_context() call for this new event to + * serialize against exec() altering our credentials (and the + * perf_event_exit_task() that could imply). + */ + err = -EACCES; + if (!perf_check_permission(&attr, task)) + goto err_cred; } /* * Get the target context (task or percpu): */ - ctx = find_get_context(pmu, task, event); + ctx = find_get_context(task, event); if (IS_ERR(ctx)) { err = PTR_ERR(ctx); - goto err_alloc; + goto err_cred; + } + + mutex_lock(&ctx->mutex); + + if (ctx->task == TASK_TOMBSTONE) { + err = -ESRCH; + goto err_locked; + } + + if (!task) { + /* + * Check if the @cpu we're creating an event for is online. + * + * We use the perf_cpu_context::ctx::mutex to serialize against + * the hotplug notifiers. See perf_event_{init,exit}_cpu(). + */ + struct perf_cpu_context *cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu); + + if (!cpuctx->online) { + err = -ENODEV; + goto err_locked; + } } - /* - * Look up the group leader (we will attach this event to it): - */ if (group_leader) { err = -EINVAL; @@ -12227,11 +12509,11 @@ SYSCALL_DEFINE5(perf_event_open, * becoming part of another group-sibling): */ if (group_leader->group_leader != group_leader) - goto err_context; + goto err_locked; /* All events in a group should have the same clock */ if (group_leader->clock != event->clock) - goto err_context; + goto err_locked; /* * Make sure we're both events for the same CPU; @@ -12239,145 +12521,76 @@ SYSCALL_DEFINE5(perf_event_open, * you can never concurrently schedule them anyhow. */ if (group_leader->cpu != event->cpu) - goto err_context; - - /* - * Make sure we're both on the same task, or both - * per-CPU events. - */ - if (group_leader->ctx->task != ctx->task) - goto err_context; + goto err_locked; /* - * Do not allow to attach to a group in a different task - * or CPU context. If we're moving SW events, we'll fix - * this up later, so allow that. - * - * Racy, not holding group_leader->ctx->mutex, see comment with - * perf_event_ctx_lock(). + * Make sure we're both on the same context; either task or cpu. */ - if (!move_group && group_leader->ctx != ctx) - goto err_context; + if (group_leader->ctx != ctx) + goto err_locked; /* * Only a group leader can be exclusive or pinned */ if (attr.exclusive || attr.pinned) - goto err_context; - } - - if (output_event) { - err = perf_event_set_output(event, output_event); - if (err) - goto err_context; - } - - event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, - f_flags); - if (IS_ERR(event_file)) { - err = PTR_ERR(event_file); - event_file = NULL; - goto err_context; - } - - if (task) { - err = down_read_interruptible(&task->signal->exec_update_lock); - if (err) - goto err_file; - - /* - * We must hold exec_update_lock across this and any potential - * perf_install_in_context() call for this new event to - * serialize against exec() altering our credentials (and the - * perf_event_exit_task() that could imply). - */ - err = -EACCES; - if (!perf_check_permission(&attr, task)) - goto err_cred; - } - - if (move_group) { - gctx = __perf_event_ctx_lock_double(group_leader, ctx); - - if (gctx->task == TASK_TOMBSTONE) { - err = -ESRCH; goto err_locked; - } - /* - * Check if we raced against another sys_perf_event_open() call - * moving the software group underneath us. - */ - if (!(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) { + if (is_software_event(event) && + !in_software_context(group_leader)) { /* - * If someone moved the group out from under us, check - * if this new event wound up on the same ctx, if so - * its the regular !move_group case, otherwise fail. + * If the event is a sw event, but the group_leader + * is on hw context. + * + * Allow the addition of software events to hw + * groups, this is safe because software events + * never fail to schedule. + * + * Note the comment that goes with struct + * perf_event_pmu_context. */ - if (gctx != ctx) { - err = -EINVAL; - goto err_locked; - } else { - perf_event_ctx_unlock(group_leader, gctx); - move_group = 0; - goto not_move_group; + pmu = group_leader->pmu_ctx->pmu; + } else if (!is_software_event(event)) { + if (is_software_event(group_leader) && + (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) { + /* + * In case the group is a pure software group, and we + * try to add a hardware event, move the whole group to + * the hardware context. + */ + move_group = 1; } - } - - /* - * Failure to create exclusive events returns -EBUSY. - */ - err = -EBUSY; - if (!exclusive_event_installable(group_leader, ctx)) - goto err_locked; - for_each_sibling_event(sibling, group_leader) { - if (!exclusive_event_installable(sibling, ctx)) + /* Don't allow group of multiple hw events from different pmus */ + if (!in_software_context(group_leader) && + group_leader->pmu_ctx->pmu != pmu) goto err_locked; } - } else { - mutex_lock(&ctx->mutex); - - /* - * Now that we hold ctx->lock, (re)validate group_leader->ctx == ctx, - * see the group_leader && !move_group test earlier. - */ - if (group_leader && group_leader->ctx != ctx) { - err = -EINVAL; - goto err_locked; - } } -not_move_group: - if (ctx->task == TASK_TOMBSTONE) { - err = -ESRCH; + /* + * Now that we're certain of the pmu; find the pmu_ctx. + */ + pmu_ctx = find_get_pmu_context(pmu, ctx, event); + if (IS_ERR(pmu_ctx)) { + err = PTR_ERR(pmu_ctx); goto err_locked; } + event->pmu_ctx = pmu_ctx; - if (!perf_event_validate_size(event)) { - err = -E2BIG; - goto err_locked; + if (output_event) { + err = perf_event_set_output(event, output_event); + if (err) + goto err_context; } - if (!task) { - /* - * Check if the @cpu we're creating an event for is online. - * - * We use the perf_cpu_context::ctx::mutex to serialize against - * the hotplug notifiers. See perf_event_{init,exit}_cpu(). - */ - struct perf_cpu_context *cpuctx = - container_of(ctx, struct perf_cpu_context, ctx); - - if (!cpuctx->online) { - err = -ENODEV; - goto err_locked; - } + if (!perf_event_validate_size(event)) { + err = -E2BIG; + goto err_context; } if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) { err = -EINVAL; - goto err_locked; + goto err_context; } /* @@ -12386,36 +12599,33 @@ not_move_group: */ if (!exclusive_event_installable(event, ctx)) { err = -EBUSY; - goto err_locked; + goto err_context; } WARN_ON_ONCE(ctx->parent_ctx); + event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, f_flags); + if (IS_ERR(event_file)) { + err = PTR_ERR(event_file); + event_file = NULL; + goto err_context; + } + /* * This is the point on no return; we cannot fail hereafter. This is * where we start modifying current state. */ if (move_group) { - /* - * See perf_event_ctx_lock() for comments on the details - * of swizzling perf_event::ctx. - */ perf_remove_from_context(group_leader, 0); - put_ctx(gctx); + put_pmu_ctx(group_leader->pmu_ctx); for_each_sibling_event(sibling, group_leader) { perf_remove_from_context(sibling, 0); - put_ctx(gctx); + put_pmu_ctx(sibling->pmu_ctx); } /* - * Wait for everybody to stop referencing the events through - * the old lists, before installing it on new lists. - */ - synchronize_rcu(); - - /* * Install the group siblings before the group leader. * * Because a group leader will try and install the entire group @@ -12426,9 +12636,10 @@ not_move_group: * reachable through the group lists. */ for_each_sibling_event(sibling, group_leader) { + sibling->pmu_ctx = pmu_ctx; + get_pmu_ctx(pmu_ctx); perf_event__state_init(sibling); perf_install_in_context(ctx, sibling, sibling->cpu); - get_ctx(ctx); } /* @@ -12436,9 +12647,10 @@ not_move_group: * event. What we want here is event in the initial * startup state, ready to be add into new context. */ + group_leader->pmu_ctx = pmu_ctx; + get_pmu_ctx(pmu_ctx); perf_event__state_init(group_leader); perf_install_in_context(ctx, group_leader, group_leader->cpu); - get_ctx(ctx); } /* @@ -12455,8 +12667,6 @@ not_move_group: perf_install_in_context(ctx, event, event->cpu); perf_unpin_context(ctx); - if (move_group) - perf_event_ctx_unlock(group_leader, gctx); mutex_unlock(&ctx->mutex); if (task) { @@ -12478,25 +12688,17 @@ not_move_group: fd_install(event_fd, event_file); return event_fd; +err_context: + /* event->pmu_ctx freed by free_event() */ err_locked: - if (move_group) - perf_event_ctx_unlock(group_leader, gctx); mutex_unlock(&ctx->mutex); + perf_unpin_context(ctx); + put_ctx(ctx); err_cred: if (task) up_read(&task->signal->exec_update_lock); -err_file: - fput(event_file); -err_context: - perf_unpin_context(ctx); - put_ctx(ctx); err_alloc: - /* - * If event_file is set, the fput() above will have called ->release() - * and that will take care of freeing the event. - */ - if (!event_file) - free_event(event); + free_event(event); err_task: if (task) put_task_struct(task); @@ -12522,8 +12724,10 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, perf_overflow_handler_t overflow_handler, void *context) { + struct perf_event_pmu_context *pmu_ctx; struct perf_event_context *ctx; struct perf_event *event; + struct pmu *pmu; int err; /* @@ -12542,14 +12746,18 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, /* Mark owner so we could distinguish it from user events. */ event->owner = TASK_TOMBSTONE; + pmu = event->pmu; + + if (pmu->task_ctx_nr == perf_sw_context) + event->event_caps |= PERF_EV_CAP_SOFTWARE; /* * Get the target context (task or percpu): */ - ctx = find_get_context(event->pmu, task, event); + ctx = find_get_context(task, event); if (IS_ERR(ctx)) { err = PTR_ERR(ctx); - goto err_free; + goto err_alloc; } WARN_ON_ONCE(ctx->parent_ctx); @@ -12559,6 +12767,13 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, goto err_unlock; } + pmu_ctx = find_get_pmu_context(pmu, ctx, event); + if (IS_ERR(pmu_ctx)) { + err = PTR_ERR(pmu_ctx); + goto err_unlock; + } + event->pmu_ctx = pmu_ctx; + if (!task) { /* * Check if the @cpu we're creating an event for is online. @@ -12570,13 +12785,13 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, container_of(ctx, struct perf_cpu_context, ctx); if (!cpuctx->online) { err = -ENODEV; - goto err_unlock; + goto err_pmu_ctx; } } if (!exclusive_event_installable(event, ctx)) { err = -EBUSY; - goto err_unlock; + goto err_pmu_ctx; } perf_install_in_context(ctx, event, event->cpu); @@ -12585,44 +12800,61 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, return event; +err_pmu_ctx: + put_pmu_ctx(pmu_ctx); err_unlock: mutex_unlock(&ctx->mutex); perf_unpin_context(ctx); put_ctx(ctx); -err_free: +err_alloc: free_event(event); err: return ERR_PTR(err); } EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); -void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) +static void __perf_pmu_remove(struct perf_event_context *ctx, + int cpu, struct pmu *pmu, + struct perf_event_groups *groups, + struct list_head *events) { - struct perf_event_context *src_ctx; - struct perf_event_context *dst_ctx; - struct perf_event *event, *tmp; - LIST_HEAD(events); - - src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx; - dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx; + struct perf_event *event, *sibling; - /* - * See perf_event_ctx_lock() for comments on the details - * of swizzling perf_event::ctx. - */ - mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex); - list_for_each_entry_safe(event, tmp, &src_ctx->event_list, - event_entry) { + perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) { perf_remove_from_context(event, 0); - unaccount_event_cpu(event, src_cpu); - put_ctx(src_ctx); - list_add(&event->migrate_entry, &events); + unaccount_event_cpu(event, cpu); + put_pmu_ctx(event->pmu_ctx); + list_add(&event->migrate_entry, events); + + for_each_sibling_event(sibling, event) { + perf_remove_from_context(sibling, 0); + unaccount_event_cpu(sibling, cpu); + put_pmu_ctx(sibling->pmu_ctx); + list_add(&sibling->migrate_entry, events); + } } +} - /* - * Wait for the events to quiesce before re-instating them. - */ - synchronize_rcu(); +static void __perf_pmu_install_event(struct pmu *pmu, + struct perf_event_context *ctx, + int cpu, struct perf_event *event) +{ + struct perf_event_pmu_context *epc; + + event->cpu = cpu; + epc = find_get_pmu_context(pmu, ctx, event); + event->pmu_ctx = epc; + + if (event->state >= PERF_EVENT_STATE_OFF) + event->state = PERF_EVENT_STATE_INACTIVE; + account_event_cpu(event, cpu); + perf_install_in_context(ctx, event, cpu); +} + +static void __perf_pmu_install(struct perf_event_context *ctx, + int cpu, struct pmu *pmu, struct list_head *events) +{ + struct perf_event *event, *tmp; /* * Re-instate events in 2 passes. @@ -12632,30 +12864,48 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) * leader will enable its siblings, even if those are still on the old * context. */ - list_for_each_entry_safe(event, tmp, &events, migrate_entry) { + list_for_each_entry_safe(event, tmp, events, migrate_entry) { if (event->group_leader == event) continue; list_del(&event->migrate_entry); - if (event->state >= PERF_EVENT_STATE_OFF) - event->state = PERF_EVENT_STATE_INACTIVE; - account_event_cpu(event, dst_cpu); - perf_install_in_context(dst_ctx, event, dst_cpu); - get_ctx(dst_ctx); + __perf_pmu_install_event(pmu, ctx, cpu, event); } /* * Once all the siblings are setup properly, install the group leaders * to make it go. */ - list_for_each_entry_safe(event, tmp, &events, migrate_entry) { + list_for_each_entry_safe(event, tmp, events, migrate_entry) { list_del(&event->migrate_entry); - if (event->state >= PERF_EVENT_STATE_OFF) - event->state = PERF_EVENT_STATE_INACTIVE; - account_event_cpu(event, dst_cpu); - perf_install_in_context(dst_ctx, event, dst_cpu); - get_ctx(dst_ctx); + __perf_pmu_install_event(pmu, ctx, cpu, event); } +} + +void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) +{ + struct perf_event_context *src_ctx, *dst_ctx; + LIST_HEAD(events); + + src_ctx = &per_cpu_ptr(&perf_cpu_context, src_cpu)->ctx; + dst_ctx = &per_cpu_ptr(&perf_cpu_context, dst_cpu)->ctx; + + /* + * See perf_event_ctx_lock() for comments on the details + * of swizzling perf_event::ctx. + */ + mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex); + + __perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->pinned_groups, &events); + __perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->flexible_groups, &events); + + /* + * Wait for the events to quiesce before re-instating them. + */ + synchronize_rcu(); + + __perf_pmu_install(dst_ctx, dst_cpu, pmu, &events); + mutex_unlock(&dst_ctx->mutex); mutex_unlock(&src_ctx->mutex); } @@ -12735,14 +12985,14 @@ perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx) perf_event_wakeup(event); } -static void perf_event_exit_task_context(struct task_struct *child, int ctxn) +static void perf_event_exit_task_context(struct task_struct *child) { struct perf_event_context *child_ctx, *clone_ctx = NULL; struct perf_event *child_event, *next; WARN_ON_ONCE(child != current); - child_ctx = perf_pin_task_context(child, ctxn); + child_ctx = perf_pin_task_context(child); if (!child_ctx) return; @@ -12764,13 +13014,13 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) * in. */ raw_spin_lock_irq(&child_ctx->lock); - task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx, EVENT_ALL); + task_ctx_sched_out(child_ctx, EVENT_ALL); /* * Now that the context is inactive, destroy the task <-> ctx relation * and mark the context dead. */ - RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL); + RCU_INIT_POINTER(child->perf_event_ctxp, NULL); put_ctx(child_ctx); /* cannot be last */ WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE); put_task_struct(current); /* cannot be last */ @@ -12805,7 +13055,6 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) void perf_event_exit_task(struct task_struct *child) { struct perf_event *event, *tmp; - int ctxn; mutex_lock(&child->perf_event_mutex); list_for_each_entry_safe(event, tmp, &child->perf_event_list, @@ -12821,8 +13070,7 @@ void perf_event_exit_task(struct task_struct *child) } mutex_unlock(&child->perf_event_mutex); - for_each_task_context_nr(ctxn) - perf_event_exit_task_context(child, ctxn); + perf_event_exit_task_context(child); /* * The perf_event_exit_task_context calls perf_event_task @@ -12865,56 +13113,51 @@ void perf_event_free_task(struct task_struct *task) { struct perf_event_context *ctx; struct perf_event *event, *tmp; - int ctxn; - for_each_task_context_nr(ctxn) { - ctx = task->perf_event_ctxp[ctxn]; - if (!ctx) - continue; + ctx = rcu_access_pointer(task->perf_event_ctxp); + if (!ctx) + return; - mutex_lock(&ctx->mutex); - raw_spin_lock_irq(&ctx->lock); - /* - * Destroy the task <-> ctx relation and mark the context dead. - * - * This is important because even though the task hasn't been - * exposed yet the context has been (through child_list). - */ - RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], NULL); - WRITE_ONCE(ctx->task, TASK_TOMBSTONE); - put_task_struct(task); /* cannot be last */ - raw_spin_unlock_irq(&ctx->lock); + mutex_lock(&ctx->mutex); + raw_spin_lock_irq(&ctx->lock); + /* + * Destroy the task <-> ctx relation and mark the context dead. + * + * This is important because even though the task hasn't been + * exposed yet the context has been (through child_list). + */ + RCU_INIT_POINTER(task->perf_event_ctxp, NULL); + WRITE_ONCE(ctx->task, TASK_TOMBSTONE); + put_task_struct(task); /* cannot be last */ + raw_spin_unlock_irq(&ctx->lock); - list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) - perf_free_event(event, ctx); - mutex_unlock(&ctx->mutex); + list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) + perf_free_event(event, ctx); - /* - * perf_event_release_kernel() could've stolen some of our - * child events and still have them on its free_list. In that - * case we must wait for these events to have been freed (in - * particular all their references to this task must've been - * dropped). - * - * Without this copy_process() will unconditionally free this - * task (irrespective of its reference count) and - * _free_event()'s put_task_struct(event->hw.target) will be a - * use-after-free. - * - * Wait for all events to drop their context reference. - */ - wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1); - put_ctx(ctx); /* must be last */ - } + mutex_unlock(&ctx->mutex); + + /* + * perf_event_release_kernel() could've stolen some of our + * child events and still have them on its free_list. In that + * case we must wait for these events to have been freed (in + * particular all their references to this task must've been + * dropped). + * + * Without this copy_process() will unconditionally free this + * task (irrespective of its reference count) and + * _free_event()'s put_task_struct(event->hw.target) will be a + * use-after-free. + * + * Wait for all events to drop their context reference. + */ + wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1); + put_ctx(ctx); /* must be last */ } void perf_event_delayed_put(struct task_struct *task) { - int ctxn; - - for_each_task_context_nr(ctxn) - WARN_ON_ONCE(task->perf_event_ctxp[ctxn]); + WARN_ON_ONCE(task->perf_event_ctxp); } struct file *perf_event_get(unsigned int fd) @@ -12964,6 +13207,7 @@ inherit_event(struct perf_event *parent_event, struct perf_event_context *child_ctx) { enum perf_event_state parent_state = parent_event->state; + struct perf_event_pmu_context *pmu_ctx; struct perf_event *child_event; unsigned long flags; @@ -12984,17 +13228,12 @@ inherit_event(struct perf_event *parent_event, if (IS_ERR(child_event)) return child_event; - - if ((child_event->attach_state & PERF_ATTACH_TASK_DATA) && - !child_ctx->task_ctx_data) { - struct pmu *pmu = child_event->pmu; - - child_ctx->task_ctx_data = alloc_task_ctx_data(pmu); - if (!child_ctx->task_ctx_data) { - free_event(child_event); - return ERR_PTR(-ENOMEM); - } + pmu_ctx = find_get_pmu_context(child_event->pmu, child_ctx, child_event); + if (IS_ERR(pmu_ctx)) { + free_event(child_event); + return NULL; } + child_event->pmu_ctx = pmu_ctx; /* * is_orphaned_event() and list_add_tail(&parent_event->child_list) @@ -13117,11 +13356,11 @@ static int inherit_group(struct perf_event *parent_event, static int inherit_task_group(struct perf_event *event, struct task_struct *parent, struct perf_event_context *parent_ctx, - struct task_struct *child, int ctxn, + struct task_struct *child, u64 clone_flags, int *inherited_all) { - int ret; struct perf_event_context *child_ctx; + int ret; if (!event->attr.inherit || (event->attr.inherit_thread && !(clone_flags & CLONE_THREAD)) || @@ -13131,7 +13370,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, return 0; } - child_ctx = child->perf_event_ctxp[ctxn]; + child_ctx = child->perf_event_ctxp; if (!child_ctx) { /* * This is executed from the parent task context, so @@ -13139,16 +13378,14 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, * First allocate and initialize a context for the * child. */ - child_ctx = alloc_perf_context(parent_ctx->pmu, child); + child_ctx = alloc_perf_context(child); if (!child_ctx) return -ENOMEM; - child->perf_event_ctxp[ctxn] = child_ctx; + child->perf_event_ctxp = child_ctx; } - ret = inherit_group(event, parent, parent_ctx, - child, child_ctx); - + ret = inherit_group(event, parent, parent_ctx, child, child_ctx); if (ret) *inherited_all = 0; @@ -13158,8 +13395,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, /* * Initialize the perf_event context in task_struct */ -static int perf_event_init_context(struct task_struct *child, int ctxn, - u64 clone_flags) +static int perf_event_init_context(struct task_struct *child, u64 clone_flags) { struct perf_event_context *child_ctx, *parent_ctx; struct perf_event_context *cloned_ctx; @@ -13169,14 +13405,14 @@ static int perf_event_init_context(struct task_struct *child, int ctxn, unsigned long flags; int ret = 0; - if (likely(!parent->perf_event_ctxp[ctxn])) + if (likely(!parent->perf_event_ctxp)) return 0; /* * If the parent's context is a clone, pin it so it won't get * swapped under us. */ - parent_ctx = perf_pin_task_context(parent, ctxn); + parent_ctx = perf_pin_task_context(parent); if (!parent_ctx) return 0; @@ -13199,8 +13435,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn, */ perf_event_groups_for_each(event, &parent_ctx->pinned_groups) { ret = inherit_task_group(event, parent, parent_ctx, - child, ctxn, clone_flags, - &inherited_all); + child, clone_flags, &inherited_all); if (ret) goto out_unlock; } @@ -13216,8 +13451,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn, perf_event_groups_for_each(event, &parent_ctx->flexible_groups) { ret = inherit_task_group(event, parent, parent_ctx, - child, ctxn, clone_flags, - &inherited_all); + child, clone_flags, &inherited_all); if (ret) goto out_unlock; } @@ -13225,7 +13459,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn, raw_spin_lock_irqsave(&parent_ctx->lock, flags); parent_ctx->rotate_disable = 0; - child_ctx = child->perf_event_ctxp[ctxn]; + child_ctx = child->perf_event_ctxp; if (child_ctx && inherited_all) { /* @@ -13261,18 +13495,16 @@ out_unlock: */ int perf_event_init_task(struct task_struct *child, u64 clone_flags) { - int ctxn, ret; + int ret; - memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp)); + child->perf_event_ctxp = NULL; mutex_init(&child->perf_event_mutex); INIT_LIST_HEAD(&child->perf_event_list); - for_each_task_context_nr(ctxn) { - ret = perf_event_init_context(child, ctxn, clone_flags); - if (ret) { - perf_event_free_task(child); - return ret; - } + ret = perf_event_init_context(child, clone_flags); + if (ret) { + perf_event_free_task(child); + return ret; } return 0; @@ -13281,6 +13513,7 @@ int perf_event_init_task(struct task_struct *child, u64 clone_flags) static void __init perf_event_init_all_cpus(void) { struct swevent_htable *swhash; + struct perf_cpu_context *cpuctx; int cpu; zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL); @@ -13288,15 +13521,19 @@ static void __init perf_event_init_all_cpus(void) for_each_possible_cpu(cpu) { swhash = &per_cpu(swevent_htable, cpu); mutex_init(&swhash->hlist_mutex); - INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu)); INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu)); raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu)); -#ifdef CONFIG_CGROUP_PERF - INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu)); -#endif INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu)); + + cpuctx = per_cpu_ptr(&perf_cpu_context, cpu); + __perf_event_init_context(&cpuctx->ctx); + lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); + lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); + cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask); + cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default); + cpuctx->heap = cpuctx->heap_default; } } @@ -13318,12 +13555,12 @@ static void perf_swevent_init_cpu(unsigned int cpu) #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE static void __perf_event_exit_context(void *__info) { + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_context *ctx = __info; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); struct perf_event *event; raw_spin_lock(&ctx->lock); - ctx_sched_out(ctx, cpuctx, EVENT_TIME); + ctx_sched_out(ctx, EVENT_TIME); list_for_each_entry(event, &ctx->event_list, event_entry) __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP); raw_spin_unlock(&ctx->lock); @@ -13333,18 +13570,16 @@ static void perf_event_exit_cpu_context(int cpu) { struct perf_cpu_context *cpuctx; struct perf_event_context *ctx; - struct pmu *pmu; + // XXX simplify cpuctx->online mutex_lock(&pmus_lock); - list_for_each_entry(pmu, &pmus, entry) { - cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); - ctx = &cpuctx->ctx; + cpuctx = per_cpu_ptr(&perf_cpu_context, cpu); + ctx = &cpuctx->ctx; - mutex_lock(&ctx->mutex); - smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); - cpuctx->online = 0; - mutex_unlock(&ctx->mutex); - } + mutex_lock(&ctx->mutex); + smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); + cpuctx->online = 0; + mutex_unlock(&ctx->mutex); cpumask_clear_cpu(cpu, perf_online_mask); mutex_unlock(&pmus_lock); } @@ -13358,20 +13593,17 @@ int perf_event_init_cpu(unsigned int cpu) { struct perf_cpu_context *cpuctx; struct perf_event_context *ctx; - struct pmu *pmu; perf_swevent_init_cpu(cpu); mutex_lock(&pmus_lock); cpumask_set_cpu(cpu, perf_online_mask); - list_for_each_entry(pmu, &pmus, entry) { - cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); - ctx = &cpuctx->ctx; + cpuctx = per_cpu_ptr(&perf_cpu_context, cpu); + ctx = &cpuctx->ctx; - mutex_lock(&ctx->mutex); - cpuctx->online = 1; - mutex_unlock(&ctx->mutex); - } + mutex_lock(&ctx->mutex); + cpuctx->online = 1; + mutex_unlock(&ctx->mutex); mutex_unlock(&pmus_lock); return 0; @@ -13508,9 +13740,12 @@ static int perf_cgroup_css_online(struct cgroup_subsys_state *css) static int __perf_cgroup_move(void *info) { struct task_struct *task = info; - rcu_read_lock(); - perf_cgroup_switch(task); - rcu_read_unlock(); + + preempt_disable(); + if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) + perf_cgroup_switch(task); + preempt_enable(); + return 0; } diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index f32320ac02fd..c3797701339c 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -17,61 +17,276 @@ * This file contains the arch-independent routines. */ +#include <linux/hw_breakpoint.h> + +#include <linux/atomic.h> +#include <linux/bug.h> +#include <linux/cpu.h> +#include <linux/export.h> +#include <linux/init.h> #include <linux/irqflags.h> -#include <linux/kallsyms.h> -#include <linux/notifier.h> -#include <linux/kprobes.h> #include <linux/kdebug.h> #include <linux/kernel.h> -#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/notifier.h> +#include <linux/percpu-rwsem.h> #include <linux/percpu.h> +#include <linux/rhashtable.h> #include <linux/sched.h> -#include <linux/init.h> #include <linux/slab.h> -#include <linux/list.h> -#include <linux/cpu.h> -#include <linux/smp.h> -#include <linux/bug.h> -#include <linux/hw_breakpoint.h> /* - * Constraints data + * Datastructure to track the total uses of N slots across tasks or CPUs; + * bp_slots_histogram::count[N] is the number of assigned N+1 breakpoint slots. + */ +struct bp_slots_histogram { +#ifdef hw_breakpoint_slots + atomic_t count[hw_breakpoint_slots(0)]; +#else + atomic_t *count; +#endif +}; + +/* + * Per-CPU constraints data. */ struct bp_cpuinfo { - /* Number of pinned cpu breakpoints in a cpu */ - unsigned int cpu_pinned; - /* tsk_pinned[n] is the number of tasks having n+1 breakpoints */ - unsigned int *tsk_pinned; - /* Number of non-pinned cpu/task breakpoints in a cpu */ - unsigned int flexible; /* XXX: placeholder, see fetch_this_slot() */ + /* Number of pinned CPU breakpoints in a CPU. */ + unsigned int cpu_pinned; + /* Histogram of pinned task breakpoints in a CPU. */ + struct bp_slots_histogram tsk_pinned; }; static DEFINE_PER_CPU(struct bp_cpuinfo, bp_cpuinfo[TYPE_MAX]); -static int nr_slots[TYPE_MAX]; static struct bp_cpuinfo *get_bp_info(int cpu, enum bp_type_idx type) { return per_cpu_ptr(bp_cpuinfo + type, cpu); } +/* Number of pinned CPU breakpoints globally. */ +static struct bp_slots_histogram cpu_pinned[TYPE_MAX]; +/* Number of pinned CPU-independent task breakpoints. */ +static struct bp_slots_histogram tsk_pinned_all[TYPE_MAX]; + /* Keep track of the breakpoints attached to tasks */ -static LIST_HEAD(bp_task_head); +static struct rhltable task_bps_ht; +static const struct rhashtable_params task_bps_ht_params = { + .head_offset = offsetof(struct hw_perf_event, bp_list), + .key_offset = offsetof(struct hw_perf_event, target), + .key_len = sizeof_field(struct hw_perf_event, target), + .automatic_shrinking = true, +}; -static int constraints_initialized; +static bool constraints_initialized __ro_after_init; -/* Gather the number of total pinned and un-pinned bp in a cpuset */ -struct bp_busy_slots { - unsigned int pinned; - unsigned int flexible; -}; +/* + * Synchronizes accesses to the per-CPU constraints; the locking rules are: + * + * 1. Atomic updates to bp_cpuinfo::tsk_pinned only require a held read-lock + * (due to bp_slots_histogram::count being atomic, no update are lost). + * + * 2. Holding a write-lock is required for computations that require a + * stable snapshot of all bp_cpuinfo::tsk_pinned. + * + * 3. In all other cases, non-atomic accesses require the appropriately held + * lock (read-lock for read-only accesses; write-lock for reads/writes). + */ +DEFINE_STATIC_PERCPU_RWSEM(bp_cpuinfo_sem); -/* Serialize accesses to the above constraints */ -static DEFINE_MUTEX(nr_bp_mutex); +/* + * Return mutex to serialize accesses to per-task lists in task_bps_ht. Since + * rhltable synchronizes concurrent insertions/deletions, independent tasks may + * insert/delete concurrently; therefore, a mutex per task is sufficient. + * + * Uses task_struct::perf_event_mutex, to avoid extending task_struct with a + * hw_breakpoint-only mutex, which may be infrequently used. The caveat here is + * that hw_breakpoint may contend with per-task perf event list management. The + * assumption is that perf usecases involving hw_breakpoints are very unlikely + * to result in unnecessary contention. + */ +static inline struct mutex *get_task_bps_mutex(struct perf_event *bp) +{ + struct task_struct *tsk = bp->hw.target; -__weak int hw_breakpoint_weight(struct perf_event *bp) + return tsk ? &tsk->perf_event_mutex : NULL; +} + +static struct mutex *bp_constraints_lock(struct perf_event *bp) +{ + struct mutex *tsk_mtx = get_task_bps_mutex(bp); + + if (tsk_mtx) { + /* + * Fully analogous to the perf_try_init_event() nesting + * argument in the comment near perf_event_ctx_lock_nested(); + * this child->perf_event_mutex cannot ever deadlock against + * the parent->perf_event_mutex usage from + * perf_event_task_{en,dis}able(). + * + * Specifically, inherited events will never occur on + * ->perf_event_list. + */ + mutex_lock_nested(tsk_mtx, SINGLE_DEPTH_NESTING); + percpu_down_read(&bp_cpuinfo_sem); + } else { + percpu_down_write(&bp_cpuinfo_sem); + } + + return tsk_mtx; +} + +static void bp_constraints_unlock(struct mutex *tsk_mtx) +{ + if (tsk_mtx) { + percpu_up_read(&bp_cpuinfo_sem); + mutex_unlock(tsk_mtx); + } else { + percpu_up_write(&bp_cpuinfo_sem); + } +} + +static bool bp_constraints_is_locked(struct perf_event *bp) +{ + struct mutex *tsk_mtx = get_task_bps_mutex(bp); + + return percpu_is_write_locked(&bp_cpuinfo_sem) || + (tsk_mtx ? mutex_is_locked(tsk_mtx) : + percpu_is_read_locked(&bp_cpuinfo_sem)); +} + +static inline void assert_bp_constraints_lock_held(struct perf_event *bp) +{ + struct mutex *tsk_mtx = get_task_bps_mutex(bp); + + if (tsk_mtx) + lockdep_assert_held(tsk_mtx); + lockdep_assert_held(&bp_cpuinfo_sem); +} + +#ifdef hw_breakpoint_slots +/* + * Number of breakpoint slots is constant, and the same for all types. + */ +static_assert(hw_breakpoint_slots(TYPE_INST) == hw_breakpoint_slots(TYPE_DATA)); +static inline int hw_breakpoint_slots_cached(int type) { return hw_breakpoint_slots(type); } +static inline int init_breakpoint_slots(void) { return 0; } +#else +/* + * Dynamic number of breakpoint slots. + */ +static int __nr_bp_slots[TYPE_MAX] __ro_after_init; + +static inline int hw_breakpoint_slots_cached(int type) +{ + return __nr_bp_slots[type]; +} + +static __init bool +bp_slots_histogram_alloc(struct bp_slots_histogram *hist, enum bp_type_idx type) +{ + hist->count = kcalloc(hw_breakpoint_slots_cached(type), sizeof(*hist->count), GFP_KERNEL); + return hist->count; +} + +static __init void bp_slots_histogram_free(struct bp_slots_histogram *hist) +{ + kfree(hist->count); +} + +static __init int init_breakpoint_slots(void) +{ + int i, cpu, err_cpu; + + for (i = 0; i < TYPE_MAX; i++) + __nr_bp_slots[i] = hw_breakpoint_slots(i); + + for_each_possible_cpu(cpu) { + for (i = 0; i < TYPE_MAX; i++) { + struct bp_cpuinfo *info = get_bp_info(cpu, i); + + if (!bp_slots_histogram_alloc(&info->tsk_pinned, i)) + goto err; + } + } + for (i = 0; i < TYPE_MAX; i++) { + if (!bp_slots_histogram_alloc(&cpu_pinned[i], i)) + goto err; + if (!bp_slots_histogram_alloc(&tsk_pinned_all[i], i)) + goto err; + } + + return 0; +err: + for_each_possible_cpu(err_cpu) { + for (i = 0; i < TYPE_MAX; i++) + bp_slots_histogram_free(&get_bp_info(err_cpu, i)->tsk_pinned); + if (err_cpu == cpu) + break; + } + for (i = 0; i < TYPE_MAX; i++) { + bp_slots_histogram_free(&cpu_pinned[i]); + bp_slots_histogram_free(&tsk_pinned_all[i]); + } + + return -ENOMEM; +} +#endif + +static inline void +bp_slots_histogram_add(struct bp_slots_histogram *hist, int old, int val) +{ + const int old_idx = old - 1; + const int new_idx = old_idx + val; + + if (old_idx >= 0) + WARN_ON(atomic_dec_return_relaxed(&hist->count[old_idx]) < 0); + if (new_idx >= 0) + WARN_ON(atomic_inc_return_relaxed(&hist->count[new_idx]) < 0); +} + +static int +bp_slots_histogram_max(struct bp_slots_histogram *hist, enum bp_type_idx type) +{ + for (int i = hw_breakpoint_slots_cached(type) - 1; i >= 0; i--) { + const int count = atomic_read(&hist->count[i]); + + /* Catch unexpected writers; we want a stable snapshot. */ + ASSERT_EXCLUSIVE_WRITER(hist->count[i]); + if (count > 0) + return i + 1; + WARN(count < 0, "inconsistent breakpoint slots histogram"); + } + + return 0; +} + +static int +bp_slots_histogram_max_merge(struct bp_slots_histogram *hist1, struct bp_slots_histogram *hist2, + enum bp_type_idx type) +{ + for (int i = hw_breakpoint_slots_cached(type) - 1; i >= 0; i--) { + const int count1 = atomic_read(&hist1->count[i]); + const int count2 = atomic_read(&hist2->count[i]); + + /* Catch unexpected writers; we want a stable snapshot. */ + ASSERT_EXCLUSIVE_WRITER(hist1->count[i]); + ASSERT_EXCLUSIVE_WRITER(hist2->count[i]); + if (count1 + count2 > 0) + return i + 1; + WARN(count1 < 0, "inconsistent breakpoint slots histogram"); + WARN(count2 < 0, "inconsistent breakpoint slots histogram"); + } + + return 0; +} + +#ifndef hw_breakpoint_weight +static inline int hw_breakpoint_weight(struct perf_event *bp) { return 1; } +#endif static inline enum bp_type_idx find_slot_idx(u64 bp_type) { @@ -82,39 +297,61 @@ static inline enum bp_type_idx find_slot_idx(u64 bp_type) } /* - * Report the maximum number of pinned breakpoints a task - * have in this cpu + * Return the maximum number of pinned breakpoints a task has in this CPU. */ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) { - unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned; - int i; + struct bp_slots_histogram *tsk_pinned = &get_bp_info(cpu, type)->tsk_pinned; - for (i = nr_slots[type] - 1; i >= 0; i--) { - if (tsk_pinned[i] > 0) - return i + 1; - } - - return 0; + /* + * At this point we want to have acquired the bp_cpuinfo_sem as a + * writer to ensure that there are no concurrent writers in + * toggle_bp_task_slot() to tsk_pinned, and we get a stable snapshot. + */ + lockdep_assert_held_write(&bp_cpuinfo_sem); + return bp_slots_histogram_max_merge(tsk_pinned, &tsk_pinned_all[type], type); } /* * Count the number of breakpoints of the same type and same task. * The given event must be not on the list. + * + * If @cpu is -1, but the result of task_bp_pinned() is not CPU-independent, + * returns a negative value. */ static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type) { - struct task_struct *tsk = bp->hw.target; + struct rhlist_head *head, *pos; struct perf_event *iter; int count = 0; - list_for_each_entry(iter, &bp_task_head, hw.bp_list) { - if (iter->hw.target == tsk && - find_slot_idx(iter->attr.bp_type) == type && - (iter->cpu < 0 || cpu == iter->cpu)) - count += hw_breakpoint_weight(iter); + /* + * We need a stable snapshot of the per-task breakpoint list. + */ + assert_bp_constraints_lock_held(bp); + + rcu_read_lock(); + head = rhltable_lookup(&task_bps_ht, &bp->hw.target, task_bps_ht_params); + if (!head) + goto out; + + rhl_for_each_entry_rcu(iter, pos, head, hw.bp_list) { + if (find_slot_idx(iter->attr.bp_type) != type) + continue; + + if (iter->cpu >= 0) { + if (cpu == -1) { + count = -1; + goto out; + } else if (cpu != iter->cpu) + continue; + } + + count += hw_breakpoint_weight(iter); } +out: + rcu_read_unlock(); return count; } @@ -126,16 +363,29 @@ static const struct cpumask *cpumask_of_bp(struct perf_event *bp) } /* - * Report the number of pinned/un-pinned breakpoints we have in - * a given cpu (cpu > -1) or in all of them (cpu = -1). + * Returns the max pinned breakpoint slots in a given + * CPU (cpu > -1) or across all of them (cpu = -1). */ -static void -fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, - enum bp_type_idx type) +static int +max_bp_pinned_slots(struct perf_event *bp, enum bp_type_idx type) { const struct cpumask *cpumask = cpumask_of_bp(bp); + int pinned_slots = 0; int cpu; + if (bp->hw.target && bp->cpu < 0) { + int max_pinned = task_bp_pinned(-1, bp, type); + + if (max_pinned >= 0) { + /* + * Fast path: task_bp_pinned() is CPU-independent and + * returns the same value for any CPU. + */ + max_pinned += bp_slots_histogram_max(&cpu_pinned[type], type); + return max_pinned; + } + } + for_each_cpu(cpu, cpumask) { struct bp_cpuinfo *info = get_bp_info(cpu, type); int nr; @@ -146,71 +396,131 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, else nr += task_bp_pinned(cpu, bp, type); - if (nr > slots->pinned) - slots->pinned = nr; - - nr = info->flexible; - if (nr > slots->flexible) - slots->flexible = nr; + pinned_slots = max(nr, pinned_slots); } -} -/* - * For now, continue to consider flexible as pinned, until we can - * ensure no flexible event can ever be scheduled before a pinned event - * in a same cpu. - */ -static void -fetch_this_slot(struct bp_busy_slots *slots, int weight) -{ - slots->pinned += weight; -} - -/* - * Add a pinned breakpoint for the given task in our constraint table - */ -static void toggle_bp_task_slot(struct perf_event *bp, int cpu, - enum bp_type_idx type, int weight) -{ - unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned; - int old_idx, new_idx; - - old_idx = task_bp_pinned(cpu, bp, type) - 1; - new_idx = old_idx + weight; - - if (old_idx >= 0) - tsk_pinned[old_idx]--; - if (new_idx >= 0) - tsk_pinned[new_idx]++; + return pinned_slots; } /* * Add/remove the given breakpoint in our constraint table */ -static void -toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, - int weight) +static int +toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, int weight) { - const struct cpumask *cpumask = cpumask_of_bp(bp); - int cpu; + int cpu, next_tsk_pinned; if (!enable) weight = -weight; - /* Pinned counter cpu profiling */ if (!bp->hw.target) { - get_bp_info(bp->cpu, type)->cpu_pinned += weight; - return; + /* + * Update the pinned CPU slots, in per-CPU bp_cpuinfo and in the + * global histogram. + */ + struct bp_cpuinfo *info = get_bp_info(bp->cpu, type); + + lockdep_assert_held_write(&bp_cpuinfo_sem); + bp_slots_histogram_add(&cpu_pinned[type], info->cpu_pinned, weight); + info->cpu_pinned += weight; + return 0; + } + + /* + * If bp->hw.target, tsk_pinned is only modified, but not used + * otherwise. We can permit concurrent updates as long as there are no + * other uses: having acquired bp_cpuinfo_sem as a reader allows + * concurrent updates here. Uses of tsk_pinned will require acquiring + * bp_cpuinfo_sem as a writer to stabilize tsk_pinned's value. + */ + lockdep_assert_held_read(&bp_cpuinfo_sem); + + /* + * Update the pinned task slots, in per-CPU bp_cpuinfo and in the global + * histogram. We need to take care of 4 cases: + * + * 1. This breakpoint targets all CPUs (cpu < 0), and there may only + * exist other task breakpoints targeting all CPUs. In this case we + * can simply update the global slots histogram. + * + * 2. This breakpoint targets a specific CPU (cpu >= 0), but there may + * only exist other task breakpoints targeting all CPUs. + * + * a. On enable: remove the existing breakpoints from the global + * slots histogram and use the per-CPU histogram. + * + * b. On disable: re-insert the existing breakpoints into the global + * slots histogram and remove from per-CPU histogram. + * + * 3. Some other existing task breakpoints target specific CPUs. Only + * update the per-CPU slots histogram. + */ + + if (!enable) { + /* + * Remove before updating histograms so we can determine if this + * was the last task breakpoint for a specific CPU. + */ + int ret = rhltable_remove(&task_bps_ht, &bp->hw.bp_list, task_bps_ht_params); + + if (ret) + return ret; + } + /* + * Note: If !enable, next_tsk_pinned will not count the to-be-removed breakpoint. + */ + next_tsk_pinned = task_bp_pinned(-1, bp, type); + + if (next_tsk_pinned >= 0) { + if (bp->cpu < 0) { /* Case 1: fast path */ + if (!enable) + next_tsk_pinned += hw_breakpoint_weight(bp); + bp_slots_histogram_add(&tsk_pinned_all[type], next_tsk_pinned, weight); + } else if (enable) { /* Case 2.a: slow path */ + /* Add existing to per-CPU histograms. */ + for_each_possible_cpu(cpu) { + bp_slots_histogram_add(&get_bp_info(cpu, type)->tsk_pinned, + 0, next_tsk_pinned); + } + /* Add this first CPU-pinned task breakpoint. */ + bp_slots_histogram_add(&get_bp_info(bp->cpu, type)->tsk_pinned, + next_tsk_pinned, weight); + /* Rebalance global task pinned histogram. */ + bp_slots_histogram_add(&tsk_pinned_all[type], next_tsk_pinned, + -next_tsk_pinned); + } else { /* Case 2.b: slow path */ + /* Remove this last CPU-pinned task breakpoint. */ + bp_slots_histogram_add(&get_bp_info(bp->cpu, type)->tsk_pinned, + next_tsk_pinned + hw_breakpoint_weight(bp), weight); + /* Remove all from per-CPU histograms. */ + for_each_possible_cpu(cpu) { + bp_slots_histogram_add(&get_bp_info(cpu, type)->tsk_pinned, + next_tsk_pinned, -next_tsk_pinned); + } + /* Rebalance global task pinned histogram. */ + bp_slots_histogram_add(&tsk_pinned_all[type], 0, next_tsk_pinned); + } + } else { /* Case 3: slow path */ + const struct cpumask *cpumask = cpumask_of_bp(bp); + + for_each_cpu(cpu, cpumask) { + next_tsk_pinned = task_bp_pinned(cpu, bp, type); + if (!enable) + next_tsk_pinned += hw_breakpoint_weight(bp); + bp_slots_histogram_add(&get_bp_info(cpu, type)->tsk_pinned, + next_tsk_pinned, weight); + } } - /* Pinned counter task profiling */ - for_each_cpu(cpu, cpumask) - toggle_bp_task_slot(bp, cpu, type, weight); + /* + * Readers want a stable snapshot of the per-task breakpoint list. + */ + assert_bp_constraints_lock_held(bp); if (enable) - list_add_tail(&bp->hw.bp_list, &bp_task_head); - else - list_del(&bp->hw.bp_list); + return rhltable_insert(&task_bps_ht, &bp->hw.bp_list, task_bps_ht_params); + + return 0; } __weak int arch_reserve_bp_slot(struct perf_event *bp) @@ -234,7 +544,12 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp) } /* - * Constraints to check before allowing this new breakpoint counter: + * Constraints to check before allowing this new breakpoint counter. + * + * Note: Flexible breakpoints are currently unimplemented, but outlined in the + * below algorithm for completeness. The implementation treats flexible as + * pinned due to no guarantee that we currently always schedule flexible events + * before a pinned event in a same CPU. * * == Non-pinned counter == (Considered as pinned for now) * @@ -276,8 +591,8 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp) */ static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type) { - struct bp_busy_slots slots = {0}; enum bp_type_idx type; + int max_pinned_slots; int weight; int ret; @@ -293,36 +608,24 @@ static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type) type = find_slot_idx(bp_type); weight = hw_breakpoint_weight(bp); - fetch_bp_busy_slots(&slots, bp, type); - /* - * Simulate the addition of this breakpoint to the constraints - * and see the result. - */ - fetch_this_slot(&slots, weight); - - /* Flexible counters need to keep at least one slot */ - if (slots.pinned + (!!slots.flexible) > nr_slots[type]) + /* Check if this new breakpoint can be satisfied across all CPUs. */ + max_pinned_slots = max_bp_pinned_slots(bp, type) + weight; + if (max_pinned_slots > hw_breakpoint_slots_cached(type)) return -ENOSPC; ret = arch_reserve_bp_slot(bp); if (ret) return ret; - toggle_bp_slot(bp, true, type, weight); - - return 0; + return toggle_bp_slot(bp, true, type, weight); } int reserve_bp_slot(struct perf_event *bp) { - int ret; - - mutex_lock(&nr_bp_mutex); - - ret = __reserve_bp_slot(bp, bp->attr.bp_type); - - mutex_unlock(&nr_bp_mutex); + struct mutex *mtx = bp_constraints_lock(bp); + int ret = __reserve_bp_slot(bp, bp->attr.bp_type); + bp_constraints_unlock(mtx); return ret; } @@ -335,17 +638,16 @@ static void __release_bp_slot(struct perf_event *bp, u64 bp_type) type = find_slot_idx(bp_type); weight = hw_breakpoint_weight(bp); - toggle_bp_slot(bp, false, type, weight); + WARN_ON(toggle_bp_slot(bp, false, type, weight)); } void release_bp_slot(struct perf_event *bp) { - mutex_lock(&nr_bp_mutex); + struct mutex *mtx = bp_constraints_lock(bp); arch_unregister_hw_breakpoint(bp); __release_bp_slot(bp, bp->attr.bp_type); - - mutex_unlock(&nr_bp_mutex); + bp_constraints_unlock(mtx); } static int __modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type) @@ -372,11 +674,10 @@ static int __modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type) static int modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type) { - int ret; + struct mutex *mtx = bp_constraints_lock(bp); + int ret = __modify_bp_slot(bp, old_type, new_type); - mutex_lock(&nr_bp_mutex); - ret = __modify_bp_slot(bp, old_type, new_type); - mutex_unlock(&nr_bp_mutex); + bp_constraints_unlock(mtx); return ret; } @@ -387,18 +688,28 @@ static int modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type) */ int dbg_reserve_bp_slot(struct perf_event *bp) { - if (mutex_is_locked(&nr_bp_mutex)) + int ret; + + if (bp_constraints_is_locked(bp)) return -1; - return __reserve_bp_slot(bp, bp->attr.bp_type); + /* Locks aren't held; disable lockdep assert checking. */ + lockdep_off(); + ret = __reserve_bp_slot(bp, bp->attr.bp_type); + lockdep_on(); + + return ret; } int dbg_release_bp_slot(struct perf_event *bp) { - if (mutex_is_locked(&nr_bp_mutex)) + if (bp_constraints_is_locked(bp)) return -1; + /* Locks aren't held; disable lockdep assert checking. */ + lockdep_off(); __release_bp_slot(bp, bp->attr.bp_type); + lockdep_on(); return 0; } @@ -604,6 +915,50 @@ void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events) } EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint); +/** + * hw_breakpoint_is_used - check if breakpoints are currently used + * + * Returns: true if breakpoints are used, false otherwise. + */ +bool hw_breakpoint_is_used(void) +{ + int cpu; + + if (!constraints_initialized) + return false; + + for_each_possible_cpu(cpu) { + for (int type = 0; type < TYPE_MAX; ++type) { + struct bp_cpuinfo *info = get_bp_info(cpu, type); + + if (info->cpu_pinned) + return true; + + for (int slot = 0; slot < hw_breakpoint_slots_cached(type); ++slot) { + if (atomic_read(&info->tsk_pinned.count[slot])) + return true; + } + } + } + + for (int type = 0; type < TYPE_MAX; ++type) { + for (int slot = 0; slot < hw_breakpoint_slots_cached(type); ++slot) { + /* + * Warn, because if there are CPU pinned counters, + * should never get here; bp_cpuinfo::cpu_pinned should + * be consistent with the global cpu_pinned histogram. + */ + if (WARN_ON(atomic_read(&cpu_pinned[type].count[slot]))) + return true; + + if (atomic_read(&tsk_pinned_all[type].count[slot])) + return true; + } + } + + return false; +} + static struct notifier_block hw_breakpoint_exceptions_nb = { .notifier_call = hw_breakpoint_exceptions_notify, /* we need to be notified first */ @@ -678,38 +1033,19 @@ static struct pmu perf_breakpoint = { int __init init_hw_breakpoint(void) { - int cpu, err_cpu; - int i; - - for (i = 0; i < TYPE_MAX; i++) - nr_slots[i] = hw_breakpoint_slots(i); + int ret; - for_each_possible_cpu(cpu) { - for (i = 0; i < TYPE_MAX; i++) { - struct bp_cpuinfo *info = get_bp_info(cpu, i); + ret = rhltable_init(&task_bps_ht, &task_bps_ht_params); + if (ret) + return ret; - info->tsk_pinned = kcalloc(nr_slots[i], sizeof(int), - GFP_KERNEL); - if (!info->tsk_pinned) - goto err_alloc; - } - } + ret = init_breakpoint_slots(); + if (ret) + return ret; - constraints_initialized = 1; + constraints_initialized = true; perf_pmu_register(&perf_breakpoint, "breakpoint", PERF_TYPE_BREAKPOINT); return register_die_notifier(&hw_breakpoint_exceptions_nb); - - err_alloc: - for_each_possible_cpu(err_cpu) { - for (i = 0; i < TYPE_MAX; i++) - kfree(get_bp_info(err_cpu, i)->tsk_pinned); - if (err_cpu == cpu) - break; - } - - return -ENOMEM; } - - diff --git a/kernel/events/hw_breakpoint_test.c b/kernel/events/hw_breakpoint_test.c new file mode 100644 index 000000000000..c57610f52bb4 --- /dev/null +++ b/kernel/events/hw_breakpoint_test.c @@ -0,0 +1,333 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KUnit test for hw_breakpoint constraints accounting logic. + * + * Copyright (C) 2022, Google LLC. + */ + +#include <kunit/test.h> +#include <linux/cpumask.h> +#include <linux/hw_breakpoint.h> +#include <linux/kthread.h> +#include <linux/perf_event.h> +#include <asm/hw_breakpoint.h> + +#define TEST_REQUIRES_BP_SLOTS(test, slots) \ + do { \ + if ((slots) > get_test_bp_slots()) { \ + kunit_skip((test), "Requires breakpoint slots: %d > %d", slots, \ + get_test_bp_slots()); \ + } \ + } while (0) + +#define TEST_EXPECT_NOSPC(expr) KUNIT_EXPECT_EQ(test, -ENOSPC, PTR_ERR(expr)) + +#define MAX_TEST_BREAKPOINTS 512 + +static char break_vars[MAX_TEST_BREAKPOINTS]; +static struct perf_event *test_bps[MAX_TEST_BREAKPOINTS]; +static struct task_struct *__other_task; + +static struct perf_event *register_test_bp(int cpu, struct task_struct *tsk, int idx) +{ + struct perf_event_attr attr = {}; + + if (WARN_ON(idx < 0 || idx >= MAX_TEST_BREAKPOINTS)) + return NULL; + + hw_breakpoint_init(&attr); + attr.bp_addr = (unsigned long)&break_vars[idx]; + attr.bp_len = HW_BREAKPOINT_LEN_1; + attr.bp_type = HW_BREAKPOINT_RW; + return perf_event_create_kernel_counter(&attr, cpu, tsk, NULL, NULL); +} + +static void unregister_test_bp(struct perf_event **bp) +{ + if (WARN_ON(IS_ERR(*bp))) + return; + if (WARN_ON(!*bp)) + return; + unregister_hw_breakpoint(*bp); + *bp = NULL; +} + +static int get_test_bp_slots(void) +{ + static int slots; + + if (!slots) + slots = hw_breakpoint_slots(TYPE_DATA); + + return slots; +} + +static void fill_one_bp_slot(struct kunit *test, int *id, int cpu, struct task_struct *tsk) +{ + struct perf_event *bp = register_test_bp(cpu, tsk, *id); + + KUNIT_ASSERT_NOT_NULL(test, bp); + KUNIT_ASSERT_FALSE(test, IS_ERR(bp)); + KUNIT_ASSERT_NULL(test, test_bps[*id]); + test_bps[(*id)++] = bp; +} + +/* + * Fills up the given @cpu/@tsk with breakpoints, only leaving @skip slots free. + * + * Returns true if this can be called again, continuing at @id. + */ +static bool fill_bp_slots(struct kunit *test, int *id, int cpu, struct task_struct *tsk, int skip) +{ + for (int i = 0; i < get_test_bp_slots() - skip; ++i) + fill_one_bp_slot(test, id, cpu, tsk); + + return *id + get_test_bp_slots() <= MAX_TEST_BREAKPOINTS; +} + +static int dummy_kthread(void *arg) +{ + return 0; +} + +static struct task_struct *get_other_task(struct kunit *test) +{ + struct task_struct *tsk; + + if (__other_task) + return __other_task; + + tsk = kthread_create(dummy_kthread, NULL, "hw_breakpoint_dummy_task"); + KUNIT_ASSERT_FALSE(test, IS_ERR(tsk)); + __other_task = tsk; + return __other_task; +} + +static int get_test_cpu(int num) +{ + int cpu; + + WARN_ON(num < 0); + + for_each_online_cpu(cpu) { + if (num-- <= 0) + break; + } + + return cpu; +} + +/* ===== Test cases ===== */ + +static void test_one_cpu(struct kunit *test) +{ + int idx = 0; + + fill_bp_slots(test, &idx, get_test_cpu(0), NULL, 0); + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); +} + +static void test_many_cpus(struct kunit *test) +{ + int idx = 0; + int cpu; + + /* Test that CPUs are independent. */ + for_each_online_cpu(cpu) { + bool do_continue = fill_bp_slots(test, &idx, cpu, NULL, 0); + + TEST_EXPECT_NOSPC(register_test_bp(cpu, NULL, idx)); + if (!do_continue) + break; + } +} + +static void test_one_task_on_all_cpus(struct kunit *test) +{ + int idx = 0; + + fill_bp_slots(test, &idx, -1, current, 0); + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); + /* Remove one and adding back CPU-target should work. */ + unregister_test_bp(&test_bps[0]); + fill_one_bp_slot(test, &idx, get_test_cpu(0), NULL); +} + +static void test_two_tasks_on_all_cpus(struct kunit *test) +{ + int idx = 0; + + /* Test that tasks are independent. */ + fill_bp_slots(test, &idx, -1, current, 0); + fill_bp_slots(test, &idx, -1, get_other_task(test), 0); + + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(-1, get_other_task(test), idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), get_other_task(test), idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); + /* Remove one from first task and adding back CPU-target should not work. */ + unregister_test_bp(&test_bps[0]); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); +} + +static void test_one_task_on_one_cpu(struct kunit *test) +{ + int idx = 0; + + fill_bp_slots(test, &idx, get_test_cpu(0), current, 0); + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); + /* + * Remove one and adding back CPU-target should work; this case is + * special vs. above because the task's constraints are CPU-dependent. + */ + unregister_test_bp(&test_bps[0]); + fill_one_bp_slot(test, &idx, get_test_cpu(0), NULL); +} + +static void test_one_task_mixed(struct kunit *test) +{ + int idx = 0; + + TEST_REQUIRES_BP_SLOTS(test, 3); + + fill_one_bp_slot(test, &idx, get_test_cpu(0), current); + fill_bp_slots(test, &idx, -1, current, 1); + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); + + /* Transition from CPU-dependent pinned count to CPU-independent. */ + unregister_test_bp(&test_bps[0]); + unregister_test_bp(&test_bps[1]); + fill_one_bp_slot(test, &idx, get_test_cpu(0), NULL); + fill_one_bp_slot(test, &idx, get_test_cpu(0), NULL); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); +} + +static void test_two_tasks_on_one_cpu(struct kunit *test) +{ + int idx = 0; + + fill_bp_slots(test, &idx, get_test_cpu(0), current, 0); + fill_bp_slots(test, &idx, get_test_cpu(0), get_other_task(test), 0); + + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(-1, get_other_task(test), idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), get_other_task(test), idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); + /* Can still create breakpoints on some other CPU. */ + fill_bp_slots(test, &idx, get_test_cpu(1), NULL, 0); +} + +static void test_two_tasks_on_one_all_cpus(struct kunit *test) +{ + int idx = 0; + + fill_bp_slots(test, &idx, get_test_cpu(0), current, 0); + fill_bp_slots(test, &idx, -1, get_other_task(test), 0); + + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(-1, get_other_task(test), idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), get_other_task(test), idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); + /* Cannot create breakpoints on some other CPU either. */ + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(1), NULL, idx)); +} + +static void test_task_on_all_and_one_cpu(struct kunit *test) +{ + int tsk_on_cpu_idx, cpu_idx; + int idx = 0; + + TEST_REQUIRES_BP_SLOTS(test, 3); + + fill_bp_slots(test, &idx, -1, current, 2); + /* Transitioning from only all CPU breakpoints to mixed. */ + tsk_on_cpu_idx = idx; + fill_one_bp_slot(test, &idx, get_test_cpu(0), current); + fill_one_bp_slot(test, &idx, -1, current); + + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); + + /* We should still be able to use up another CPU's slots. */ + cpu_idx = idx; + fill_one_bp_slot(test, &idx, get_test_cpu(1), NULL); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(1), NULL, idx)); + + /* Transitioning back to task target on all CPUs. */ + unregister_test_bp(&test_bps[tsk_on_cpu_idx]); + /* Still have a CPU target breakpoint in get_test_cpu(1). */ + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + /* Remove it and try again. */ + unregister_test_bp(&test_bps[cpu_idx]); + fill_one_bp_slot(test, &idx, -1, current); + + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(1), NULL, idx)); +} + +static struct kunit_case hw_breakpoint_test_cases[] = { + KUNIT_CASE(test_one_cpu), + KUNIT_CASE(test_many_cpus), + KUNIT_CASE(test_one_task_on_all_cpus), + KUNIT_CASE(test_two_tasks_on_all_cpus), + KUNIT_CASE(test_one_task_on_one_cpu), + KUNIT_CASE(test_one_task_mixed), + KUNIT_CASE(test_two_tasks_on_one_cpu), + KUNIT_CASE(test_two_tasks_on_one_all_cpus), + KUNIT_CASE(test_task_on_all_and_one_cpu), + {}, +}; + +static int test_init(struct kunit *test) +{ + /* Most test cases want 2 distinct CPUs. */ + if (num_online_cpus() < 2) + kunit_skip(test, "not enough cpus"); + + /* Want the system to not use breakpoints elsewhere. */ + if (hw_breakpoint_is_used()) + kunit_skip(test, "hw breakpoint already in use"); + + return 0; +} + +static void test_exit(struct kunit *test) +{ + for (int i = 0; i < MAX_TEST_BREAKPOINTS; ++i) { + if (test_bps[i]) + unregister_test_bp(&test_bps[i]); + } + + if (__other_task) { + kthread_stop(__other_task); + __other_task = NULL; + } + + /* Verify that internal state agrees that no breakpoints are in use. */ + KUNIT_EXPECT_FALSE(test, hw_breakpoint_is_used()); +} + +static struct kunit_suite hw_breakpoint_test_suite = { + .name = "hw_breakpoint", + .test_cases = hw_breakpoint_test_cases, + .init = test_init, + .exit = test_exit, +}; + +kunit_test_suites(&hw_breakpoint_test_suite); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Marco Elver <elver@google.com>"); diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 726132039c38..273a0fe7910a 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -22,7 +22,7 @@ static void perf_output_wakeup(struct perf_output_handle *handle) atomic_set(&handle->rb->poll, EPOLLIN); handle->event->pending_wakeup = 1; - irq_work_queue(&handle->event->pending); + irq_work_queue(&handle->event->pending_irq); } /* diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 2eaa327f8158..d9e357b7e17c 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -19,7 +19,7 @@ #include <linux/export.h> #include <linux/rmap.h> /* anon_vma_prepare */ #include <linux/mmu_notifier.h> /* set_pte_at_notify */ -#include <linux/swap.h> /* try_to_free_swap */ +#include <linux/swap.h> /* folio_free_swap */ #include <linux/ptrace.h> /* user_enable_single_step */ #include <linux/kdebug.h> /* notifier mechanism */ #include "../../mm/internal.h" /* munlock_vma_page */ @@ -154,8 +154,10 @@ static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr) static int __replace_page(struct vm_area_struct *vma, unsigned long addr, struct page *old_page, struct page *new_page) { + struct folio *old_folio = page_folio(old_page); + struct folio *new_folio; struct mm_struct *mm = vma->vm_mm; - DEFINE_FOLIO_VMA_WALK(pvmw, page_folio(old_page), vma, addr, 0); + DEFINE_FOLIO_VMA_WALK(pvmw, old_folio, vma, addr, 0); int err; struct mmu_notifier_range range; @@ -163,14 +165,14 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, addr + PAGE_SIZE); if (new_page) { - err = mem_cgroup_charge(page_folio(new_page), vma->vm_mm, - GFP_KERNEL); + new_folio = page_folio(new_page); + err = mem_cgroup_charge(new_folio, vma->vm_mm, GFP_KERNEL); if (err) return err; } - /* For try_to_free_swap() below */ - lock_page(old_page); + /* For folio_free_swap() below */ + folio_lock(old_folio); mmu_notifier_invalidate_range_start(&range); err = -EAGAIN; @@ -179,14 +181,14 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, VM_BUG_ON_PAGE(addr != pvmw.address, old_page); if (new_page) { - get_page(new_page); + folio_get(new_folio); page_add_new_anon_rmap(new_page, vma, addr); - lru_cache_add_inactive_or_unevictable(new_page, vma); + folio_add_lru_vma(new_folio, vma); } else /* no new page, just dec_mm_counter for old_page */ dec_mm_counter(mm, MM_ANONPAGES); - if (!PageAnon(old_page)) { + if (!folio_test_anon(old_folio)) { dec_mm_counter(mm, mm_counter_file(old_page)); inc_mm_counter(mm, MM_ANONPAGES); } @@ -198,15 +200,15 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, mk_pte(new_page, vma->vm_page_prot)); page_remove_rmap(old_page, vma, false); - if (!page_mapped(old_page)) - try_to_free_swap(old_page); + if (!folio_mapped(old_folio)) + folio_free_swap(old_folio); page_vma_mapped_walk_done(&pvmw); - put_page(old_page); + folio_put(old_folio); err = 0; unlock: mmu_notifier_invalidate_range_end(&range); - unlock_page(old_page); + folio_unlock(old_folio); return err; } @@ -349,9 +351,10 @@ static bool valid_ref_ctr_vma(struct uprobe *uprobe, static struct vm_area_struct * find_ref_ctr_vma(struct uprobe *uprobe, struct mm_struct *mm) { + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *tmp; - for (tmp = mm->mmap; tmp; tmp = tmp->vm_next) + for_each_vma(vmi, tmp) if (valid_ref_ctr_vma(uprobe, tmp)) return tmp; @@ -552,7 +555,7 @@ put_old: /* try collapse pmd for compound page */ if (!ret && orig_page_huge) - collapse_pte_mapped_thp(mm, vaddr); + collapse_pte_mapped_thp(mm, vaddr, false); return ret; } @@ -1231,11 +1234,12 @@ int uprobe_apply(struct inode *inode, loff_t offset, static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) { + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; int err = 0; mmap_read_lock(mm); - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { unsigned long vaddr; loff_t offset; @@ -1983,9 +1987,10 @@ bool uprobe_deny_signal(void) static void mmf_recalc_uprobes(struct mm_struct *mm) { + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { if (!valid_vma(vma, false)) continue; /* diff --git a/kernel/exit.c b/kernel/exit.c index 84021b24f79e..15dc2ec80c46 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -60,17 +60,65 @@ #include <linux/writeback.h> #include <linux/shm.h> #include <linux/kcov.h> +#include <linux/kmsan.h> #include <linux/random.h> #include <linux/rcuwait.h> #include <linux/compat.h> #include <linux/io_uring.h> #include <linux/kprobes.h> #include <linux/rethook.h> +#include <linux/sysfs.h> #include <linux/uaccess.h> #include <asm/unistd.h> #include <asm/mmu_context.h> +/* + * The default value should be high enough to not crash a system that randomly + * crashes its kernel from time to time, but low enough to at least not permit + * overflowing 32-bit refcounts or the ldsem writer count. + */ +static unsigned int oops_limit = 10000; + +#ifdef CONFIG_SYSCTL +static struct ctl_table kern_exit_table[] = { + { + .procname = "oops_limit", + .data = &oops_limit, + .maxlen = sizeof(oops_limit), + .mode = 0644, + .proc_handler = proc_douintvec, + }, + { } +}; + +static __init int kernel_exit_sysctls_init(void) +{ + register_sysctl_init("kernel", kern_exit_table); + return 0; +} +late_initcall(kernel_exit_sysctls_init); +#endif + +static atomic_t oops_count = ATOMIC_INIT(0); + +#ifdef CONFIG_SYSFS +static ssize_t oops_count_show(struct kobject *kobj, struct kobj_attribute *attr, + char *page) +{ + return sysfs_emit(page, "%d\n", atomic_read(&oops_count)); +} + +static struct kobj_attribute oops_count_attr = __ATTR_RO(oops_count); + +static __init int kernel_exit_sysfs_init(void) +{ + sysfs_add_file_to_group(kernel_kobj, &oops_count_attr.attr, NULL); + return 0; +} +late_initcall(kernel_exit_sysfs_init); +#endif + static void __unhash_process(struct task_struct *p, bool group_dead) { nr_threads--; @@ -183,6 +231,10 @@ void put_task_struct_rcu_user(struct task_struct *task) call_rcu(&task->rcu, delayed_put_task_struct); } +void __weak release_thread(struct task_struct *dead_task) +{ +} + void release_task(struct task_struct *p) { struct task_struct *leader; @@ -374,10 +426,10 @@ static void coredump_task_exit(struct task_struct *tsk) complete(&core_state->startup); for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE|TASK_FREEZABLE); if (!self.task) /* see coredump_finish() */ break; - freezable_schedule(); + schedule(); } __set_current_state(TASK_RUNNING); } @@ -466,6 +518,7 @@ assign_new_owner: goto retry; } WRITE_ONCE(mm->owner, c); + lru_gen_migrate_mm(mm); task_unlock(c); put_task_struct(c); } @@ -733,14 +786,33 @@ static void check_stack_usage(void) static inline void check_stack_usage(void) {} #endif +static void synchronize_group_exit(struct task_struct *tsk, long code) +{ + struct sighand_struct *sighand = tsk->sighand; + struct signal_struct *signal = tsk->signal; + + spin_lock_irq(&sighand->siglock); + signal->quick_threads--; + if ((signal->quick_threads == 0) && + !(signal->flags & SIGNAL_GROUP_EXIT)) { + signal->flags = SIGNAL_GROUP_EXIT; + signal->group_exit_code = code; + signal->group_stop_count = 0; + } + spin_unlock_irq(&sighand->siglock); +} + void __noreturn do_exit(long code) { struct task_struct *tsk = current; int group_dead; + synchronize_group_exit(tsk, code); + WARN_ON(tsk->plug); kcov_task_exit(tsk); + kmsan_task_exit(tsk); coredump_task_exit(tsk); ptrace_event(PTRACE_EVENT_EXIT, code); @@ -859,6 +931,7 @@ void __noreturn make_task_dead(int signr) * Then do everything else. */ struct task_struct *tsk = current; + unsigned int limit; if (unlikely(in_interrupt())) panic("Aiee, killing interrupt handler!"); @@ -873,6 +946,20 @@ void __noreturn make_task_dead(int signr) } /* + * Every time the system oopses, if the oops happens while a reference + * to an object was held, the reference leaks. + * If the oops doesn't also leak memory, repeated oopsing can cause + * reference counters to wrap around (if they're not using refcount_t). + * This means that repeated oopsing can make unexploitable-looking bugs + * exploitable through repeated oopsing. + * To make sure this can't happen, place an upper bound on how often the + * kernel may oops without panic(). + */ + limit = READ_ONCE(oops_limit); + if (atomic_inc_return(&oops_count) >= limit && limit) + panic("Oopsed too often (kernel.oops_limit is %d)", limit); + + /* * We're taking recursive faults here in make_task_dead. Safest is to just * leave this task alone and wait for reboot. */ @@ -905,7 +992,7 @@ do_group_exit(int exit_code) exit_code = sig->group_exit_code; else if (sig->group_exec_task) exit_code = 0; - else if (!thread_group_empty(current)) { + else { struct sighand_struct *const sighand = current->sighand; spin_lock_irq(&sighand->siglock); diff --git a/kernel/fail_function.c b/kernel/fail_function.c index 60dc825ecc2b..a7ccd2930c5f 100644 --- a/kernel/fail_function.c +++ b/kernel/fail_function.c @@ -247,15 +247,11 @@ static ssize_t fei_write(struct file *file, const char __user *buffer, /* cut off if it is too long */ if (count > KSYM_NAME_LEN) count = KSYM_NAME_LEN; - buf = kmalloc(count + 1, GFP_KERNEL); - if (!buf) - return -ENOMEM; - if (copy_from_user(buf, buffer, count)) { - ret = -EFAULT; - goto out_free; - } - buf[count] = '\0'; + buf = memdup_user_nul(buffer, count); + if (IS_ERR(buf)) + return PTR_ERR(buf); + sym = strstrip(buf); mutex_lock(&fei_lock); @@ -298,17 +294,15 @@ static ssize_t fei_write(struct file *file, const char __user *buffer, } ret = register_kprobe(&attr->kp); - if (!ret) - fei_debugfs_add_attr(attr); - if (ret < 0) - fei_attr_remove(attr); - else { - list_add_tail(&attr->list, &fei_attr_list); - ret = count; + if (ret) { + fei_attr_free(attr); + goto out; } + fei_debugfs_add_attr(attr); + list_add_tail(&attr->list, &fei_attr_list); + ret = count; out: mutex_unlock(&fei_lock); -out_free: kfree(buf); return ret; } diff --git a/kernel/fork.c b/kernel/fork.c index 8a9e92068b15..9f7fe3541897 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -37,13 +37,13 @@ #include <linux/fdtable.h> #include <linux/iocontext.h> #include <linux/key.h> +#include <linux/kmsan.h> #include <linux/binfmts.h> #include <linux/mman.h> #include <linux/mmu_notifier.h> #include <linux/fs.h> #include <linux/mm.h> #include <linux/mm_inline.h> -#include <linux/vmacache.h> #include <linux/nsproxy.h> #include <linux/capability.h> #include <linux/cpu.h> @@ -75,7 +75,6 @@ #include <linux/freezer.h> #include <linux/delayacct.h> #include <linux/taskstats_kern.h> -#include <linux/random.h> #include <linux/tty.h> #include <linux/fs_struct.h> #include <linux/magic.h> @@ -97,7 +96,7 @@ #include <linux/scs.h> #include <linux/io_uring.h> #include <linux/bpf.h> -#include <linux/sched/mm.h> +#include <linux/stackprotector.h> #include <asm/pgalloc.h> #include <linux/uaccess.h> @@ -475,7 +474,6 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) */ *new = data_race(*orig); INIT_LIST_HEAD(&new->anon_vma_chain); - new->vm_next = new->vm_prev = NULL; dup_anon_vma_name(orig, new); } return new; @@ -537,6 +535,9 @@ void put_task_stack(struct task_struct *tsk) void free_task(struct task_struct *tsk) { +#ifdef CONFIG_SECCOMP + WARN_ON_ONCE(tsk->seccomp.filter); +#endif release_user_cpus_ptr(tsk); scs_release(tsk); @@ -580,11 +581,12 @@ static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm) static __latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) { - struct vm_area_struct *mpnt, *tmp, *prev, **pprev; - struct rb_node **rb_link, *rb_parent; + struct vm_area_struct *mpnt, *tmp; int retval; - unsigned long charge; + unsigned long charge = 0; LIST_HEAD(uf); + MA_STATE(old_mas, &oldmm->mm_mt, 0, 0); + MA_STATE(mas, &mm->mm_mt, 0, 0); uprobe_start_dup_mmap(); if (mmap_write_lock_killable(oldmm)) { @@ -606,16 +608,16 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, mm->exec_vm = oldmm->exec_vm; mm->stack_vm = oldmm->stack_vm; - rb_link = &mm->mm_rb.rb_node; - rb_parent = NULL; - pprev = &mm->mmap; retval = ksm_fork(mm, oldmm); if (retval) goto out; khugepaged_fork(mm, oldmm); - prev = NULL; - for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { + retval = mas_expected_entries(&mas, oldmm->map_count); + if (retval) + goto out; + + mas_for_each(&old_mas, mpnt, ULONG_MAX) { struct file *file; if (mpnt->vm_flags & VM_DONTCOPY) { @@ -629,7 +631,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, */ if (fatal_signal_pending(current)) { retval = -EINTR; - goto out; + goto loop_out; } if (mpnt->vm_flags & VM_ACCOUNT) { unsigned long len = vma_pages(mpnt); @@ -675,24 +677,17 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, } /* - * Clear hugetlb-related page reserves for children. This only - * affects MAP_PRIVATE mappings. Faults generated by the child - * are not guaranteed to succeed, even if read-only + * Copy/update hugetlb private vma information. */ if (is_vm_hugetlb_page(tmp)) - reset_vma_resv_huge_pages(tmp); + hugetlb_dup_vma_private(tmp); - /* - * Link in the new vma and copy the page table entries. - */ - *pprev = tmp; - pprev = &tmp->vm_next; - tmp->vm_prev = prev; - prev = tmp; - - __vma_link_rb(mm, tmp, rb_link, rb_parent); - rb_link = &tmp->vm_rb.rb_right; - rb_parent = &tmp->vm_rb; + /* Link the vma into the MT */ + mas.index = tmp->vm_start; + mas.last = tmp->vm_end - 1; + mas_store(&mas, tmp); + if (mas_is_err(&mas)) + goto fail_nomem_mas_store; mm->map_count++; if (!(tmp->vm_flags & VM_WIPEONFORK)) @@ -702,10 +697,12 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, tmp->vm_ops->open(tmp); if (retval) - goto out; + goto loop_out; } /* a new mm has just been created */ retval = arch_dup_mmap(oldmm, mm); +loop_out: + mas_destroy(&mas); out: mmap_write_unlock(mm); flush_tlb_mm(oldmm); @@ -714,6 +711,9 @@ out: fail_uprobe_end: uprobe_end_dup_mmap(); return retval; + +fail_nomem_mas_store: + unlink_anon_vmas(tmp); fail_nomem_anon_vma_fork: mpol_put(vma_policy(tmp)); fail_nomem_policy: @@ -721,7 +721,7 @@ fail_nomem_policy: fail_nomem: retval = -ENOMEM; vm_unacct_memory(charge); - goto out; + goto loop_out; } static inline int mm_alloc_pgd(struct mm_struct *mm) @@ -756,8 +756,13 @@ static void check_mm(struct mm_struct *mm) "Please make sure 'struct resident_page_types[]' is updated as well"); for (i = 0; i < NR_MM_COUNTERS; i++) { - long x = atomic_long_read(&mm->rss_stat.count[i]); + long x = percpu_counter_sum(&mm->rss_stat[i]); + if (likely(!x)) + continue; + + /* Making sure this is not due to race with CPU offlining. */ + x = percpu_counter_sum_all(&mm->rss_stat[i]); if (unlikely(x)) pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n", mm, resident_page_types[i], x); @@ -782,6 +787,8 @@ static void check_mm(struct mm_struct *mm) */ void __mmdrop(struct mm_struct *mm) { + int i; + BUG_ON(mm == &init_mm); WARN_ON_ONCE(mm == current->mm); WARN_ON_ONCE(mm == current->active_mm); @@ -791,6 +798,9 @@ void __mmdrop(struct mm_struct *mm) check_mm(mm); put_user_ns(mm->user_ns); mm_pasid_drop(mm); + + for (i = 0; i < NR_MM_COUNTERS; i++) + percpu_counter_destroy(&mm->rss_stat[i]); free_mm(mm); } EXPORT_SYMBOL_GPL(__mmdrop); @@ -925,13 +935,13 @@ void __init fork_init(void) init_task.signal->rlim[RLIMIT_SIGPENDING] = init_task.signal->rlim[RLIMIT_NPROC]; - for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++) + for (i = 0; i < UCOUNT_COUNTS; i++) init_user_ns.ucount_max[i] = max_threads/2; - set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_NPROC, RLIM_INFINITY); - set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_MSGQUEUE, RLIM_INFINITY); - set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_SIGPENDING, RLIM_INFINITY); - set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_MEMLOCK, RLIM_INFINITY); + set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_NPROC, RLIM_INFINITY); + set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_MSGQUEUE, RLIM_INFINITY); + set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_SIGPENDING, RLIM_INFINITY); + set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_MEMLOCK, RLIM_INFINITY); #ifdef CONFIG_VMAP_STACK cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache", @@ -1026,6 +1036,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->worker_private = NULL; kcov_task_init(tsk); + kmsan_task_create(tsk); kmap_local_fork(tsk); #ifdef CONFIG_FAULT_INJECTION @@ -1109,9 +1120,10 @@ static void mm_init_uprobes_state(struct mm_struct *mm) static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, struct user_namespace *user_ns) { - mm->mmap = NULL; - mm->mm_rb = RB_ROOT; - mm->vmacache_seqnum = 0; + int i; + + mt_init_flags(&mm->mm_mt, MM_MT_FLAGS); + mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock); atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_count, 1); seqcount_init(&mm->write_protect_seq); @@ -1151,9 +1163,17 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, if (init_new_context(p, mm)) goto fail_nocontext; + for (i = 0; i < NR_MM_COUNTERS; i++) + if (percpu_counter_init(&mm->rss_stat[i], 0, GFP_KERNEL_ACCOUNT)) + goto fail_pcpu; + mm->user_ns = get_user_ns(user_ns); + lru_gen_init_mm(mm); return mm; +fail_pcpu: + while (i > 0) + percpu_counter_destroy(&mm->rss_stat[--i]); fail_nocontext: mm_free_pgd(mm); fail_nopgd: @@ -1194,6 +1214,7 @@ static inline void __mmput(struct mm_struct *mm) } if (mm->binfmt) module_put(mm->binfmt->module); + lru_gen_del_mm(mm); mmdrop(mm); } @@ -1285,13 +1306,16 @@ int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) /* Forbid mm->exe_file change if old file still mapped. */ old_exe_file = get_mm_exe_file(mm); if (old_exe_file) { + VMA_ITERATOR(vmi, mm, 0); mmap_read_lock(mm); - for (vma = mm->mmap; vma && !ret; vma = vma->vm_next) { + for_each_vma(vmi, vma) { if (!vma->vm_file) continue; if (path_equal(&vma->vm_file->f_path, - &old_exe_file->f_path)) + &old_exe_file->f_path)) { ret = -EBUSY; + break; + } } mmap_read_unlock(mm); fput(old_exe_file); @@ -1421,13 +1445,12 @@ static void complete_vfork_done(struct task_struct *tsk) static int wait_for_vfork_done(struct task_struct *child, struct completion *vfork) { + unsigned int state = TASK_UNINTERRUPTIBLE|TASK_KILLABLE|TASK_FREEZABLE; int killed; - freezer_do_not_count(); cgroup_enter_frozen(); - killed = wait_for_completion_killable(vfork); + killed = wait_for_completion_state(vfork, state); cgroup_leave_frozen(false); - freezer_count(); if (killed) { task_lock(child); @@ -1567,9 +1590,6 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) if (!oldmm) return 0; - /* initialize the new vmacache entries */ - vmacache_flush(tsk); - if (clone_flags & CLONE_VM) { mmget(oldmm); mm = oldmm; @@ -1693,6 +1713,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) return -ENOMEM; sig->nr_threads = 1; + sig->quick_threads = 1; atomic_set(&sig->live, 1); refcount_set(&sig->sigcnt, 1); @@ -2044,18 +2065,6 @@ static __latent_entropy struct task_struct *copy_process( return ERR_PTR(-EINVAL); } - /* - * If the new process will be in a different time namespace - * do not allow it to share VM or a thread group with the forking task. - * - * On vfork, the child process enters the target time namespace only - * after exec. - */ - if ((clone_flags & (CLONE_VM | CLONE_VFORK)) == CLONE_VM) { - if (nsp->time_ns != nsp->time_ns_for_children) - return ERR_PTR(-EINVAL); - } - if (clone_flags & CLONE_PIDFD) { /* * - CLONE_DETACHED is blocked so that we can potentially @@ -2119,7 +2128,7 @@ static __latent_entropy struct task_struct *copy_process( goto bad_fork_free; retval = -EAGAIN; - if (is_ucounts_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) { + if (is_rlimit_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) { if (p->real_cred->user != INIT_USER && !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) goto bad_fork_cleanup_count; @@ -2410,12 +2419,6 @@ static __latent_entropy struct task_struct *copy_process( spin_lock(¤t->sighand->siglock); - /* - * Copy seccomp details explicitly here, in case they were changed - * before holding sighand lock. - */ - copy_seccomp(p); - rv_task_fork(p); rseq_fork(p, clone_flags); @@ -2432,6 +2435,14 @@ static __latent_entropy struct task_struct *copy_process( goto bad_fork_cancel_cgroup; } + /* No more failure paths after this point. */ + + /* + * Copy seccomp details explicitly here, in case they were changed + * before holding sighand lock. + */ + copy_seccomp(p); + init_task_pid_links(p); if (likely(p->pid)) { ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); @@ -2463,6 +2474,7 @@ static __latent_entropy struct task_struct *copy_process( __this_cpu_inc(process_counts); } else { current->signal->nr_threads++; + current->signal->quick_threads++; atomic_inc(¤t->signal->live); refcount_inc(¤t->signal->sigcnt); task_join_group_stop(p); @@ -2595,11 +2607,6 @@ struct task_struct * __init fork_idle(int cpu) return task; } -struct mm_struct *copy_init_mm(void) -{ - return dup_mm(NULL, &init_mm); -} - /* * This is like kernel_clone(), but shaved down and tailored to just * creating io_uring workers. It returns a created task, or an error pointer. @@ -2695,6 +2702,13 @@ pid_t kernel_clone(struct kernel_clone_args *args) get_task_struct(p); } + if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) { + /* lock the task to synchronize with memcg migration */ + task_lock(p); + lru_gen_add_mm(p->mm); + task_unlock(p); + } + wake_up_new_task(p); /* forking complete and child started to run, tell ptracer */ @@ -3011,10 +3025,27 @@ static void sighand_ctor(void *data) init_waitqueue_head(&sighand->signalfd_wqh); } -void __init proc_caches_init(void) +void __init mm_cache_init(void) { unsigned int mm_size; + /* + * The mm_cpumask is located at the end of mm_struct, and is + * dynamically sized based on the maximum CPU number this system + * can have, taking hotplug into account (nr_cpu_ids). + */ + mm_size = sizeof(struct mm_struct) + cpumask_size(); + + mm_cachep = kmem_cache_create_usercopy("mm_struct", + mm_size, ARCH_MIN_MMSTRUCT_ALIGN, + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, + offsetof(struct mm_struct, saved_auxv), + sizeof_field(struct mm_struct, saved_auxv), + NULL); +} + +void __init proc_caches_init(void) +{ sighand_cachep = kmem_cache_create("sighand_cache", sizeof(struct sighand_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| @@ -3032,19 +3063,6 @@ void __init proc_caches_init(void) SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); - /* - * The mm_cpumask is located at the end of mm_struct, and is - * dynamically sized based on the maximum CPU number this system - * can have, taking hotplug into account (nr_cpu_ids). - */ - mm_size = sizeof(struct mm_struct) + cpumask_size(); - - mm_cachep = kmem_cache_create_usercopy("mm_struct", - mm_size, ARCH_MIN_MMSTRUCT_ALIGN, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, - offsetof(struct mm_struct, saved_auxv), - sizeof_field(struct mm_struct, saved_auxv), - NULL); vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT); mmap_init(); nsproxy_cache_init(); diff --git a/kernel/freezer.c b/kernel/freezer.c index 45ab36ffd0e7..4fad0e6fca64 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -13,10 +13,11 @@ #include <linux/kthread.h> /* total number of freezing conditions in effect */ -atomic_t system_freezing_cnt = ATOMIC_INIT(0); -EXPORT_SYMBOL(system_freezing_cnt); +DEFINE_STATIC_KEY_FALSE(freezer_active); +EXPORT_SYMBOL(freezer_active); -/* indicate whether PM freezing is in effect, protected by +/* + * indicate whether PM freezing is in effect, protected by * system_transition_mutex */ bool pm_freezing; @@ -29,7 +30,7 @@ static DEFINE_SPINLOCK(freezer_lock); * freezing_slow_path - slow path for testing whether a task needs to be frozen * @p: task to be tested * - * This function is called by freezing() if system_freezing_cnt isn't zero + * This function is called by freezing() if freezer_active isn't zero * and tests whether @p needs to enter and stay in frozen state. Can be * called under any context. The freezers are responsible for ensuring the * target tasks see the updated state. @@ -52,41 +53,40 @@ bool freezing_slow_path(struct task_struct *p) } EXPORT_SYMBOL(freezing_slow_path); +bool frozen(struct task_struct *p) +{ + return READ_ONCE(p->__state) & TASK_FROZEN; +} + /* Refrigerator is place where frozen processes are stored :-). */ bool __refrigerator(bool check_kthr_stop) { - /* Hmm, should we be allowed to suspend when there are realtime - processes around? */ + unsigned int state = get_current_state(); bool was_frozen = false; - unsigned int save = get_current_state(); pr_debug("%s entered refrigerator\n", current->comm); + WARN_ON_ONCE(state && !(state & TASK_NORMAL)); + for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); + bool freeze; + + set_current_state(TASK_FROZEN); spin_lock_irq(&freezer_lock); - current->flags |= PF_FROZEN; - if (!freezing(current) || - (check_kthr_stop && kthread_should_stop())) - current->flags &= ~PF_FROZEN; + freeze = freezing(current) && !(check_kthr_stop && kthread_should_stop()); spin_unlock_irq(&freezer_lock); - if (!(current->flags & PF_FROZEN)) + if (!freeze) break; + was_frozen = true; schedule(); } + __set_current_state(TASK_RUNNING); pr_debug("%s left refrigerator\n", current->comm); - /* - * Restore saved task state before returning. The mb'd version - * needs to be used; otherwise, it might silently break - * synchronization which depends on ordered task state change. - */ - set_current_state(save); - return was_frozen; } EXPORT_SYMBOL(__refrigerator); @@ -101,6 +101,44 @@ static void fake_signal_wake_up(struct task_struct *p) } } +static int __set_task_frozen(struct task_struct *p, void *arg) +{ + unsigned int state = READ_ONCE(p->__state); + + if (p->on_rq) + return 0; + + if (p != current && task_curr(p)) + return 0; + + if (!(state & (TASK_FREEZABLE | __TASK_STOPPED | __TASK_TRACED))) + return 0; + + /* + * Only TASK_NORMAL can be augmented with TASK_FREEZABLE, since they + * can suffer spurious wakeups. + */ + if (state & TASK_FREEZABLE) + WARN_ON_ONCE(!(state & TASK_NORMAL)); + +#ifdef CONFIG_LOCKDEP + /* + * It's dangerous to freeze with locks held; there be dragons there. + */ + if (!(state & __TASK_FREEZABLE_UNSAFE)) + WARN_ON_ONCE(debug_locks && p->lockdep_depth); +#endif + + WRITE_ONCE(p->__state, TASK_FROZEN); + return TASK_FROZEN; +} + +static bool __freeze_task(struct task_struct *p) +{ + /* TASK_FREEZABLE|TASK_STOPPED|TASK_TRACED -> TASK_FROZEN */ + return task_call_func(p, __set_task_frozen, NULL); +} + /** * freeze_task - send a freeze request to given task * @p: task to send the request to @@ -116,20 +154,8 @@ bool freeze_task(struct task_struct *p) { unsigned long flags; - /* - * This check can race with freezer_do_not_count, but worst case that - * will result in an extra wakeup being sent to the task. It does not - * race with freezer_count(), the barriers in freezer_count() and - * freezer_should_skip() ensure that either freezer_count() sees - * freezing == true in try_to_freeze() and freezes, or - * freezer_should_skip() sees !PF_FREEZE_SKIP and freezes the task - * normally. - */ - if (freezer_should_skip(p)) - return false; - spin_lock_irqsave(&freezer_lock, flags); - if (!freezing(p) || frozen(p)) { + if (!freezing(p) || frozen(p) || __freeze_task(p)) { spin_unlock_irqrestore(&freezer_lock, flags); return false; } @@ -137,19 +163,52 @@ bool freeze_task(struct task_struct *p) if (!(p->flags & PF_KTHREAD)) fake_signal_wake_up(p); else - wake_up_state(p, TASK_INTERRUPTIBLE); + wake_up_state(p, TASK_NORMAL); spin_unlock_irqrestore(&freezer_lock, flags); return true; } +/* + * The special task states (TASK_STOPPED, TASK_TRACED) keep their canonical + * state in p->jobctl. If either of them got a wakeup that was missed because + * TASK_FROZEN, then their canonical state reflects that and the below will + * refuse to restore the special state and instead issue the wakeup. + */ +static int __set_task_special(struct task_struct *p, void *arg) +{ + unsigned int state = 0; + + if (p->jobctl & JOBCTL_TRACED) + state = TASK_TRACED; + + else if (p->jobctl & JOBCTL_STOPPED) + state = TASK_STOPPED; + + if (state) + WRITE_ONCE(p->__state, state); + + return state; +} + void __thaw_task(struct task_struct *p) { - unsigned long flags; + unsigned long flags, flags2; spin_lock_irqsave(&freezer_lock, flags); - if (frozen(p)) - wake_up_process(p); + if (WARN_ON_ONCE(freezing(p))) + goto unlock; + + if (lock_task_sighand(p, &flags2)) { + /* TASK_FROZEN -> TASK_{STOPPED,TRACED} */ + bool ret = task_call_func(p, __set_task_special, NULL); + unlock_task_sighand(p, &flags2); + if (ret) + goto unlock; + } + + wake_up_state(p, TASK_FROZEN); +unlock: spin_unlock_irqrestore(&freezer_lock, flags); } diff --git a/kernel/futex/core.c b/kernel/futex/core.c index b22ef1efe751..514e4582b863 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -638,6 +638,7 @@ static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, bool pi, bool pending_op) { u32 uval, nval, mval; + pid_t owner; int err; /* Futex address must be 32bit aligned */ @@ -659,6 +660,10 @@ retry: * 2. A woken up waiter is killed before it can acquire the * futex in user space. * + * In the second case, the wake up notification could be generated + * by the unlock path in user space after setting the futex value + * to zero or by the kernel after setting the OWNER_DIED bit below. + * * In both cases the TID validation below prevents a wakeup of * potential waiters which can cause these waiters to block * forever. @@ -667,24 +672,27 @@ retry: * * 1) task->robust_list->list_op_pending != NULL * @pending_op == true - * 2) User space futex value == 0 + * 2) The owner part of user space futex value == 0 * 3) Regular futex: @pi == false * * If these conditions are met, it is safe to attempt waking up a * potential waiter without touching the user space futex value and - * trying to set the OWNER_DIED bit. The user space futex value is - * uncontended and the rest of the user space mutex state is - * consistent, so a woken waiter will just take over the - * uncontended futex. Setting the OWNER_DIED bit would create - * inconsistent state and malfunction of the user space owner died - * handling. + * trying to set the OWNER_DIED bit. If the futex value is zero, + * the rest of the user space mutex state is consistent, so a woken + * waiter will just take over the uncontended futex. Setting the + * OWNER_DIED bit would create inconsistent state and malfunction + * of the user space owner died handling. Otherwise, the OWNER_DIED + * bit is already set, and the woken waiter is expected to deal with + * this. */ - if (pending_op && !pi && !uval) { + owner = uval & FUTEX_TID_MASK; + + if (pending_op && !pi && !owner) { futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY); return 0; } - if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr)) + if (owner != task_pid_vnr(curr)) return 0; /* diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c index 4ce0923f1ce3..ba01b9408203 100644 --- a/kernel/futex/waitwake.c +++ b/kernel/futex/waitwake.c @@ -334,7 +334,7 @@ void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q, * futex_queue() calls spin_unlock() upon completion, both serializing * access to the hash list and forcing another memory barrier. */ - set_current_state(TASK_INTERRUPTIBLE); + set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); futex_queue(q, hb); /* Arm the timer */ @@ -352,7 +352,7 @@ void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q, * is no timeout, or if it has yet to expire. */ if (!timeout || timeout->task) - freezable_schedule(); + schedule(); } __set_current_state(TASK_RUNNING); } @@ -430,7 +430,7 @@ retry: return ret; } - set_current_state(TASK_INTERRUPTIBLE); + set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); for (i = 0; i < count; i++) { u32 __user *uaddr = (u32 __user *)(unsigned long)vs[i].w.uaddr; @@ -504,7 +504,7 @@ static void futex_sleep_multiple(struct futex_vector *vs, unsigned int count, return; } - freezable_schedule(); + schedule(); } /** diff --git a/kernel/gcov/clang.c b/kernel/gcov/clang.c index cbb0bed958ab..7670a811a565 100644 --- a/kernel/gcov/clang.c +++ b/kernel/gcov/clang.c @@ -280,6 +280,8 @@ void gcov_info_add(struct gcov_info *dst, struct gcov_info *src) for (i = 0; i < sfn_ptr->num_counters; i++) dfn_ptr->counters[i] += sfn_ptr->counters[i]; + + sfn_ptr = list_next_entry(sfn_ptr, head); } } diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index 460c12b7dfea..74a4ef1da9ad 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c @@ -30,6 +30,13 @@ #define GCOV_TAG_FUNCTION_LENGTH 3 +/* Since GCC 12.1 sizes are in BYTES and not in WORDS (4B). */ +#if (__GNUC__ >= 12) +#define GCOV_UNIT_SIZE 4 +#else +#define GCOV_UNIT_SIZE 1 +#endif + static struct gcov_info *gcov_info_head; /** @@ -75,6 +82,7 @@ struct gcov_fn_info { * @version: gcov version magic indicating the gcc version used for compilation * @next: list head for a singly-linked list * @stamp: uniquifying time stamp + * @checksum: unique object checksum * @filename: name of the associated gcov data file * @merge: merge functions (null for unused counter type) * @n_functions: number of instrumented functions @@ -87,6 +95,10 @@ struct gcov_info { unsigned int version; struct gcov_info *next; unsigned int stamp; + /* Since GCC 12.1 a checksum field is added. */ +#if (__GNUC__ >= 12) + unsigned int checksum; +#endif const char *filename; void (*merge[GCOV_COUNTERS])(gcov_type *, unsigned int); unsigned int n_functions; @@ -383,12 +395,18 @@ size_t convert_to_gcda(char *buffer, struct gcov_info *info) pos += store_gcov_u32(buffer, pos, info->version); pos += store_gcov_u32(buffer, pos, info->stamp); +#if (__GNUC__ >= 12) + /* Use zero as checksum of the compilation unit. */ + pos += store_gcov_u32(buffer, pos, 0); +#endif + for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) { fi_ptr = info->functions[fi_idx]; /* Function record. */ pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION); - pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION_LENGTH); + pos += store_gcov_u32(buffer, pos, + GCOV_TAG_FUNCTION_LENGTH * GCOV_UNIT_SIZE); pos += store_gcov_u32(buffer, pos, fi_ptr->ident); pos += store_gcov_u32(buffer, pos, fi_ptr->lineno_checksum); pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum); @@ -402,7 +420,8 @@ size_t convert_to_gcda(char *buffer, struct gcov_info *info) /* Counter record. */ pos += store_gcov_u32(buffer, pos, GCOV_TAG_FOR_COUNTER(ct_idx)); - pos += store_gcov_u32(buffer, pos, ci_ptr->num * 2); + pos += store_gcov_u32(buffer, pos, + ci_ptr->num * 2 * GCOV_UNIT_SIZE); for (cv_idx = 0; cv_idx < ci_ptr->num; cv_idx++) { pos += store_gcov_u64(buffer, pos, diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index 0c78e64f747d..473036b43c83 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -31,8 +31,8 @@ if [ "$building_out_of_srctree" ]; then fi all_dirs="$all_dirs $dir_list" -# include/generated/compile.h is ignored because it is touched even when none -# of the source files changed. +# include/generated/utsversion.h is ignored because it is generated after this +# script is executed. (utsversion.h is unneeded for kheaders) # # When Kconfig regenerates include/generated/autoconf.h, its timestamp is # updated, but the contents might be still the same. When any CONFIG option is @@ -42,7 +42,7 @@ all_dirs="$all_dirs $dir_list" # # Ignore them for md5 calculation to avoid pointless regeneration. headers_md5="$(find $all_dirs -name "*.h" | - grep -v "include/generated/compile.h" | + grep -v "include/generated/utsversion.h" | grep -v "include/generated/autoconf.h" | xargs ls -l | md5sum | cut -d ' ' -f1)" diff --git a/kernel/hung_task.c b/kernel/hung_task.c index bb2354f73ded..c71889f3f3fc 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -95,8 +95,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) * Ensure the task is not frozen. * Also, skip vfork and any other user process that freezer should skip. */ - if (unlikely(t->flags & (PF_FROZEN | PF_FREEZER_SKIP))) - return; + if (unlikely(READ_ONCE(t->__state) & TASK_FROZEN)) + return; /* * When a freshly created task is scheduled once, changes its state to @@ -191,6 +191,8 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) hung_task_show_lock = false; rcu_read_lock(); for_each_process_thread(g, t) { + unsigned int state; + if (!max_count--) goto unlock; if (time_after(jiffies, last_break + HUNG_TASK_LOCK_BREAK)) { @@ -198,8 +200,14 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) goto unlock; last_break = jiffies; } - /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ - if (READ_ONCE(t->__state) == TASK_UNINTERRUPTIBLE) + /* + * skip the TASK_KILLABLE tasks -- these can be killed + * skip the TASK_IDLE tasks -- those are genuinely idle + */ + state = READ_ONCE(t->__state); + if ((state & TASK_UNINTERRUPTIBLE) && + !(state & TASK_WAKEKILL) && + !(state & TASK_NOLOAD)) check_hung_task(t, timeout); } unlock: diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index db3d174c53d4..b64c44ae4c25 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -86,15 +86,10 @@ config GENERIC_IRQ_IPI depends on SMP select IRQ_DOMAIN_HIERARCHY -# Generic MSI interrupt support -config GENERIC_MSI_IRQ - bool - # Generic MSI hierarchical interrupt domain support -config GENERIC_MSI_IRQ_DOMAIN +config GENERIC_MSI_IRQ bool select IRQ_DOMAIN_HIERARCHY - select GENERIC_MSI_IRQ config IRQ_MSI_IOMMU bool diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 8ac37e8e738a..49e7bc871fec 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -1561,10 +1561,10 @@ int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) return 0; } -static struct device *irq_get_parent_device(struct irq_data *data) +static struct device *irq_get_pm_device(struct irq_data *data) { if (data->domain) - return data->domain->dev; + return data->domain->pm_dev; return NULL; } @@ -1578,7 +1578,7 @@ static struct device *irq_get_parent_device(struct irq_data *data) */ int irq_chip_pm_get(struct irq_data *data) { - struct device *dev = irq_get_parent_device(data); + struct device *dev = irq_get_pm_device(data); int retval = 0; if (IS_ENABLED(CONFIG_PM) && dev) @@ -1597,7 +1597,7 @@ int irq_chip_pm_get(struct irq_data *data) */ int irq_chip_pm_put(struct irq_data *data) { - struct device *dev = irq_get_parent_device(data); + struct device *dev = irq_get_pm_device(data); int retval = 0; if (IS_ENABLED(CONFIG_PM) && dev) diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index f09c60393e55..5fdc0b557579 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -52,6 +52,7 @@ enum { * IRQS_PENDING - irq is pending and replayed later * IRQS_SUSPENDED - irq is suspended * IRQS_NMI - irq line is used to deliver NMIs + * IRQS_SYSFS - descriptor has been added to sysfs */ enum { IRQS_AUTODETECT = 0x00000001, @@ -64,6 +65,7 @@ enum { IRQS_SUSPENDED = 0x00000800, IRQS_TIMINGS = 0x00001000, IRQS_NMI = 0x00002000, + IRQS_SYSFS = 0x00004000, }; #include "debug.h" diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 5db0230aa6b5..fd0996274401 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -288,22 +288,25 @@ static void irq_sysfs_add(int irq, struct irq_desc *desc) if (irq_kobj_base) { /* * Continue even in case of failure as this is nothing - * crucial. + * crucial and failures in the late irq_sysfs_init() + * cannot be rolled back. */ if (kobject_add(&desc->kobj, irq_kobj_base, "%d", irq)) pr_warn("Failed to add kobject for irq %d\n", irq); + else + desc->istate |= IRQS_SYSFS; } } static void irq_sysfs_del(struct irq_desc *desc) { /* - * If irq_sysfs_init() has not yet been invoked (early boot), then - * irq_kobj_base is NULL and the descriptor was never added. - * kobject_del() complains about a object with no parent, so make - * it conditional. + * Only invoke kobject_del() when kobject_add() was successfully + * invoked for the descriptor. This covers both early boot, where + * sysfs is not initialized yet, and the case of a failed + * kobject_add() invocation. */ - if (irq_kobj_base) + if (desc->istate & IRQS_SYSFS) kobject_del(&desc->kobj); } @@ -705,6 +708,30 @@ int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq) } EXPORT_SYMBOL_GPL(generic_handle_domain_irq); + /** + * generic_handle_irq_safe - Invoke the handler for a HW irq belonging + * to a domain from any context. + * @domain: The domain where to perform the lookup + * @hwirq: The HW irq number to convert to a logical one + * + * Returns: 0 on success, a negative value on error. + * + * This function can be called from any context (IRQ or process + * context). If the interrupt is marked as 'enforce IRQ-context only' then + * the function must be invoked from hard interrupt context. + */ +int generic_handle_domain_irq_safe(struct irq_domain *domain, unsigned int hwirq) +{ + unsigned long flags; + int ret; + + local_irq_save(flags); + ret = handle_irq_desc(irq_resolve_mapping(domain, hwirq)); + local_irq_restore(flags); + return ret; +} +EXPORT_SYMBOL_GPL(generic_handle_domain_irq_safe); + /** * generic_handle_domain_nmi - Invoke the handler for a HW nmi belonging * to a domain. diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 40fe7806cc8c..5b7cf28df290 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -321,7 +321,7 @@ static int irq_try_set_affinity(struct irq_data *data, } static bool irq_set_affinity_deactivated(struct irq_data *data, - const struct cpumask *mask, bool force) + const struct cpumask *mask) { struct irq_desc *desc = irq_data_to_desc(data); @@ -354,7 +354,7 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask, if (!chip || !chip->irq_set_affinity) return -EINVAL; - if (irq_set_affinity_deactivated(data, mask, force)) + if (irq_set_affinity_deactivated(data, mask)) return 0; if (irq_can_move_pcntxt(data) && !irqd_is_setaffinity_pending(data)) { diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index a9ee535293eb..955267bbc2be 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -19,8 +19,31 @@ #include "internals.h" +/** + * struct msi_ctrl - MSI internal management control structure + * @domid: ID of the domain on which management operations should be done + * @first: First (hardware) slot index to operate on + * @last: Last (hardware) slot index to operate on + * @nirqs: The number of Linux interrupts to allocate. Can be larger + * than the range due to PCI/multi-MSI. + */ +struct msi_ctrl { + unsigned int domid; + unsigned int first; + unsigned int last; + unsigned int nirqs; +}; + +/* Invalid Xarray index which is outside of any searchable range */ +#define MSI_XA_MAX_INDEX (ULONG_MAX - 1) +/* The maximum domain size */ +#define MSI_XA_DOMAIN_SIZE (MSI_MAX_INDEX + 1) + +static void msi_domain_free_locked(struct device *dev, struct msi_ctrl *ctrl); +static unsigned int msi_domain_get_hwsize(struct device *dev, unsigned int domid); static inline int msi_sysfs_create_group(struct device *dev); + /** * msi_alloc_desc - Allocate an initialized msi_desc * @dev: Pointer to the device for which this is allocated @@ -33,7 +56,7 @@ static inline int msi_sysfs_create_group(struct device *dev); * Return: pointer to allocated &msi_desc on success or %NULL on failure */ static struct msi_desc *msi_alloc_desc(struct device *dev, int nvec, - const struct irq_affinity_desc *affinity) + const struct irq_affinity_desc *affinity) { struct msi_desc *desc = kzalloc(sizeof(*desc), GFP_KERNEL); @@ -58,25 +81,56 @@ static void msi_free_desc(struct msi_desc *desc) kfree(desc); } -static int msi_insert_desc(struct msi_device_data *md, struct msi_desc *desc, unsigned int index) +static int msi_insert_desc(struct device *dev, struct msi_desc *desc, + unsigned int domid, unsigned int index) { + struct msi_device_data *md = dev->msi.data; + struct xarray *xa = &md->__domains[domid].store; + unsigned int hwsize; int ret; - desc->msi_index = index; - ret = xa_insert(&md->__store, index, desc, GFP_KERNEL); - if (ret) - msi_free_desc(desc); + hwsize = msi_domain_get_hwsize(dev, domid); + + if (index == MSI_ANY_INDEX) { + struct xa_limit limit = { .min = 0, .max = hwsize - 1 }; + unsigned int index; + + /* Let the xarray allocate a free index within the limit */ + ret = xa_alloc(xa, &index, desc, limit, GFP_KERNEL); + if (ret) + goto fail; + + desc->msi_index = index; + return 0; + } else { + if (index >= hwsize) { + ret = -ERANGE; + goto fail; + } + + desc->msi_index = index; + ret = xa_insert(xa, index, desc, GFP_KERNEL); + if (ret) + goto fail; + return 0; + } +fail: + msi_free_desc(desc); return ret; } /** - * msi_add_msi_desc - Allocate and initialize a MSI descriptor + * msi_domain_insert_msi_desc - Allocate and initialize a MSI descriptor and + * insert it at @init_desc->msi_index + * * @dev: Pointer to the device for which the descriptor is allocated + * @domid: The id of the interrupt domain to which the desriptor is added * @init_desc: Pointer to an MSI descriptor to initialize the new descriptor * * Return: 0 on success or an appropriate failure code. */ -int msi_add_msi_desc(struct device *dev, struct msi_desc *init_desc) +int msi_domain_insert_msi_desc(struct device *dev, unsigned int domid, + struct msi_desc *init_desc) { struct msi_desc *desc; @@ -88,40 +142,8 @@ int msi_add_msi_desc(struct device *dev, struct msi_desc *init_desc) /* Copy type specific data to the new descriptor. */ desc->pci = init_desc->pci; - return msi_insert_desc(dev->msi.data, desc, init_desc->msi_index); -} -/** - * msi_add_simple_msi_descs - Allocate and initialize MSI descriptors - * @dev: Pointer to the device for which the descriptors are allocated - * @index: Index for the first MSI descriptor - * @ndesc: Number of descriptors to allocate - * - * Return: 0 on success or an appropriate failure code. - */ -static int msi_add_simple_msi_descs(struct device *dev, unsigned int index, unsigned int ndesc) -{ - unsigned int idx, last = index + ndesc - 1; - struct msi_desc *desc; - int ret; - - lockdep_assert_held(&dev->msi.data->mutex); - - for (idx = index; idx <= last; idx++) { - desc = msi_alloc_desc(dev, 1, NULL); - if (!desc) - goto fail_mem; - ret = msi_insert_desc(dev->msi.data, desc, idx); - if (ret) - goto fail; - } - return 0; - -fail_mem: - ret = -ENOMEM; -fail: - msi_free_msi_descs_range(dev, MSI_DESC_NOTASSOCIATED, index, last); - return ret; + return msi_insert_desc(dev, desc, domid, init_desc->msi_index); } static bool msi_desc_match(struct msi_desc *desc, enum msi_desc_filter filter) @@ -138,28 +160,97 @@ static bool msi_desc_match(struct msi_desc *desc, enum msi_desc_filter filter) return false; } +static bool msi_ctrl_valid(struct device *dev, struct msi_ctrl *ctrl) +{ + unsigned int hwsize; + + if (WARN_ON_ONCE(ctrl->domid >= MSI_MAX_DEVICE_IRQDOMAINS || + (dev->msi.domain && + !dev->msi.data->__domains[ctrl->domid].domain))) + return false; + + hwsize = msi_domain_get_hwsize(dev, ctrl->domid); + if (WARN_ON_ONCE(ctrl->first > ctrl->last || + ctrl->first >= hwsize || + ctrl->last >= hwsize)) + return false; + return true; +} + +static void msi_domain_free_descs(struct device *dev, struct msi_ctrl *ctrl) +{ + struct msi_desc *desc; + struct xarray *xa; + unsigned long idx; + + lockdep_assert_held(&dev->msi.data->mutex); + + if (!msi_ctrl_valid(dev, ctrl)) + return; + + xa = &dev->msi.data->__domains[ctrl->domid].store; + xa_for_each_range(xa, idx, desc, ctrl->first, ctrl->last) { + xa_erase(xa, idx); + + /* Leak the descriptor when it is still referenced */ + if (WARN_ON_ONCE(msi_desc_match(desc, MSI_DESC_ASSOCIATED))) + continue; + msi_free_desc(desc); + } +} + /** - * msi_free_msi_descs_range - Free MSI descriptors of a device - * @dev: Device to free the descriptors - * @filter: Descriptor state filter - * @first_index: Index to start freeing from - * @last_index: Last index to be freed + * msi_domain_free_msi_descs_range - Free a range of MSI descriptors of a device in an irqdomain + * @dev: Device for which to free the descriptors + * @domid: Id of the domain to operate on + * @first: Index to start freeing from (inclusive) + * @last: Last index to be freed (inclusive) */ -void msi_free_msi_descs_range(struct device *dev, enum msi_desc_filter filter, - unsigned int first_index, unsigned int last_index) +void msi_domain_free_msi_descs_range(struct device *dev, unsigned int domid, + unsigned int first, unsigned int last) +{ + struct msi_ctrl ctrl = { + .domid = domid, + .first = first, + .last = last, + }; + + msi_domain_free_descs(dev, &ctrl); +} + +/** + * msi_domain_add_simple_msi_descs - Allocate and initialize MSI descriptors + * @dev: Pointer to the device for which the descriptors are allocated + * @ctrl: Allocation control struct + * + * Return: 0 on success or an appropriate failure code. + */ +static int msi_domain_add_simple_msi_descs(struct device *dev, struct msi_ctrl *ctrl) { - struct xarray *xa = &dev->msi.data->__store; struct msi_desc *desc; - unsigned long idx; + unsigned int idx; + int ret; lockdep_assert_held(&dev->msi.data->mutex); - xa_for_each_range(xa, idx, desc, first_index, last_index) { - if (msi_desc_match(desc, filter)) { - xa_erase(xa, idx); - msi_free_desc(desc); - } + if (!msi_ctrl_valid(dev, ctrl)) + return -EINVAL; + + for (idx = ctrl->first; idx <= ctrl->last; idx++) { + desc = msi_alloc_desc(dev, 1, NULL); + if (!desc) + goto fail_mem; + ret = msi_insert_desc(dev, desc, ctrl->domid, idx); + if (ret) + goto fail; } + return 0; + +fail_mem: + ret = -ENOMEM; +fail: + msi_domain_free_descs(dev, ctrl); + return ret; } void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg) @@ -178,9 +269,13 @@ EXPORT_SYMBOL_GPL(get_cached_msi_msg); static void msi_device_data_release(struct device *dev, void *res) { struct msi_device_data *md = res; + int i; - WARN_ON_ONCE(!xa_empty(&md->__store)); - xa_destroy(&md->__store); + for (i = 0; i < MSI_MAX_DEVICE_IRQDOMAINS; i++) { + msi_remove_device_irq_domain(dev, i); + WARN_ON_ONCE(!xa_empty(&md->__domains[i].store)); + xa_destroy(&md->__domains[i].store); + } dev->msi.data = NULL; } @@ -197,7 +292,7 @@ static void msi_device_data_release(struct device *dev, void *res) int msi_setup_device_data(struct device *dev) { struct msi_device_data *md; - int ret; + int ret, i; if (dev->msi.data) return 0; @@ -212,7 +307,18 @@ int msi_setup_device_data(struct device *dev) return ret; } - xa_init(&md->__store); + for (i = 0; i < MSI_MAX_DEVICE_IRQDOMAINS; i++) + xa_init_flags(&md->__domains[i].store, XA_FLAGS_ALLOC); + + /* + * If @dev::msi::domain is set and is a global MSI domain, copy the + * pointer into the domain array so all code can operate on domain + * ids. The NULL pointer check is required to keep the legacy + * architecture specific PCI/MSI support working. + */ + if (dev->msi.domain && !irq_domain_is_msi_parent(dev->msi.domain)) + md->__domains[MSI_DEFAULT_DOMAIN].domain = dev->msi.domain; + mutex_init(&md->mutex); dev->msi.data = md; devres_add(dev, md); @@ -235,27 +341,30 @@ EXPORT_SYMBOL_GPL(msi_lock_descs); */ void msi_unlock_descs(struct device *dev) { - /* Invalidate the index wich was cached by the iterator */ - dev->msi.data->__iter_idx = MSI_MAX_INDEX; + /* Invalidate the index which was cached by the iterator */ + dev->msi.data->__iter_idx = MSI_XA_MAX_INDEX; mutex_unlock(&dev->msi.data->mutex); } EXPORT_SYMBOL_GPL(msi_unlock_descs); -static struct msi_desc *msi_find_desc(struct msi_device_data *md, enum msi_desc_filter filter) +static struct msi_desc *msi_find_desc(struct msi_device_data *md, unsigned int domid, + enum msi_desc_filter filter) { + struct xarray *xa = &md->__domains[domid].store; struct msi_desc *desc; - xa_for_each_start(&md->__store, md->__iter_idx, desc, md->__iter_idx) { + xa_for_each_start(xa, md->__iter_idx, desc, md->__iter_idx) { if (msi_desc_match(desc, filter)) return desc; } - md->__iter_idx = MSI_MAX_INDEX; + md->__iter_idx = MSI_XA_MAX_INDEX; return NULL; } /** - * msi_first_desc - Get the first MSI descriptor of a device + * msi_domain_first_desc - Get the first MSI descriptor of an irqdomain associated to a device * @dev: Device to operate on + * @domid: The id of the interrupt domain which should be walked. * @filter: Descriptor state filter * * Must be called with the MSI descriptor mutex held, i.e. msi_lock_descs() @@ -264,23 +373,26 @@ static struct msi_desc *msi_find_desc(struct msi_device_data *md, enum msi_desc_ * Return: Pointer to the first MSI descriptor matching the search * criteria, NULL if none found. */ -struct msi_desc *msi_first_desc(struct device *dev, enum msi_desc_filter filter) +struct msi_desc *msi_domain_first_desc(struct device *dev, unsigned int domid, + enum msi_desc_filter filter) { struct msi_device_data *md = dev->msi.data; - if (WARN_ON_ONCE(!md)) + if (WARN_ON_ONCE(!md || domid >= MSI_MAX_DEVICE_IRQDOMAINS)) return NULL; lockdep_assert_held(&md->mutex); md->__iter_idx = 0; - return msi_find_desc(md, filter); + return msi_find_desc(md, domid, filter); } -EXPORT_SYMBOL_GPL(msi_first_desc); +EXPORT_SYMBOL_GPL(msi_domain_first_desc); /** * msi_next_desc - Get the next MSI descriptor of a device * @dev: Device to operate on + * @domid: The id of the interrupt domain which should be walked. + * @filter: Descriptor state filter * * The first invocation of msi_next_desc() has to be preceeded by a * successful invocation of __msi_first_desc(). Consecutive invocations are @@ -290,11 +402,12 @@ EXPORT_SYMBOL_GPL(msi_first_desc); * Return: Pointer to the next MSI descriptor matching the search * criteria, NULL if none found. */ -struct msi_desc *msi_next_desc(struct device *dev, enum msi_desc_filter filter) +struct msi_desc *msi_next_desc(struct device *dev, unsigned int domid, + enum msi_desc_filter filter) { struct msi_device_data *md = dev->msi.data; - if (WARN_ON_ONCE(!md)) + if (WARN_ON_ONCE(!md || domid >= MSI_MAX_DEVICE_IRQDOMAINS)) return NULL; lockdep_assert_held(&md->mutex); @@ -303,30 +416,38 @@ struct msi_desc *msi_next_desc(struct device *dev, enum msi_desc_filter filter) return NULL; md->__iter_idx++; - return msi_find_desc(md, filter); + return msi_find_desc(md, domid, filter); } EXPORT_SYMBOL_GPL(msi_next_desc); /** - * msi_get_virq - Return Linux interrupt number of a MSI interrupt + * msi_domain_get_virq - Lookup the Linux interrupt number for a MSI index on a interrupt domain * @dev: Device to operate on + * @domid: Domain ID of the interrupt domain associated to the device * @index: MSI interrupt index to look for (0-based) * * Return: The Linux interrupt number on success (> 0), 0 if not found */ -unsigned int msi_get_virq(struct device *dev, unsigned int index) +unsigned int msi_domain_get_virq(struct device *dev, unsigned int domid, unsigned int index) { struct msi_desc *desc; unsigned int ret = 0; - bool pcimsi; + bool pcimsi = false; + struct xarray *xa; if (!dev->msi.data) return 0; - pcimsi = dev_is_pci(dev) ? to_pci_dev(dev)->msi_enabled : false; + if (WARN_ON_ONCE(index > MSI_MAX_INDEX || domid >= MSI_MAX_DEVICE_IRQDOMAINS)) + return 0; + + /* This check is only valid for the PCI default MSI domain */ + if (dev_is_pci(dev) && domid == MSI_DEFAULT_DOMAIN) + pcimsi = to_pci_dev(dev)->msi_enabled; msi_lock_descs(dev); - desc = xa_load(&dev->msi.data->__store, pcimsi ? 0 : index); + xa = &dev->msi.data->__domains[domid].store; + desc = xa_load(xa, pcimsi ? 0 : index); if (desc && desc->irq) { /* * PCI-MSI has only one descriptor for multiple interrupts. @@ -340,10 +461,11 @@ unsigned int msi_get_virq(struct device *dev, unsigned int index) ret = desc->irq; } } + msi_unlock_descs(dev); return ret; } -EXPORT_SYMBOL_GPL(msi_get_virq); +EXPORT_SYMBOL_GPL(msi_domain_get_virq); #ifdef CONFIG_SYSFS static struct attribute *msi_dev_attrs[] = { @@ -459,7 +581,39 @@ static inline int msi_sysfs_populate_desc(struct device *dev, struct msi_desc *d static inline void msi_sysfs_remove_desc(struct device *dev, struct msi_desc *desc) { } #endif /* !CONFIG_SYSFS */ -#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN +static struct irq_domain *msi_get_device_domain(struct device *dev, unsigned int domid) +{ + struct irq_domain *domain; + + lockdep_assert_held(&dev->msi.data->mutex); + + if (WARN_ON_ONCE(domid >= MSI_MAX_DEVICE_IRQDOMAINS)) + return NULL; + + domain = dev->msi.data->__domains[domid].domain; + if (!domain) + return NULL; + + if (WARN_ON_ONCE(irq_domain_is_msi_parent(domain))) + return NULL; + + return domain; +} + +static unsigned int msi_domain_get_hwsize(struct device *dev, unsigned int domid) +{ + struct msi_domain_info *info; + struct irq_domain *domain; + + domain = msi_get_device_domain(dev, domid); + if (domain) { + info = domain->host_data; + return info->hwsize; + } + /* No domain, default to MSI_XA_DOMAIN_SIZE */ + return MSI_XA_DOMAIN_SIZE; +} + static inline void irq_chip_write_msi_msg(struct irq_data *data, struct msi_msg *msg) { @@ -613,21 +767,11 @@ static int msi_domain_ops_init(struct irq_domain *domain, return 0; } -static int msi_domain_ops_check(struct irq_domain *domain, - struct msi_domain_info *info, - struct device *dev) -{ - return 0; -} - static struct msi_domain_ops msi_domain_ops_default = { .get_hwirq = msi_domain_ops_get_hwirq, .msi_init = msi_domain_ops_init, - .msi_check = msi_domain_ops_check, .msi_prepare = msi_domain_ops_prepare, .set_desc = msi_domain_ops_set_desc, - .domain_alloc_irqs = __msi_domain_alloc_irqs, - .domain_free_irqs = __msi_domain_free_irqs, }; static void msi_domain_update_dom_ops(struct msi_domain_info *info) @@ -639,11 +783,6 @@ static void msi_domain_update_dom_ops(struct msi_domain_info *info) return; } - if (ops->domain_alloc_irqs == NULL) - ops->domain_alloc_irqs = msi_domain_ops_default.domain_alloc_irqs; - if (ops->domain_free_irqs == NULL) - ops->domain_free_irqs = msi_domain_ops_default.domain_free_irqs; - if (!(info->flags & MSI_FLAG_USE_DEF_DOM_OPS)) return; @@ -651,8 +790,6 @@ static void msi_domain_update_dom_ops(struct msi_domain_info *info) ops->get_hwirq = msi_domain_ops_default.get_hwirq; if (ops->msi_init == NULL) ops->msi_init = msi_domain_ops_default.msi_init; - if (ops->msi_check == NULL) - ops->msi_check = msi_domain_ops_default.msi_check; if (ops->msi_prepare == NULL) ops->msi_prepare = msi_domain_ops_default.msi_prepare; if (ops->set_desc == NULL) @@ -668,6 +805,40 @@ static void msi_domain_update_chip_ops(struct msi_domain_info *info) chip->irq_set_affinity = msi_domain_set_affinity; } +static struct irq_domain *__msi_create_irq_domain(struct fwnode_handle *fwnode, + struct msi_domain_info *info, + unsigned int flags, + struct irq_domain *parent) +{ + struct irq_domain *domain; + + if (info->hwsize > MSI_XA_DOMAIN_SIZE) + return NULL; + + /* + * Hardware size 0 is valid for backwards compatibility and for + * domains which are not backed by a hardware table. Grant the + * maximum index space. + */ + if (!info->hwsize) + info->hwsize = MSI_XA_DOMAIN_SIZE; + + msi_domain_update_dom_ops(info); + if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS) + msi_domain_update_chip_ops(info); + + domain = irq_domain_create_hierarchy(parent, flags | IRQ_DOMAIN_FLAG_MSI, 0, + fwnode, &msi_domain_ops, info); + + if (domain) { + if (!domain->name && info->chip) + domain->name = info->chip->name; + irq_domain_update_bus_token(domain, info->bus_token); + } + + return domain; +} + /** * msi_create_irq_domain - Create an MSI interrupt domain * @fwnode: Optional fwnode of the interrupt controller @@ -680,19 +851,210 @@ struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode, struct msi_domain_info *info, struct irq_domain *parent) { + return __msi_create_irq_domain(fwnode, info, 0, parent); +} + +/** + * msi_parent_init_dev_msi_info - Delegate initialization of device MSI info down + * in the domain hierarchy + * @dev: The device for which the domain should be created + * @domain: The domain in the hierarchy this op is being called on + * @msi_parent_domain: The IRQ_DOMAIN_FLAG_MSI_PARENT domain for the child to + * be created + * @msi_child_info: The MSI domain info of the IRQ_DOMAIN_FLAG_MSI_DEVICE + * domain to be created + * + * Return: true on success, false otherwise + * + * This is the most complex problem of per device MSI domains and the + * underlying interrupt domain hierarchy: + * + * The device domain to be initialized requests the broadest feature set + * possible and the underlying domain hierarchy puts restrictions on it. + * + * That's trivial for a simple parent->child relationship, but it gets + * interesting with an intermediate domain: root->parent->child. The + * intermediate 'parent' can expand the capabilities which the 'root' + * domain is providing. So that creates a classic hen and egg problem: + * Which entity is doing the restrictions/expansions? + * + * One solution is to let the root domain handle the initialization that's + * why there is the @domain and the @msi_parent_domain pointer. + */ +bool msi_parent_init_dev_msi_info(struct device *dev, struct irq_domain *domain, + struct irq_domain *msi_parent_domain, + struct msi_domain_info *msi_child_info) +{ + struct irq_domain *parent = domain->parent; + + if (WARN_ON_ONCE(!parent || !parent->msi_parent_ops || + !parent->msi_parent_ops->init_dev_msi_info)) + return false; + + return parent->msi_parent_ops->init_dev_msi_info(dev, parent, msi_parent_domain, + msi_child_info); +} + +/** + * msi_create_device_irq_domain - Create a device MSI interrupt domain + * @dev: Pointer to the device + * @domid: Domain id + * @template: MSI domain info bundle used as template + * @hwsize: Maximum number of MSI table entries (0 if unknown or unlimited) + * @domain_data: Optional pointer to domain specific data which is set in + * msi_domain_info::data + * @chip_data: Optional pointer to chip specific data which is set in + * msi_domain_info::chip_data + * + * Return: True on success, false otherwise + * + * There is no firmware node required for this interface because the per + * device domains are software constructs which are actually closer to the + * hardware reality than any firmware can describe them. + * + * The domain name and the irq chip name for a MSI device domain are + * composed by: "$(PREFIX)$(CHIPNAME)-$(DEVNAME)" + * + * $PREFIX: Optional prefix provided by the underlying MSI parent domain + * via msi_parent_ops::prefix. If that pointer is NULL the prefix + * is empty. + * $CHIPNAME: The name of the irq_chip in @template + * $DEVNAME: The name of the device + * + * This results in understandable chip names and hardware interrupt numbers + * in e.g. /proc/interrupts + * + * PCI-MSI-0000:00:1c.0 0-edge Parent domain has no prefix + * IR-PCI-MSI-0000:00:1c.4 0-edge Same with interrupt remapping prefix 'IR-' + * + * IR-PCI-MSIX-0000:3d:00.0 0-edge Hardware interrupt numbers reflect + * IR-PCI-MSIX-0000:3d:00.0 1-edge the real MSI-X index on that device + * IR-PCI-MSIX-0000:3d:00.0 2-edge + * + * On IMS domains the hardware interrupt number is either a table entry + * index or a purely software managed index but it is guaranteed to be + * unique. + * + * The domain pointer is stored in @dev::msi::data::__irqdomains[]. All + * subsequent operations on the domain depend on the domain id. + * + * The domain is automatically freed when the device is removed via devres + * in the context of @dev::msi::data freeing, but it can also be + * independently removed via @msi_remove_device_irq_domain(). + */ +bool msi_create_device_irq_domain(struct device *dev, unsigned int domid, + const struct msi_domain_template *template, + unsigned int hwsize, void *domain_data, + void *chip_data) +{ + struct irq_domain *domain, *parent = dev->msi.domain; + const struct msi_parent_ops *pops; + struct msi_domain_template *bundle; + struct fwnode_handle *fwnode; + + if (!irq_domain_is_msi_parent(parent)) + return false; + + if (domid >= MSI_MAX_DEVICE_IRQDOMAINS) + return false; + + bundle = kmemdup(template, sizeof(*bundle), GFP_KERNEL); + if (!bundle) + return false; + + bundle->info.hwsize = hwsize; + bundle->info.chip = &bundle->chip; + bundle->info.ops = &bundle->ops; + bundle->info.data = domain_data; + bundle->info.chip_data = chip_data; + + pops = parent->msi_parent_ops; + snprintf(bundle->name, sizeof(bundle->name), "%s%s-%s", + pops->prefix ? : "", bundle->chip.name, dev_name(dev)); + bundle->chip.name = bundle->name; + + fwnode = irq_domain_alloc_named_fwnode(bundle->name); + if (!fwnode) + goto free_bundle; + + if (msi_setup_device_data(dev)) + goto free_fwnode; + + msi_lock_descs(dev); + + if (WARN_ON_ONCE(msi_get_device_domain(dev, domid))) + goto fail; + + if (!pops->init_dev_msi_info(dev, parent, parent, &bundle->info)) + goto fail; + + domain = __msi_create_irq_domain(fwnode, &bundle->info, IRQ_DOMAIN_FLAG_MSI_DEVICE, parent); + if (!domain) + goto fail; + + domain->dev = dev; + dev->msi.data->__domains[domid].domain = domain; + msi_unlock_descs(dev); + return true; + +fail: + msi_unlock_descs(dev); +free_fwnode: + kfree(fwnode); +free_bundle: + kfree(bundle); + return false; +} + +/** + * msi_remove_device_irq_domain - Free a device MSI interrupt domain + * @dev: Pointer to the device + * @domid: Domain id + */ +void msi_remove_device_irq_domain(struct device *dev, unsigned int domid) +{ + struct msi_domain_info *info; struct irq_domain *domain; - msi_domain_update_dom_ops(info); - if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS) - msi_domain_update_chip_ops(info); + msi_lock_descs(dev); - domain = irq_domain_create_hierarchy(parent, IRQ_DOMAIN_FLAG_MSI, 0, - fwnode, &msi_domain_ops, info); + domain = msi_get_device_domain(dev, domid); - if (domain && !domain->name && info->chip) - domain->name = info->chip->name; + if (!domain || !irq_domain_is_msi_device(domain)) + goto unlock; - return domain; + dev->msi.data->__domains[domid].domain = NULL; + info = domain->host_data; + irq_domain_remove(domain); + kfree(container_of(info, struct msi_domain_template, info)); + +unlock: + msi_unlock_descs(dev); +} + +/** + * msi_match_device_irq_domain - Match a device irq domain against a bus token + * @dev: Pointer to the device + * @domid: Domain id + * @bus_token: Bus token to match against the domain bus token + * + * Return: True if device domain exists and bus tokens match. + */ +bool msi_match_device_irq_domain(struct device *dev, unsigned int domid, + enum irq_domain_bus_token bus_token) +{ + struct msi_domain_info *info; + struct irq_domain *domain; + bool ret = false; + + msi_lock_descs(dev); + domain = msi_get_device_domain(dev, domid); + if (domain && irq_domain_is_msi_device(domain)) { + info = domain->host_data; + ret = info->bus_token == bus_token; + } + msi_unlock_descs(dev); + return ret; } int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev, @@ -700,13 +1062,8 @@ int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev, { struct msi_domain_info *info = domain->host_data; struct msi_domain_ops *ops = info->ops; - int ret; - - ret = ops->msi_check(domain, info, dev); - if (ret == 0) - ret = ops->msi_prepare(domain, dev, nvec, arg); - return ret; + return ops->msi_prepare(domain, dev, nvec, arg); } int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev, @@ -714,16 +1071,27 @@ int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev, { struct msi_domain_info *info = domain->host_data; struct msi_domain_ops *ops = info->ops; + struct msi_ctrl ctrl = { + .domid = MSI_DEFAULT_DOMAIN, + .first = virq_base, + .last = virq_base + nvec - 1, + }; struct msi_desc *desc; + struct xarray *xa; int ret, virq; + if (!msi_ctrl_valid(dev, &ctrl)) + return -EINVAL; + msi_lock_descs(dev); - ret = msi_add_simple_msi_descs(dev, virq_base, nvec); + ret = msi_domain_add_simple_msi_descs(dev, &ctrl); if (ret) goto unlock; + xa = &dev->msi.data->__domains[ctrl.domid].store; + for (virq = virq_base; virq < virq_base + nvec; virq++) { - desc = xa_load(&dev->msi.data->__store, virq); + desc = xa_load(xa, virq); desc->irq = virq; ops->set_desc(arg, desc); @@ -739,7 +1107,7 @@ int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev, fail: for (--virq; virq >= virq_base; virq--) irq_domain_free_irqs_common(domain, virq, 1); - msi_free_msi_descs_range(dev, MSI_DESC_ALL, virq_base, virq_base + nvec - 1); + msi_domain_free_descs(dev, &ctrl); unlock: msi_unlock_descs(dev); return ret; @@ -764,6 +1132,8 @@ static bool msi_check_reservation_mode(struct irq_domain *domain, switch(domain->bus_token) { case DOMAIN_BUS_PCI_MSI: + case DOMAIN_BUS_PCI_DEVICE_MSI: + case DOMAIN_BUS_PCI_DEVICE_MSIX: case DOMAIN_BUS_VMD_MSI: break; default: @@ -789,6 +1159,8 @@ static int msi_handle_pci_fail(struct irq_domain *domain, struct msi_desc *desc, { switch(domain->bus_token) { case DOMAIN_BUS_PCI_MSI: + case DOMAIN_BUS_PCI_DEVICE_MSI: + case DOMAIN_BUS_PCI_DEVICE_MSIX: case DOMAIN_BUS_VMD_MSI: if (IS_ENABLED(CONFIG_PCI_MSI)) break; @@ -850,18 +1222,19 @@ static int msi_init_virq(struct irq_domain *domain, int virq, unsigned int vflag return 0; } -int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, - int nvec) +static int __msi_domain_alloc_irqs(struct device *dev, struct irq_domain *domain, + struct msi_ctrl *ctrl) { + struct xarray *xa = &dev->msi.data->__domains[ctrl->domid].store; struct msi_domain_info *info = domain->host_data; struct msi_domain_ops *ops = info->ops; + unsigned int vflags = 0, allocated = 0; msi_alloc_info_t arg = { }; - unsigned int vflags = 0; struct msi_desc *desc; - int allocated = 0; + unsigned long idx; int i, ret, virq; - ret = msi_domain_prepare_irqs(domain, dev, nvec, &arg); + ret = msi_domain_prepare_irqs(domain, dev, ctrl->nirqs, &arg); if (ret) return ret; @@ -883,11 +1256,21 @@ int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, * MSI affinity setting requires a special quirk (X86) when * reservation mode is active. */ - if (domain->flags & IRQ_DOMAIN_MSI_NOMASK_QUIRK) + if (info->flags & MSI_FLAG_NOMASK_QUIRK) vflags |= VIRQ_NOMASK_QUIRK; } - msi_for_each_desc(desc, dev, MSI_DESC_NOTASSOCIATED) { + xa_for_each_range(xa, idx, desc, ctrl->first, ctrl->last) { + if (!msi_desc_match(desc, MSI_DESC_NOTASSOCIATED)) + continue; + + /* This should return -ECONFUSED... */ + if (WARN_ON_ONCE(allocated >= ctrl->nirqs)) + return -EINVAL; + + if (ops->prepare_desc) + ops->prepare_desc(domain, &arg, desc); + ops->set_desc(&arg, desc); virq = __irq_domain_alloc_irqs(domain, -1, desc->nvec_used, @@ -913,76 +1296,213 @@ int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, return 0; } -static int msi_domain_add_simple_msi_descs(struct msi_domain_info *info, - struct device *dev, - unsigned int num_descs) +static int msi_domain_alloc_simple_msi_descs(struct device *dev, + struct msi_domain_info *info, + struct msi_ctrl *ctrl) { if (!(info->flags & MSI_FLAG_ALLOC_SIMPLE_MSI_DESCS)) return 0; - return msi_add_simple_msi_descs(dev, 0, num_descs); + return msi_domain_add_simple_msi_descs(dev, ctrl); +} + +static int __msi_domain_alloc_locked(struct device *dev, struct msi_ctrl *ctrl) +{ + struct msi_domain_info *info; + struct msi_domain_ops *ops; + struct irq_domain *domain; + int ret; + + if (!msi_ctrl_valid(dev, ctrl)) + return -EINVAL; + + domain = msi_get_device_domain(dev, ctrl->domid); + if (!domain) + return -ENODEV; + + info = domain->host_data; + + ret = msi_domain_alloc_simple_msi_descs(dev, info, ctrl); + if (ret) + return ret; + + ops = info->ops; + if (ops->domain_alloc_irqs) + return ops->domain_alloc_irqs(domain, dev, ctrl->nirqs); + + return __msi_domain_alloc_irqs(dev, domain, ctrl); +} + +static int msi_domain_alloc_locked(struct device *dev, struct msi_ctrl *ctrl) +{ + int ret = __msi_domain_alloc_locked(dev, ctrl); + + if (ret) + msi_domain_free_locked(dev, ctrl); + return ret; } /** - * msi_domain_alloc_irqs_descs_locked - Allocate interrupts from a MSI interrupt domain - * @domain: The domain to allocate from + * msi_domain_alloc_irqs_range_locked - Allocate interrupts from a MSI interrupt domain * @dev: Pointer to device struct of the device for which the interrupts * are allocated - * @nvec: The number of interrupts to allocate + * @domid: Id of the interrupt domain to operate on + * @first: First index to allocate (inclusive) + * @last: Last index to allocate (inclusive) * * Must be invoked from within a msi_lock_descs() / msi_unlock_descs() - * pair. Use this for MSI irqdomains which implement their own vector + * pair. Use this for MSI irqdomains which implement their own descriptor * allocation/free. * * Return: %0 on success or an error code. */ -int msi_domain_alloc_irqs_descs_locked(struct irq_domain *domain, struct device *dev, - int nvec) +int msi_domain_alloc_irqs_range_locked(struct device *dev, unsigned int domid, + unsigned int first, unsigned int last) { - struct msi_domain_info *info = domain->host_data; - struct msi_domain_ops *ops = info->ops; - int ret; - - lockdep_assert_held(&dev->msi.data->mutex); + struct msi_ctrl ctrl = { + .domid = domid, + .first = first, + .last = last, + .nirqs = last + 1 - first, + }; + + return msi_domain_alloc_locked(dev, &ctrl); +} - ret = msi_domain_add_simple_msi_descs(info, dev, nvec); - if (ret) - return ret; +/** + * msi_domain_alloc_irqs_range - Allocate interrupts from a MSI interrupt domain + * @dev: Pointer to device struct of the device for which the interrupts + * are allocated + * @domid: Id of the interrupt domain to operate on + * @first: First index to allocate (inclusive) + * @last: Last index to allocate (inclusive) + * + * Return: %0 on success or an error code. + */ +int msi_domain_alloc_irqs_range(struct device *dev, unsigned int domid, + unsigned int first, unsigned int last) +{ + int ret; - ret = ops->domain_alloc_irqs(domain, dev, nvec); - if (ret) - msi_domain_free_irqs_descs_locked(domain, dev); + msi_lock_descs(dev); + ret = msi_domain_alloc_irqs_range_locked(dev, domid, first, last); + msi_unlock_descs(dev); return ret; } /** - * msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain - * @domain: The domain to allocate from + * msi_domain_alloc_irqs_all_locked - Allocate all interrupts from a MSI interrupt domain + * * @dev: Pointer to device struct of the device for which the interrupts * are allocated - * @nvec: The number of interrupts to allocate + * @domid: Id of the interrupt domain to operate on + * @nirqs: The number of interrupts to allocate + * + * This function scans all MSI descriptors of the MSI domain and allocates interrupts + * for all unassigned ones. That function is to be used for MSI domain usage where + * the descriptor allocation is handled at the call site, e.g. PCI/MSI[X]. * * Return: %0 on success or an error code. */ -int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, int nvec) +int msi_domain_alloc_irqs_all_locked(struct device *dev, unsigned int domid, int nirqs) { + struct msi_ctrl ctrl = { + .domid = domid, + .first = 0, + .last = msi_domain_get_hwsize(dev, domid) - 1, + .nirqs = nirqs, + }; + + return msi_domain_alloc_locked(dev, &ctrl); +} + +/** + * msi_domain_alloc_irq_at - Allocate an interrupt from a MSI interrupt domain at + * a given index - or at the next free index + * + * @dev: Pointer to device struct of the device for which the interrupts + * are allocated + * @domid: Id of the interrupt domain to operate on + * @index: Index for allocation. If @index == %MSI_ANY_INDEX the allocation + * uses the next free index. + * @affdesc: Optional pointer to an interrupt affinity descriptor structure + * @icookie: Optional pointer to a domain specific per instance cookie. If + * non-NULL the content of the cookie is stored in msi_desc::data. + * Must be NULL for MSI-X allocations + * + * This requires a MSI interrupt domain which lets the core code manage the + * MSI descriptors. + * + * Return: struct msi_map + * + * On success msi_map::index contains the allocated index number and + * msi_map::virq the corresponding Linux interrupt number + * + * On failure msi_map::index contains the error code and msi_map::virq + * is %0. + */ +struct msi_map msi_domain_alloc_irq_at(struct device *dev, unsigned int domid, unsigned int index, + const struct irq_affinity_desc *affdesc, + union msi_instance_cookie *icookie) +{ + struct msi_ctrl ctrl = { .domid = domid, .nirqs = 1, }; + struct irq_domain *domain; + struct msi_map map = { }; + struct msi_desc *desc; int ret; msi_lock_descs(dev); - ret = msi_domain_alloc_irqs_descs_locked(domain, dev, nvec); + domain = msi_get_device_domain(dev, domid); + if (!domain) { + map.index = -ENODEV; + goto unlock; + } + + desc = msi_alloc_desc(dev, 1, affdesc); + if (!desc) { + map.index = -ENOMEM; + goto unlock; + } + + if (icookie) + desc->data.icookie = *icookie; + + ret = msi_insert_desc(dev, desc, domid, index); + if (ret) { + map.index = ret; + goto unlock; + } + + ctrl.first = ctrl.last = desc->msi_index; + + ret = __msi_domain_alloc_irqs(dev, domain, &ctrl); + if (ret) { + map.index = ret; + msi_domain_free_locked(dev, &ctrl); + } else { + map.index = desc->msi_index; + map.virq = desc->irq; + } +unlock: msi_unlock_descs(dev); - return ret; + return map; } -void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev) +static void __msi_domain_free_irqs(struct device *dev, struct irq_domain *domain, + struct msi_ctrl *ctrl) { + struct xarray *xa = &dev->msi.data->__domains[ctrl->domid].store; struct msi_domain_info *info = domain->host_data; struct irq_data *irqd; struct msi_desc *desc; + unsigned long idx; int i; - /* Only handle MSI entries which have an interrupt associated */ - msi_for_each_desc(desc, dev, MSI_DESC_ASSOCIATED) { + xa_for_each_range(xa, idx, desc, ctrl->first, ctrl->last) { + /* Only handle MSI entries which have an interrupt associated */ + if (!msi_desc_match(desc, MSI_DESC_ASSOCIATED)) + continue; + /* Make sure all interrupts are deactivated */ for (i = 0; i < desc->nvec_used; i++) { irqd = irq_domain_get_irq_data(domain, desc->irq + i); @@ -997,44 +1517,99 @@ void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev) } } -static void msi_domain_free_msi_descs(struct msi_domain_info *info, - struct device *dev) +static void msi_domain_free_locked(struct device *dev, struct msi_ctrl *ctrl) { + struct msi_domain_info *info; + struct msi_domain_ops *ops; + struct irq_domain *domain; + + if (!msi_ctrl_valid(dev, ctrl)) + return; + + domain = msi_get_device_domain(dev, ctrl->domid); + if (!domain) + return; + + info = domain->host_data; + ops = info->ops; + + if (ops->domain_free_irqs) + ops->domain_free_irqs(domain, dev); + else + __msi_domain_free_irqs(dev, domain, ctrl); + + if (ops->msi_post_free) + ops->msi_post_free(domain, dev); + if (info->flags & MSI_FLAG_FREE_MSI_DESCS) - msi_free_msi_descs(dev); + msi_domain_free_descs(dev, ctrl); } /** - * msi_domain_free_irqs_descs_locked - Free interrupts from a MSI interrupt @domain associated to @dev - * @domain: The domain to managing the interrupts + * msi_domain_free_irqs_range_locked - Free a range of interrupts from a MSI interrupt domain + * associated to @dev with msi_lock held * @dev: Pointer to device struct of the device for which the interrupts - * are free + * are freed + * @domid: Id of the interrupt domain to operate on + * @first: First index to free (inclusive) + * @last: Last index to free (inclusive) + */ +void msi_domain_free_irqs_range_locked(struct device *dev, unsigned int domid, + unsigned int first, unsigned int last) +{ + struct msi_ctrl ctrl = { + .domid = domid, + .first = first, + .last = last, + }; + msi_domain_free_locked(dev, &ctrl); +} + +/** + * msi_domain_free_irqs_range - Free a range of interrupts from a MSI interrupt domain + * associated to @dev + * @dev: Pointer to device struct of the device for which the interrupts + * are freed + * @domid: Id of the interrupt domain to operate on + * @first: First index to free (inclusive) + * @last: Last index to free (inclusive) + */ +void msi_domain_free_irqs_range(struct device *dev, unsigned int domid, + unsigned int first, unsigned int last) +{ + msi_lock_descs(dev); + msi_domain_free_irqs_range_locked(dev, domid, first, last); + msi_unlock_descs(dev); +} + +/** + * msi_domain_free_irqs_all_locked - Free all interrupts from a MSI interrupt domain + * associated to a device + * @dev: Pointer to device struct of the device for which the interrupts + * are freed + * @domid: The id of the domain to operate on * * Must be invoked from within a msi_lock_descs() / msi_unlock_descs() * pair. Use this for MSI irqdomains which implement their own vector * allocation. */ -void msi_domain_free_irqs_descs_locked(struct irq_domain *domain, struct device *dev) +void msi_domain_free_irqs_all_locked(struct device *dev, unsigned int domid) { - struct msi_domain_info *info = domain->host_data; - struct msi_domain_ops *ops = info->ops; - - lockdep_assert_held(&dev->msi.data->mutex); - - ops->domain_free_irqs(domain, dev); - msi_domain_free_msi_descs(info, dev); + msi_domain_free_irqs_range_locked(dev, domid, 0, + msi_domain_get_hwsize(dev, domid) - 1); } /** - * msi_domain_free_irqs - Free interrupts from a MSI interrupt @domain associated to @dev - * @domain: The domain to managing the interrupts + * msi_domain_free_irqs_all - Free all interrupts from a MSI interrupt domain + * associated to a device * @dev: Pointer to device struct of the device for which the interrupts - * are free + * are freed + * @domid: The id of the domain to operate on */ -void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev) +void msi_domain_free_irqs_all(struct device *dev, unsigned int domid) { msi_lock_descs(dev); - msi_domain_free_irqs_descs_locked(domain, dev); + msi_domain_free_irqs_all_locked(dev, domid); msi_unlock_descs(dev); } @@ -1048,5 +1623,3 @@ struct msi_domain_info *msi_get_domain_info(struct irq_domain *domain) { return (struct msi_domain_info *)domain->host_data; } - -#endif /* CONFIG_GENERIC_MSI_IRQ_DOMAIN */ diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 714ac4c3b556..d9c822bbffb8 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -113,11 +113,40 @@ int static_key_count(struct static_key *key) } EXPORT_SYMBOL_GPL(static_key_count); -void static_key_slow_inc_cpuslocked(struct static_key *key) +/* + * static_key_fast_inc_not_disabled - adds a user for a static key + * @key: static key that must be already enabled + * + * The caller must make sure that the static key can't get disabled while + * in this function. It doesn't patch jump labels, only adds a user to + * an already enabled static key. + * + * Returns true if the increment was done. Unlike refcount_t the ref counter + * is not saturated, but will fail to increment on overflow. + */ +bool static_key_fast_inc_not_disabled(struct static_key *key) { - int v, v1; + int v; STATIC_KEY_CHECK_USE(key); + /* + * Negative key->enabled has a special meaning: it sends + * static_key_slow_inc() down the slow path, and it is non-zero + * so it counts as "enabled" in jump_label_update(). Note that + * atomic_inc_unless_negative() checks >= 0, so roll our own. + */ + v = atomic_read(&key->enabled); + do { + if (v <= 0 || (v + 1) < 0) + return false; + } while (!likely(atomic_try_cmpxchg(&key->enabled, &v, v + 1))); + + return true; +} +EXPORT_SYMBOL_GPL(static_key_fast_inc_not_disabled); + +bool static_key_slow_inc_cpuslocked(struct static_key *key) +{ lockdep_assert_cpus_held(); /* @@ -126,17 +155,9 @@ void static_key_slow_inc_cpuslocked(struct static_key *key) * jump_label_update() process. At the same time, however, * the jump_label_update() call below wants to see * static_key_enabled(&key) for jumps to be updated properly. - * - * So give a special meaning to negative key->enabled: it sends - * static_key_slow_inc() down the slow path, and it is non-zero - * so it counts as "enabled" in jump_label_update(). Note that - * atomic_inc_unless_negative() checks >= 0, so roll our own. */ - for (v = atomic_read(&key->enabled); v > 0; v = v1) { - v1 = atomic_cmpxchg(&key->enabled, v, v + 1); - if (likely(v1 == v)) - return; - } + if (static_key_fast_inc_not_disabled(key)) + return true; jump_label_lock(); if (atomic_read(&key->enabled) == 0) { @@ -148,16 +169,23 @@ void static_key_slow_inc_cpuslocked(struct static_key *key) */ atomic_set_release(&key->enabled, 1); } else { - atomic_inc(&key->enabled); + if (WARN_ON_ONCE(!static_key_fast_inc_not_disabled(key))) { + jump_label_unlock(); + return false; + } } jump_label_unlock(); + return true; } -void static_key_slow_inc(struct static_key *key) +bool static_key_slow_inc(struct static_key *key) { + bool ret; + cpus_read_lock(); - static_key_slow_inc_cpuslocked(key); + ret = static_key_slow_inc_cpuslocked(key); cpus_read_unlock(); + return ret; } EXPORT_SYMBOL_GPL(static_key_slow_inc); diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 3e7e2c2ad2f7..83f499182c9a 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -50,12 +50,20 @@ static unsigned int kallsyms_expand_symbol(unsigned int off, data = &kallsyms_names[off]; len = *data; data++; + off++; + + /* If MSB is 1, it is a "big" symbol, so needs an additional byte. */ + if ((len & 0x80) != 0) { + len = (len & 0x7F) | (*data << 7); + data++; + off++; + } /* * Update the offset to return the offset for the next symbol on * the compressed stream. */ - off += len + 1; + off += len; /* * For every byte on the compressed symbol data, copy the table @@ -108,7 +116,7 @@ static char kallsyms_get_symbol_type(unsigned int off) static unsigned int get_symbol_offset(unsigned long pos) { const u8 *name; - int i; + int i, len; /* * Use the closest marker we have. We have markers every 256 positions, @@ -122,13 +130,23 @@ static unsigned int get_symbol_offset(unsigned long pos) * so we just need to add the len to the current pointer for every * symbol we wish to skip. */ - for (i = 0; i < (pos & 0xFF); i++) - name = name + (*name) + 1; + for (i = 0; i < (pos & 0xFF); i++) { + len = *name; + + /* + * If MSB is 1, it is a "big" symbol, so we need to look into + * the next byte (and skip it, too). + */ + if ((len & 0x80) != 0) + len = ((len & 0x7F) | (name[1] << 7)) + 1; + + name = name + len + 1; + } return name - kallsyms_names; } -static unsigned long kallsyms_sym_address(int idx) +unsigned long kallsyms_sym_address(int idx) { if (!IS_ENABLED(CONFIG_KALLSYMS_BASE_RELATIVE)) return kallsyms_addresses[idx]; @@ -159,7 +177,6 @@ static bool cleanup_symbol_name(char *s) * character in an identifier in C. Suffixes observed: * - foo.llvm.[0-9a-f]+ * - foo.[0-9a-f]+ - * - foo.[0-9a-f]+.cfi_jt */ res = strchr(s, '.'); if (res) { @@ -167,45 +184,103 @@ static bool cleanup_symbol_name(char *s) return true; } - if (!IS_ENABLED(CONFIG_CFI_CLANG) || - !IS_ENABLED(CONFIG_LTO_CLANG_THIN) || - CONFIG_CLANG_VERSION >= 130000) - return false; + return false; +} - /* - * Prior to LLVM 13, the following suffixes were observed when thinLTO - * and CFI are both enabled: - * - foo$[0-9]+ - */ - res = strrchr(s, '$'); - if (res) { - *res = '\0'; - return true; +static int compare_symbol_name(const char *name, char *namebuf) +{ + int ret; + + ret = strcmp(name, namebuf); + if (!ret) + return ret; + + if (cleanup_symbol_name(namebuf) && !strcmp(name, namebuf)) + return 0; + + return ret; +} + +static unsigned int get_symbol_seq(int index) +{ + unsigned int i, seq = 0; + + for (i = 0; i < 3; i++) + seq = (seq << 8) | kallsyms_seqs_of_names[3 * index + i]; + + return seq; +} + +static int kallsyms_lookup_names(const char *name, + unsigned int *start, + unsigned int *end) +{ + int ret; + int low, mid, high; + unsigned int seq, off; + char namebuf[KSYM_NAME_LEN]; + + low = 0; + high = kallsyms_num_syms - 1; + + while (low <= high) { + mid = low + (high - low) / 2; + seq = get_symbol_seq(mid); + off = get_symbol_offset(seq); + kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); + ret = compare_symbol_name(name, namebuf); + if (ret > 0) + low = mid + 1; + else if (ret < 0) + high = mid - 1; + else + break; } - return false; + if (low > high) + return -ESRCH; + + low = mid; + while (low) { + seq = get_symbol_seq(low - 1); + off = get_symbol_offset(seq); + kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); + if (compare_symbol_name(name, namebuf)) + break; + low--; + } + *start = low; + + if (end) { + high = mid; + while (high < kallsyms_num_syms - 1) { + seq = get_symbol_seq(high + 1); + off = get_symbol_offset(seq); + kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); + if (compare_symbol_name(name, namebuf)) + break; + high++; + } + *end = high; + } + + return 0; } /* Lookup the address for this symbol. Returns 0 if not found. */ unsigned long kallsyms_lookup_name(const char *name) { - char namebuf[KSYM_NAME_LEN]; - unsigned long i; - unsigned int off; + int ret; + unsigned int i; /* Skip the search for empty string. */ if (!*name) return 0; - for (i = 0, off = 0; i < kallsyms_num_syms; i++) { - off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); - - if (strcmp(namebuf, name) == 0) - return kallsyms_sym_address(i); + ret = kallsyms_lookup_names(name, &i, NULL); + if (!ret) + return kallsyms_sym_address(get_symbol_seq(i)); - if (cleanup_symbol_name(namebuf) && strcmp(namebuf, name) == 0) - return kallsyms_sym_address(i); - } return module_kallsyms_lookup_name(name); } @@ -232,6 +307,24 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, return 0; } +int kallsyms_on_each_match_symbol(int (*fn)(void *, unsigned long), + const char *name, void *data) +{ + int ret; + unsigned int i, start, end; + + ret = kallsyms_lookup_names(name, &start, &end); + if (ret) + return 0; + + for (i = start; !ret && i <= end; i++) { + ret = fn(data, kallsyms_sym_address(get_symbol_seq(i))); + cond_resched(); + } + + return ret; +} + static unsigned long get_symbol_pos(unsigned long addr, unsigned long *symbolsize, unsigned long *offset) diff --git a/kernel/kallsyms_internal.h b/kernel/kallsyms_internal.h index 2d0c6f2f0243..27fabdcc40f5 100644 --- a/kernel/kallsyms_internal.h +++ b/kernel/kallsyms_internal.h @@ -26,5 +26,6 @@ extern const char kallsyms_token_table[] __weak; extern const u16 kallsyms_token_index[] __weak; extern const unsigned int kallsyms_markers[] __weak; +extern const u8 kallsyms_seqs_of_names[] __weak; #endif // LINUX_KALLSYMS_INTERNAL_H_ diff --git a/kernel/kallsyms_selftest.c b/kernel/kallsyms_selftest.c new file mode 100644 index 000000000000..f35d9cc1aab1 --- /dev/null +++ b/kernel/kallsyms_selftest.c @@ -0,0 +1,485 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Test the function and performance of kallsyms + * + * Copyright (C) Huawei Technologies Co., Ltd., 2022 + * + * Authors: Zhen Lei <thunder.leizhen@huawei.com> Huawei + */ + +#define pr_fmt(fmt) "kallsyms_selftest: " fmt + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/kallsyms.h> +#include <linux/random.h> +#include <linux/sched/clock.h> +#include <linux/kthread.h> +#include <linux/vmalloc.h> + +#include "kallsyms_internal.h" +#include "kallsyms_selftest.h" + + +#define MAX_NUM_OF_RECORDS 64 + +struct test_stat { + int min; + int max; + int save_cnt; + int real_cnt; + int perf; + u64 sum; + char *name; + unsigned long addr; + unsigned long addrs[MAX_NUM_OF_RECORDS]; +}; + +struct test_item { + char *name; + unsigned long addr; +}; + +#define ITEM_FUNC(s) \ + { \ + .name = #s, \ + .addr = (unsigned long)s, \ + } + +#define ITEM_DATA(s) \ + { \ + .name = #s, \ + .addr = (unsigned long)&s, \ + } + + +static int kallsyms_test_var_bss_static; +static int kallsyms_test_var_data_static = 1; +int kallsyms_test_var_bss; +int kallsyms_test_var_data = 1; + +static int kallsyms_test_func_static(void) +{ + kallsyms_test_var_bss_static++; + kallsyms_test_var_data_static++; + + return 0; +} + +int kallsyms_test_func(void) +{ + return kallsyms_test_func_static(); +} + +__weak int kallsyms_test_func_weak(void) +{ + kallsyms_test_var_bss++; + kallsyms_test_var_data++; + return 0; +} + +static struct test_item test_items[] = { + ITEM_FUNC(kallsyms_test_func_static), + ITEM_FUNC(kallsyms_test_func), + ITEM_FUNC(kallsyms_test_func_weak), + ITEM_FUNC(vmalloc), + ITEM_FUNC(vfree), +#ifdef CONFIG_KALLSYMS_ALL + ITEM_DATA(kallsyms_test_var_bss_static), + ITEM_DATA(kallsyms_test_var_data_static), + ITEM_DATA(kallsyms_test_var_bss), + ITEM_DATA(kallsyms_test_var_data), + ITEM_DATA(vmap_area_list), +#endif +}; + +static char stub_name[KSYM_NAME_LEN]; + +static int stat_symbol_len(void *data, const char *name, struct module *mod, unsigned long addr) +{ + *(u32 *)data += strlen(name); + + return 0; +} + +static void test_kallsyms_compression_ratio(void) +{ + u32 pos, off, len, num; + u32 ratio, total_size, total_len = 0; + + kallsyms_on_each_symbol(stat_symbol_len, &total_len); + + /* + * A symbol name cannot start with a number. This stub name helps us + * traverse the entire symbol table without finding a match. It's used + * for subsequent performance tests, and its length is the average + * length of all symbol names. + */ + memset(stub_name, '4', sizeof(stub_name)); + pos = total_len / kallsyms_num_syms; + stub_name[pos] = 0; + + pos = 0; + num = 0; + off = 0; + while (pos < kallsyms_num_syms) { + len = kallsyms_names[off]; + num++; + off++; + pos++; + if ((len & 0x80) != 0) { + len = (len & 0x7f) | (kallsyms_names[off] << 7); + num++; + off++; + } + off += len; + } + + /* + * 1. The length fields is not counted + * 2. The memory occupied by array kallsyms_token_table[] and + * kallsyms_token_index[] needs to be counted. + */ + total_size = off - num; + pos = kallsyms_token_index[0xff]; + total_size += pos + strlen(&kallsyms_token_table[pos]) + 1; + total_size += 0x100 * sizeof(u16); + + pr_info(" ---------------------------------------------------------\n"); + pr_info("| nr_symbols | compressed size | original size | ratio(%%) |\n"); + pr_info("|---------------------------------------------------------|\n"); + ratio = (u32)div_u64(10000ULL * total_size, total_len); + pr_info("| %10d | %10d | %10d | %2d.%-2d |\n", + kallsyms_num_syms, total_size, total_len, ratio / 100, ratio % 100); + pr_info(" ---------------------------------------------------------\n"); +} + +static int lookup_name(void *data, const char *name, struct module *mod, unsigned long addr) +{ + u64 t0, t1, t; + unsigned long flags; + struct test_stat *stat = (struct test_stat *)data; + + local_irq_save(flags); + t0 = sched_clock(); + (void)kallsyms_lookup_name(name); + t1 = sched_clock(); + local_irq_restore(flags); + + t = t1 - t0; + if (t < stat->min) + stat->min = t; + + if (t > stat->max) + stat->max = t; + + stat->real_cnt++; + stat->sum += t; + + return 0; +} + +static void test_perf_kallsyms_lookup_name(void) +{ + struct test_stat stat; + + memset(&stat, 0, sizeof(stat)); + stat.min = INT_MAX; + kallsyms_on_each_symbol(lookup_name, &stat); + pr_info("kallsyms_lookup_name() looked up %d symbols\n", stat.real_cnt); + pr_info("The time spent on each symbol is (ns): min=%d, max=%d, avg=%lld\n", + stat.min, stat.max, div_u64(stat.sum, stat.real_cnt)); +} + +static bool match_cleanup_name(const char *s, const char *name) +{ + char *p; + int len; + + if (!IS_ENABLED(CONFIG_LTO_CLANG)) + return false; + + p = strchr(s, '.'); + if (!p) + return false; + + len = strlen(name); + if (p - s != len) + return false; + + return !strncmp(s, name, len); +} + +static int find_symbol(void *data, const char *name, struct module *mod, unsigned long addr) +{ + struct test_stat *stat = (struct test_stat *)data; + + if (strcmp(name, stat->name) == 0 || + (!stat->perf && match_cleanup_name(name, stat->name))) { + stat->real_cnt++; + stat->addr = addr; + + if (stat->save_cnt < MAX_NUM_OF_RECORDS) { + stat->addrs[stat->save_cnt] = addr; + stat->save_cnt++; + } + + if (stat->real_cnt == stat->max) + return 1; + } + + return 0; +} + +static void test_perf_kallsyms_on_each_symbol(void) +{ + u64 t0, t1; + unsigned long flags; + struct test_stat stat; + + memset(&stat, 0, sizeof(stat)); + stat.max = INT_MAX; + stat.name = stub_name; + stat.perf = 1; + local_irq_save(flags); + t0 = sched_clock(); + kallsyms_on_each_symbol(find_symbol, &stat); + t1 = sched_clock(); + local_irq_restore(flags); + pr_info("kallsyms_on_each_symbol() traverse all: %lld ns\n", t1 - t0); +} + +static int match_symbol(void *data, unsigned long addr) +{ + struct test_stat *stat = (struct test_stat *)data; + + stat->real_cnt++; + stat->addr = addr; + + if (stat->save_cnt < MAX_NUM_OF_RECORDS) { + stat->addrs[stat->save_cnt] = addr; + stat->save_cnt++; + } + + if (stat->real_cnt == stat->max) + return 1; + + return 0; +} + +static void test_perf_kallsyms_on_each_match_symbol(void) +{ + u64 t0, t1; + unsigned long flags; + struct test_stat stat; + + memset(&stat, 0, sizeof(stat)); + stat.max = INT_MAX; + stat.name = stub_name; + local_irq_save(flags); + t0 = sched_clock(); + kallsyms_on_each_match_symbol(match_symbol, stat.name, &stat); + t1 = sched_clock(); + local_irq_restore(flags); + pr_info("kallsyms_on_each_match_symbol() traverse all: %lld ns\n", t1 - t0); +} + +static int test_kallsyms_basic_function(void) +{ + int i, j, ret; + int next = 0, nr_failed = 0; + char *prefix; + unsigned short rand; + unsigned long addr, lookup_addr; + char namebuf[KSYM_NAME_LEN]; + struct test_stat *stat, *stat2; + + stat = kmalloc(sizeof(*stat) * 2, GFP_KERNEL); + if (!stat) + return -ENOMEM; + stat2 = stat + 1; + + prefix = "kallsyms_lookup_name() for"; + for (i = 0; i < ARRAY_SIZE(test_items); i++) { + addr = kallsyms_lookup_name(test_items[i].name); + if (addr != test_items[i].addr) { + nr_failed++; + pr_info("%s %s failed: addr=%lx, expect %lx\n", + prefix, test_items[i].name, addr, test_items[i].addr); + } + } + + prefix = "kallsyms_on_each_symbol() for"; + for (i = 0; i < ARRAY_SIZE(test_items); i++) { + memset(stat, 0, sizeof(*stat)); + stat->max = INT_MAX; + stat->name = test_items[i].name; + kallsyms_on_each_symbol(find_symbol, stat); + if (stat->addr != test_items[i].addr || stat->real_cnt != 1) { + nr_failed++; + pr_info("%s %s failed: count=%d, addr=%lx, expect %lx\n", + prefix, test_items[i].name, + stat->real_cnt, stat->addr, test_items[i].addr); + } + } + + prefix = "kallsyms_on_each_match_symbol() for"; + for (i = 0; i < ARRAY_SIZE(test_items); i++) { + memset(stat, 0, sizeof(*stat)); + stat->max = INT_MAX; + stat->name = test_items[i].name; + kallsyms_on_each_match_symbol(match_symbol, test_items[i].name, stat); + if (stat->addr != test_items[i].addr || stat->real_cnt != 1) { + nr_failed++; + pr_info("%s %s failed: count=%d, addr=%lx, expect %lx\n", + prefix, test_items[i].name, + stat->real_cnt, stat->addr, test_items[i].addr); + } + } + + if (nr_failed) { + kfree(stat); + return -ESRCH; + } + + for (i = 0; i < kallsyms_num_syms; i++) { + addr = kallsyms_sym_address(i); + if (!is_ksym_addr(addr)) + continue; + + ret = lookup_symbol_name(addr, namebuf); + if (unlikely(ret)) { + namebuf[0] = 0; + goto failed; + } + + /* + * The first '.' may be the initial letter, in which case the + * entire symbol name will be truncated to an empty string in + * cleanup_symbol_name(). Do not test these symbols. + * + * For example: + * cat /proc/kallsyms | awk '{print $3}' | grep -E "^\." | head + * .E_read_words + * .E_leading_bytes + * .E_trailing_bytes + * .E_write_words + * .E_copy + * .str.292.llvm.12122243386960820698 + * .str.24.llvm.12122243386960820698 + * .str.29.llvm.12122243386960820698 + * .str.75.llvm.12122243386960820698 + * .str.99.llvm.12122243386960820698 + */ + if (IS_ENABLED(CONFIG_LTO_CLANG) && !namebuf[0]) + continue; + + lookup_addr = kallsyms_lookup_name(namebuf); + + memset(stat, 0, sizeof(*stat)); + stat->max = INT_MAX; + kallsyms_on_each_match_symbol(match_symbol, namebuf, stat); + + /* + * kallsyms_on_each_symbol() is too slow, randomly select some + * symbols for test. + */ + if (i >= next) { + memset(stat2, 0, sizeof(*stat2)); + stat2->max = INT_MAX; + stat2->name = namebuf; + kallsyms_on_each_symbol(find_symbol, stat2); + + /* + * kallsyms_on_each_symbol() and kallsyms_on_each_match_symbol() + * need to get the same traversal result. + */ + if (stat->addr != stat2->addr || + stat->real_cnt != stat2->real_cnt || + memcmp(stat->addrs, stat2->addrs, + stat->save_cnt * sizeof(stat->addrs[0]))) + goto failed; + + /* + * The average of random increments is 128, that is, one of + * them is tested every 128 symbols. + */ + get_random_bytes(&rand, sizeof(rand)); + next = i + (rand & 0xff) + 1; + } + + /* Need to be found at least once */ + if (!stat->real_cnt) + goto failed; + + /* + * kallsyms_lookup_name() returns the address of the first + * symbol found and cannot be NULL. + */ + if (!lookup_addr || lookup_addr != stat->addrs[0]) + goto failed; + + /* + * If the addresses of all matching symbols are recorded, the + * target address needs to be exist. + */ + if (stat->real_cnt <= MAX_NUM_OF_RECORDS) { + for (j = 0; j < stat->save_cnt; j++) { + if (stat->addrs[j] == addr) + break; + } + + if (j == stat->save_cnt) + goto failed; + } + } + + kfree(stat); + + return 0; + +failed: + pr_info("Test for %dth symbol failed: (%s) addr=%lx", i, namebuf, addr); + kfree(stat); + return -ESRCH; +} + +static int test_entry(void *p) +{ + int ret; + + do { + schedule_timeout(5 * HZ); + } while (system_state != SYSTEM_RUNNING); + + pr_info("start\n"); + ret = test_kallsyms_basic_function(); + if (ret) { + pr_info("abort\n"); + return 0; + } + + test_kallsyms_compression_ratio(); + test_perf_kallsyms_lookup_name(); + test_perf_kallsyms_on_each_symbol(); + test_perf_kallsyms_on_each_match_symbol(); + pr_info("finish\n"); + + return 0; +} + +static int __init kallsyms_test_init(void) +{ + struct task_struct *t; + + t = kthread_create(test_entry, NULL, "kallsyms_test"); + if (IS_ERR(t)) { + pr_info("Create kallsyms selftest task failed\n"); + return PTR_ERR(t); + } + kthread_bind(t, 0); + wake_up_process(t); + + return 0; +} +late_initcall(kallsyms_test_init); diff --git a/kernel/kallsyms_selftest.h b/kernel/kallsyms_selftest.h new file mode 100644 index 000000000000..c0ca548e2a22 --- /dev/null +++ b/kernel/kallsyms_selftest.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef LINUX_KALLSYMS_SELFTEST_H_ +#define LINUX_KALLSYMS_SELFTEST_H_ + +#include <linux/types.h> + +extern int kallsyms_test_var_bss; +extern int kallsyms_test_var_data; + +extern int kallsyms_test_func(void); +extern int kallsyms_test_func_weak(void); + +#endif // LINUX_KALLSYMS_SELFTEST_H_ diff --git a/kernel/kcov.c b/kernel/kcov.c index e19c84b02452..e5cd09fd8a05 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -11,6 +11,7 @@ #include <linux/fs.h> #include <linux/hashtable.h> #include <linux/init.h> +#include <linux/kmsan-checks.h> #include <linux/mm.h> #include <linux/preempt.h> #include <linux/printk.h> @@ -152,6 +153,12 @@ static void kcov_remote_area_put(struct kcov_remote_area *area, INIT_LIST_HEAD(&area->list); area->size = size; list_add(&area->list, &kcov_remote_areas); + /* + * KMSAN doesn't instrument this file, so it may not know area->list + * is initialized. Unpoison it explicitly to avoid reports in + * kcov_remote_area_get(). + */ + kmsan_unpoison_memory(&area->list, sizeof(area->list)); } static notrace bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t) diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile index 4f35d1bced6a..8cf70f068d92 100644 --- a/kernel/kcsan/Makefile +++ b/kernel/kcsan/Makefile @@ -17,4 +17,5 @@ KCSAN_INSTRUMENT_BARRIERS_selftest.o := y obj-$(CONFIG_KCSAN_SELFTEST) += selftest.o CFLAGS_kcsan_test.o := $(CFLAGS_KCSAN) -g -fno-omit-frame-pointer +CFLAGS_kcsan_test.o += $(DISABLE_STRUCTLEAK_PLUGIN) obj-$(CONFIG_KCSAN_KUNIT_TEST) += kcsan_test.o diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index fe12dfe254ec..54d077e1a2dc 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -14,10 +14,12 @@ #include <linux/init.h> #include <linux/kernel.h> #include <linux/list.h> +#include <linux/minmax.h> #include <linux/moduleparam.h> #include <linux/percpu.h> #include <linux/preempt.h> #include <linux/sched.h> +#include <linux/string.h> #include <linux/uaccess.h> #include "encoding.h" @@ -1308,3 +1310,51 @@ noinline void __tsan_atomic_signal_fence(int memorder) } } EXPORT_SYMBOL(__tsan_atomic_signal_fence); + +#ifdef __HAVE_ARCH_MEMSET +void *__tsan_memset(void *s, int c, size_t count); +noinline void *__tsan_memset(void *s, int c, size_t count) +{ + /* + * Instead of not setting up watchpoints where accessed size is greater + * than MAX_ENCODABLE_SIZE, truncate checked size to MAX_ENCODABLE_SIZE. + */ + size_t check_len = min_t(size_t, count, MAX_ENCODABLE_SIZE); + + check_access(s, check_len, KCSAN_ACCESS_WRITE, _RET_IP_); + return memset(s, c, count); +} +#else +void *__tsan_memset(void *s, int c, size_t count) __alias(memset); +#endif +EXPORT_SYMBOL(__tsan_memset); + +#ifdef __HAVE_ARCH_MEMMOVE +void *__tsan_memmove(void *dst, const void *src, size_t len); +noinline void *__tsan_memmove(void *dst, const void *src, size_t len) +{ + size_t check_len = min_t(size_t, len, MAX_ENCODABLE_SIZE); + + check_access(dst, check_len, KCSAN_ACCESS_WRITE, _RET_IP_); + check_access(src, check_len, 0, _RET_IP_); + return memmove(dst, src, len); +} +#else +void *__tsan_memmove(void *dst, const void *src, size_t len) __alias(memmove); +#endif +EXPORT_SYMBOL(__tsan_memmove); + +#ifdef __HAVE_ARCH_MEMCPY +void *__tsan_memcpy(void *dst, const void *src, size_t len); +noinline void *__tsan_memcpy(void *dst, const void *src, size_t len) +{ + size_t check_len = min_t(size_t, len, MAX_ENCODABLE_SIZE); + + check_access(dst, check_len, KCSAN_ACCESS_WRITE, _RET_IP_); + check_access(src, check_len, 0, _RET_IP_); + return memcpy(dst, src, len); +} +#else +void *__tsan_memcpy(void *dst, const void *src, size_t len) __alias(memcpy); +#endif +EXPORT_SYMBOL(__tsan_memcpy); diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index 67794404042a..e95ce7d7a76e 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -492,8 +492,7 @@ static void print_report(enum kcsan_value_change value_change, dump_stack_print_info(KERN_DEFAULT); pr_err("==================================================================\n"); - if (panic_on_warn) - panic("panic_on_warn set ...\n"); + check_panic_on_warn("KCSAN"); } static void release_report(unsigned long *flags, struct other_info *other_info) diff --git a/kernel/kcsan/selftest.c b/kernel/kcsan/selftest.c index 75712959c84e..8679322450f2 100644 --- a/kernel/kcsan/selftest.c +++ b/kernel/kcsan/selftest.c @@ -22,13 +22,6 @@ #define ITERS_PER_TEST 2000 -/* Test requirements. */ -static bool __init test_requires(void) -{ - /* random should be initialized for the below tests */ - return prandom_u32() + prandom_u32() != 0; -} - /* * Test watchpoint encode and decode: check that encoding some access's info, * and then subsequent decode preserves the access's info. @@ -38,15 +31,15 @@ static bool __init test_encode_decode(void) int i; for (i = 0; i < ITERS_PER_TEST; ++i) { - size_t size = prandom_u32_max(MAX_ENCODABLE_SIZE) + 1; - bool is_write = !!prandom_u32_max(2); + size_t size = get_random_u32_inclusive(1, MAX_ENCODABLE_SIZE); + bool is_write = !!get_random_u32_below(2); unsigned long verif_masked_addr; long encoded_watchpoint; bool verif_is_write; unsigned long addr; size_t verif_size; - prandom_bytes(&addr, sizeof(addr)); + get_random_bytes(&addr, sizeof(addr)); if (addr < PAGE_SIZE) addr = PAGE_SIZE; @@ -259,7 +252,6 @@ static int __init kcsan_selftest(void) pr_err("selftest: " #do_test " failed"); \ } while (0) - RUN_TEST(test_requires); RUN_TEST(test_encode_decode); RUN_TEST(test_matching_access); RUN_TEST(test_barrier); diff --git a/kernel/kexec.c b/kernel/kexec.c index b5e40f069768..cb8e6e6f983c 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -93,13 +93,10 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments, /* * Because we write directly to the reserved memory region when loading - * crash kernels we need a mutex here to prevent multiple crash kernels - * from attempting to load simultaneously, and to prevent a crash kernel - * from loading over the top of a in use crash kernel. - * - * KISS: always take the mutex. + * crash kernels we need a serialization here to prevent multiple crash + * kernels from attempting to load simultaneously. */ - if (!mutex_trylock(&kexec_mutex)) + if (!kexec_trylock()) return -EBUSY; if (flags & KEXEC_ON_CRASH) { @@ -165,7 +162,7 @@ out: kimage_free(image); out_unlock: - mutex_unlock(&kexec_mutex); + kexec_unlock(); return ret; } diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index acd029b307e4..969e8f52f7da 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -46,7 +46,7 @@ #include <crypto/hash.h> #include "kexec_internal.h" -DEFINE_MUTEX(kexec_mutex); +atomic_t __kexec_lock = ATOMIC_INIT(0); /* Per cpu memory for storing cpu states in case of system crash. */ note_buf_t __percpu *crash_notes; @@ -561,23 +561,17 @@ static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) static int kimage_set_destination(struct kimage *image, unsigned long destination) { - int result; - destination &= PAGE_MASK; - result = kimage_add_entry(image, destination | IND_DESTINATION); - return result; + return kimage_add_entry(image, destination | IND_DESTINATION); } static int kimage_add_page(struct kimage *image, unsigned long page) { - int result; - page &= PAGE_MASK; - result = kimage_add_entry(image, page | IND_SOURCE); - return result; + return kimage_add_entry(image, page | IND_SOURCE); } @@ -809,7 +803,7 @@ static int kimage_load_normal_segment(struct kimage *image, if (result < 0) goto out; - ptr = kmap(page); + ptr = kmap_local_page(page); /* Start with a clear page */ clear_page(ptr); ptr += maddr & ~PAGE_MASK; @@ -822,7 +816,7 @@ static int kimage_load_normal_segment(struct kimage *image, memcpy(ptr, kbuf, uchunk); else result = copy_from_user(ptr, buf, uchunk); - kunmap(page); + kunmap_local(ptr); if (result) { result = -EFAULT; goto out; @@ -873,7 +867,7 @@ static int kimage_load_crash_segment(struct kimage *image, goto out; } arch_kexec_post_alloc_pages(page_address(page), 1, 0); - ptr = kmap(page); + ptr = kmap_local_page(page); ptr += maddr & ~PAGE_MASK; mchunk = min_t(size_t, mbytes, PAGE_SIZE - (maddr & ~PAGE_MASK)); @@ -889,7 +883,7 @@ static int kimage_load_crash_segment(struct kimage *image, else result = copy_from_user(ptr, buf, uchunk); kexec_flush_icache_page(page); - kunmap(page); + kunmap_local(ptr); arch_kexec_pre_free_pages(page_address(page), 1); if (result) { result = -EFAULT; @@ -959,7 +953,7 @@ late_initcall(kexec_core_sysctl_init); */ void __noclone __crash_kexec(struct pt_regs *regs) { - /* Take the kexec_mutex here to prevent sys_kexec_load + /* Take the kexec_lock here to prevent sys_kexec_load * running on one cpu from replacing the crash kernel * we are using after a panic on a different cpu. * @@ -967,7 +961,7 @@ void __noclone __crash_kexec(struct pt_regs *regs) * of memory the xchg(&kexec_crash_image) would be * sufficient. But since I reuse the memory... */ - if (mutex_trylock(&kexec_mutex)) { + if (kexec_trylock()) { if (kexec_crash_image) { struct pt_regs fixed_regs; @@ -976,7 +970,7 @@ void __noclone __crash_kexec(struct pt_regs *regs) machine_crash_shutdown(&fixed_regs); machine_kexec(kexec_crash_image); } - mutex_unlock(&kexec_mutex); + kexec_unlock(); } } STACK_FRAME_NON_STANDARD(__crash_kexec); @@ -1004,14 +998,17 @@ void crash_kexec(struct pt_regs *regs) } } -size_t crash_get_memory_size(void) +ssize_t crash_get_memory_size(void) { - size_t size = 0; + ssize_t size = 0; + + if (!kexec_trylock()) + return -EBUSY; - mutex_lock(&kexec_mutex); if (crashk_res.end != crashk_res.start) size = resource_size(&crashk_res); - mutex_unlock(&kexec_mutex); + + kexec_unlock(); return size; } @@ -1022,7 +1019,8 @@ int crash_shrink_memory(unsigned long new_size) unsigned long old_size; struct resource *ram_res; - mutex_lock(&kexec_mutex); + if (!kexec_trylock()) + return -EBUSY; if (kexec_crash_image) { ret = -ENOENT; @@ -1060,7 +1058,7 @@ int crash_shrink_memory(unsigned long new_size) insert_resource(&iomem_resource, ram_res); unlock: - mutex_unlock(&kexec_mutex); + kexec_unlock(); return ret; } @@ -1132,7 +1130,7 @@ int kernel_kexec(void) { int error = 0; - if (!mutex_trylock(&kexec_mutex)) + if (!kexec_trylock()) return -EBUSY; if (!kexec_image) { error = -EINVAL; @@ -1208,6 +1206,6 @@ int kernel_kexec(void) #endif Unlock: - mutex_unlock(&kexec_mutex); + kexec_unlock(); return error; } diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 1d546dc97c50..dd5983010b7b 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -339,7 +339,7 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, image = NULL; - if (!mutex_trylock(&kexec_mutex)) + if (!kexec_trylock()) return -EBUSY; dest_image = &kexec_image; @@ -411,7 +411,7 @@ out: if ((flags & KEXEC_FILE_ON_CRASH) && kexec_crash_image) arch_kexec_protect_crashkres(); - mutex_unlock(&kexec_mutex); + kexec_unlock(); kimage_free(image); return ret; } @@ -1141,7 +1141,7 @@ int crash_exclude_mem_range(struct crash_mem *mem, { int i, j; unsigned long long start, end, p_start, p_end; - struct crash_mem_range temp_range = {0, 0}; + struct range temp_range = {0, 0}; for (i = 0; i < mem->nr_ranges; i++) { start = mem->ranges[i].start; diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h index 48aaf2ac0d0d..74da1409cd14 100644 --- a/kernel/kexec_internal.h +++ b/kernel/kexec_internal.h @@ -13,7 +13,20 @@ void kimage_terminate(struct kimage *image); int kimage_is_destination_range(struct kimage *image, unsigned long start, unsigned long end); -extern struct mutex kexec_mutex; +/* + * Whatever is used to serialize accesses to the kexec_crash_image needs to be + * NMI safe, as __crash_kexec() can happen during nmi_panic(), so here we use a + * "simple" atomic variable that is acquired with a cmpxchg(). + */ +extern atomic_t __kexec_lock; +static inline bool kexec_trylock(void) +{ + return atomic_cmpxchg_acquire(&__kexec_lock, 0, 1) == 0; +} +static inline void kexec_unlock(void) +{ + atomic_set_release(&__kexec_lock, 0); +} #ifdef CONFIG_KEXEC_FILE #include <linux/purgatory.h> diff --git a/kernel/kprobes.c b/kernel/kprobes.c index ca9d834d0b84..1c18ecf9f98b 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1607,9 +1607,10 @@ int register_kprobe(struct kprobe *p) struct kprobe *old_p; struct module *probed_mod; kprobe_opcode_t *addr; + bool on_func_entry; /* Adjust probe address from symbol */ - addr = kprobe_addr(p); + addr = _kprobe_addr(p->addr, p->symbol_name, p->offset, &on_func_entry); if (IS_ERR(addr)) return PTR_ERR(addr); p->addr = addr; @@ -1629,6 +1630,9 @@ int register_kprobe(struct kprobe *p) mutex_lock(&kprobe_mutex); + if (on_func_entry) + p->flags |= KPROBE_FLAG_ON_FUNC_ENTRY; + old_p = get_kprobe(p->addr); if (old_p) { /* Since this may unoptimize 'old_p', locking 'text_mutex'. */ @@ -1762,7 +1766,13 @@ static int __unregister_kprobe_top(struct kprobe *p) if ((list_p != p) && (list_p->post_handler)) goto noclean; } - ap->post_handler = NULL; + /* + * For the kprobe-on-ftrace case, we keep the + * post_handler setting to identify this aggrprobe + * armed with kprobe_ipmodify_ops. + */ + if (!kprobe_ftrace(ap)) + ap->post_handler = NULL; } noclean: /* @@ -2203,13 +2213,9 @@ int register_kretprobe(struct kretprobe *rp) rp->kp.post_handler = NULL; /* Pre-allocate memory for max kretprobe instances */ - if (rp->maxactive <= 0) { -#ifdef CONFIG_PREEMPTION + if (rp->maxactive <= 0) rp->maxactive = max_t(unsigned int, 10, 2*num_possible_cpus()); -#else - rp->maxactive = num_possible_cpus(); -#endif - } + #ifdef CONFIG_KRETPROBE_ON_RETHOOK rp->rh = rethook_alloc((void *)rp, kretprobe_rethook_handler); if (!rp->rh) @@ -2354,6 +2360,14 @@ static void kill_kprobe(struct kprobe *p) lockdep_assert_held(&kprobe_mutex); + /* + * The module is going away. We should disarm the kprobe which + * is using ftrace, because ftrace framework is still available at + * 'MODULE_STATE_GOING' notification. + */ + if (kprobe_ftrace(p) && !kprobe_disabled(p) && !kprobes_all_disarmed) + disarm_kprobe_ftrace(p); + p->flags |= KPROBE_FLAG_GONE; if (kprobe_aggrprobe(p)) { /* @@ -2370,14 +2384,6 @@ static void kill_kprobe(struct kprobe *p) * the original probed function (which will be freed soon) any more. */ arch_remove_kprobe(p); - - /* - * The module is going away. We should disarm the kprobe which - * is using ftrace, because ftrace framework is still available at - * 'MODULE_STATE_GOING' notification. - */ - if (kprobe_ftrace(p) && !kprobe_disabled(p) && !kprobes_all_disarmed) - disarm_kprobe_ftrace(p); } /* Disable one kprobe */ @@ -2425,8 +2431,11 @@ int enable_kprobe(struct kprobe *kp) if (!kprobes_all_disarmed && kprobe_disabled(p)) { p->flags &= ~KPROBE_FLAG_DISABLED; ret = arm_kprobe(p); - if (ret) + if (ret) { p->flags |= KPROBE_FLAG_DISABLED; + if (p != kp) + kp->flags |= KPROBE_FLAG_DISABLED; + } } out: mutex_unlock(&kprobe_mutex); diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index b1292a57c2a5..2df00b789b90 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -6,6 +6,7 @@ * Copyright (C) 2004 Kay Sievers <kay.sievers@vrfy.org> */ +#include <asm/byteorder.h> #include <linux/kobject.h> #include <linux/string.h> #include <linux/sysfs.h> @@ -20,6 +21,14 @@ #include <linux/rcupdate.h> /* rcu_expedited and rcu_normal */ +#if defined(__LITTLE_ENDIAN) +#define CPU_BYTEORDER_STRING "little" +#elif defined(__BIG_ENDIAN) +#define CPU_BYTEORDER_STRING "big" +#else +#error Unknown byteorder +#endif + #define KERNEL_ATTR_RO(_name) \ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) @@ -34,6 +43,14 @@ static ssize_t uevent_seqnum_show(struct kobject *kobj, } KERNEL_ATTR_RO(uevent_seqnum); +/* cpu byteorder */ +static ssize_t cpu_byteorder_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%s\n", CPU_BYTEORDER_STRING); +} +KERNEL_ATTR_RO(cpu_byteorder); + #ifdef CONFIG_UEVENT_HELPER /* uevent helper program, used during early boot */ static ssize_t uevent_helper_show(struct kobject *kobj, @@ -105,7 +122,12 @@ KERNEL_ATTR_RO(kexec_crash_loaded); static ssize_t kexec_crash_size_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%zu\n", crash_get_memory_size()); + ssize_t size = crash_get_memory_size(); + + if (size < 0) + return size; + + return sprintf(buf, "%zd\n", size); } static ssize_t kexec_crash_size_store(struct kobject *kobj, struct kobj_attribute *attr, @@ -210,6 +232,7 @@ EXPORT_SYMBOL_GPL(kernel_kobj); static struct attribute * kernel_attrs[] = { &fscaps_attr.attr, &uevent_seqnum_attr.attr, + &cpu_byteorder_attr.attr, #ifdef CONFIG_UEVENT_HELPER &uevent_helper_attr.attr, #endif diff --git a/kernel/kthread.c b/kernel/kthread.c index 3c677918d8f2..f97fd01a2932 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -704,6 +704,7 @@ int kthread_stop(struct task_struct *k) kthread = to_kthread(k); set_bit(KTHREAD_SHOULD_STOP, &kthread->flags); kthread_unpark(k); + set_tsk_thread_flag(k, TIF_NOTIFY_SIGNAL); wake_up_process(k); wait_for_completion(&kthread->exited); ret = kthread->result; @@ -1050,8 +1051,7 @@ static void __kthread_queue_delayed_work(struct kthread_worker *worker, struct timer_list *timer = &dwork->timer; struct kthread_work *work = &dwork->work; - WARN_ON_FUNCTION_MISMATCH(timer->function, - kthread_delayed_work_timer_fn); + WARN_ON_ONCE(timer->function != kthread_delayed_work_timer_fn); /* * If @delay is 0, queue @dwork->work immediately. This is for diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 76166df011a4..781249098cb6 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -112,7 +112,7 @@ static void __sched account_global_scheduler_latency(struct task_struct *tsk, struct latency_record *lat) { - int firstnonnull = MAXLR + 1; + int firstnonnull = MAXLR; int i; /* skip kernel threads for now */ @@ -150,7 +150,7 @@ account_global_scheduler_latency(struct task_struct *tsk, } i = firstnonnull; - if (i >= MAXLR - 1) + if (i >= MAXLR) return; /* Allocted a new one: */ diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index bc475e62279d..201f0c0482fb 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -125,20 +125,10 @@ struct klp_find_arg { unsigned long pos; }; -static int klp_find_callback(void *data, const char *name, - struct module *mod, unsigned long addr) +static int klp_match_callback(void *data, unsigned long addr) { struct klp_find_arg *args = data; - if ((mod && !args->objname) || (!mod && args->objname)) - return 0; - - if (strcmp(args->name, name)) - return 0; - - if (args->objname && strcmp(args->objname, mod->name)) - return 0; - args->addr = addr; args->count++; @@ -153,6 +143,23 @@ static int klp_find_callback(void *data, const char *name, return 0; } +static int klp_find_callback(void *data, const char *name, + struct module *mod, unsigned long addr) +{ + struct klp_find_arg *args = data; + + if ((mod && !args->objname) || (!mod && args->objname)) + return 0; + + if (strcmp(args->name, name)) + return 0; + + if (args->objname && strcmp(args->objname, mod->name)) + return 0; + + return klp_match_callback(data, addr); +} + static int klp_find_object_symbol(const char *objname, const char *name, unsigned long sympos, unsigned long *addr) { @@ -167,7 +174,7 @@ static int klp_find_object_symbol(const char *objname, const char *name, if (objname) module_kallsyms_on_each_symbol(klp_find_callback, &args); else - kallsyms_on_each_symbol(klp_find_callback, &args); + kallsyms_on_each_match_symbol(klp_match_callback, name, &args); /* * Ensure an address was found. If sympos is 0, ensure symbol is unique; @@ -213,7 +220,7 @@ static int klp_resolve_symbols(Elf_Shdr *sechdrs, const char *strtab, * we use the smallest/strictest upper bound possible (56, based on * the current definition of MODULE_NAME_LEN) to prevent overflows. */ - BUILD_BUG_ON(MODULE_NAME_LEN < 56 || KSYM_NAME_LEN != 128); + BUILD_BUG_ON(MODULE_NAME_LEN < 56 || KSYM_NAME_LEN != 512); relas = (Elf_Rela *) relasec->sh_addr; /* For each rela in this klp relocation section */ @@ -227,7 +234,7 @@ static int klp_resolve_symbols(Elf_Shdr *sechdrs, const char *strtab, /* Format: .klp.sym.sym_objname.sym_name,sympos */ cnt = sscanf(strtab + sym->st_name, - ".klp.sym.%55[^.].%127[^,],%lu", + ".klp.sym.%55[^.].%511[^,],%lu", sym_objname, sym_name, &sympos); if (cnt != 3) { pr_err("symbol %s has an incorrectly formatted name\n", @@ -325,6 +332,7 @@ int klp_apply_section_relocs(struct module *pmod, Elf_Shdr *sechdrs, * /sys/kernel/livepatch/<patch>/transition * /sys/kernel/livepatch/<patch>/force * /sys/kernel/livepatch/<patch>/<object> + * /sys/kernel/livepatch/<patch>/<object>/patched * /sys/kernel/livepatch/<patch>/<object>/<function,sympos> */ static int __klp_disable_patch(struct klp_patch *patch); @@ -431,6 +439,22 @@ static struct attribute *klp_patch_attrs[] = { }; ATTRIBUTE_GROUPS(klp_patch); +static ssize_t patched_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct klp_object *obj; + + obj = container_of(kobj, struct klp_object, kobj); + return sysfs_emit(buf, "%d\n", obj->patched); +} + +static struct kobj_attribute patched_kobj_attr = __ATTR_RO(patched); +static struct attribute *klp_object_attrs[] = { + &patched_kobj_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(klp_object); + static void klp_free_object_dynamic(struct klp_object *obj) { kfree(obj->name); @@ -576,6 +600,7 @@ static void klp_kobj_release_object(struct kobject *kobj) static struct kobj_type klp_ktype_object = { .release = klp_kobj_release_object, .sysfs_ops = &kobj_sysfs_ops, + .default_groups = klp_object_groups, }; static void klp_kobj_release_func(struct kobject *kobj) @@ -1171,7 +1196,7 @@ int klp_module_coming(struct module *mod) return -EINVAL; if (!strcmp(mod->name, "vmlinux")) { - pr_err("vmlinux.ko: invalid module name"); + pr_err("vmlinux.ko: invalid module name\n"); return -EINVAL; } diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c index 4c4f5a776d80..4152c71507e2 100644 --- a/kernel/livepatch/patch.c +++ b/kernel/livepatch/patch.c @@ -118,7 +118,7 @@ static void notrace klp_ftrace_handler(unsigned long ip, if (func->nop) goto unlock; - ftrace_instruction_pointer_set(fregs, (unsigned long)func->new_func); + ftrace_regs_set_instruction_pointer(fregs, (unsigned long)func->new_func); unlock: ftrace_test_recursion_unlock(bit); diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index 5d03a2ad1066..f1b25ec581e0 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -196,36 +196,36 @@ static int klp_check_stack_func(struct klp_func *func, unsigned long *entries, struct klp_ops *ops; int i; - for (i = 0; i < nr_entries; i++) { - address = entries[i]; + if (klp_target_state == KLP_UNPATCHED) { + /* + * Check for the to-be-unpatched function + * (the func itself). + */ + func_addr = (unsigned long)func->new_func; + func_size = func->new_size; + } else { + /* + * Check for the to-be-patched function + * (the previous func). + */ + ops = klp_find_ops(func->old_func); - if (klp_target_state == KLP_UNPATCHED) { - /* - * Check for the to-be-unpatched function - * (the func itself). - */ - func_addr = (unsigned long)func->new_func; - func_size = func->new_size; + if (list_is_singular(&ops->func_stack)) { + /* original function */ + func_addr = (unsigned long)func->old_func; + func_size = func->old_size; } else { - /* - * Check for the to-be-patched function - * (the previous func). - */ - ops = klp_find_ops(func->old_func); - - if (list_is_singular(&ops->func_stack)) { - /* original function */ - func_addr = (unsigned long)func->old_func; - func_size = func->old_size; - } else { - /* previously patched function */ - struct klp_func *prev; - - prev = list_next_entry(func, stack_node); - func_addr = (unsigned long)prev->new_func; - func_size = prev->new_size; - } + /* previously patched function */ + struct klp_func *prev; + + prev = list_next_entry(func, stack_node); + func_addr = (unsigned long)prev->new_func; + func_size = prev->new_size; } + } + + for (i = 0; i < nr_entries; i++) { + address = entries[i]; if (address >= func_addr && address < func_addr + func_size) return -EAGAIN; @@ -610,9 +610,23 @@ void klp_reverse_transition(void) /* Called from copy_process() during fork */ void klp_copy_process(struct task_struct *child) { - child->patch_state = current->patch_state; - /* TIF_PATCH_PENDING gets copied in setup_thread_stack() */ + /* + * The parent process may have gone through a KLP transition since + * the thread flag was copied in setup_thread_stack earlier. Bring + * the task flag up to date with the parent here. + * + * The operation is serialized against all klp_*_transition() + * operations by the tasklist_lock. The only exception is + * klp_update_patch_state(current), but we cannot race with + * that because we are current. + */ + if (test_tsk_thread_flag(current, TIF_PATCH_PENDING)) + set_tsk_thread_flag(child, TIF_PATCH_PENDING); + else + clear_tsk_thread_flag(child, TIF_PATCH_PENDING); + + child->patch_state = current->patch_state; } /* diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index d51cabf28f38..0db4093d17b8 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -5,7 +5,7 @@ KCOV_INSTRUMENT := n obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o -# Avoid recursion lockdep -> KCSAN -> ... -> lockdep. +# Avoid recursion lockdep -> sanitizer -> ... -> lockdep. KCSAN_SANITIZE_lockdep.o := n ifdef CONFIG_FUNCTION_TRACER diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 64a13eb56078..e3375bc40dad 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -934,8 +934,10 @@ look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass) * Huh! same key, different name? Did someone trample * on some memory? We're most confused. */ - WARN_ON_ONCE(class->name != lock->name && - lock->key != &__lockdep_no_validate__); + WARN_ONCE(class->name != lock->name && + lock->key != &__lockdep_no_validate__, + "Looking for class \"%s\" with key %ps, but found a different class \"%s\" with the same key\n", + lock->name, lock->key, class->name); return class; } } diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index 5fe4c5495ba3..185bd1c906b0 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -192,6 +192,12 @@ EXPORT_SYMBOL_GPL(__percpu_down_read); __sum; \ }) +bool percpu_is_read_locked(struct percpu_rw_semaphore *sem) +{ + return per_cpu_sum(*sem->read_count) != 0 && !atomic_read(&sem->block); +} +EXPORT_SYMBOL_GPL(percpu_is_read_locked); + /* * Return true if the modular sum of the sem->read_count per-CPU variable is * zero. If this sum is zero, then it is stable due to the fact that if any diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index 2e1600906c9f..d2ef312a8611 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -18,7 +18,7 @@ * queued_read_lock_slowpath - acquire read lock of a queued rwlock * @lock: Pointer to queued rwlock structure */ -void queued_read_lock_slowpath(struct qrwlock *lock) +void __lockfunc queued_read_lock_slowpath(struct qrwlock *lock) { /* * Readers come here when they cannot get the lock without waiting @@ -63,7 +63,7 @@ EXPORT_SYMBOL(queued_read_lock_slowpath); * queued_write_lock_slowpath - acquire write lock of a queued rwlock * @lock : Pointer to queued rwlock structure */ -void queued_write_lock_slowpath(struct qrwlock *lock) +void __lockfunc queued_write_lock_slowpath(struct qrwlock *lock) { int cnts; diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 65a9a10caa6f..2b23378775fe 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -313,7 +313,7 @@ static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock, * contended : (*,x,y) +--> (*,0,0) ---> (*,0,1) -' : * queue : ^--' : */ -void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) +void __lockfunc queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) { struct mcs_spinlock *prev, *next, *node; u32 old, tail; diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index e84d21aa0722..6afc249ce697 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -489,7 +489,7 @@ gotlock: * PV versions of the unlock fastpath and slowpath functions to be used * instead of queued_spin_unlock(). */ -__visible void +__visible __lockfunc void __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked) { struct pv_node *node; @@ -544,7 +544,7 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked) #include <asm/qspinlock_paravirt.h> #ifndef __pv_queued_spin_unlock -__visible void __pv_queued_spin_unlock(struct qspinlock *lock) +__visible __lockfunc void __pv_queued_spin_unlock(struct qspinlock *lock) { u8 locked; diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 65f0262f635e..44873594de03 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -133,14 +133,19 @@ * the owner value concurrently without lock. Read from owner, however, * may not need READ_ONCE() as long as the pointer value is only used * for comparison and isn't being dereferenced. + * + * Both rwsem_{set,clear}_owner() functions should be in the same + * preempt disable section as the atomic op that changes sem->count. */ static inline void rwsem_set_owner(struct rw_semaphore *sem) { + lockdep_assert_preemption_disabled(); atomic_long_set(&sem->owner, (long)current); } static inline void rwsem_clear_owner(struct rw_semaphore *sem) { + lockdep_assert_preemption_disabled(); atomic_long_set(&sem->owner, 0); } @@ -251,13 +256,16 @@ static inline bool rwsem_read_trylock(struct rw_semaphore *sem, long *cntp) static inline bool rwsem_write_trylock(struct rw_semaphore *sem) { long tmp = RWSEM_UNLOCKED_VALUE; + bool ret = false; + preempt_disable(); if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, RWSEM_WRITER_LOCKED)) { rwsem_set_owner(sem); - return true; + ret = true; } - return false; + preempt_enable(); + return ret; } /* @@ -1352,8 +1360,10 @@ static inline void __up_write(struct rw_semaphore *sem) DEBUG_RWSEMS_WARN_ON((rwsem_owner(sem) != current) && !rwsem_test_oflags(sem, RWSEM_NONSPINNABLE), sem); + preempt_disable(); rwsem_clear_owner(sem); tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count); + preempt_enable(); if (unlikely(tmp & RWSEM_FLAG_WAITERS)) rwsem_wake(sem); } diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c index f2654d2fe43a..34bfae72f295 100644 --- a/kernel/locking/semaphore.c +++ b/kernel/locking/semaphore.c @@ -51,7 +51,7 @@ static noinline void __up(struct semaphore *sem); * Use of this function is deprecated, please use down_interruptible() or * down_killable() instead. */ -void down(struct semaphore *sem) +void __sched down(struct semaphore *sem) { unsigned long flags; @@ -74,7 +74,7 @@ EXPORT_SYMBOL(down); * If the sleep is interrupted by a signal, this function will return -EINTR. * If the semaphore is successfully acquired, this function returns 0. */ -int down_interruptible(struct semaphore *sem) +int __sched down_interruptible(struct semaphore *sem) { unsigned long flags; int result = 0; @@ -101,7 +101,7 @@ EXPORT_SYMBOL(down_interruptible); * -EINTR. If the semaphore is successfully acquired, this function returns * 0. */ -int down_killable(struct semaphore *sem) +int __sched down_killable(struct semaphore *sem) { unsigned long flags; int result = 0; @@ -131,7 +131,7 @@ EXPORT_SYMBOL(down_killable); * Unlike mutex_trylock, this function can be used from interrupt context, * and the semaphore can be released by any task or interrupt. */ -int down_trylock(struct semaphore *sem) +int __sched down_trylock(struct semaphore *sem) { unsigned long flags; int count; @@ -156,7 +156,7 @@ EXPORT_SYMBOL(down_trylock); * If the semaphore is not released within the specified number of jiffies, * this function returns -ETIME. It returns 0 if the semaphore was acquired. */ -int down_timeout(struct semaphore *sem, long timeout) +int __sched down_timeout(struct semaphore *sem, long timeout) { unsigned long flags; int result = 0; @@ -180,7 +180,7 @@ EXPORT_SYMBOL(down_timeout); * Release the semaphore. Unlike mutexes, up() may be called from any * context and even by tasks which have never called down(). */ -void up(struct semaphore *sem) +void __sched up(struct semaphore *sem) { unsigned long flags; diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c index 7f49baaa4979..8475a0794f8c 100644 --- a/kernel/locking/spinlock.c +++ b/kernel/locking/spinlock.c @@ -133,7 +133,7 @@ BUILD_LOCK_OPS(write, rwlock); #endif #ifndef CONFIG_INLINE_SPIN_TRYLOCK -int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock) +noinline int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock) { return __raw_spin_trylock(lock); } @@ -141,7 +141,7 @@ EXPORT_SYMBOL(_raw_spin_trylock); #endif #ifndef CONFIG_INLINE_SPIN_TRYLOCK_BH -int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock) +noinline int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock) { return __raw_spin_trylock_bh(lock); } @@ -149,7 +149,7 @@ EXPORT_SYMBOL(_raw_spin_trylock_bh); #endif #ifndef CONFIG_INLINE_SPIN_LOCK -void __lockfunc _raw_spin_lock(raw_spinlock_t *lock) +noinline void __lockfunc _raw_spin_lock(raw_spinlock_t *lock) { __raw_spin_lock(lock); } @@ -157,7 +157,7 @@ EXPORT_SYMBOL(_raw_spin_lock); #endif #ifndef CONFIG_INLINE_SPIN_LOCK_IRQSAVE -unsigned long __lockfunc _raw_spin_lock_irqsave(raw_spinlock_t *lock) +noinline unsigned long __lockfunc _raw_spin_lock_irqsave(raw_spinlock_t *lock) { return __raw_spin_lock_irqsave(lock); } @@ -165,7 +165,7 @@ EXPORT_SYMBOL(_raw_spin_lock_irqsave); #endif #ifndef CONFIG_INLINE_SPIN_LOCK_IRQ -void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock) +noinline void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock) { __raw_spin_lock_irq(lock); } @@ -173,7 +173,7 @@ EXPORT_SYMBOL(_raw_spin_lock_irq); #endif #ifndef CONFIG_INLINE_SPIN_LOCK_BH -void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock) +noinline void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock) { __raw_spin_lock_bh(lock); } @@ -181,7 +181,7 @@ EXPORT_SYMBOL(_raw_spin_lock_bh); #endif #ifdef CONFIG_UNINLINE_SPIN_UNLOCK -void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock) +noinline void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock) { __raw_spin_unlock(lock); } @@ -189,7 +189,7 @@ EXPORT_SYMBOL(_raw_spin_unlock); #endif #ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQRESTORE -void __lockfunc _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags) +noinline void __lockfunc _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags) { __raw_spin_unlock_irqrestore(lock, flags); } @@ -197,7 +197,7 @@ EXPORT_SYMBOL(_raw_spin_unlock_irqrestore); #endif #ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQ -void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock) +noinline void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock) { __raw_spin_unlock_irq(lock); } @@ -205,7 +205,7 @@ EXPORT_SYMBOL(_raw_spin_unlock_irq); #endif #ifndef CONFIG_INLINE_SPIN_UNLOCK_BH -void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock) +noinline void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock) { __raw_spin_unlock_bh(lock); } @@ -215,7 +215,7 @@ EXPORT_SYMBOL(_raw_spin_unlock_bh); #ifndef CONFIG_PREEMPT_RT #ifndef CONFIG_INLINE_READ_TRYLOCK -int __lockfunc _raw_read_trylock(rwlock_t *lock) +noinline int __lockfunc _raw_read_trylock(rwlock_t *lock) { return __raw_read_trylock(lock); } @@ -223,7 +223,7 @@ EXPORT_SYMBOL(_raw_read_trylock); #endif #ifndef CONFIG_INLINE_READ_LOCK -void __lockfunc _raw_read_lock(rwlock_t *lock) +noinline void __lockfunc _raw_read_lock(rwlock_t *lock) { __raw_read_lock(lock); } @@ -231,7 +231,7 @@ EXPORT_SYMBOL(_raw_read_lock); #endif #ifndef CONFIG_INLINE_READ_LOCK_IRQSAVE -unsigned long __lockfunc _raw_read_lock_irqsave(rwlock_t *lock) +noinline unsigned long __lockfunc _raw_read_lock_irqsave(rwlock_t *lock) { return __raw_read_lock_irqsave(lock); } @@ -239,7 +239,7 @@ EXPORT_SYMBOL(_raw_read_lock_irqsave); #endif #ifndef CONFIG_INLINE_READ_LOCK_IRQ -void __lockfunc _raw_read_lock_irq(rwlock_t *lock) +noinline void __lockfunc _raw_read_lock_irq(rwlock_t *lock) { __raw_read_lock_irq(lock); } @@ -247,7 +247,7 @@ EXPORT_SYMBOL(_raw_read_lock_irq); #endif #ifndef CONFIG_INLINE_READ_LOCK_BH -void __lockfunc _raw_read_lock_bh(rwlock_t *lock) +noinline void __lockfunc _raw_read_lock_bh(rwlock_t *lock) { __raw_read_lock_bh(lock); } @@ -255,7 +255,7 @@ EXPORT_SYMBOL(_raw_read_lock_bh); #endif #ifndef CONFIG_INLINE_READ_UNLOCK -void __lockfunc _raw_read_unlock(rwlock_t *lock) +noinline void __lockfunc _raw_read_unlock(rwlock_t *lock) { __raw_read_unlock(lock); } @@ -263,7 +263,7 @@ EXPORT_SYMBOL(_raw_read_unlock); #endif #ifndef CONFIG_INLINE_READ_UNLOCK_IRQRESTORE -void __lockfunc _raw_read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) +noinline void __lockfunc _raw_read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) { __raw_read_unlock_irqrestore(lock, flags); } @@ -271,7 +271,7 @@ EXPORT_SYMBOL(_raw_read_unlock_irqrestore); #endif #ifndef CONFIG_INLINE_READ_UNLOCK_IRQ -void __lockfunc _raw_read_unlock_irq(rwlock_t *lock) +noinline void __lockfunc _raw_read_unlock_irq(rwlock_t *lock) { __raw_read_unlock_irq(lock); } @@ -279,7 +279,7 @@ EXPORT_SYMBOL(_raw_read_unlock_irq); #endif #ifndef CONFIG_INLINE_READ_UNLOCK_BH -void __lockfunc _raw_read_unlock_bh(rwlock_t *lock) +noinline void __lockfunc _raw_read_unlock_bh(rwlock_t *lock) { __raw_read_unlock_bh(lock); } @@ -287,7 +287,7 @@ EXPORT_SYMBOL(_raw_read_unlock_bh); #endif #ifndef CONFIG_INLINE_WRITE_TRYLOCK -int __lockfunc _raw_write_trylock(rwlock_t *lock) +noinline int __lockfunc _raw_write_trylock(rwlock_t *lock) { return __raw_write_trylock(lock); } @@ -295,7 +295,7 @@ EXPORT_SYMBOL(_raw_write_trylock); #endif #ifndef CONFIG_INLINE_WRITE_LOCK -void __lockfunc _raw_write_lock(rwlock_t *lock) +noinline void __lockfunc _raw_write_lock(rwlock_t *lock) { __raw_write_lock(lock); } @@ -313,7 +313,7 @@ EXPORT_SYMBOL(_raw_write_lock_nested); #endif #ifndef CONFIG_INLINE_WRITE_LOCK_IRQSAVE -unsigned long __lockfunc _raw_write_lock_irqsave(rwlock_t *lock) +noinline unsigned long __lockfunc _raw_write_lock_irqsave(rwlock_t *lock) { return __raw_write_lock_irqsave(lock); } @@ -321,7 +321,7 @@ EXPORT_SYMBOL(_raw_write_lock_irqsave); #endif #ifndef CONFIG_INLINE_WRITE_LOCK_IRQ -void __lockfunc _raw_write_lock_irq(rwlock_t *lock) +noinline void __lockfunc _raw_write_lock_irq(rwlock_t *lock) { __raw_write_lock_irq(lock); } @@ -329,7 +329,7 @@ EXPORT_SYMBOL(_raw_write_lock_irq); #endif #ifndef CONFIG_INLINE_WRITE_LOCK_BH -void __lockfunc _raw_write_lock_bh(rwlock_t *lock) +noinline void __lockfunc _raw_write_lock_bh(rwlock_t *lock) { __raw_write_lock_bh(lock); } @@ -337,7 +337,7 @@ EXPORT_SYMBOL(_raw_write_lock_bh); #endif #ifndef CONFIG_INLINE_WRITE_UNLOCK -void __lockfunc _raw_write_unlock(rwlock_t *lock) +noinline void __lockfunc _raw_write_unlock(rwlock_t *lock) { __raw_write_unlock(lock); } @@ -345,7 +345,7 @@ EXPORT_SYMBOL(_raw_write_unlock); #endif #ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQRESTORE -void __lockfunc _raw_write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) +noinline void __lockfunc _raw_write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) { __raw_write_unlock_irqrestore(lock, flags); } @@ -353,7 +353,7 @@ EXPORT_SYMBOL(_raw_write_unlock_irqrestore); #endif #ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQ -void __lockfunc _raw_write_unlock_irq(rwlock_t *lock) +noinline void __lockfunc _raw_write_unlock_irq(rwlock_t *lock) { __raw_write_unlock_irq(lock); } @@ -361,7 +361,7 @@ EXPORT_SYMBOL(_raw_write_unlock_irq); #endif #ifndef CONFIG_INLINE_WRITE_UNLOCK_BH -void __lockfunc _raw_write_unlock_bh(rwlock_t *lock) +noinline void __lockfunc _raw_write_unlock_bh(rwlock_t *lock) { __raw_write_unlock_bh(lock); } diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c index 353004155d65..29dc253d03af 100644 --- a/kernel/locking/test-ww_mutex.c +++ b/kernel/locking/test-ww_mutex.c @@ -399,7 +399,7 @@ static int *get_random_order(int count) order[n] = n; for (n = count - 1; n > 1; n--) { - r = get_random_int() % (n + 1); + r = get_random_u32_below(n + 1); if (r != n) { tmp = order[n]; order[n] = order[r]; @@ -538,7 +538,7 @@ static void stress_one_work(struct work_struct *work) { struct stress *stress = container_of(work, typeof(*stress), work); const int nlocks = stress->nlocks; - struct ww_mutex *lock = stress->locks + (get_random_int() % nlocks); + struct ww_mutex *lock = stress->locks + get_random_u32_below(nlocks); int err; do { diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig index 26ea5d04f56c..424b3bc58f3f 100644 --- a/kernel/module/Kconfig +++ b/kernel/module/Kconfig @@ -221,9 +221,10 @@ endchoice config MODULE_DECOMPRESS bool "Support in-kernel module decompression" - depends on MODULE_COMPRESS_GZIP || MODULE_COMPRESS_XZ + depends on MODULE_COMPRESS_GZIP || MODULE_COMPRESS_XZ || MODULE_COMPRESS_ZSTD select ZLIB_INFLATE if MODULE_COMPRESS_GZIP select XZ_DEC if MODULE_COMPRESS_XZ + select ZSTD_DECOMPRESS if MODULE_COMPRESS_ZSTD help Support for decompressing kernel modules by the kernel itself diff --git a/kernel/module/decompress.c b/kernel/module/decompress.c index 4d0bcb3d9e44..bb79ac1a6d8f 100644 --- a/kernel/module/decompress.c +++ b/kernel/module/decompress.c @@ -50,7 +50,7 @@ static struct page *module_get_next_page(struct load_info *info) return page; } -#ifdef CONFIG_MODULE_COMPRESS_GZIP +#if defined(CONFIG_MODULE_COMPRESS_GZIP) #include <linux/zlib.h> #define MODULE_COMPRESSION gzip #define MODULE_DECOMPRESS_FN module_gzip_decompress @@ -114,8 +114,8 @@ static ssize_t module_gzip_decompress(struct load_info *info, do { struct page *page = module_get_next_page(info); - if (!page) { - retval = -ENOMEM; + if (IS_ERR(page)) { + retval = PTR_ERR(page); goto out_inflate_end; } @@ -141,7 +141,7 @@ out: kfree(s.workspace); return retval; } -#elif CONFIG_MODULE_COMPRESS_XZ +#elif defined(CONFIG_MODULE_COMPRESS_XZ) #include <linux/xz.h> #define MODULE_COMPRESSION xz #define MODULE_DECOMPRESS_FN module_xz_decompress @@ -173,8 +173,8 @@ static ssize_t module_xz_decompress(struct load_info *info, do { struct page *page = module_get_next_page(info); - if (!page) { - retval = -ENOMEM; + if (IS_ERR(page)) { + retval = PTR_ERR(page); goto out; } @@ -199,6 +199,94 @@ static ssize_t module_xz_decompress(struct load_info *info, xz_dec_end(xz_dec); return retval; } +#elif defined(CONFIG_MODULE_COMPRESS_ZSTD) +#include <linux/zstd.h> +#define MODULE_COMPRESSION zstd +#define MODULE_DECOMPRESS_FN module_zstd_decompress + +static ssize_t module_zstd_decompress(struct load_info *info, + const void *buf, size_t size) +{ + static const u8 signature[] = { 0x28, 0xb5, 0x2f, 0xfd }; + ZSTD_outBuffer zstd_dec; + ZSTD_inBuffer zstd_buf; + zstd_frame_header header; + size_t wksp_size; + void *wksp = NULL; + ZSTD_DStream *dstream; + size_t ret; + size_t new_size = 0; + int retval; + + if (size < sizeof(signature) || + memcmp(buf, signature, sizeof(signature))) { + pr_err("not a zstd compressed module\n"); + return -EINVAL; + } + + zstd_buf.src = buf; + zstd_buf.pos = 0; + zstd_buf.size = size; + + ret = zstd_get_frame_header(&header, zstd_buf.src, zstd_buf.size); + if (ret != 0) { + pr_err("ZSTD-compressed data has an incomplete frame header\n"); + retval = -EINVAL; + goto out; + } + if (header.windowSize > (1 << ZSTD_WINDOWLOG_MAX)) { + pr_err("ZSTD-compressed data has too large a window size\n"); + retval = -EINVAL; + goto out; + } + + wksp_size = zstd_dstream_workspace_bound(header.windowSize); + wksp = kmalloc(wksp_size, GFP_KERNEL); + if (!wksp) { + retval = -ENOMEM; + goto out; + } + + dstream = zstd_init_dstream(header.windowSize, wksp, wksp_size); + if (!dstream) { + pr_err("Can't initialize ZSTD stream\n"); + retval = -ENOMEM; + goto out; + } + + do { + struct page *page = module_get_next_page(info); + + if (!IS_ERR(page)) { + retval = PTR_ERR(page); + goto out; + } + + zstd_dec.dst = kmap_local_page(page); + zstd_dec.pos = 0; + zstd_dec.size = PAGE_SIZE; + + ret = zstd_decompress_stream(dstream, &zstd_dec, &zstd_buf); + kunmap(page); + retval = zstd_get_error_code(ret); + if (retval) + break; + + new_size += zstd_dec.pos; + } while (zstd_dec.pos == PAGE_SIZE && ret != 0); + + if (retval) { + pr_err("ZSTD-decompression failed with status %d\n", retval); + retval = -EINVAL; + goto out; + } + + retval = new_size; + + out: + kfree(wksp); + return retval; +} #else #error "Unexpected configuration for CONFIG_MODULE_DECOMPRESS" #endif @@ -256,7 +344,7 @@ void module_decompress_cleanup(struct load_info *info) static ssize_t compression_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sysfs_emit(buf, "%s\n", __stringify(MODULE_COMPRESSION)); + return sysfs_emit(buf, __stringify(MODULE_COMPRESSION) "\n"); } static struct kobj_attribute module_compression_attr = __ATTR_RO(compression); diff --git a/kernel/module/internal.h b/kernel/module/internal.h index 680d980a4fb2..2e2bf236f558 100644 --- a/kernel/module/internal.h +++ b/kernel/module/internal.h @@ -53,6 +53,7 @@ extern const struct kernel_symbol __stop___ksymtab_gpl[]; extern const s32 __start___kcrctab[]; extern const s32 __start___kcrctab_gpl[]; +#include <linux/dynamic_debug.h> struct load_info { const char *name; /* pointer to module in temporary copy, freed at end of load_module() */ @@ -62,8 +63,7 @@ struct load_info { Elf_Shdr *sechdrs; char *secstrings, *strtab; unsigned long symoffs, stroffs, init_typeoffs, core_typeoffs; - struct _ddebug *debug; - unsigned int num_debug; + struct _ddebug_info dyndbg; bool sig_ok; #ifdef CONFIG_KALLSYMS unsigned long mod_kallsyms_init_off; diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c index f5c5c9175333..4523f99b0358 100644 --- a/kernel/module/kallsyms.c +++ b/kernel/module/kallsyms.c @@ -494,7 +494,6 @@ unsigned long module_kallsyms_lookup_name(const char *name) return ret; } -#ifdef CONFIG_LIVEPATCH int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, unsigned long), void *data) @@ -531,4 +530,3 @@ out: mutex_unlock(&module_mutex); return ret; } -#endif /* CONFIG_LIVEPATCH */ diff --git a/kernel/module/main.c b/kernel/module/main.c index a4e4d84b6f4e..48568a0f5651 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -53,6 +53,7 @@ #include <linux/bsearch.h> #include <linux/dynamic_debug.h> #include <linux/audit.h> +#include <linux/cfi.h> #include <uapi/linux/module.h> #include "internal.h" @@ -84,9 +85,6 @@ struct mod_tree_root mod_data_tree __cacheline_aligned = { }; #endif -#define module_addr_min mod_tree.addr_min -#define module_addr_max mod_tree.addr_max - struct symsearch { const struct kernel_symbol *start, *stop; const s32 *crcs; @@ -1144,8 +1142,6 @@ void __weak module_arch_freeing_init(struct module *mod) { } -static void cfi_cleanup(struct module *mod); - /* Free a module, remove from lists, etc. */ static void free_module(struct module *mod) { @@ -1190,9 +1186,6 @@ static void free_module(struct module *mod) mod->name); mutex_unlock(&module_mutex); - /* Clean up CFI for the module. */ - cfi_cleanup(mod); - /* This may be empty, but that's OK */ module_arch_freeing_init(mod); module_memfree(mod->init_layout.base); @@ -1598,16 +1591,16 @@ static void free_modinfo(struct module *mod) } } -static void dynamic_debug_setup(struct module *mod, struct _ddebug *debug, unsigned int num) +static void dynamic_debug_setup(struct module *mod, struct _ddebug_info *dyndbg) { - if (!debug) + if (!dyndbg->num_descs) return; - ddebug_add_module(debug, num, mod->name); + ddebug_add_module(dyndbg, mod->name); } -static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug) +static void dynamic_debug_remove(struct module *mod, struct _ddebug_info *dyndbg) { - if (debug) + if (dyndbg->num_descs) ddebug_remove_module(mod->name); } @@ -1678,6 +1671,11 @@ static int elf_validity_check(struct load_info *info) info->hdr->e_machine); goto no_exec; } + if (!module_elf_check_arch(info->hdr)) { + pr_err("Invalid module architecture in ELF header: %u\n", + info->hdr->e_machine); + goto no_exec; + } if (info->hdr->e_shentsize != sizeof(Elf_Shdr)) { pr_err("Invalid ELF section header size\n"); goto no_exec; @@ -2111,8 +2109,10 @@ static int find_module_sections(struct module *mod, struct load_info *info) if (section_addr(info, "__obsparm")) pr_warn("%s: Ignoring obsolete parameters\n", mod->name); - info->debug = section_objs(info, "__dyndbg", - sizeof(*info->debug), &info->num_debug); + info->dyndbg.descs = section_objs(info, "__dyndbg", + sizeof(*info->dyndbg.descs), &info->dyndbg.num_descs); + info->dyndbg.classes = section_objs(info, "__dyndbg_classes", + sizeof(*info->dyndbg.classes), &info->dyndbg.num_classes); return 0; } @@ -2249,6 +2249,11 @@ static void flush_module_icache(const struct module *mod) (unsigned long)mod->core_layout.base + mod->core_layout.size); } +bool __weak module_elf_check_arch(Elf_Ehdr *hdr) +{ + return true; +} + int __weak module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, char *secstrings, @@ -2602,8 +2607,9 @@ static int complete_formation(struct module *mod, struct load_info *info) if (err < 0) goto out; - /* This relies on module_mutex for list integrity. */ + /* These rely on module_mutex for list integrity. */ module_bug_finalize(info->hdr, info->sechdrs, mod); + module_cfi_finalize(info->hdr, info->sechdrs, mod); if (module_check_misalignment(mod)) goto out_misaligned; @@ -2665,8 +2671,6 @@ static int unknown_module_param_cb(char *param, char *val, const char *modname, return 0; } -static void cfi_init(struct module *mod); - /* * Allocate and load the module: note that size of section 0 is always * zero, and we rely on this for optional sections. @@ -2796,9 +2800,6 @@ static int load_module(struct load_info *info, const char __user *uargs, flush_module_icache(mod); - /* Setup CFI for the module. */ - cfi_init(mod); - /* Now copy in args */ mod->args = strndup_user(uargs, ~0UL >> 1); if (IS_ERR(mod->args)) { @@ -2807,7 +2808,7 @@ static int load_module(struct load_info *info, const char __user *uargs, } init_build_id(mod, info); - dynamic_debug_setup(mod, info->debug, info->num_debug); + dynamic_debug_setup(mod, &info->dyndbg); /* Ftrace init must be called in the MODULE_STATE_UNFORMED state */ ftrace_module_init(mod); @@ -2871,11 +2872,10 @@ static int load_module(struct load_info *info, const char __user *uargs, ddebug_cleanup: ftrace_release_mod(mod); - dynamic_debug_remove(mod, info->debug); + dynamic_debug_remove(mod, &info->dyndbg); synchronize_rcu(); kfree(mod->args); free_arch_cleanup: - cfi_cleanup(mod); module_arch_cleanup(mod); free_modinfo: free_modinfo(mod); @@ -2961,41 +2961,6 @@ static inline int within(unsigned long addr, void *start, unsigned long size) return ((void *)addr >= start && (void *)addr < start + size); } -static void cfi_init(struct module *mod) -{ -#ifdef CONFIG_CFI_CLANG - initcall_t *init; -#ifdef CONFIG_MODULE_UNLOAD - exitcall_t *exit; -#endif - - rcu_read_lock_sched(); - mod->cfi_check = (cfi_check_fn) - find_kallsyms_symbol_value(mod, "__cfi_check"); - init = (initcall_t *) - find_kallsyms_symbol_value(mod, "__cfi_jt_init_module"); - /* Fix init/exit functions to point to the CFI jump table */ - if (init) - mod->init = *init; -#ifdef CONFIG_MODULE_UNLOAD - exit = (exitcall_t *) - find_kallsyms_symbol_value(mod, "__cfi_jt_cleanup_module"); - if (exit) - mod->exit = *exit; -#endif - rcu_read_unlock_sched(); - - cfi_module_add(mod, mod_tree.addr_min); -#endif -} - -static void cfi_cleanup(struct module *mod) -{ -#ifdef CONFIG_CFI_CLANG - cfi_module_remove(mod, mod_tree.addr_min); -#endif -} - /* Keep in sync with MODULE_FLAGS_BUF_SIZE !!! */ char *module_flags(struct module *mod, char *buf, bool show_state) { diff --git a/kernel/module/sysfs.c b/kernel/module/sysfs.c index ce68f821dcd1..c921bf044050 100644 --- a/kernel/module/sysfs.c +++ b/kernel/module/sysfs.c @@ -340,7 +340,7 @@ static int mod_sysfs_init(struct module *mod) int err; struct kobject *kobj; - if (!module_sysfs_initialized) { + if (!module_kset) { pr_err("%s: module sysfs not initialized\n", mod->name); err = -EINVAL; goto out; diff --git a/kernel/module/tracking.c b/kernel/module/tracking.c index 7f8133044d09..26d812e07615 100644 --- a/kernel/module/tracking.c +++ b/kernel/module/tracking.c @@ -10,6 +10,7 @@ #include <linux/printk.h> #include <linux/slab.h> #include <linux/list.h> +#include <linux/debugfs.h> #include <linux/rculist.h> #include "internal.h" @@ -21,6 +22,9 @@ int try_add_tainted_module(struct module *mod) module_assert_mutex_or_preempt(); + if (!mod->taints) + goto out; + list_for_each_entry_rcu(mod_taint, &unloaded_tainted_modules, list, lockdep_is_held(&module_mutex)) { if (!strcmp(mod_taint->name, mod->name) && @@ -59,3 +63,70 @@ void print_unloaded_tainted_modules(void) } } } + +#ifdef CONFIG_DEBUG_FS +static void *unloaded_tainted_modules_seq_start(struct seq_file *m, loff_t *pos) + __acquires(rcu) +{ + rcu_read_lock(); + return seq_list_start_rcu(&unloaded_tainted_modules, *pos); +} + +static void *unloaded_tainted_modules_seq_next(struct seq_file *m, void *p, loff_t *pos) +{ + return seq_list_next_rcu(p, &unloaded_tainted_modules, pos); +} + +static void unloaded_tainted_modules_seq_stop(struct seq_file *m, void *p) + __releases(rcu) +{ + rcu_read_unlock(); +} + +static int unloaded_tainted_modules_seq_show(struct seq_file *m, void *p) +{ + struct mod_unload_taint *mod_taint; + char buf[MODULE_FLAGS_BUF_SIZE]; + size_t l; + + mod_taint = list_entry(p, struct mod_unload_taint, list); + l = module_flags_taint(mod_taint->taints, buf); + buf[l++] = '\0'; + + seq_printf(m, "%s (%s) %llu", mod_taint->name, buf, mod_taint->count); + seq_puts(m, "\n"); + + return 0; +} + +static const struct seq_operations unloaded_tainted_modules_seq_ops = { + .start = unloaded_tainted_modules_seq_start, + .next = unloaded_tainted_modules_seq_next, + .stop = unloaded_tainted_modules_seq_stop, + .show = unloaded_tainted_modules_seq_show, +}; + +static int unloaded_tainted_modules_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &unloaded_tainted_modules_seq_ops); +} + +static const struct file_operations unloaded_tainted_modules_fops = { + .open = unloaded_tainted_modules_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init unloaded_tainted_modules_init(void) +{ + struct dentry *dir; + + dir = debugfs_create_dir("modules", NULL); + debugfs_create_file("unloaded_tainted", 0444, dir, NULL, + &unloaded_tainted_modules_fops); + + return 0; +} +module_init(unloaded_tainted_modules_init); +#endif /* CONFIG_DEBUG_FS */ diff --git a/kernel/notifier.c b/kernel/notifier.c index 0d5bd62c480e..ab75637fd904 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -62,7 +62,7 @@ static int notifier_chain_unregister(struct notifier_block **nl, * value of this parameter is -1. * @nr_calls: Records the number of notifications sent. Don't care * value of this field is NULL. - * @returns: notifier_call_chain returns the value returned by the + * Return: notifier_call_chain returns the value returned by the * last notifier function called. */ static int notifier_call_chain(struct notifier_block **nl, @@ -105,13 +105,13 @@ NOKPROBE_SYMBOL(notifier_call_chain); * @val_up: Value passed unmodified to the notifier function * @val_down: Value passed unmodified to the notifier function when recovering * from an error on @val_up - * @v Pointer passed unmodified to the notifier function + * @v: Pointer passed unmodified to the notifier function * * NOTE: It is important the @nl chain doesn't change between the two * invocations of notifier_call_chain() such that we visit the * exact same notifier callbacks; this rules out any RCU usage. * - * Returns: the return value of the @val_up call. + * Return: the return value of the @val_up call. */ static int notifier_call_chain_robust(struct notifier_block **nl, unsigned long val_up, unsigned long val_down, diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index b4cbb406bc28..a487ff24129b 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -157,7 +157,8 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWPID | CLONE_NEWNET | CLONE_NEWCGROUP | CLONE_NEWTIME)))) { - if (likely(old_ns->time_ns_for_children == old_ns->time_ns)) { + if ((flags & CLONE_VM) || + likely(old_ns->time_ns_for_children == old_ns->time_ns)) { get_nsproxy(old_ns); return 0; } @@ -255,6 +256,23 @@ void exit_task_namespaces(struct task_struct *p) switch_task_namespaces(p, NULL); } +int exec_task_namespaces(void) +{ + struct task_struct *tsk = current; + struct nsproxy *new; + + if (tsk->nsproxy->time_ns_for_children == tsk->nsproxy->time_ns) + return 0; + + new = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs); + if (IS_ERR(new)) + return PTR_ERR(new); + + timens_on_fork(new, tsk); + switch_task_namespaces(tsk, new); + return 0; +} + static int check_setns_flags(unsigned long flags) { if (!flags || (flags & ~(CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | diff --git a/kernel/padata.c b/kernel/padata.c index e5819bb8bd1d..e007b8a4b738 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -83,8 +83,16 @@ static struct padata_work *padata_work_alloc(void) return pw; } -static void padata_work_init(struct padata_work *pw, work_func_t work_fn, - void *data, int flags) +/* + * This function is marked __ref because this function may be optimized in such + * a way that it directly refers to work_fn's address, which causes modpost to + * complain when work_fn is marked __init. This scenario was observed with clang + * LTO, where padata_work_init() was optimized to refer directly to + * padata_mt_helper() because the calls to padata_work_init() with other work_fn + * values were eliminated or inlined. + */ +static void __ref padata_work_init(struct padata_work *pw, work_func_t work_fn, + void *data, int flags) { if (flags & PADATA_WORK_ONSTACK) INIT_WORK_ONSTACK(&pw->pw_work, work_fn); @@ -207,14 +215,16 @@ int padata_do_parallel(struct padata_shell *ps, pw = padata_work_alloc(); spin_unlock(&padata_works_lock); + if (!pw) { + /* Maximum works limit exceeded, run in the current task. */ + padata->parallel(padata); + } + rcu_read_unlock_bh(); if (pw) { padata_work_init(pw, padata_parallel_worker, padata, 0); queue_work(pinst->parallel_wq, &pw->pw_work); - } else { - /* Maximum works limit exceeded, run in the current task. */ - padata->parallel(padata); } return 0; @@ -388,13 +398,16 @@ void padata_do_serial(struct padata_priv *padata) int hashed_cpu = padata_cpu_hash(pd, padata->seq_nr); struct padata_list *reorder = per_cpu_ptr(pd->reorder_list, hashed_cpu); struct padata_priv *cur; + struct list_head *pos; spin_lock(&reorder->lock); /* Sort in ascending order of sequence number. */ - list_for_each_entry_reverse(cur, &reorder->list, list) + list_for_each_prev(pos, &reorder->list) { + cur = list_entry(pos, struct padata_priv, list); if (cur->seq_nr < padata->seq_nr) break; - list_add(&padata->list, &cur->list); + } + list_add(&padata->list, pos); spin_unlock(&reorder->lock); /* diff --git a/kernel/panic.c b/kernel/panic.c index c6eb8f8db0c0..463c9295bc28 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -25,6 +25,7 @@ #include <linux/kexec.h> #include <linux/panic_notifier.h> #include <linux/sched.h> +#include <linux/string_helpers.h> #include <linux/sysrq.h> #include <linux/init.h> #include <linux/nmi.h> @@ -32,6 +33,7 @@ #include <linux/bug.h> #include <linux/ratelimit.h> #include <linux/debugfs.h> +#include <linux/sysfs.h> #include <trace/events/error_report.h> #include <asm/sections.h> @@ -58,6 +60,7 @@ bool crash_kexec_post_notifiers; int panic_on_warn __read_mostly; unsigned long panic_on_taint; bool panic_on_taint_nousertaint = false; +static unsigned int warn_limit __read_mostly; int panic_timeout = CONFIG_PANIC_TIMEOUT; EXPORT_SYMBOL_GPL(panic_timeout); @@ -75,8 +78,9 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list); EXPORT_SYMBOL(panic_notifier_list); -#if defined(CONFIG_SMP) && defined(CONFIG_SYSCTL) +#ifdef CONFIG_SYSCTL static struct ctl_table kern_panic_table[] = { +#ifdef CONFIG_SMP { .procname = "oops_all_cpu_backtrace", .data = &sysctl_oops_all_cpu_backtrace, @@ -86,6 +90,14 @@ static struct ctl_table kern_panic_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, +#endif + { + .procname = "warn_limit", + .data = &warn_limit, + .maxlen = sizeof(warn_limit), + .mode = 0644, + .proc_handler = proc_douintvec, + }, { } }; @@ -97,6 +109,25 @@ static __init int kernel_panic_sysctls_init(void) late_initcall(kernel_panic_sysctls_init); #endif +static atomic_t warn_count = ATOMIC_INIT(0); + +#ifdef CONFIG_SYSFS +static ssize_t warn_count_show(struct kobject *kobj, struct kobj_attribute *attr, + char *page) +{ + return sysfs_emit(page, "%d\n", atomic_read(&warn_count)); +} + +static struct kobj_attribute warn_count_attr = __ATTR_RO(warn_count); + +static __init int kernel_panic_sysfs_init(void) +{ + sysfs_add_file_to_group(kernel_kobj, &warn_count_attr.attr, NULL); + return 0; +} +late_initcall(kernel_panic_sysfs_init); +#endif + static long no_blink(int state) { return 0; @@ -199,6 +230,19 @@ static void panic_print_sys_info(bool console_flush) ftrace_dump(DUMP_ALL); } +void check_panic_on_warn(const char *origin) +{ + unsigned int limit; + + if (panic_on_warn) + panic("%s: panic_on_warn set ...\n", origin); + + limit = READ_ONCE(warn_limit); + if (atomic_inc_return(&warn_count) >= limit && limit) + panic("%s: system warned too often (kernel.warn_limit is %d)", + origin, limit); +} + /** * panic - halt the system * @fmt: The text string to print @@ -329,9 +373,6 @@ void panic(const char *fmt, ...) if (_crash_kexec_post_notifiers) __crash_kexec(NULL); -#ifdef CONFIG_VT - unblank_screen(); -#endif console_unblank(); /* @@ -620,8 +661,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint, if (regs) show_regs(regs); - if (panic_on_warn) - panic("panic_on_warn set ...\n"); + check_panic_on_warn("kernel"); if (!regs) dump_stack(); @@ -747,8 +787,8 @@ static int __init panic_on_taint_setup(char *s) if (s && !strcmp(s, "nousertaint")) panic_on_taint_nousertaint = true; - pr_info("panic_on_taint: bitmask=0x%lx nousertaint_mode=%sabled\n", - panic_on_taint, panic_on_taint_nousertaint ? "en" : "dis"); + pr_info("panic_on_taint: bitmask=0x%lx nousertaint_mode=%s\n", + panic_on_taint, str_enabled_disabled(panic_on_taint_nousertaint)); return 0; } diff --git a/kernel/params.c b/kernel/params.c index 5b92310425c5..14d66070757b 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -926,7 +926,7 @@ static const struct sysfs_ops module_sysfs_ops = { .store = module_attr_store, }; -static int uevent_filter(struct kobject *kobj) +static int uevent_filter(const struct kobject *kobj) { const struct kobj_type *ktype = get_ktype(kobj); @@ -940,7 +940,6 @@ static const struct kset_uevent_ops module_uevent_ops = { }; struct kset *module_kset; -int module_sysfs_initialized; static void module_kobj_release(struct kobject *kobj) { @@ -954,7 +953,11 @@ struct kobj_type module_ktype = { }; /* - * param_sysfs_init - wrapper for built-in params support + * param_sysfs_init - create "module" kset + * + * This must be done before the initramfs is unpacked and + * request_module() thus becomes possible, because otherwise the + * module load would fail in mod_sysfs_init. */ static int __init param_sysfs_init(void) { @@ -964,13 +967,25 @@ static int __init param_sysfs_init(void) __FILE__, __LINE__); return -ENOMEM; } - module_sysfs_initialized = 1; + + return 0; +} +subsys_initcall(param_sysfs_init); + +/* + * param_sysfs_builtin_init - add sysfs version and parameter + * attributes for built-in modules + */ +static int __init param_sysfs_builtin_init(void) +{ + if (!module_kset) + return -ENOMEM; version_sysfs_builtin(); param_sysfs_builtin(); return 0; } -subsys_initcall(param_sysfs_init); +late_initcall(param_sysfs_builtin_init); #endif /* CONFIG_SYSFS */ diff --git a/kernel/pid.c b/kernel/pid.c index 2fc0a16ec77b..3fbc5e46b721 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -519,6 +519,7 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) { return idr_get_next(&ns->idr, &nr); } +EXPORT_SYMBOL_GPL(find_ge_pid); struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags) { diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 89c71fce225d..793c55a2becb 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -92,20 +92,24 @@ bool hibernation_available(void) */ void hibernation_set_ops(const struct platform_hibernation_ops *ops) { + unsigned int sleep_flags; + if (ops && !(ops->begin && ops->end && ops->pre_snapshot && ops->prepare && ops->finish && ops->enter && ops->pre_restore && ops->restore_cleanup && ops->leave)) { WARN_ON(1); return; } - lock_system_sleep(); + + sleep_flags = lock_system_sleep(); + hibernation_ops = ops; if (ops) hibernation_mode = HIBERNATION_PLATFORM; else if (hibernation_mode == HIBERNATION_PLATFORM) hibernation_mode = HIBERNATION_SHUTDOWN; - unlock_system_sleep(); + unlock_system_sleep(sleep_flags); } EXPORT_SYMBOL_GPL(hibernation_set_ops); @@ -641,7 +645,7 @@ static void power_down(void) int error; if (hibernation_mode == HIBERNATION_SUSPEND) { - error = suspend_devices_and_enter(PM_SUSPEND_MEM); + error = suspend_devices_and_enter(mem_sleep_current); if (error) { hibernation_mode = hibernation_ops ? HIBERNATION_PLATFORM : @@ -713,6 +717,7 @@ static int load_image_and_restore(void) int hibernate(void) { bool snapshot_test = false; + unsigned int sleep_flags; int error; if (!hibernation_available()) { @@ -720,7 +725,7 @@ int hibernate(void) return -EPERM; } - lock_system_sleep(); + sleep_flags = lock_system_sleep(); /* The snapshot device should not be opened while we're running */ if (!hibernate_acquire()) { error = -EBUSY; @@ -794,7 +799,7 @@ int hibernate(void) pm_restore_console(); hibernate_release(); Unlock: - unlock_system_sleep(); + unlock_system_sleep(sleep_flags); pr_info("hibernation exit\n"); return error; @@ -809,9 +814,10 @@ int hibernate(void) */ int hibernate_quiet_exec(int (*func)(void *data), void *data) { + unsigned int sleep_flags; int error; - lock_system_sleep(); + sleep_flags = lock_system_sleep(); if (!hibernate_acquire()) { error = -EBUSY; @@ -891,7 +897,7 @@ restore: hibernate_release(); unlock: - unlock_system_sleep(); + unlock_system_sleep(sleep_flags); return error; } @@ -1100,11 +1106,12 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { + int mode = HIBERNATION_INVALID; + unsigned int sleep_flags; int error = 0; - int i; int len; char *p; - int mode = HIBERNATION_INVALID; + int i; if (!hibernation_available()) return -EPERM; @@ -1112,7 +1119,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, p = memchr(buf, '\n', n); len = p ? p - buf : n; - lock_system_sleep(); + sleep_flags = lock_system_sleep(); for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { if (len == strlen(hibernation_modes[i]) && !strncmp(buf, hibernation_modes[i], len)) { @@ -1142,7 +1149,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, if (!error) pm_pr_dbg("Hibernation mode set to '%s'\n", hibernation_modes[mode]); - unlock_system_sleep(); + unlock_system_sleep(sleep_flags); return error ? error : n; } @@ -1158,9 +1165,10 @@ static ssize_t resume_show(struct kobject *kobj, struct kobj_attribute *attr, static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { - dev_t res; + unsigned int sleep_flags; int len = n; char *name; + dev_t res; if (len && buf[len-1] == '\n') len--; @@ -1173,9 +1181,10 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, if (!res) return -EINVAL; - lock_system_sleep(); + sleep_flags = lock_system_sleep(); swsusp_resume_device = res; - unlock_system_sleep(); + unlock_system_sleep(sleep_flags); + pm_pr_dbg("Configured hibernation resume from disk to %u\n", swsusp_resume_device); noresume = 0; diff --git a/kernel/power/main.c b/kernel/power/main.c index e3694034b753..31ec4a9b9d70 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -21,14 +21,16 @@ #ifdef CONFIG_PM_SLEEP -void lock_system_sleep(void) +unsigned int lock_system_sleep(void) { - current->flags |= PF_FREEZER_SKIP; + unsigned int flags = current->flags; + current->flags |= PF_NOFREEZE; mutex_lock(&system_transition_mutex); + return flags; } EXPORT_SYMBOL_GPL(lock_system_sleep); -void unlock_system_sleep(void) +void unlock_system_sleep(unsigned int flags) { /* * Don't use freezer_count() because we don't want the call to @@ -46,7 +48,8 @@ void unlock_system_sleep(void) * Which means, if we use try_to_freeze() here, it would make them * enter the refrigerator, thus causing hibernation to lockup. */ - current->flags &= ~PF_FREEZER_SKIP; + if (!(flags & PF_NOFREEZE)) + current->flags &= ~PF_NOFREEZE; mutex_unlock(&system_transition_mutex); } EXPORT_SYMBOL_GPL(unlock_system_sleep); @@ -263,16 +266,17 @@ static ssize_t pm_test_show(struct kobject *kobj, struct kobj_attribute *attr, static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { + unsigned int sleep_flags; const char * const *s; + int error = -EINVAL; int level; char *p; int len; - int error = -EINVAL; p = memchr(buf, '\n', n); len = p ? p - buf : n; - lock_system_sleep(); + sleep_flags = lock_system_sleep(); level = TEST_FIRST; for (s = &pm_tests[level]; level <= TEST_MAX; s++, level++) @@ -282,7 +286,7 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr, break; } - unlock_system_sleep(); + unlock_system_sleep(sleep_flags); return error ? error : n; } diff --git a/kernel/power/process.c b/kernel/power/process.c index 3068601e585a..6c1c7e566d35 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -27,6 +27,8 @@ unsigned int __read_mostly freeze_timeout_msecs = 20 * MSEC_PER_SEC; static int try_to_freeze_tasks(bool user_only) { + const char *what = user_only ? "user space processes" : + "remaining freezable tasks"; struct task_struct *g, *p; unsigned long end_time; unsigned int todo; @@ -36,6 +38,8 @@ static int try_to_freeze_tasks(bool user_only) bool wakeup = false; int sleep_usecs = USEC_PER_MSEC; + pr_info("Freezing %s\n", what); + start = ktime_get_boottime(); end_time = jiffies + msecs_to_jiffies(freeze_timeout_msecs); @@ -50,8 +54,7 @@ static int try_to_freeze_tasks(bool user_only) if (p == current || !freeze_task(p)) continue; - if (!freezer_should_skip(p)) - todo++; + todo++; } read_unlock(&tasklist_lock); @@ -83,9 +86,8 @@ static int try_to_freeze_tasks(bool user_only) elapsed_msecs = ktime_to_ms(elapsed); if (todo) { - pr_cont("\n"); - pr_err("Freezing of tasks %s after %d.%03d seconds " - "(%d tasks refusing to freeze, wq_busy=%d):\n", + pr_err("Freezing %s %s after %d.%03d seconds " + "(%d tasks refusing to freeze, wq_busy=%d):\n", what, wakeup ? "aborted" : "failed", elapsed_msecs / 1000, elapsed_msecs % 1000, todo - wq_busy, wq_busy); @@ -96,15 +98,14 @@ static int try_to_freeze_tasks(bool user_only) if (!wakeup || pm_debug_messages_on) { read_lock(&tasklist_lock); for_each_process_thread(g, p) { - if (p != current && !freezer_should_skip(p) - && freezing(p) && !frozen(p)) + if (p != current && freezing(p) && !frozen(p)) sched_show_task(p); } read_unlock(&tasklist_lock); } } else { - pr_cont("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000, - elapsed_msecs % 1000); + pr_info("Freezing %s completed (elapsed %d.%03d seconds)\n", + what, elapsed_msecs / 1000, elapsed_msecs % 1000); } return todo ? -EBUSY : 0; @@ -129,17 +130,14 @@ int freeze_processes(void) current->flags |= PF_SUSPEND_TASK; if (!pm_freezing) - atomic_inc(&system_freezing_cnt); + static_branch_inc(&freezer_active); pm_wakeup_clear(0); - pr_info("Freezing user space processes ... "); pm_freezing = true; error = try_to_freeze_tasks(true); - if (!error) { + if (!error) __usermodehelper_set_disable_depth(UMH_DISABLED); - pr_cont("done."); - } - pr_cont("\n"); + BUG_ON(in_atomic()); /* @@ -168,14 +166,9 @@ int freeze_kernel_threads(void) { int error; - pr_info("Freezing remaining freezable tasks ... "); - pm_nosig_freezing = true; error = try_to_freeze_tasks(false); - if (!error) - pr_cont("done."); - pr_cont("\n"); BUG_ON(in_atomic()); if (error) @@ -190,7 +183,7 @@ void thaw_processes(void) trace_suspend_resume(TPS("thaw_processes"), 0, true); if (pm_freezing) - atomic_dec(&system_freezing_cnt); + static_branch_dec(&freezer_active); pm_freezing = false; pm_nosig_freezing = false; diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 2a406753af90..cd8b7b35f1e8 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1723,8 +1723,8 @@ static unsigned long minimum_image_size(unsigned long saveable) * /sys/power/reserved_size, respectively). To make this happen, we compute the * total number of available page frames and allocate at least * - * ([page frames total] + PAGES_FOR_IO + [metadata pages]) / 2 - * + 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE) + * ([page frames total] - PAGES_FOR_IO - [metadata pages]) / 2 + * - 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE) * * of them, which corresponds to the maximum size of a hibernation image. * @@ -2259,10 +2259,14 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) if (unlikely(buf[j] == BM_END_OF_MAP)) break; - if (pfn_valid(buf[j]) && memory_bm_pfn_present(bm, buf[j])) + if (pfn_valid(buf[j]) && memory_bm_pfn_present(bm, buf[j])) { memory_bm_set_bit(bm, buf[j]); - else + } else { + if (!pfn_valid(buf[j])) + pr_err(FW_BUG "Memory map mismatch at 0x%llx after hibernation\n", + (unsigned long long)PFN_PHYS(buf[j])); return -EFAULT; + } } return 0; diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 827075944d28..fa3bf161d13f 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -75,9 +75,11 @@ EXPORT_SYMBOL_GPL(pm_suspend_default_s2idle); void s2idle_set_ops(const struct platform_s2idle_ops *ops) { - lock_system_sleep(); + unsigned int sleep_flags; + + sleep_flags = lock_system_sleep(); s2idle_ops = ops; - unlock_system_sleep(); + unlock_system_sleep(sleep_flags); } static void s2idle_begin(void) @@ -136,6 +138,9 @@ static void s2idle_loop(void) break; } + if (s2idle_ops && s2idle_ops->check) + s2idle_ops->check(); + s2idle_enter(); } @@ -200,7 +205,9 @@ __setup("mem_sleep_default=", mem_sleep_default_setup); */ void suspend_set_ops(const struct platform_suspend_ops *ops) { - lock_system_sleep(); + unsigned int sleep_flags; + + sleep_flags = lock_system_sleep(); suspend_ops = ops; @@ -216,7 +223,7 @@ void suspend_set_ops(const struct platform_suspend_ops *ops) mem_sleep_current = PM_SUSPEND_MEM; } - unlock_system_sleep(); + unlock_system_sleep(sleep_flags); } EXPORT_SYMBOL_GPL(suspend_set_ops); diff --git a/kernel/power/user.c b/kernel/power/user.c index d43c2aa583b2..3a4e70366f35 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -47,12 +47,13 @@ int is_hibernate_resume_dev(dev_t dev) static int snapshot_open(struct inode *inode, struct file *filp) { struct snapshot_data *data; + unsigned int sleep_flags; int error; if (!hibernation_available()) return -EPERM; - lock_system_sleep(); + sleep_flags = lock_system_sleep(); if (!hibernate_acquire()) { error = -EBUSY; @@ -98,7 +99,7 @@ static int snapshot_open(struct inode *inode, struct file *filp) data->dev = 0; Unlock: - unlock_system_sleep(); + unlock_system_sleep(sleep_flags); return error; } @@ -106,8 +107,9 @@ static int snapshot_open(struct inode *inode, struct file *filp) static int snapshot_release(struct inode *inode, struct file *filp) { struct snapshot_data *data; + unsigned int sleep_flags; - lock_system_sleep(); + sleep_flags = lock_system_sleep(); swsusp_free(); data = filp->private_data; @@ -124,7 +126,7 @@ static int snapshot_release(struct inode *inode, struct file *filp) PM_POST_HIBERNATION : PM_POST_RESTORE); hibernate_release(); - unlock_system_sleep(); + unlock_system_sleep(sleep_flags); return 0; } @@ -132,11 +134,12 @@ static int snapshot_release(struct inode *inode, struct file *filp) static ssize_t snapshot_read(struct file *filp, char __user *buf, size_t count, loff_t *offp) { + loff_t pg_offp = *offp & ~PAGE_MASK; struct snapshot_data *data; + unsigned int sleep_flags; ssize_t res; - loff_t pg_offp = *offp & ~PAGE_MASK; - lock_system_sleep(); + sleep_flags = lock_system_sleep(); data = filp->private_data; if (!data->ready) { @@ -157,7 +160,7 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf, *offp += res; Unlock: - unlock_system_sleep(); + unlock_system_sleep(sleep_flags); return res; } @@ -165,16 +168,17 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf, static ssize_t snapshot_write(struct file *filp, const char __user *buf, size_t count, loff_t *offp) { + loff_t pg_offp = *offp & ~PAGE_MASK; struct snapshot_data *data; + unsigned long sleep_flags; ssize_t res; - loff_t pg_offp = *offp & ~PAGE_MASK; if (need_wait) { wait_for_device_probe(); need_wait = false; } - lock_system_sleep(); + sleep_flags = lock_system_sleep(); data = filp->private_data; @@ -196,7 +200,7 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf, if (res > 0) *offp += res; unlock: - unlock_system_sleep(); + unlock_system_sleep(sleep_flags); return res; } diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index a1a81fd9889b..7decf1e9c486 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -79,13 +79,20 @@ int oops_in_progress; EXPORT_SYMBOL(oops_in_progress); /* - * console_sem protects the console_drivers list, and also - * provides serialisation for access to the entire console - * driver system. + * console_mutex protects console_list updates and console->flags updates. + * The flags are synchronized only for consoles that are registered, i.e. + * accessible via the console list. + */ +static DEFINE_MUTEX(console_mutex); + +/* + * console_sem protects updates to console->seq and console_suspended, + * and also provides serialization for console printing. */ static DEFINE_SEMAPHORE(console_sem); -struct console *console_drivers; -EXPORT_SYMBOL_GPL(console_drivers); +HLIST_HEAD(console_list); +EXPORT_SYMBOL_GPL(console_list); +DEFINE_STATIC_SRCU(console_srcu); /* * System may need to suppress printk message under certain @@ -103,6 +110,19 @@ static int __read_mostly suppress_panic_printk; static struct lockdep_map console_lock_dep_map = { .name = "console_lock" }; + +void lockdep_assert_console_list_lock_held(void) +{ + lockdep_assert_held(&console_mutex); +} +EXPORT_SYMBOL(lockdep_assert_console_list_lock_held); +#endif + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +bool console_srcu_read_lock_is_held(void) +{ + return srcu_read_lock_held(&console_srcu); +} #endif enum devkmsg_log_bits { @@ -220,8 +240,68 @@ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, } #endif /* CONFIG_PRINTK && CONFIG_SYSCTL */ -/* Number of registered extended console drivers. */ -static int nr_ext_console_drivers; +/** + * console_list_lock - Lock the console list + * + * For console list or console->flags updates + */ +void console_list_lock(void) +{ + /* + * In unregister_console() and console_force_preferred_locked(), + * synchronize_srcu() is called with the console_list_lock held. + * Therefore it is not allowed that the console_list_lock is taken + * with the srcu_lock held. + * + * Detecting if this context is really in the read-side critical + * section is only possible if the appropriate debug options are + * enabled. + */ + WARN_ON_ONCE(debug_lockdep_rcu_enabled() && + srcu_read_lock_held(&console_srcu)); + + mutex_lock(&console_mutex); +} +EXPORT_SYMBOL(console_list_lock); + +/** + * console_list_unlock - Unlock the console list + * + * Counterpart to console_list_lock() + */ +void console_list_unlock(void) +{ + mutex_unlock(&console_mutex); +} +EXPORT_SYMBOL(console_list_unlock); + +/** + * console_srcu_read_lock - Register a new reader for the + * SRCU-protected console list + * + * Use for_each_console_srcu() to iterate the console list + * + * Context: Any context. + * Return: A cookie to pass to console_srcu_read_unlock(). + */ +int console_srcu_read_lock(void) +{ + return srcu_read_lock_nmisafe(&console_srcu); +} +EXPORT_SYMBOL(console_srcu_read_lock); + +/** + * console_srcu_read_unlock - Unregister an old reader from + * the SRCU-protected console list + * @cookie: cookie returned from console_srcu_read_lock() + * + * Counterpart to console_srcu_read_lock() + */ +void console_srcu_read_unlock(int cookie) +{ + srcu_read_unlock_nmisafe(&console_srcu, cookie); +} +EXPORT_SYMBOL(console_srcu_read_unlock); /* * Helper macros to handle lockdep when locking/unlocking console_sem. We use @@ -433,7 +513,7 @@ static struct printk_ringbuffer *prb = &printk_rb_static; * per_cpu_areas are initialised. This variable is set to true when * it's safe to access per-CPU data. */ -static bool __printk_percpu_data_ready __read_mostly; +static bool __printk_percpu_data_ready __ro_after_init; bool printk_percpu_data_ready(void) { @@ -1817,13 +1897,13 @@ static void console_lock_spinning_enable(void) * safe to start busy waiting for the lock. Second, it checks if * there is a busy waiter and passes the lock rights to her. * - * Important: Callers lose the lock if there was a busy waiter. - * They must not touch items synchronized by console_lock - * in this case. + * Important: Callers lose both the console_lock and the SRCU read lock if + * there was a busy waiter. They must not touch items synchronized by + * console_lock or SRCU read lock in this case. * * Return: 1 if the lock rights were passed, 0 otherwise. */ -static int console_lock_spinning_disable_and_check(void) +static int console_lock_spinning_disable_and_check(int cookie) { int waiter; @@ -1843,6 +1923,12 @@ static int console_lock_spinning_disable_and_check(void) spin_release(&console_owner_dep_map, _THIS_IP_); /* + * Preserve lockdep lock ordering. Release the SRCU read lock before + * releasing the console_lock. + */ + console_srcu_read_unlock(cookie); + + /* * Hand off console_lock to waiter. The waiter will perform * the up(). After this, the waiter is the console_lock owner. */ @@ -2296,6 +2382,7 @@ asmlinkage __visible int _printk(const char *fmt, ...) } EXPORT_SYMBOL(_printk); +static bool pr_flush(int timeout_ms, bool reset_on_progress); static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress); #else /* CONFIG_PRINTK */ @@ -2324,12 +2411,13 @@ static ssize_t msg_print_ext_body(char *buf, size_t size, char *text, size_t text_len, struct dev_printk_info *dev_info) { return 0; } static void console_lock_spinning_enable(void) { } -static int console_lock_spinning_disable_and_check(void) { return 0; } +static int console_lock_spinning_disable_and_check(int cookie) { return 0; } static void call_console_driver(struct console *con, const char *text, size_t len, char *dropped_text) { } static bool suppress_message_printing(int level) { return false; } +static bool pr_flush(int timeout_ms, bool reset_on_progress) { return true; } static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) { return true; } #endif /* CONFIG_PRINTK */ @@ -2392,7 +2480,7 @@ static int __add_preferred_console(char *name, int idx, char *options, return -E2BIG; if (!brl_options) preferred_console = i; - strlcpy(c->name, name, sizeof(c->name)); + strscpy(c->name, name, sizeof(c->name)); c->options = options; set_user_specified(c, user_specified); braille_set_options(c, brl_options); @@ -2554,10 +2642,10 @@ static int console_cpu_notify(unsigned int cpu) } /** - * console_lock - lock the console system for exclusive use. + * console_lock - block the console subsystem from printing * - * Acquires a lock which guarantees that the caller has - * exclusive access to the console system and the console_drivers list. + * Acquires a lock which guarantees that no consoles will + * be in or enter their write() callback. * * Can sleep, returns nothing. */ @@ -2574,10 +2662,10 @@ void console_lock(void) EXPORT_SYMBOL(console_lock); /** - * console_trylock - try to lock the console system for exclusive use. + * console_trylock - try to block the console subsystem from printing * - * Try to acquire a lock which guarantees that the caller has exclusive - * access to the console system and the console_drivers list. + * Try to acquire a lock which guarantees that no consoles will + * be in or enter their write() callback. * * returns 1 on success, and 0 on failure to acquire the lock. */ @@ -2624,11 +2712,13 @@ static bool abandon_console_lock_in_panic(void) * Check if the given console is currently capable and allowed to print * records. * - * Requires the console_lock. + * Requires the console_srcu_read_lock. */ static inline bool console_is_usable(struct console *con) { - if (!(con->flags & CON_ENABLED)) + short flags = console_srcu_read_flags(con); + + if (!(flags & CON_ENABLED)) return false; if (!con->write) @@ -2639,8 +2729,7 @@ static inline bool console_is_usable(struct console *con) * allocated. So unless they're explicitly marked as being able to * cope (CON_ANYTIME) don't call them until this CPU is officially up. */ - if (!cpu_online(raw_smp_processor_id()) && - !(con->flags & CON_ANYTIME)) + if (!cpu_online(raw_smp_processor_id()) && !(flags & CON_ANYTIME)) return false; return true; @@ -2665,16 +2754,18 @@ static void __console_unlock(void) * DROPPED_TEXT_MAX. Otherwise @dropped_text must be NULL. * * @handover will be set to true if a printk waiter has taken over the - * console_lock, in which case the caller is no longer holding the - * console_lock. Otherwise it is set to false. + * console_lock, in which case the caller is no longer holding both the + * console_lock and the SRCU read lock. Otherwise it is set to false. + * + * @cookie is the cookie from the SRCU read lock. * * Returns false if the given console has no next record to print, otherwise * true. * - * Requires the console_lock. + * Requires the console_lock and the SRCU read lock. */ static bool console_emit_next_record(struct console *con, char *text, char *ext_text, - char *dropped_text, bool *handover) + char *dropped_text, bool *handover, int cookie) { static int panic_console_dropped; struct printk_info info; @@ -2734,7 +2825,7 @@ static bool console_emit_next_record(struct console *con, char *text, char *ext_ con->seq++; - *handover = console_lock_spinning_disable_and_check(); + *handover = console_lock_spinning_disable_and_check(cookie); printk_safe_exit_irqrestore(flags); skip: return true; @@ -2771,6 +2862,7 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove bool any_usable = false; struct console *con; bool any_progress; + int cookie; *next_seq = 0; *handover = false; @@ -2778,23 +2870,29 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove do { any_progress = false; - for_each_console(con) { + cookie = console_srcu_read_lock(); + for_each_console_srcu(con) { bool progress; if (!console_is_usable(con)) continue; any_usable = true; - if (con->flags & CON_EXTENDED) { + if (console_srcu_read_flags(con) & CON_EXTENDED) { /* Extended consoles do not print "dropped messages". */ progress = console_emit_next_record(con, &text[0], &ext_text[0], NULL, - handover); + handover, cookie); } else { progress = console_emit_next_record(con, &text[0], NULL, &dropped_text[0], - handover); + handover, cookie); } + + /* + * If a handover has occurred, the SRCU read lock + * is already released. + */ if (*handover) return false; @@ -2808,21 +2906,26 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove /* Allow panic_cpu to take over the consoles safely. */ if (abandon_console_lock_in_panic()) - return false; + goto abandon; if (do_cond_resched) cond_resched(); } + console_srcu_read_unlock(cookie); } while (any_progress); return any_usable; + +abandon: + console_srcu_read_unlock(cookie); + return false; } /** - * console_unlock - unlock the console system + * console_unlock - unblock the console subsystem from printing * - * Releases the console_lock which the caller holds on the console system - * and the console driver list. + * Releases the console_lock which the caller holds to block printing of + * the console subsystem. * * While the console_lock was held, console output may have been buffered * by printk(). If this is the case, console_unlock(); emits @@ -2900,10 +3003,14 @@ EXPORT_SYMBOL(console_conditional_schedule); void console_unblank(void) { struct console *c; + int cookie; /* - * console_unblank can no longer be called in interrupt context unless - * oops_in_progress is set to 1.. + * Stop console printing because the unblank() callback may + * assume the console is not within its write() callback. + * + * If @oops_in_progress is set, this may be an atomic context. + * In that case, attempt a trylock as best-effort. */ if (oops_in_progress) { if (down_trylock_console_sem() != 0) @@ -2913,9 +3020,14 @@ void console_unblank(void) console_locked = 1; console_may_schedule = 0; - for_each_console(c) - if ((c->flags & CON_ENABLED) && c->unblank) + + cookie = console_srcu_read_lock(); + for_each_console_srcu(c) { + if ((console_srcu_read_flags(c) & CON_ENABLED) && c->unblank) c->unblank(); + } + console_srcu_read_unlock(cookie); + console_unlock(); if (!oops_in_progress) @@ -2942,11 +3054,21 @@ void console_flush_on_panic(enum con_flush_mode mode) if (mode == CONSOLE_REPLAY_ALL) { struct console *c; + int cookie; u64 seq; seq = prb_first_valid_seq(prb); - for_each_console(c) + + cookie = console_srcu_read_lock(); + for_each_console_srcu(c) { + /* + * If the above console_trylock() failed, this is an + * unsynchronized assignment. But in that case, the + * kernel is in "hope and pray" mode anyway. + */ c->seq = seq; + } + console_srcu_read_unlock(cookie); } console_unlock(); } @@ -2958,15 +3080,25 @@ struct tty_driver *console_device(int *index) { struct console *c; struct tty_driver *driver = NULL; + int cookie; + /* + * Take console_lock to serialize device() callback with + * other console operations. For example, fg_console is + * modified under console_lock when switching vt. + */ console_lock(); - for_each_console(c) { + + cookie = console_srcu_read_lock(); + for_each_console_srcu(c) { if (!c->device) continue; driver = c->device(c, index); if (driver) break; } + console_srcu_read_unlock(cookie); + console_unlock(); return driver; } @@ -2979,17 +3111,25 @@ struct tty_driver *console_device(int *index) void console_stop(struct console *console) { __pr_flush(console, 1000, true); - console_lock(); - console->flags &= ~CON_ENABLED; - console_unlock(); + console_list_lock(); + console_srcu_write_flags(console, console->flags & ~CON_ENABLED); + console_list_unlock(); + + /* + * Ensure that all SRCU list walks have completed. All contexts must + * be able to see that this console is disabled so that (for example) + * the caller can suspend the port without risk of another context + * using the port. + */ + synchronize_srcu(&console_srcu); } EXPORT_SYMBOL(console_stop); void console_start(struct console *console) { - console_lock(); - console->flags |= CON_ENABLED; - console_unlock(); + console_list_lock(); + console_srcu_write_flags(console, console->flags | CON_ENABLED); + console_list_unlock(); __pr_flush(console, 1000, true); } EXPORT_SYMBOL(console_start); @@ -3082,6 +3222,72 @@ static void try_enable_default_console(struct console *newcon) (con->flags & CON_BOOT) ? "boot" : "", \ con->name, con->index, ##__VA_ARGS__) +static void console_init_seq(struct console *newcon, bool bootcon_registered) +{ + struct console *con; + bool handover; + + if (newcon->flags & (CON_PRINTBUFFER | CON_BOOT)) { + /* Get a consistent copy of @syslog_seq. */ + mutex_lock(&syslog_lock); + newcon->seq = syslog_seq; + mutex_unlock(&syslog_lock); + } else { + /* Begin with next message added to ringbuffer. */ + newcon->seq = prb_next_seq(prb); + + /* + * If any enabled boot consoles are due to be unregistered + * shortly, some may not be caught up and may be the same + * device as @newcon. Since it is not known which boot console + * is the same device, flush all consoles and, if necessary, + * start with the message of the enabled boot console that is + * the furthest behind. + */ + if (bootcon_registered && !keep_bootcon) { + /* + * Hold the console_lock to stop console printing and + * guarantee safe access to console->seq. + */ + console_lock(); + + /* + * Flush all consoles and set the console to start at + * the next unprinted sequence number. + */ + if (!console_flush_all(true, &newcon->seq, &handover)) { + /* + * Flushing failed. Just choose the lowest + * sequence of the enabled boot consoles. + */ + + /* + * If there was a handover, this context no + * longer holds the console_lock. + */ + if (handover) + console_lock(); + + newcon->seq = prb_next_seq(prb); + for_each_console(con) { + if ((con->flags & CON_BOOT) && + (con->flags & CON_ENABLED) && + con->seq < newcon->seq) { + newcon->seq = con->seq; + } + } + } + + console_unlock(); + } + } +} + +#define console_first() \ + hlist_entry(console_list.first, struct console, node) + +static int unregister_console_locked(struct console *console); + /* * The console driver calls this routine during kernel initialization * to register the console printing procedure with printk() and to @@ -3104,28 +3310,29 @@ static void try_enable_default_console(struct console *newcon) void register_console(struct console *newcon) { struct console *con; - bool bootcon_enabled = false; - bool realcon_enabled = false; + bool bootcon_registered = false; + bool realcon_registered = false; int err; + console_list_lock(); + for_each_console(con) { if (WARN(con == newcon, "console '%s%d' already registered\n", - con->name, con->index)) - return; - } + con->name, con->index)) { + goto unlock; + } - for_each_console(con) { if (con->flags & CON_BOOT) - bootcon_enabled = true; + bootcon_registered = true; else - realcon_enabled = true; + realcon_registered = true; } /* Do not register boot consoles when there already is a real one. */ - if (newcon->flags & CON_BOOT && realcon_enabled) { + if ((newcon->flags & CON_BOOT) && realcon_registered) { pr_info("Too late to register bootconsole %s%d\n", newcon->name, newcon->index); - return; + goto unlock; } /* @@ -3141,8 +3348,8 @@ void register_console(struct console *newcon) * flag set and will be first in the list. */ if (preferred_console < 0) { - if (!console_drivers || !console_drivers->device || - console_drivers->flags & CON_BOOT) { + if (hlist_empty(&console_list) || !console_first()->device || + console_first()->flags & CON_BOOT) { try_enable_default_console(newcon); } } @@ -3156,7 +3363,7 @@ void register_console(struct console *newcon) /* printk() messages are not printed to the Braille console. */ if (err || newcon->flags & CON_BRL) - return; + goto unlock; /* * If we have a bootconsole, and are switching to a real console, @@ -3164,42 +3371,38 @@ void register_console(struct console *newcon) * the real console are the same physical device, it's annoying to * see the beginning boot messages twice */ - if (bootcon_enabled && + if (bootcon_registered && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) { newcon->flags &= ~CON_PRINTBUFFER; } + newcon->dropped = 0; + console_init_seq(newcon, bootcon_registered); + /* - * Put this console in the list - keep the - * preferred driver at the head of the list. + * Put this console in the list - keep the + * preferred driver at the head of the list. */ - console_lock(); - if ((newcon->flags & CON_CONSDEV) || console_drivers == NULL) { - newcon->next = console_drivers; - console_drivers = newcon; - if (newcon->next) - newcon->next->flags &= ~CON_CONSDEV; - /* Ensure this flag is always set for the head of the list */ + if (hlist_empty(&console_list)) { + /* Ensure CON_CONSDEV is always set for the head. */ newcon->flags |= CON_CONSDEV; - } else { - newcon->next = console_drivers->next; - console_drivers->next = newcon; - } + hlist_add_head_rcu(&newcon->node, &console_list); - if (newcon->flags & CON_EXTENDED) - nr_ext_console_drivers++; + } else if (newcon->flags & CON_CONSDEV) { + /* Only the new head can have CON_CONSDEV set. */ + console_srcu_write_flags(console_first(), console_first()->flags & ~CON_CONSDEV); + hlist_add_head_rcu(&newcon->node, &console_list); - newcon->dropped = 0; - if (newcon->flags & CON_PRINTBUFFER) { - /* Get a consistent copy of @syslog_seq. */ - mutex_lock(&syslog_lock); - newcon->seq = syslog_seq; - mutex_unlock(&syslog_lock); } else { - /* Begin with next message. */ - newcon->seq = prb_next_seq(prb); + hlist_add_behind_rcu(&newcon->node, console_list.first); } - console_unlock(); + + /* + * No need to synchronize SRCU here! The caller does not rely + * on all contexts being able to see the new console before + * register_console() completes. + */ + console_sysfs_notify(); /* @@ -3210,24 +3413,28 @@ void register_console(struct console *newcon) * went to the bootconsole (that they do not see on the real console) */ con_printk(KERN_INFO, newcon, "enabled\n"); - if (bootcon_enabled && + if (bootcon_registered && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) && !keep_bootcon) { - /* We need to iterate through all boot consoles, to make - * sure we print everything out, before we unregister them. - */ - for_each_console(con) + struct hlist_node *tmp; + + hlist_for_each_entry_safe(con, tmp, &console_list, node) { if (con->flags & CON_BOOT) - unregister_console(con); + unregister_console_locked(con); + } } +unlock: + console_list_unlock(); } EXPORT_SYMBOL(register_console); -int unregister_console(struct console *console) +/* Must be called under console_list_lock(). */ +static int unregister_console_locked(struct console *console) { - struct console *con; int res; + lockdep_assert_console_list_lock_held(); + con_printk(KERN_INFO, console, "disabled\n"); res = _braille_unregister_console(console); @@ -3236,51 +3443,94 @@ int unregister_console(struct console *console) if (res > 0) return 0; - res = -ENODEV; - console_lock(); - if (console_drivers == console) { - console_drivers=console->next; - res = 0; - } else { - for_each_console(con) { - if (con->next == console) { - con->next = console->next; - res = 0; - break; - } - } - } + /* Disable it unconditionally */ + console_srcu_write_flags(console, console->flags & ~CON_ENABLED); - if (res) - goto out_disable_unlock; + if (!console_is_registered_locked(console)) + return -ENODEV; - if (console->flags & CON_EXTENDED) - nr_ext_console_drivers--; + hlist_del_init_rcu(&console->node); /* + * <HISTORICAL> * If this isn't the last console and it has CON_CONSDEV set, we * need to set it on the next preferred console. + * </HISTORICAL> + * + * The above makes no sense as there is no guarantee that the next + * console has any device attached. Oh well.... */ - if (console_drivers != NULL && console->flags & CON_CONSDEV) - console_drivers->flags |= CON_CONSDEV; + if (!hlist_empty(&console_list) && console->flags & CON_CONSDEV) + console_srcu_write_flags(console_first(), console_first()->flags | CON_CONSDEV); + + /* + * Ensure that all SRCU list walks have completed. All contexts + * must not be able to see this console in the list so that any + * exit/cleanup routines can be performed safely. + */ + synchronize_srcu(&console_srcu); - console->flags &= ~CON_ENABLED; - console_unlock(); console_sysfs_notify(); if (console->exit) res = console->exit(console); return res; +} -out_disable_unlock: - console->flags &= ~CON_ENABLED; - console_unlock(); +int unregister_console(struct console *console) +{ + int res; + console_list_lock(); + res = unregister_console_locked(console); + console_list_unlock(); return res; } EXPORT_SYMBOL(unregister_console); +/** + * console_force_preferred_locked - force a registered console preferred + * @con: The registered console to force preferred. + * + * Must be called under console_list_lock(). + */ +void console_force_preferred_locked(struct console *con) +{ + struct console *cur_pref_con; + + if (!console_is_registered_locked(con)) + return; + + cur_pref_con = console_first(); + + /* Already preferred? */ + if (cur_pref_con == con) + return; + + /* + * Delete, but do not re-initialize the entry. This allows the console + * to continue to appear registered (via any hlist_unhashed_lockless() + * checks), even though it was briefly removed from the console list. + */ + hlist_del_rcu(&con->node); + + /* + * Ensure that all SRCU list walks have completed so that the console + * can be added to the beginning of the console list and its forward + * list pointer can be re-initialized. + */ + synchronize_srcu(&console_srcu); + + con->flags |= CON_CONSDEV; + WARN_ON(!con->device); + + /* Only the new head can have CON_CONSDEV set. */ + console_srcu_write_flags(cur_pref_con, cur_pref_con->flags & ~CON_CONSDEV); + hlist_add_head_rcu(&con->node, &console_list); +} +EXPORT_SYMBOL(console_force_preferred_locked); + /* * Initialize the console device. This is called *early*, so * we can't necessarily depend on lots of kernel help here. @@ -3327,10 +3577,12 @@ void __init console_init(void) */ static int __init printk_late_init(void) { + struct hlist_node *tmp; struct console *con; int ret; - for_each_console(con) { + console_list_lock(); + hlist_for_each_entry_safe(con, tmp, &console_list, node) { if (!(con->flags & CON_BOOT)) continue; @@ -3347,9 +3599,11 @@ static int __init printk_late_init(void) */ pr_warn("bootconsole [%s%d] uses init memory and must be disabled even before the real one is ready\n", con->name, con->index); - unregister_console(con); + unregister_console_locked(con); } } + console_list_unlock(); + ret = cpuhp_setup_state_nocalls(CPUHP_PRINTK_DEAD, "printk:dead", NULL, console_cpu_notify); WARN_ON(ret < 0); @@ -3369,6 +3623,7 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre struct console *c; u64 last_diff = 0; u64 printk_seq; + int cookie; u64 diff; u64 seq; @@ -3379,9 +3634,15 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre for (;;) { diff = 0; + /* + * Hold the console_lock to guarantee safe access to + * console->seq and to prevent changes to @console_suspended + * until all consoles have been processed. + */ console_lock(); - for_each_console(c) { + cookie = console_srcu_read_lock(); + for_each_console_srcu(c) { if (con && con != c) continue; if (!console_is_usable(c)) @@ -3390,6 +3651,7 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre if (printk_seq < seq) diff += seq - printk_seq; } + console_srcu_read_unlock(cookie); /* * If consoles are suspended, it cannot be expected that they @@ -3438,11 +3700,10 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre * Context: Process context. May sleep while acquiring console lock. * Return: true if all enabled printers are caught up. */ -bool pr_flush(int timeout_ms, bool reset_on_progress) +static bool pr_flush(int timeout_ms, bool reset_on_progress) { return __pr_flush(NULL, timeout_ms, reset_on_progress); } -EXPORT_SYMBOL(pr_flush); /* * Delayed printk version, for scheduler-internal messages: diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index 2b7b6ddab4f7..2dc4d5a1f1ff 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -203,7 +203,7 @@ * prb_rec_init_wr(&r, 5); * * // try to extend, but only if it does not exceed 32 bytes - * if (prb_reserve_in_last(&e, &test_rb, &r, printk_caller_id()), 32) { + * if (prb_reserve_in_last(&e, &test_rb, &r, printk_caller_id(), 32)) { * snprintf(&r.text_buf[r.info->text_len], * r.text_buf_size - r.info->text_len, "hello"); * diff --git a/kernel/profile.c b/kernel/profile.c index 7ea01ba30e75..8a77769bc4b4 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -59,43 +59,39 @@ int profile_setup(char *str) static const char schedstr[] = "schedule"; static const char sleepstr[] = "sleep"; static const char kvmstr[] = "kvm"; + const char *select = NULL; int par; if (!strncmp(str, sleepstr, strlen(sleepstr))) { #ifdef CONFIG_SCHEDSTATS force_schedstat_enabled(); prof_on = SLEEP_PROFILING; - if (str[strlen(sleepstr)] == ',') - str += strlen(sleepstr) + 1; - if (get_option(&str, &par)) - prof_shift = clamp(par, 0, BITS_PER_LONG - 1); - pr_info("kernel sleep profiling enabled (shift: %u)\n", - prof_shift); + select = sleepstr; #else pr_warn("kernel sleep profiling requires CONFIG_SCHEDSTATS\n"); #endif /* CONFIG_SCHEDSTATS */ } else if (!strncmp(str, schedstr, strlen(schedstr))) { prof_on = SCHED_PROFILING; - if (str[strlen(schedstr)] == ',') - str += strlen(schedstr) + 1; - if (get_option(&str, &par)) - prof_shift = clamp(par, 0, BITS_PER_LONG - 1); - pr_info("kernel schedule profiling enabled (shift: %u)\n", - prof_shift); + select = schedstr; } else if (!strncmp(str, kvmstr, strlen(kvmstr))) { prof_on = KVM_PROFILING; - if (str[strlen(kvmstr)] == ',') - str += strlen(kvmstr) + 1; - if (get_option(&str, &par)) - prof_shift = clamp(par, 0, BITS_PER_LONG - 1); - pr_info("kernel KVM profiling enabled (shift: %u)\n", - prof_shift); + select = kvmstr; } else if (get_option(&str, &par)) { prof_shift = clamp(par, 0, BITS_PER_LONG - 1); prof_on = CPU_PROFILING; pr_info("kernel profiling enabled (shift: %u)\n", prof_shift); } + + if (select) { + if (str[strlen(select)] == ',') + str += strlen(select) + 1; + if (get_option(&str, &par)) + prof_shift = clamp(par, 0, BITS_PER_LONG - 1); + pr_info("kernel %s profiling enabled (shift: %u)\n", + select, prof_shift); + } + return 1; } __setup("profile=", profile_setup); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 1893d909e45c..54482193e1ed 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -269,7 +269,7 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state) read_unlock(&tasklist_lock); if (!ret && !ignore_state && - WARN_ON_ONCE(!wait_task_inactive(child, __TASK_TRACED))) + WARN_ON_ONCE(!wait_task_inactive(child, __TASK_TRACED|TASK_FROZEN))) ret = -ESRCH; return ret; diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index d471d22a5e21..ab62074174c3 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -54,27 +54,25 @@ config RCU_EXPERT Say N if you are unsure. config SRCU - bool - help - This option selects the sleepable version of RCU. This version - permits arbitrary sleeping or blocking within RCU read-side critical - sections. + def_bool y config TINY_SRCU bool - default y if SRCU && TINY_RCU + default y if TINY_RCU help This option selects the single-CPU non-preemptible version of SRCU. config TREE_SRCU bool - default y if SRCU && !TINY_RCU + default y if !TINY_RCU help This option selects the full-fledged version of SRCU. +config NEED_SRCU_NMI_SAFE + def_bool HAVE_NMI && !ARCH_HAS_NMI_SAFE_THIS_CPU_OPS && !TINY_SRCU + config TASKS_RCU_GENERIC def_bool TASKS_RCU || TASKS_RUDE_RCU || TASKS_TRACE_RCU - select SRCU help This option enables generic infrastructure code supporting task-based RCU implementations. Not for manual selection. @@ -311,4 +309,12 @@ config TASKS_TRACE_RCU_READ_MB Say N here if you hate read-side memory barriers. Take the default if you are unsure. +config RCU_LAZY + bool "RCU callback lazy invocation functionality" + depends on RCU_NOCB_CPU + default n + help + To save power, batch RCU callbacks and flush after delay, memory + pressure, or callback list growing too big. + endmenu # "RCU Subsystem" diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 1b0c41d490f0..232e29fe3e5e 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -27,7 +27,6 @@ config RCU_SCALE_TEST tristate "performance tests for RCU" depends on DEBUG_KERNEL select TORTURE_TEST - select SRCU default n help This option provides a kernel module that runs performance @@ -43,7 +42,6 @@ config RCU_TORTURE_TEST tristate "torture tests for RCU" depends on DEBUG_KERNEL select TORTURE_TEST - select SRCU default n help This option provides a kernel module that runs torture tests @@ -59,7 +57,6 @@ config RCU_REF_SCALE_TEST tristate "Scalability tests for read-side synchronization (RCU and others)" depends on DEBUG_KERNEL select TORTURE_TEST - select SRCU default n help This option provides a kernel module that runs performance tests diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index be5979da07f5..c5aa934de59b 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -286,7 +286,7 @@ void rcu_test_sync_prims(void); */ extern void resched_cpu(int cpu); -#if defined(CONFIG_SRCU) || !defined(CONFIG_TINY_RCU) +#if !defined(CONFIG_TINY_RCU) #include <linux/rcu_node_tree.h> @@ -375,6 +375,10 @@ extern void rcu_init_geometry(void); (cpu) <= rnp->grphi; \ (cpu) = rcu_find_next_bit((rnp), (cpu) + 1 - (rnp->grplo), (mask))) +#endif /* !defined(CONFIG_TINY_RCU) */ + +#if !defined(CONFIG_TINY_RCU) || defined(CONFIG_TASKS_RCU_GENERIC) + /* * Wrappers for the rcu_node::lock acquire and release. * @@ -437,7 +441,7 @@ do { \ #define raw_lockdep_assert_held_rcu_node(p) \ lockdep_assert_held(&ACCESS_PRIVATE(p, lock)) -#endif /* #if defined(CONFIG_SRCU) || !defined(CONFIG_TINY_RCU) */ +#endif // #if !defined(CONFIG_TINY_RCU) || defined(CONFIG_TASKS_RCU_GENERIC) #ifdef CONFIG_TINY_RCU /* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */ @@ -474,6 +478,14 @@ enum rcutorture_type { INVALID_RCU_FLAVOR }; +#if defined(CONFIG_RCU_LAZY) +unsigned long rcu_lazy_get_jiffies_till_flush(void); +void rcu_lazy_set_jiffies_till_flush(unsigned long j); +#else +static inline unsigned long rcu_lazy_get_jiffies_till_flush(void) { return 0; } +static inline void rcu_lazy_set_jiffies_till_flush(unsigned long j) { } +#endif + #if defined(CONFIG_TREE_RCU) void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, unsigned long *gp_seq); diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index 3ef02d4a8108..91fb5905a008 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -95,6 +95,7 @@ torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable"); torture_param(int, kfree_rcu_test, 0, "Do we run a kfree_rcu() scale test?"); torture_param(int, kfree_mult, 1, "Multiple of kfree_obj size to allocate."); +torture_param(int, kfree_by_call_rcu, 0, "Use call_rcu() to emulate kfree_rcu()?"); static char *scale_type = "rcu"; module_param(scale_type, charp, 0444); @@ -175,7 +176,7 @@ static struct rcu_scale_ops rcu_ops = { .get_gp_seq = rcu_get_gp_seq, .gp_diff = rcu_seq_diff, .exp_completed = rcu_exp_batches_completed, - .async = call_rcu, + .async = call_rcu_hurry, .gp_barrier = rcu_barrier, .sync = synchronize_rcu, .exp_sync = synchronize_rcu_expedited, @@ -659,6 +660,14 @@ struct kfree_obj { struct rcu_head rh; }; +/* Used if doing RCU-kfree'ing via call_rcu(). */ +static void kfree_call_rcu(struct rcu_head *rh) +{ + struct kfree_obj *obj = container_of(rh, struct kfree_obj, rh); + + kfree(obj); +} + static int kfree_scale_thread(void *arg) { @@ -696,6 +705,11 @@ kfree_scale_thread(void *arg) if (!alloc_ptr) return -ENOMEM; + if (kfree_by_call_rcu) { + call_rcu(&(alloc_ptr->rh), kfree_call_rcu); + continue; + } + // By default kfree_rcu_test_single and kfree_rcu_test_double are // initialized to false. If both have the same value (false or true) // both are randomly tested, otherwise only the one with value true @@ -767,11 +781,58 @@ kfree_scale_shutdown(void *arg) return -EINVAL; } +// Used if doing RCU-kfree'ing via call_rcu(). +static unsigned long jiffies_at_lazy_cb; +static struct rcu_head lazy_test1_rh; +static int rcu_lazy_test1_cb_called; +static void call_rcu_lazy_test1(struct rcu_head *rh) +{ + jiffies_at_lazy_cb = jiffies; + WRITE_ONCE(rcu_lazy_test1_cb_called, 1); +} + static int __init kfree_scale_init(void) { - long i; int firsterr = 0; + long i; + unsigned long jif_start; + unsigned long orig_jif; + + // Also, do a quick self-test to ensure laziness is as much as + // expected. + if (kfree_by_call_rcu && !IS_ENABLED(CONFIG_RCU_LAZY)) { + pr_alert("CONFIG_RCU_LAZY is disabled, falling back to kfree_rcu() for delayed RCU kfree'ing\n"); + kfree_by_call_rcu = 0; + } + + if (kfree_by_call_rcu) { + /* do a test to check the timeout. */ + orig_jif = rcu_lazy_get_jiffies_till_flush(); + + rcu_lazy_set_jiffies_till_flush(2 * HZ); + rcu_barrier(); + + jif_start = jiffies; + jiffies_at_lazy_cb = 0; + call_rcu(&lazy_test1_rh, call_rcu_lazy_test1); + + smp_cond_load_relaxed(&rcu_lazy_test1_cb_called, VAL == 1); + + rcu_lazy_set_jiffies_till_flush(orig_jif); + + if (WARN_ON_ONCE(jiffies_at_lazy_cb - jif_start < 2 * HZ)) { + pr_alert("ERROR: call_rcu() CBs are not being lazy as expected!\n"); + WARN_ON_ONCE(1); + return -1; + } + + if (WARN_ON_ONCE(jiffies_at_lazy_cb - jif_start > 3 * HZ)) { + pr_alert("ERROR: call_rcu() CBs are being too lazy!\n"); + WARN_ON_ONCE(1); + return -1; + } + } kfree_nrealthreads = compute_real(kfree_nthreads); /* Start up the kthreads. */ @@ -784,7 +845,9 @@ kfree_scale_init(void) schedule_timeout_uninterruptible(1); } - pr_alert("kfree object size=%zu\n", kfree_mult * sizeof(struct kfree_obj)); + pr_alert("kfree object size=%zu, kfree_by_call_rcu=%d\n", + kfree_mult * sizeof(struct kfree_obj), + kfree_by_call_rcu); kfree_reader_tasks = kcalloc(kfree_nrealthreads, sizeof(kfree_reader_tasks[0]), GFP_KERNEL); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index d8e1b270a065..634df26a2c27 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -84,10 +84,15 @@ torture_param(int, fwd_progress_holdoff, 60, "Time between forward-progress test torture_param(bool, fwd_progress_need_resched, 1, "Hide cond_resched() behind need_resched()"); torture_param(bool, gp_cond, false, "Use conditional/async GP wait primitives"); torture_param(bool, gp_cond_exp, false, "Use conditional/async expedited GP wait primitives"); +torture_param(bool, gp_cond_full, false, "Use conditional/async full-state GP wait primitives"); +torture_param(bool, gp_cond_exp_full, false, + "Use conditional/async full-stateexpedited GP wait primitives"); torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); torture_param(bool, gp_normal, false, "Use normal (non-expedited) GP wait primitives"); torture_param(bool, gp_poll, false, "Use polling GP wait primitives"); torture_param(bool, gp_poll_exp, false, "Use polling expedited GP wait primitives"); +torture_param(bool, gp_poll_full, false, "Use polling full-state GP wait primitives"); +torture_param(bool, gp_poll_exp_full, false, "Use polling full-state expedited GP wait primitives"); torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives"); torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers"); torture_param(int, leakpointer, 0, "Leak pointer dereferences from readers"); @@ -194,16 +199,24 @@ static int rcu_torture_writer_state; #define RTWS_DEF_FREE 3 #define RTWS_EXP_SYNC 4 #define RTWS_COND_GET 5 -#define RTWS_COND_GET_EXP 6 -#define RTWS_COND_SYNC 7 -#define RTWS_COND_SYNC_EXP 8 -#define RTWS_POLL_GET 9 -#define RTWS_POLL_GET_EXP 10 -#define RTWS_POLL_WAIT 11 -#define RTWS_POLL_WAIT_EXP 12 -#define RTWS_SYNC 13 -#define RTWS_STUTTER 14 -#define RTWS_STOPPING 15 +#define RTWS_COND_GET_FULL 6 +#define RTWS_COND_GET_EXP 7 +#define RTWS_COND_GET_EXP_FULL 8 +#define RTWS_COND_SYNC 9 +#define RTWS_COND_SYNC_FULL 10 +#define RTWS_COND_SYNC_EXP 11 +#define RTWS_COND_SYNC_EXP_FULL 12 +#define RTWS_POLL_GET 13 +#define RTWS_POLL_GET_FULL 14 +#define RTWS_POLL_GET_EXP 15 +#define RTWS_POLL_GET_EXP_FULL 16 +#define RTWS_POLL_WAIT 17 +#define RTWS_POLL_WAIT_FULL 18 +#define RTWS_POLL_WAIT_EXP 19 +#define RTWS_POLL_WAIT_EXP_FULL 20 +#define RTWS_SYNC 21 +#define RTWS_STUTTER 22 +#define RTWS_STOPPING 23 static const char * const rcu_torture_writer_state_names[] = { "RTWS_FIXED_DELAY", "RTWS_DELAY", @@ -211,13 +224,21 @@ static const char * const rcu_torture_writer_state_names[] = { "RTWS_DEF_FREE", "RTWS_EXP_SYNC", "RTWS_COND_GET", + "RTWS_COND_GET_FULL", "RTWS_COND_GET_EXP", + "RTWS_COND_GET_EXP_FULL", "RTWS_COND_SYNC", + "RTWS_COND_SYNC_FULL", "RTWS_COND_SYNC_EXP", + "RTWS_COND_SYNC_EXP_FULL", "RTWS_POLL_GET", + "RTWS_POLL_GET_FULL", "RTWS_POLL_GET_EXP", + "RTWS_POLL_GET_EXP_FULL", "RTWS_POLL_WAIT", + "RTWS_POLL_WAIT_FULL", "RTWS_POLL_WAIT_EXP", + "RTWS_POLL_WAIT_EXP_FULL", "RTWS_SYNC", "RTWS_STUTTER", "RTWS_STOPPING", @@ -332,13 +353,25 @@ struct rcu_torture_ops { void (*exp_sync)(void); unsigned long (*get_gp_state_exp)(void); unsigned long (*start_gp_poll_exp)(void); + void (*start_gp_poll_exp_full)(struct rcu_gp_oldstate *rgosp); bool (*poll_gp_state_exp)(unsigned long oldstate); void (*cond_sync_exp)(unsigned long oldstate); + void (*cond_sync_exp_full)(struct rcu_gp_oldstate *rgosp); + unsigned long (*get_comp_state)(void); + void (*get_comp_state_full)(struct rcu_gp_oldstate *rgosp); + bool (*same_gp_state)(unsigned long oldstate1, unsigned long oldstate2); + bool (*same_gp_state_full)(struct rcu_gp_oldstate *rgosp1, struct rcu_gp_oldstate *rgosp2); unsigned long (*get_gp_state)(void); + void (*get_gp_state_full)(struct rcu_gp_oldstate *rgosp); unsigned long (*get_gp_completed)(void); + void (*get_gp_completed_full)(struct rcu_gp_oldstate *rgosp); unsigned long (*start_gp_poll)(void); + void (*start_gp_poll_full)(struct rcu_gp_oldstate *rgosp); bool (*poll_gp_state)(unsigned long oldstate); + bool (*poll_gp_state_full)(struct rcu_gp_oldstate *rgosp); + bool (*poll_need_2gp)(bool poll, bool poll_full); void (*cond_sync)(unsigned long oldstate); + void (*cond_sync_full)(struct rcu_gp_oldstate *rgosp); call_rcu_func_t call; void (*cb_barrier)(void); void (*fqs)(void); @@ -481,7 +514,7 @@ static unsigned long rcu_no_completed(void) static void rcu_torture_deferred_free(struct rcu_torture *p) { - call_rcu(&p->rtort_rcu, rcu_torture_cb); + call_rcu_hurry(&p->rtort_rcu, rcu_torture_cb); } static void rcu_sync_torture_init(void) @@ -489,6 +522,11 @@ static void rcu_sync_torture_init(void) INIT_LIST_HEAD(&rcu_torture_removed); } +static bool rcu_poll_need_2gp(bool poll, bool poll_full) +{ + return poll; +} + static struct rcu_torture_ops rcu_ops = { .ttype = RCU_FLAVOR, .init = rcu_sync_torture_init, @@ -501,16 +539,27 @@ static struct rcu_torture_ops rcu_ops = { .deferred_free = rcu_torture_deferred_free, .sync = synchronize_rcu, .exp_sync = synchronize_rcu_expedited, + .same_gp_state = same_state_synchronize_rcu, + .same_gp_state_full = same_state_synchronize_rcu_full, + .get_comp_state = get_completed_synchronize_rcu, + .get_comp_state_full = get_completed_synchronize_rcu_full, .get_gp_state = get_state_synchronize_rcu, + .get_gp_state_full = get_state_synchronize_rcu_full, .get_gp_completed = get_completed_synchronize_rcu, + .get_gp_completed_full = get_completed_synchronize_rcu_full, .start_gp_poll = start_poll_synchronize_rcu, + .start_gp_poll_full = start_poll_synchronize_rcu_full, .poll_gp_state = poll_state_synchronize_rcu, + .poll_gp_state_full = poll_state_synchronize_rcu_full, + .poll_need_2gp = rcu_poll_need_2gp, .cond_sync = cond_synchronize_rcu, + .cond_sync_full = cond_synchronize_rcu_full, .get_gp_state_exp = get_state_synchronize_rcu, .start_gp_poll_exp = start_poll_synchronize_rcu_expedited, + .start_gp_poll_exp_full = start_poll_synchronize_rcu_expedited_full, .poll_gp_state_exp = poll_state_synchronize_rcu, .cond_sync_exp = cond_synchronize_rcu_expedited, - .call = call_rcu, + .call = call_rcu_hurry, .cb_barrier = rcu_barrier, .fqs = rcu_force_quiescent_state, .stats = NULL, @@ -574,10 +623,14 @@ static struct rcu_torture_ops rcu_busted_ops = { DEFINE_STATIC_SRCU(srcu_ctl); static struct srcu_struct srcu_ctld; static struct srcu_struct *srcu_ctlp = &srcu_ctl; +static struct rcu_torture_ops srcud_ops; static int srcu_torture_read_lock(void) __acquires(srcu_ctlp) { - return srcu_read_lock(srcu_ctlp); + if (cur_ops == &srcud_ops) + return srcu_read_lock_nmisafe(srcu_ctlp); + else + return srcu_read_lock(srcu_ctlp); } static void @@ -601,7 +654,10 @@ srcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp) static void srcu_torture_read_unlock(int idx) __releases(srcu_ctlp) { - srcu_read_unlock(srcu_ctlp, idx); + if (cur_ops == &srcud_ops) + srcu_read_unlock_nmisafe(srcu_ctlp, idx); + else + srcu_read_unlock(srcu_ctlp, idx); } static int torture_srcu_read_lock_held(void) @@ -709,6 +765,9 @@ static struct rcu_torture_ops srcud_ops = { .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, .exp_sync = srcu_torture_synchronize_expedited, + .get_gp_state = srcu_torture_get_gp_state, + .start_gp_poll = srcu_torture_start_gp_poll, + .poll_gp_state = srcu_torture_poll_gp_state, .call = srcu_torture_call, .cb_barrier = srcu_torture_barrier, .stats = srcu_torture_stats, @@ -804,7 +863,7 @@ static void rcu_tasks_torture_deferred_free(struct rcu_torture *p) static void synchronize_rcu_mult_test(void) { - synchronize_rcu_mult(call_rcu_tasks, call_rcu); + synchronize_rcu_mult(call_rcu_tasks, call_rcu_hurry); } static struct rcu_torture_ops tasks_ops = { @@ -1148,15 +1207,35 @@ static int nsynctypes; */ static void rcu_torture_write_types(void) { - bool gp_cond1 = gp_cond, gp_cond_exp1 = gp_cond_exp, gp_exp1 = gp_exp; - bool gp_poll_exp1 = gp_poll_exp, gp_normal1 = gp_normal, gp_poll1 = gp_poll; - bool gp_sync1 = gp_sync; + bool gp_cond1 = gp_cond, gp_cond_exp1 = gp_cond_exp, gp_cond_full1 = gp_cond_full; + bool gp_cond_exp_full1 = gp_cond_exp_full, gp_exp1 = gp_exp, gp_poll_exp1 = gp_poll_exp; + bool gp_poll_exp_full1 = gp_poll_exp_full, gp_normal1 = gp_normal, gp_poll1 = gp_poll; + bool gp_poll_full1 = gp_poll_full, gp_sync1 = gp_sync; /* Initialize synctype[] array. If none set, take default. */ - if (!gp_cond1 && !gp_cond_exp1 && !gp_exp1 && !gp_poll_exp && - !gp_normal1 && !gp_poll1 && !gp_sync1) - gp_cond1 = gp_cond_exp1 = gp_exp1 = gp_poll_exp1 = - gp_normal1 = gp_poll1 = gp_sync1 = true; + if (!gp_cond1 && + !gp_cond_exp1 && + !gp_cond_full1 && + !gp_cond_exp_full1 && + !gp_exp1 && + !gp_poll_exp1 && + !gp_poll_exp_full1 && + !gp_normal1 && + !gp_poll1 && + !gp_poll_full1 && + !gp_sync1) { + gp_cond1 = true; + gp_cond_exp1 = true; + gp_cond_full1 = true; + gp_cond_exp_full1 = true; + gp_exp1 = true; + gp_poll_exp1 = true; + gp_poll_exp_full1 = true; + gp_normal1 = true; + gp_poll1 = true; + gp_poll_full1 = true; + gp_sync1 = true; + } if (gp_cond1 && cur_ops->get_gp_state && cur_ops->cond_sync) { synctype[nsynctypes++] = RTWS_COND_GET; pr_info("%s: Testing conditional GPs.\n", __func__); @@ -1169,6 +1248,19 @@ static void rcu_torture_write_types(void) } else if (gp_cond_exp && (!cur_ops->get_gp_state_exp || !cur_ops->cond_sync_exp)) { pr_alert("%s: gp_cond_exp without primitives.\n", __func__); } + if (gp_cond_full1 && cur_ops->get_gp_state && cur_ops->cond_sync_full) { + synctype[nsynctypes++] = RTWS_COND_GET_FULL; + pr_info("%s: Testing conditional full-state GPs.\n", __func__); + } else if (gp_cond_full && (!cur_ops->get_gp_state || !cur_ops->cond_sync_full)) { + pr_alert("%s: gp_cond_full without primitives.\n", __func__); + } + if (gp_cond_exp_full1 && cur_ops->get_gp_state_exp && cur_ops->cond_sync_exp_full) { + synctype[nsynctypes++] = RTWS_COND_GET_EXP_FULL; + pr_info("%s: Testing conditional full-state expedited GPs.\n", __func__); + } else if (gp_cond_exp_full && + (!cur_ops->get_gp_state_exp || !cur_ops->cond_sync_exp_full)) { + pr_alert("%s: gp_cond_exp_full without primitives.\n", __func__); + } if (gp_exp1 && cur_ops->exp_sync) { synctype[nsynctypes++] = RTWS_EXP_SYNC; pr_info("%s: Testing expedited GPs.\n", __func__); @@ -1181,18 +1273,33 @@ static void rcu_torture_write_types(void) } else if (gp_normal && !cur_ops->deferred_free) { pr_alert("%s: gp_normal without primitives.\n", __func__); } - if (gp_poll1 && cur_ops->start_gp_poll && cur_ops->poll_gp_state) { + if (gp_poll1 && cur_ops->get_comp_state && cur_ops->same_gp_state && + cur_ops->start_gp_poll && cur_ops->poll_gp_state) { synctype[nsynctypes++] = RTWS_POLL_GET; pr_info("%s: Testing polling GPs.\n", __func__); } else if (gp_poll && (!cur_ops->start_gp_poll || !cur_ops->poll_gp_state)) { pr_alert("%s: gp_poll without primitives.\n", __func__); } + if (gp_poll_full1 && cur_ops->get_comp_state_full && cur_ops->same_gp_state_full + && cur_ops->start_gp_poll_full && cur_ops->poll_gp_state_full) { + synctype[nsynctypes++] = RTWS_POLL_GET_FULL; + pr_info("%s: Testing polling full-state GPs.\n", __func__); + } else if (gp_poll_full && (!cur_ops->start_gp_poll_full || !cur_ops->poll_gp_state_full)) { + pr_alert("%s: gp_poll_full without primitives.\n", __func__); + } if (gp_poll_exp1 && cur_ops->start_gp_poll_exp && cur_ops->poll_gp_state_exp) { synctype[nsynctypes++] = RTWS_POLL_GET_EXP; pr_info("%s: Testing polling expedited GPs.\n", __func__); } else if (gp_poll_exp && (!cur_ops->start_gp_poll_exp || !cur_ops->poll_gp_state_exp)) { pr_alert("%s: gp_poll_exp without primitives.\n", __func__); } + if (gp_poll_exp_full1 && cur_ops->start_gp_poll_exp_full && cur_ops->poll_gp_state_full) { + synctype[nsynctypes++] = RTWS_POLL_GET_EXP_FULL; + pr_info("%s: Testing polling full-state expedited GPs.\n", __func__); + } else if (gp_poll_exp_full && + (!cur_ops->start_gp_poll_exp_full || !cur_ops->poll_gp_state_full)) { + pr_alert("%s: gp_poll_exp_full without primitives.\n", __func__); + } if (gp_sync1 && cur_ops->sync) { synctype[nsynctypes++] = RTWS_SYNC; pr_info("%s: Testing normal GPs.\n", __func__); @@ -1202,6 +1309,40 @@ static void rcu_torture_write_types(void) } /* + * Do the specified rcu_torture_writer() synchronous grace period, + * while also testing out the polled APIs. Note well that the single-CPU + * grace-period optimizations must be accounted for. + */ +static void do_rtws_sync(struct torture_random_state *trsp, void (*sync)(void)) +{ + unsigned long cookie; + struct rcu_gp_oldstate cookie_full; + bool dopoll; + bool dopoll_full; + unsigned long r = torture_random(trsp); + + dopoll = cur_ops->get_gp_state && cur_ops->poll_gp_state && !(r & 0x300); + dopoll_full = cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full && !(r & 0xc00); + if (dopoll || dopoll_full) + cpus_read_lock(); + if (dopoll) + cookie = cur_ops->get_gp_state(); + if (dopoll_full) + cur_ops->get_gp_state_full(&cookie_full); + if (cur_ops->poll_need_2gp && cur_ops->poll_need_2gp(dopoll, dopoll_full)) + sync(); + sync(); + WARN_ONCE(dopoll && !cur_ops->poll_gp_state(cookie), + "%s: Cookie check 3 failed %pS() online %*pbl.", + __func__, sync, cpumask_pr_args(cpu_online_mask)); + WARN_ONCE(dopoll_full && !cur_ops->poll_gp_state_full(&cookie_full), + "%s: Cookie check 4 failed %pS() online %*pbl", + __func__, sync, cpumask_pr_args(cpu_online_mask)); + if (dopoll || dopoll_full) + cpus_read_unlock(); +} + +/* * RCU torture writer kthread. Repeatedly substitutes a new structure * for that pointed to by rcu_torture_current, freeing the old structure * after a series of grace periods (the "pipeline"). @@ -1212,15 +1353,21 @@ rcu_torture_writer(void *arg) bool boot_ended; bool can_expedite = !rcu_gp_is_expedited() && !rcu_gp_is_normal(); unsigned long cookie; + struct rcu_gp_oldstate cookie_full; int expediting = 0; unsigned long gp_snap; + unsigned long gp_snap1; + struct rcu_gp_oldstate gp_snap_full; + struct rcu_gp_oldstate gp_snap1_full; int i; int idx; int oldnice = task_nice(current); + struct rcu_gp_oldstate rgo[NUM_ACTIVE_RCU_POLL_FULL_OLDSTATE]; struct rcu_torture *rp; struct rcu_torture *old_rp; static DEFINE_TORTURE_RANDOM(rand); bool stutter_waited; + unsigned long ulo[NUM_ACTIVE_RCU_POLL_OLDSTATE]; VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); if (!can_expedite) @@ -1261,11 +1408,12 @@ rcu_torture_writer(void *arg) atomic_inc(&rcu_torture_wcount[i]); WRITE_ONCE(old_rp->rtort_pipe_count, old_rp->rtort_pipe_count + 1); + + // Make sure readers block polled grace periods. if (cur_ops->get_gp_state && cur_ops->poll_gp_state) { idx = cur_ops->readlock(); cookie = cur_ops->get_gp_state(); - WARN_ONCE(rcu_torture_writer_state != RTWS_DEF_FREE && - cur_ops->poll_gp_state(cookie), + WARN_ONCE(cur_ops->poll_gp_state(cookie), "%s: Cookie check 1 failed %s(%d) %lu->%lu\n", __func__, rcu_torture_writer_state_getname(), @@ -1277,6 +1425,21 @@ rcu_torture_writer(void *arg) } cur_ops->readunlock(idx); } + if (cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full) { + idx = cur_ops->readlock(); + cur_ops->get_gp_state_full(&cookie_full); + WARN_ONCE(cur_ops->poll_gp_state_full(&cookie_full), + "%s: Cookie check 5 failed %s(%d) online %*pbl\n", + __func__, + rcu_torture_writer_state_getname(), + rcu_torture_writer_state, + cpumask_pr_args(cpu_online_mask)); + if (cur_ops->get_gp_completed_full) { + cur_ops->get_gp_completed_full(&cookie_full); + WARN_ON_ONCE(!cur_ops->poll_gp_state_full(&cookie_full)); + } + cur_ops->readunlock(idx); + } switch (synctype[torture_random(&rand) % nsynctypes]) { case RTWS_DEF_FREE: rcu_torture_writer_state = RTWS_DEF_FREE; @@ -1284,12 +1447,7 @@ rcu_torture_writer(void *arg) break; case RTWS_EXP_SYNC: rcu_torture_writer_state = RTWS_EXP_SYNC; - if (cur_ops->get_gp_state && cur_ops->poll_gp_state) - cookie = cur_ops->get_gp_state(); - cur_ops->exp_sync(); - cur_ops->exp_sync(); - if (cur_ops->get_gp_state && cur_ops->poll_gp_state) - WARN_ON_ONCE(!cur_ops->poll_gp_state(cookie)); + do_rtws_sync(&rand, cur_ops->exp_sync); rcu_torture_pipe_update(old_rp); break; case RTWS_COND_GET: @@ -1308,13 +1466,61 @@ rcu_torture_writer(void *arg) cur_ops->cond_sync_exp(gp_snap); rcu_torture_pipe_update(old_rp); break; + case RTWS_COND_GET_FULL: + rcu_torture_writer_state = RTWS_COND_GET_FULL; + cur_ops->get_gp_state_full(&gp_snap_full); + torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); + rcu_torture_writer_state = RTWS_COND_SYNC_FULL; + cur_ops->cond_sync_full(&gp_snap_full); + rcu_torture_pipe_update(old_rp); + break; + case RTWS_COND_GET_EXP_FULL: + rcu_torture_writer_state = RTWS_COND_GET_EXP_FULL; + cur_ops->get_gp_state_full(&gp_snap_full); + torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); + rcu_torture_writer_state = RTWS_COND_SYNC_EXP_FULL; + cur_ops->cond_sync_exp_full(&gp_snap_full); + rcu_torture_pipe_update(old_rp); + break; case RTWS_POLL_GET: rcu_torture_writer_state = RTWS_POLL_GET; + for (i = 0; i < ARRAY_SIZE(ulo); i++) + ulo[i] = cur_ops->get_comp_state(); gp_snap = cur_ops->start_gp_poll(); rcu_torture_writer_state = RTWS_POLL_WAIT; - while (!cur_ops->poll_gp_state(gp_snap)) + while (!cur_ops->poll_gp_state(gp_snap)) { + gp_snap1 = cur_ops->get_gp_state(); + for (i = 0; i < ARRAY_SIZE(ulo); i++) + if (cur_ops->poll_gp_state(ulo[i]) || + cur_ops->same_gp_state(ulo[i], gp_snap1)) { + ulo[i] = gp_snap1; + break; + } + WARN_ON_ONCE(i >= ARRAY_SIZE(ulo)); torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); + } + rcu_torture_pipe_update(old_rp); + break; + case RTWS_POLL_GET_FULL: + rcu_torture_writer_state = RTWS_POLL_GET_FULL; + for (i = 0; i < ARRAY_SIZE(rgo); i++) + cur_ops->get_comp_state_full(&rgo[i]); + cur_ops->start_gp_poll_full(&gp_snap_full); + rcu_torture_writer_state = RTWS_POLL_WAIT_FULL; + while (!cur_ops->poll_gp_state_full(&gp_snap_full)) { + cur_ops->get_gp_state_full(&gp_snap1_full); + for (i = 0; i < ARRAY_SIZE(rgo); i++) + if (cur_ops->poll_gp_state_full(&rgo[i]) || + cur_ops->same_gp_state_full(&rgo[i], + &gp_snap1_full)) { + rgo[i] = gp_snap1_full; + break; + } + WARN_ON_ONCE(i >= ARRAY_SIZE(rgo)); + torture_hrtimeout_jiffies(torture_random(&rand) % 16, + &rand); + } rcu_torture_pipe_update(old_rp); break; case RTWS_POLL_GET_EXP: @@ -1326,14 +1532,18 @@ rcu_torture_writer(void *arg) &rand); rcu_torture_pipe_update(old_rp); break; + case RTWS_POLL_GET_EXP_FULL: + rcu_torture_writer_state = RTWS_POLL_GET_EXP_FULL; + cur_ops->start_gp_poll_exp_full(&gp_snap_full); + rcu_torture_writer_state = RTWS_POLL_WAIT_EXP_FULL; + while (!cur_ops->poll_gp_state_full(&gp_snap_full)) + torture_hrtimeout_jiffies(torture_random(&rand) % 16, + &rand); + rcu_torture_pipe_update(old_rp); + break; case RTWS_SYNC: rcu_torture_writer_state = RTWS_SYNC; - if (cur_ops->get_gp_state && cur_ops->poll_gp_state) - cookie = cur_ops->get_gp_state(); - cur_ops->sync(); - cur_ops->sync(); - if (cur_ops->get_gp_state && cur_ops->poll_gp_state) - WARN_ON_ONCE(!cur_ops->poll_gp_state(cookie)); + do_rtws_sync(&rand, cur_ops->sync); rcu_torture_pipe_update(old_rp); break; default: @@ -1400,6 +1610,7 @@ static int rcu_torture_fakewriter(void *arg) { unsigned long gp_snap; + struct rcu_gp_oldstate gp_snap_full; DEFINE_TORTURE_RANDOM(rand); VERBOSE_TOROUT_STRING("rcu_torture_fakewriter task started"); @@ -1438,6 +1649,16 @@ rcu_torture_fakewriter(void *arg) torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); cur_ops->cond_sync_exp(gp_snap); break; + case RTWS_COND_GET_FULL: + cur_ops->get_gp_state_full(&gp_snap_full); + torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); + cur_ops->cond_sync_full(&gp_snap_full); + break; + case RTWS_COND_GET_EXP_FULL: + cur_ops->get_gp_state_full(&gp_snap_full); + torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); + cur_ops->cond_sync_exp_full(&gp_snap_full); + break; case RTWS_POLL_GET: gp_snap = cur_ops->start_gp_poll(); while (!cur_ops->poll_gp_state(gp_snap)) { @@ -1445,6 +1666,13 @@ rcu_torture_fakewriter(void *arg) &rand); } break; + case RTWS_POLL_GET_FULL: + cur_ops->start_gp_poll_full(&gp_snap_full); + while (!cur_ops->poll_gp_state_full(&gp_snap_full)) { + torture_hrtimeout_jiffies(torture_random(&rand) % 16, + &rand); + } + break; case RTWS_POLL_GET_EXP: gp_snap = cur_ops->start_gp_poll_exp(); while (!cur_ops->poll_gp_state_exp(gp_snap)) { @@ -1452,6 +1680,13 @@ rcu_torture_fakewriter(void *arg) &rand); } break; + case RTWS_POLL_GET_EXP_FULL: + cur_ops->start_gp_poll_exp_full(&gp_snap_full); + while (!cur_ops->poll_gp_state_full(&gp_snap_full)) { + torture_hrtimeout_jiffies(torture_random(&rand) % 16, + &rand); + } + break; case RTWS_SYNC: cur_ops->sync(); break; @@ -1715,7 +1950,9 @@ rcutorture_loop_extend(int *readstate, struct torture_random_state *trsp, */ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) { + bool checkpolling = !(torture_random(trsp) & 0xfff); unsigned long cookie; + struct rcu_gp_oldstate cookie_full; int i; unsigned long started; unsigned long completed; @@ -1731,8 +1968,12 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) WARN_ON_ONCE(!rcu_is_watching()); newstate = rcutorture_extend_mask(readstate, trsp); rcutorture_one_extend(&readstate, newstate, trsp, rtrsp++); - if (cur_ops->get_gp_state && cur_ops->poll_gp_state) - cookie = cur_ops->get_gp_state(); + if (checkpolling) { + if (cur_ops->get_gp_state && cur_ops->poll_gp_state) + cookie = cur_ops->get_gp_state(); + if (cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full) + cur_ops->get_gp_state_full(&cookie_full); + } started = cur_ops->get_gp_seq(); ts = rcu_trace_clock_local(); p = rcu_dereference_check(rcu_torture_current, @@ -1766,13 +2007,22 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) } __this_cpu_inc(rcu_torture_batch[completed]); preempt_enable(); - if (cur_ops->get_gp_state && cur_ops->poll_gp_state) - WARN_ONCE(cur_ops->poll_gp_state(cookie), - "%s: Cookie check 2 failed %s(%d) %lu->%lu\n", - __func__, - rcu_torture_writer_state_getname(), - rcu_torture_writer_state, - cookie, cur_ops->get_gp_state()); + if (checkpolling) { + if (cur_ops->get_gp_state && cur_ops->poll_gp_state) + WARN_ONCE(cur_ops->poll_gp_state(cookie), + "%s: Cookie check 2 failed %s(%d) %lu->%lu\n", + __func__, + rcu_torture_writer_state_getname(), + rcu_torture_writer_state, + cookie, cur_ops->get_gp_state()); + if (cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full) + WARN_ONCE(cur_ops->poll_gp_state_full(&cookie_full), + "%s: Cookie check 6 failed %s(%d) online %*pbl\n", + __func__, + rcu_torture_writer_state_getname(), + rcu_torture_writer_state, + cpumask_pr_args(cpu_online_mask)); + } rcutorture_one_extend(&readstate, 0, trsp, rtrsp); WARN_ON_ONCE(readstate); // This next splat is expected behavior if leakpointer, especially @@ -2600,12 +2850,12 @@ static int rcutorture_oom_notify(struct notifier_block *self, for (i = 0; i < fwd_progress; i++) ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]); pr_info("%s: Freed %lu RCU callbacks.\n", __func__, ncbs); - rcu_barrier(); + cur_ops->cb_barrier(); ncbs = 0; for (i = 0; i < fwd_progress; i++) ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]); pr_info("%s: Freed %lu RCU callbacks.\n", __func__, ncbs); - rcu_barrier(); + cur_ops->cb_barrier(); ncbs = 0; for (i = 0; i < fwd_progress; i++) ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]); @@ -3182,13 +3432,13 @@ static void rcu_test_debug_objects(void) /* Try to queue the rh2 pair of callbacks for the same grace period. */ preempt_disable(); /* Prevent preemption from interrupting test. */ rcu_read_lock(); /* Make it impossible to finish a grace period. */ - call_rcu(&rh1, rcu_torture_leak_cb); /* Start grace period. */ + call_rcu_hurry(&rh1, rcu_torture_leak_cb); /* Start grace period. */ local_irq_disable(); /* Make it harder to start a new grace period. */ - call_rcu(&rh2, rcu_torture_leak_cb); - call_rcu(&rh2, rcu_torture_err_cb); /* Duplicate callback. */ + call_rcu_hurry(&rh2, rcu_torture_leak_cb); + call_rcu_hurry(&rh2, rcu_torture_err_cb); /* Duplicate callback. */ if (rhp) { - call_rcu(rhp, rcu_torture_leak_cb); - call_rcu(rhp, rcu_torture_err_cb); /* Another duplicate callback. */ + call_rcu_hurry(rhp, rcu_torture_leak_cb); + call_rcu_hurry(rhp, rcu_torture_err_cb); /* Another duplicate callback. */ } local_irq_enable(); rcu_read_unlock(); diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 92c002d65482..b12fb0cec44d 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -117,7 +117,7 @@ void srcu_drive_gp(struct work_struct *wp) struct srcu_struct *ssp; ssp = container_of(wp, struct srcu_struct, srcu_work); - if (ssp->srcu_gp_running || USHORT_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max))) + if (ssp->srcu_gp_running || ULONG_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max))) return; /* Already running or nothing to do. */ /* Remove recently arrived callbacks and wait for readers. */ @@ -150,17 +150,17 @@ void srcu_drive_gp(struct work_struct *wp) * straighten that out. */ WRITE_ONCE(ssp->srcu_gp_running, false); - if (USHORT_CMP_LT(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max))) + if (ULONG_CMP_LT(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max))) schedule_work(&ssp->srcu_work); } EXPORT_SYMBOL_GPL(srcu_drive_gp); static void srcu_gp_start_if_needed(struct srcu_struct *ssp) { - unsigned short cookie; + unsigned long cookie; cookie = get_state_synchronize_srcu(ssp); - if (USHORT_CMP_GE(READ_ONCE(ssp->srcu_idx_max), cookie)) + if (ULONG_CMP_GE(READ_ONCE(ssp->srcu_idx_max), cookie)) return; WRITE_ONCE(ssp->srcu_idx_max, cookie); if (!READ_ONCE(ssp->srcu_gp_running)) { @@ -197,6 +197,16 @@ void synchronize_srcu(struct srcu_struct *ssp) { struct rcu_synchronize rs; + RCU_LOCKDEP_WARN(lockdep_is_held(ssp) || + lock_is_held(&rcu_bh_lock_map) || + lock_is_held(&rcu_lock_map) || + lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section"); + + if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) + return; + + might_sleep(); init_rcu_head_on_stack(&rs.head); init_completion(&rs.completion); call_srcu(ssp, &rs.head, wakeme_after_rcu); @@ -215,7 +225,7 @@ unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp) barrier(); ret = (READ_ONCE(ssp->srcu_idx) + 3) & ~0x1; barrier(); - return ret & USHRT_MAX; + return ret; } EXPORT_SYMBOL_GPL(get_state_synchronize_srcu); @@ -240,10 +250,10 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu); */ bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie) { - bool ret = USHORT_CMP_GE(READ_ONCE(ssp->srcu_idx), cookie); + unsigned long cur_s = READ_ONCE(ssp->srcu_idx); barrier(); - return ret; + return ULONG_CMP_GE(cur_s, cookie) || ULONG_CMP_LT(cur_s, cookie - 3); } EXPORT_SYMBOL_GPL(poll_state_synchronize_srcu); diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 1c304fec89c0..ca4b5dcec675 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -417,7 +417,7 @@ static unsigned long srcu_readers_lock_idx(struct srcu_struct *ssp, int idx) for_each_possible_cpu(cpu) { struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu); - sum += READ_ONCE(cpuc->srcu_lock_count[idx]); + sum += atomic_long_read(&cpuc->srcu_lock_count[idx]); } return sum; } @@ -429,13 +429,18 @@ static unsigned long srcu_readers_lock_idx(struct srcu_struct *ssp, int idx) static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx) { int cpu; + unsigned long mask = 0; unsigned long sum = 0; for_each_possible_cpu(cpu) { struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu); - sum += READ_ONCE(cpuc->srcu_unlock_count[idx]); + sum += atomic_long_read(&cpuc->srcu_unlock_count[idx]); + if (IS_ENABLED(CONFIG_PROVE_RCU)) + mask = mask | READ_ONCE(cpuc->srcu_nmi_safety); } + WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask >> 1)), + "Mixed NMI-safe readers for srcu_struct at %ps.\n", ssp); return sum; } @@ -503,10 +508,10 @@ static bool srcu_readers_active(struct srcu_struct *ssp) for_each_possible_cpu(cpu) { struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu); - sum += READ_ONCE(cpuc->srcu_lock_count[0]); - sum += READ_ONCE(cpuc->srcu_lock_count[1]); - sum -= READ_ONCE(cpuc->srcu_unlock_count[0]); - sum -= READ_ONCE(cpuc->srcu_unlock_count[1]); + sum += atomic_long_read(&cpuc->srcu_lock_count[0]); + sum += atomic_long_read(&cpuc->srcu_lock_count[1]); + sum -= atomic_long_read(&cpuc->srcu_unlock_count[0]); + sum -= atomic_long_read(&cpuc->srcu_unlock_count[1]); } return sum; } @@ -626,6 +631,29 @@ void cleanup_srcu_struct(struct srcu_struct *ssp) } EXPORT_SYMBOL_GPL(cleanup_srcu_struct); +#ifdef CONFIG_PROVE_RCU +/* + * Check for consistent NMI safety. + */ +void srcu_check_nmi_safety(struct srcu_struct *ssp, bool nmi_safe) +{ + int nmi_safe_mask = 1 << nmi_safe; + int old_nmi_safe_mask; + struct srcu_data *sdp; + + /* NMI-unsafe use in NMI is a bad sign */ + WARN_ON_ONCE(!nmi_safe && in_nmi()); + sdp = raw_cpu_ptr(ssp->sda); + old_nmi_safe_mask = READ_ONCE(sdp->srcu_nmi_safety); + if (!old_nmi_safe_mask) { + WRITE_ONCE(sdp->srcu_nmi_safety, nmi_safe_mask); + return; + } + WARN_ONCE(old_nmi_safe_mask != nmi_safe_mask, "CPU %d old state %d new state %d\n", sdp->cpu, old_nmi_safe_mask, nmi_safe_mask); +} +EXPORT_SYMBOL_GPL(srcu_check_nmi_safety); +#endif /* CONFIG_PROVE_RCU */ + /* * Counts the new reader in the appropriate per-CPU element of the * srcu_struct. @@ -636,7 +664,7 @@ int __srcu_read_lock(struct srcu_struct *ssp) int idx; idx = READ_ONCE(ssp->srcu_idx) & 0x1; - this_cpu_inc(ssp->sda->srcu_lock_count[idx]); + this_cpu_inc(ssp->sda->srcu_lock_count[idx].counter); smp_mb(); /* B */ /* Avoid leaking the critical section. */ return idx; } @@ -650,10 +678,45 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock); void __srcu_read_unlock(struct srcu_struct *ssp, int idx) { smp_mb(); /* C */ /* Avoid leaking the critical section. */ - this_cpu_inc(ssp->sda->srcu_unlock_count[idx]); + this_cpu_inc(ssp->sda->srcu_unlock_count[idx].counter); } EXPORT_SYMBOL_GPL(__srcu_read_unlock); +#ifdef CONFIG_NEED_SRCU_NMI_SAFE + +/* + * Counts the new reader in the appropriate per-CPU element of the + * srcu_struct, but in an NMI-safe manner using RMW atomics. + * Returns an index that must be passed to the matching srcu_read_unlock(). + */ +int __srcu_read_lock_nmisafe(struct srcu_struct *ssp) +{ + int idx; + struct srcu_data *sdp = raw_cpu_ptr(ssp->sda); + + idx = READ_ONCE(ssp->srcu_idx) & 0x1; + atomic_long_inc(&sdp->srcu_lock_count[idx]); + smp_mb__after_atomic(); /* B */ /* Avoid leaking the critical section. */ + return idx; +} +EXPORT_SYMBOL_GPL(__srcu_read_lock_nmisafe); + +/* + * Removes the count for the old reader from the appropriate per-CPU + * element of the srcu_struct. Note that this may well be a different + * CPU than that which was incremented by the corresponding srcu_read_lock(). + */ +void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx) +{ + struct srcu_data *sdp = raw_cpu_ptr(ssp->sda); + + smp_mb__before_atomic(); /* C */ /* Avoid leaking the critical section. */ + atomic_long_inc(&sdp->srcu_unlock_count[idx]); +} +EXPORT_SYMBOL_GPL(__srcu_read_unlock_nmisafe); + +#endif // CONFIG_NEED_SRCU_NMI_SAFE + /* * Start an SRCU grace period. */ @@ -1090,7 +1153,12 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp, int ss_state; check_init_srcu_struct(ssp); - idx = srcu_read_lock(ssp); + /* + * While starting a new grace period, make sure we are in an + * SRCU read-side critical section so that the grace-period + * sequence number cannot wrap around in the meantime. + */ + idx = __srcu_read_lock_nmisafe(ssp); ss_state = smp_load_acquire(&ssp->srcu_size_state); if (ss_state < SRCU_SIZE_WAIT_CALL) sdp = per_cpu_ptr(ssp->sda, 0); @@ -1123,7 +1191,7 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp, srcu_funnel_gp_start(ssp, sdp, s, do_norm); else if (needexp) srcu_funnel_exp_start(ssp, sdp_mynode, s); - srcu_read_unlock(ssp, idx); + __srcu_read_unlock_nmisafe(ssp, idx); return s; } @@ -1427,13 +1495,13 @@ void srcu_barrier(struct srcu_struct *ssp) /* Initial count prevents reaching zero until all CBs are posted. */ atomic_set(&ssp->srcu_barrier_cpu_cnt, 1); - idx = srcu_read_lock(ssp); + idx = __srcu_read_lock_nmisafe(ssp); if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, 0)); else for_each_possible_cpu(cpu) srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, cpu)); - srcu_read_unlock(ssp, idx); + __srcu_read_unlock_nmisafe(ssp, idx); /* Remove the initial count, at which point reaching zero can happen. */ if (atomic_dec_and_test(&ssp->srcu_barrier_cpu_cnt)) @@ -1687,8 +1755,8 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) struct srcu_data *sdp; sdp = per_cpu_ptr(ssp->sda, cpu); - u0 = data_race(sdp->srcu_unlock_count[!idx]); - u1 = data_race(sdp->srcu_unlock_count[idx]); + u0 = data_race(atomic_long_read(&sdp->srcu_unlock_count[!idx])); + u1 = data_race(atomic_long_read(&sdp->srcu_unlock_count[idx])); /* * Make sure that a lock is always counted if the corresponding @@ -1696,8 +1764,8 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) */ smp_rmb(); - l0 = data_race(sdp->srcu_lock_count[!idx]); - l1 = data_race(sdp->srcu_lock_count[idx]); + l0 = data_race(atomic_long_read(&sdp->srcu_lock_count[!idx])); + l1 = data_race(atomic_long_read(&sdp->srcu_lock_count[idx])); c0 = l0 - u0; c1 = l1 - u1; diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index 5cefc702158f..e550f97779b8 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c @@ -44,7 +44,7 @@ static void rcu_sync_func(struct rcu_head *rhp); static void rcu_sync_call(struct rcu_sync *rsp) { - call_rcu(&rsp->cb_head, rcu_sync_func); + call_rcu_hurry(&rsp->cb_head, rcu_sync_func); } /** diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 83c7e6620d40..fe9840d90e96 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -560,7 +560,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp) { /* Complain if the scheduler has not started. */ - RCU_LOCKDEP_WARN(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE, + WARN_ONCE(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE, "synchronize_rcu_tasks called too soon"); // If the grace-period kthread is running, use it. @@ -728,7 +728,7 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) if (rtsi > 0 && !reported && time_after(j, lastinfo + rtsi)) { lastinfo = j; rtsi = rtsi * rcu_task_stall_info_mult; - pr_info("%s: %s grace period %lu is %lu jiffies old.\n", + pr_info("%s: %s grace period number %lu (since boot) is %lu jiffies old.\n", __func__, rtp->kname, rtp->tasks_gp_seq, j - rtp->gp_start); } } @@ -1500,6 +1500,7 @@ static void rcu_tasks_trace_pregp_step(struct list_head *hop) if (rcu_tasks_trace_pertask_prep(t, true)) trc_add_holdout(t, hop); rcu_read_unlock(); + cond_resched_tasks_rcu_qs(); } // Only after all running tasks have been accounted for is it @@ -1520,6 +1521,7 @@ static void rcu_tasks_trace_pregp_step(struct list_head *hop) raw_spin_lock_irqsave_rcu_node(rtpcp, flags); } raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); + cond_resched_tasks_rcu_qs(); } // Re-enable CPU hotplug now that the holdout list is populated. @@ -1533,6 +1535,8 @@ static void rcu_tasks_trace_postscan(struct list_head *hop) { // Wait for late-stage exiting tasks to finish exiting. // These might have passed the call to exit_tasks_rcu_finish(). + + // If you remove the following line, update rcu_trace_implies_rcu_gp()!!! synchronize_rcu(); // Any tasks that exit after this point will set // TRC_NEED_QS_CHECKED in ->trc_reader_special.b.need_qs. @@ -1619,6 +1623,7 @@ static void check_all_holdout_tasks_trace(struct list_head *hop, trc_del_holdout(t); else if (needreport) show_stalled_task_trace(t, firstreport); + cond_resched_tasks_rcu_qs(); } // Re-enable CPU hotplug now that the holdout list scan has completed. diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index f0561ee16b9c..72913ce21258 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -44,7 +44,7 @@ static struct rcu_ctrlblk rcu_ctrlblk = { void rcu_barrier(void) { - wait_rcu_gp(call_rcu); + wait_rcu_gp(call_rcu_hurry); } EXPORT_SYMBOL(rcu_barrier); @@ -158,6 +158,10 @@ void synchronize_rcu(void) } EXPORT_SYMBOL_GPL(synchronize_rcu); +static void tiny_rcu_leak_callback(struct rcu_head *rhp) +{ +} + /* * Post an RCU callback to be invoked after the end of an RCU grace * period. But since we have but one CPU, that would be after any @@ -165,9 +169,20 @@ EXPORT_SYMBOL_GPL(synchronize_rcu); */ void call_rcu(struct rcu_head *head, rcu_callback_t func) { + static atomic_t doublefrees; unsigned long flags; - debug_rcu_head_queue(head); + if (debug_rcu_head_queue(head)) { + if (atomic_inc_return(&doublefrees) < 4) { + pr_err("%s(): Double-freed CB %p->%pS()!!! ", __func__, head, head->func); + mem_dump_obj(head); + } + + if (!__is_kvfree_rcu_offset((unsigned long)head->func)) + WRITE_ONCE(head->func, tiny_rcu_leak_callback); + return; + } + head->func = func; head->next = NULL; @@ -184,6 +199,16 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func) EXPORT_SYMBOL_GPL(call_rcu); /* + * Store a grace-period-counter "cookie". For more information, + * see the Tree RCU header comment. + */ +void get_completed_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp) +{ + rgosp->rgos_norm = RCU_GET_STATE_COMPLETED; +} +EXPORT_SYMBOL_GPL(get_completed_synchronize_rcu_full); + +/* * Return a grace-period-counter "cookie". For more information, * see the Tree RCU header comment. */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 79aea7df4345..cf34a961821a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -76,6 +76,7 @@ /* Data structures. */ static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = { + .gpwrap = true, #ifdef CONFIG_RCU_NOCB_CPU .cblist.flags = SEGCBLIST_RCU_CORE, #endif @@ -300,12 +301,6 @@ static bool rcu_dynticks_in_eqs(int snap) return !(snap & RCU_DYNTICKS_IDX); } -/* Return true if the specified CPU is currently idle from an RCU viewpoint. */ -bool rcu_is_idle_cpu(int cpu) -{ - return rcu_dynticks_in_eqs(rcu_dynticks_snap(cpu)); -} - /* * Return true if the CPU corresponding to the specified rcu_data * structure has spent some time in an extended quiescent state since @@ -1367,7 +1362,7 @@ static void rcu_poll_gp_seq_start(unsigned long *snap) { struct rcu_node *rnp = rcu_get_root(); - if (rcu_init_invoked()) + if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE) raw_lockdep_assert_held_rcu_node(rnp); // If RCU was idle, note beginning of GP. @@ -1383,7 +1378,7 @@ static void rcu_poll_gp_seq_end(unsigned long *snap) { struct rcu_node *rnp = rcu_get_root(); - if (rcu_init_invoked()) + if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE) raw_lockdep_assert_held_rcu_node(rnp); // If the previously noted GP is still in effect, record the @@ -1402,30 +1397,34 @@ static void rcu_poll_gp_seq_end(unsigned long *snap) // where caller does not hold the root rcu_node structure's lock. static void rcu_poll_gp_seq_start_unlocked(unsigned long *snap) { + unsigned long flags; struct rcu_node *rnp = rcu_get_root(); if (rcu_init_invoked()) { - lockdep_assert_irqs_enabled(); - raw_spin_lock_irq_rcu_node(rnp); + if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE) + lockdep_assert_irqs_enabled(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); } rcu_poll_gp_seq_start(snap); if (rcu_init_invoked()) - raw_spin_unlock_irq_rcu_node(rnp); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } // Make the polled API aware of the end of a grace period, but where // caller does not hold the root rcu_node structure's lock. static void rcu_poll_gp_seq_end_unlocked(unsigned long *snap) { + unsigned long flags; struct rcu_node *rnp = rcu_get_root(); if (rcu_init_invoked()) { - lockdep_assert_irqs_enabled(); - raw_spin_lock_irq_rcu_node(rnp); + if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE) + lockdep_assert_irqs_enabled(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); } rcu_poll_gp_seq_end(snap); if (rcu_init_invoked()) - raw_spin_unlock_irq_rcu_node(rnp); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } /* @@ -1755,6 +1754,8 @@ static noinline void rcu_gp_cleanup(void) dump_blkd_tasks(rnp, 10); WARN_ON_ONCE(rnp->qsmask); WRITE_ONCE(rnp->gp_seq, new_gp_seq); + if (!rnp->parent) + smp_mb(); // Order against failing poll_state_synchronize_rcu_full(). rdp = this_cpu_ptr(&rcu_data); if (rnp == rdp->mynode) needgp = __note_gp_changes(rnp, rdp) || needgp; @@ -2103,7 +2104,7 @@ int rcutree_dying_cpu(unsigned int cpu) if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) return 0; - blkd = !!(rnp->qsmask & rdp->grpmask); + blkd = !!(READ_ONCE(rnp->qsmask) & rdp->grpmask); trace_rcu_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq), blkd ? TPS("cpuofl-bgp") : TPS("cpuofl")); return 0; @@ -2341,8 +2342,8 @@ void rcu_sched_clock_irq(int user) rcu_flavor_sched_clock_irq(user); if (rcu_pending(user)) invoke_rcu_core(); - if (user) - rcu_tasks_classic_qs(current, false); + if (user || rcu_is_cpu_rrupt_from_idle()) + rcu_note_voluntary_context_switch(current); lockdep_assert_irqs_disabled(); trace_rcu_utilization(TPS("End scheduler-tick")); @@ -2413,7 +2414,7 @@ void rcu_force_quiescent_state(void) struct rcu_node *rnp_old = NULL; /* Funnel through hierarchy to reduce memory contention. */ - rnp = __this_cpu_read(rcu_data.mynode); + rnp = raw_cpu_read(rcu_data.mynode); for (; rnp != NULL; rnp = rnp->parent) { ret = (READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_FQS) || !raw_spin_trylock(&rnp->fqslock); @@ -2725,47 +2726,8 @@ static void check_cb_ovld(struct rcu_data *rdp) raw_spin_unlock_rcu_node(rnp); } -/** - * call_rcu() - Queue an RCU callback for invocation after a grace period. - * @head: structure to be used for queueing the RCU updates. - * @func: actual callback function to be invoked after the grace period - * - * The callback function will be invoked some time after a full grace - * period elapses, in other words after all pre-existing RCU read-side - * critical sections have completed. However, the callback function - * might well execute concurrently with RCU read-side critical sections - * that started after call_rcu() was invoked. - * - * RCU read-side critical sections are delimited by rcu_read_lock() - * and rcu_read_unlock(), and may be nested. In addition, but only in - * v5.0 and later, regions of code across which interrupts, preemption, - * or softirqs have been disabled also serve as RCU read-side critical - * sections. This includes hardware interrupt handlers, softirq handlers, - * and NMI handlers. - * - * Note that all CPUs must agree that the grace period extended beyond - * all pre-existing RCU read-side critical section. On systems with more - * than one CPU, this means that when "func()" is invoked, each CPU is - * guaranteed to have executed a full memory barrier since the end of its - * last RCU read-side critical section whose beginning preceded the call - * to call_rcu(). It also means that each CPU executing an RCU read-side - * critical section that continues beyond the start of "func()" must have - * executed a memory barrier after the call_rcu() but before the beginning - * of that RCU read-side critical section. Note that these guarantees - * include CPUs that are offline, idle, or executing in user mode, as - * well as CPUs that are executing in the kernel. - * - * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the - * resulting RCU callback function "func()", then both CPU A and CPU B are - * guaranteed to execute a full memory barrier during the time interval - * between the call to call_rcu() and the invocation of "func()" -- even - * if CPU A and CPU B are the same CPU (but again only if the system has - * more than one CPU). - * - * Implementation of these memory-ordering guarantees is described here: - * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst. - */ -void call_rcu(struct rcu_head *head, rcu_callback_t func) +static void +__call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy) { static atomic_t doublefrees; unsigned long flags; @@ -2806,7 +2768,7 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func) } check_cb_ovld(rdp); - if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags)) + if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags, lazy)) return; // Enqueued onto ->nocb_bypass, so just leave. // If no-CBs CPU gets here, rcu_nocb_try_bypass() acquired ->nocb_lock. rcu_segcblist_enqueue(&rdp->cblist, head); @@ -2828,11 +2790,87 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func) local_irq_restore(flags); } } -EXPORT_SYMBOL_GPL(call_rcu); +#ifdef CONFIG_RCU_LAZY +/** + * call_rcu_hurry() - Queue RCU callback for invocation after grace period, and + * flush all lazy callbacks (including the new one) to the main ->cblist while + * doing so. + * + * @head: structure to be used for queueing the RCU updates. + * @func: actual callback function to be invoked after the grace period + * + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all pre-existing RCU read-side + * critical sections have completed. + * + * Use this API instead of call_rcu() if you don't want the callback to be + * invoked after very long periods of time, which can happen on systems without + * memory pressure and on systems which are lightly loaded or mostly idle. + * This function will cause callbacks to be invoked sooner than later at the + * expense of extra power. Other than that, this function is identical to, and + * reuses call_rcu()'s logic. Refer to call_rcu() for more details about memory + * ordering and other functionality. + */ +void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func) +{ + return __call_rcu_common(head, func, false); +} +EXPORT_SYMBOL_GPL(call_rcu_hurry); +#endif + +/** + * call_rcu() - Queue an RCU callback for invocation after a grace period. + * By default the callbacks are 'lazy' and are kept hidden from the main + * ->cblist to prevent starting of grace periods too soon. + * If you desire grace periods to start very soon, use call_rcu_hurry(). + * + * @head: structure to be used for queueing the RCU updates. + * @func: actual callback function to be invoked after the grace period + * + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all pre-existing RCU read-side + * critical sections have completed. However, the callback function + * might well execute concurrently with RCU read-side critical sections + * that started after call_rcu() was invoked. + * + * RCU read-side critical sections are delimited by rcu_read_lock() + * and rcu_read_unlock(), and may be nested. In addition, but only in + * v5.0 and later, regions of code across which interrupts, preemption, + * or softirqs have been disabled also serve as RCU read-side critical + * sections. This includes hardware interrupt handlers, softirq handlers, + * and NMI handlers. + * + * Note that all CPUs must agree that the grace period extended beyond + * all pre-existing RCU read-side critical section. On systems with more + * than one CPU, this means that when "func()" is invoked, each CPU is + * guaranteed to have executed a full memory barrier since the end of its + * last RCU read-side critical section whose beginning preceded the call + * to call_rcu(). It also means that each CPU executing an RCU read-side + * critical section that continues beyond the start of "func()" must have + * executed a memory barrier after the call_rcu() but before the beginning + * of that RCU read-side critical section. Note that these guarantees + * include CPUs that are offline, idle, or executing in user mode, as + * well as CPUs that are executing in the kernel. + * + * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the + * resulting RCU callback function "func()", then both CPU A and CPU B are + * guaranteed to execute a full memory barrier during the time interval + * between the call to call_rcu() and the invocation of "func()" -- even + * if CPU A and CPU B are the same CPU (but again only if the system has + * more than one CPU). + * + * Implementation of these memory-ordering guarantees is described here: + * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst. + */ +void call_rcu(struct rcu_head *head, rcu_callback_t func) +{ + return __call_rcu_common(head, func, IS_ENABLED(CONFIG_RCU_LAZY)); +} +EXPORT_SYMBOL_GPL(call_rcu); /* Maximum number of jiffies to wait before draining a batch. */ -#define KFREE_DRAIN_JIFFIES (HZ / 50) +#define KFREE_DRAIN_JIFFIES (5 * HZ) #define KFREE_N_BATCHES 2 #define FREE_N_CHANNELS 2 @@ -3093,6 +3131,21 @@ need_offload_krc(struct kfree_rcu_cpu *krcp) return !!krcp->head; } +static void +schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp) +{ + long delay, delay_left; + + delay = READ_ONCE(krcp->count) >= KVFREE_BULK_MAX_ENTR ? 1:KFREE_DRAIN_JIFFIES; + if (delayed_work_pending(&krcp->monitor_work)) { + delay_left = krcp->monitor_work.timer.expires - jiffies; + if (delay < delay_left) + mod_delayed_work(system_wq, &krcp->monitor_work, delay); + return; + } + queue_delayed_work(system_wq, &krcp->monitor_work, delay); +} + /* * This function is invoked after the KFREE_DRAIN_JIFFIES timeout. */ @@ -3150,7 +3203,7 @@ static void kfree_rcu_monitor(struct work_struct *work) // work to repeat an attempt. Because previous batches are // still in progress. if (need_offload_krc(krcp)) - schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); + schedule_delayed_monitor_work(krcp); raw_spin_unlock_irqrestore(&krcp->lock, flags); } @@ -3183,15 +3236,16 @@ static void fill_page_cache_func(struct work_struct *work) bnode = (struct kvfree_rcu_bulk_data *) __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); - if (bnode) { - raw_spin_lock_irqsave(&krcp->lock, flags); - pushed = put_cached_bnode(krcp, bnode); - raw_spin_unlock_irqrestore(&krcp->lock, flags); + if (!bnode) + break; - if (!pushed) { - free_page((unsigned long) bnode); - break; - } + raw_spin_lock_irqsave(&krcp->lock, flags); + pushed = put_cached_bnode(krcp, bnode); + raw_spin_unlock_irqrestore(&krcp->lock, flags); + + if (!pushed) { + free_page((unsigned long) bnode); + break; } } @@ -3338,7 +3392,7 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) // Set timer to drain after KFREE_DRAIN_JIFFIES. if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING) - schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); + schedule_delayed_monitor_work(krcp); unlock_return: krc_this_cpu_unlock(krcp, flags); @@ -3371,7 +3425,7 @@ kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) atomic_set(&krcp->backoff_page_cache_fill, 1); } - return count; + return count == 0 ? SHRINK_EMPTY : count; } static unsigned long @@ -3414,49 +3468,27 @@ void __init kfree_rcu_scheduler_running(void) raw_spin_lock_irqsave(&krcp->lock, flags); if (need_offload_krc(krcp)) - schedule_delayed_work_on(cpu, &krcp->monitor_work, KFREE_DRAIN_JIFFIES); + schedule_delayed_monitor_work(krcp); raw_spin_unlock_irqrestore(&krcp->lock, flags); } } /* * During early boot, any blocking grace-period wait automatically - * implies a grace period. Later on, this is never the case for PREEMPTION. + * implies a grace period. * - * However, because a context switch is a grace period for !PREEMPTION, any - * blocking grace-period wait automatically implies a grace period if - * there is only one CPU online at any point time during execution of - * either synchronize_rcu() or synchronize_rcu_expedited(). It is OK to - * occasionally incorrectly indicate that there are multiple CPUs online - * when there was in fact only one the whole time, as this just adds some - * overhead: RCU still operates correctly. + * Later on, this could in theory be the case for kernels built with + * CONFIG_SMP=y && CONFIG_PREEMPTION=y running on a single CPU, but this + * is not a common case. Furthermore, this optimization would cause + * the rcu_gp_oldstate structure to expand by 50%, so this potential + * grace-period optimization is ignored once the scheduler is running. */ static int rcu_blocking_is_gp(void) { - int ret; - - // Invoking preempt_model_*() too early gets a splat. - if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE || - preempt_model_full() || preempt_model_rt()) - return rcu_scheduler_active == RCU_SCHEDULER_INACTIVE; + if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE) + return false; might_sleep(); /* Check for RCU read-side critical section. */ - preempt_disable(); - /* - * If the rcu_state.n_online_cpus counter is equal to one, - * there is only one CPU, and that CPU sees all prior accesses - * made by any CPU that was online at the time of its access. - * Furthermore, if this counter is equal to one, its value cannot - * change until after the preempt_enable() below. - * - * Furthermore, if rcu_state.n_online_cpus is equal to one here, - * all later CPUs (both this one and any that come online later - * on) are guaranteed to see all accesses prior to this point - * in the code, without the need for additional memory barriers. - * Those memory barriers are provided by CPU-hotplug code. - */ - ret = READ_ONCE(rcu_state.n_online_cpus) <= 1; - preempt_enable(); - return ret; + return true; } /** @@ -3499,30 +3531,59 @@ static int rcu_blocking_is_gp(void) */ void synchronize_rcu(void) { + unsigned long flags; + struct rcu_node *rnp; + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || lock_is_held(&rcu_lock_map) || lock_is_held(&rcu_sched_lock_map), "Illegal synchronize_rcu() in RCU read-side critical section"); - if (rcu_blocking_is_gp()) { - // Note well that this code runs with !PREEMPT && !SMP. - // In addition, all code that advances grace periods runs at - // process level. Therefore, this normal GP overlaps with - // other normal GPs only by being fully nested within them, - // which allows reuse of ->gp_seq_polled_snap. - rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_snap); - rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_snap); - if (rcu_init_invoked()) - cond_resched_tasks_rcu_qs(); - return; // Context allows vacuous grace periods. + if (!rcu_blocking_is_gp()) { + if (rcu_gp_is_expedited()) + synchronize_rcu_expedited(); + else + wait_rcu_gp(call_rcu_hurry); + return; } - if (rcu_gp_is_expedited()) - synchronize_rcu_expedited(); - else - wait_rcu_gp(call_rcu); + + // Context allows vacuous grace periods. + // Note well that this code runs with !PREEMPT && !SMP. + // In addition, all code that advances grace periods runs at + // process level. Therefore, this normal GP overlaps with other + // normal GPs only by being fully nested within them, which allows + // reuse of ->gp_seq_polled_snap. + rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_snap); + rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_snap); + + // Update the normal grace-period counters to record + // this grace period, but only those used by the boot CPU. + // The rcu_scheduler_starting() will take care of the rest of + // these counters. + local_irq_save(flags); + WARN_ON_ONCE(num_online_cpus() > 1); + rcu_state.gp_seq += (1 << RCU_SEQ_CTR_SHIFT); + for (rnp = this_cpu_ptr(&rcu_data)->mynode; rnp; rnp = rnp->parent) + rnp->gp_seq_needed = rnp->gp_seq = rcu_state.gp_seq; + local_irq_restore(flags); } EXPORT_SYMBOL_GPL(synchronize_rcu); /** + * get_completed_synchronize_rcu_full - Return a full pre-completed polled state cookie + * @rgosp: Place to put state cookie + * + * Stores into @rgosp a value that will always be treated by functions + * like poll_state_synchronize_rcu_full() as a cookie whose grace period + * has already completed. + */ +void get_completed_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp) +{ + rgosp->rgos_norm = RCU_GET_STATE_COMPLETED; + rgosp->rgos_exp = RCU_GET_STATE_COMPLETED; +} +EXPORT_SYMBOL_GPL(get_completed_synchronize_rcu_full); + +/** * get_state_synchronize_rcu - Snapshot current RCU state * * Returns a cookie that is used by a later call to cond_synchronize_rcu() @@ -3541,21 +3602,42 @@ unsigned long get_state_synchronize_rcu(void) EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); /** - * start_poll_synchronize_rcu - Snapshot and start RCU grace period + * get_state_synchronize_rcu_full - Snapshot RCU state, both normal and expedited + * @rgosp: location to place combined normal/expedited grace-period state * - * Returns a cookie that is used by a later call to cond_synchronize_rcu() - * or poll_state_synchronize_rcu() to determine whether or not a full - * grace period has elapsed in the meantime. If the needed grace period - * is not already slated to start, notifies RCU core of the need for that - * grace period. + * Places the normal and expedited grace-period states in @rgosp. This + * state value can be passed to a later call to cond_synchronize_rcu_full() + * or poll_state_synchronize_rcu_full() to determine whether or not a + * grace period (whether normal or expedited) has elapsed in the meantime. + * The rcu_gp_oldstate structure takes up twice the memory of an unsigned + * long, but is guaranteed to see all grace periods. In contrast, the + * combined state occupies less memory, but can sometimes fail to take + * grace periods into account. * - * Interrupts must be enabled for the case where it is necessary to awaken - * the grace-period kthread. + * This does not guarantee that the needed grace period will actually + * start. */ -unsigned long start_poll_synchronize_rcu(void) +void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp) +{ + struct rcu_node *rnp = rcu_get_root(); + + /* + * Any prior manipulation of RCU-protected data must happen + * before the loads from ->gp_seq and ->expedited_sequence. + */ + smp_mb(); /* ^^^ */ + rgosp->rgos_norm = rcu_seq_snap(&rnp->gp_seq); + rgosp->rgos_exp = rcu_seq_snap(&rcu_state.expedited_sequence); +} +EXPORT_SYMBOL_GPL(get_state_synchronize_rcu_full); + +/* + * Helper function for start_poll_synchronize_rcu() and + * start_poll_synchronize_rcu_full(). + */ +static void start_poll_synchronize_rcu_common(void) { unsigned long flags; - unsigned long gp_seq = get_state_synchronize_rcu(); bool needwake; struct rcu_data *rdp; struct rcu_node *rnp; @@ -3575,17 +3657,57 @@ unsigned long start_poll_synchronize_rcu(void) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); if (needwake) rcu_gp_kthread_wake(); +} + +/** + * start_poll_synchronize_rcu - Snapshot and start RCU grace period + * + * Returns a cookie that is used by a later call to cond_synchronize_rcu() + * or poll_state_synchronize_rcu() to determine whether or not a full + * grace period has elapsed in the meantime. If the needed grace period + * is not already slated to start, notifies RCU core of the need for that + * grace period. + * + * Interrupts must be enabled for the case where it is necessary to awaken + * the grace-period kthread. + */ +unsigned long start_poll_synchronize_rcu(void) +{ + unsigned long gp_seq = get_state_synchronize_rcu(); + + start_poll_synchronize_rcu_common(); return gp_seq; } EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu); /** - * poll_state_synchronize_rcu - Conditionally wait for an RCU grace period + * start_poll_synchronize_rcu_full - Take a full snapshot and start RCU grace period + * @rgosp: value from get_state_synchronize_rcu_full() or start_poll_synchronize_rcu_full() * + * Places the normal and expedited grace-period states in *@rgos. This + * state value can be passed to a later call to cond_synchronize_rcu_full() + * or poll_state_synchronize_rcu_full() to determine whether or not a + * grace period (whether normal or expedited) has elapsed in the meantime. + * If the needed grace period is not already slated to start, notifies + * RCU core of the need for that grace period. + * + * Interrupts must be enabled for the case where it is necessary to awaken + * the grace-period kthread. + */ +void start_poll_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp) +{ + get_state_synchronize_rcu_full(rgosp); + + start_poll_synchronize_rcu_common(); +} +EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_full); + +/** + * poll_state_synchronize_rcu - Has the specified RCU grace period completed? * @oldstate: value from get_state_synchronize_rcu() or start_poll_synchronize_rcu() * * If a full RCU grace period has elapsed since the earlier call from - * which oldstate was obtained, return @true, otherwise return @false. + * which @oldstate was obtained, return @true, otherwise return @false. * If @false is returned, it is the caller's responsibility to invoke this * function later on until it does return @true. Alternatively, the caller * can explicitly wait for a grace period, for example, by passing @oldstate @@ -3594,10 +3716,11 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu); * Yes, this function does not take counter wrap into account. * But counter wrap is harmless. If the counter wraps, we have waited for * more than a billion grace periods (and way more on a 64-bit system!). - * Those needing to keep oldstate values for very long time periods - * (many hours even on 32-bit systems) should check them occasionally - * and either refresh them or set a flag indicating that the grace period - * has completed. + * Those needing to keep old state values for very long time periods + * (many hours even on 32-bit systems) should check them occasionally and + * either refresh them or set a flag indicating that the grace period has + * completed. Alternatively, they can use get_completed_synchronize_rcu() + * to get a guaranteed-completed grace-period state. * * This function provides the same memory-ordering guarantees that * would be provided by a synchronize_rcu() that was invoked at the call @@ -3616,8 +3739,56 @@ bool poll_state_synchronize_rcu(unsigned long oldstate) EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu); /** - * cond_synchronize_rcu - Conditionally wait for an RCU grace period + * poll_state_synchronize_rcu_full - Has the specified RCU grace period completed? + * @rgosp: value from get_state_synchronize_rcu_full() or start_poll_synchronize_rcu_full() + * + * If a full RCU grace period has elapsed since the earlier call from + * which *rgosp was obtained, return @true, otherwise return @false. + * If @false is returned, it is the caller's responsibility to invoke this + * function later on until it does return @true. Alternatively, the caller + * can explicitly wait for a grace period, for example, by passing @rgosp + * to cond_synchronize_rcu() or by directly invoking synchronize_rcu(). + * + * Yes, this function does not take counter wrap into account. + * But counter wrap is harmless. If the counter wraps, we have waited + * for more than a billion grace periods (and way more on a 64-bit + * system!). Those needing to keep rcu_gp_oldstate values for very + * long time periods (many hours even on 32-bit systems) should check + * them occasionally and either refresh them or set a flag indicating + * that the grace period has completed. Alternatively, they can use + * get_completed_synchronize_rcu_full() to get a guaranteed-completed + * grace-period state. * + * This function provides the same memory-ordering guarantees that would + * be provided by a synchronize_rcu() that was invoked at the call to + * the function that provided @rgosp, and that returned at the end of this + * function. And this guarantee requires that the root rcu_node structure's + * ->gp_seq field be checked instead of that of the rcu_state structure. + * The problem is that the just-ending grace-period's callbacks can be + * invoked between the time that the root rcu_node structure's ->gp_seq + * field is updated and the time that the rcu_state structure's ->gp_seq + * field is updated. Therefore, if a single synchronize_rcu() is to + * cause a subsequent poll_state_synchronize_rcu_full() to return @true, + * then the root rcu_node structure is the one that needs to be polled. + */ +bool poll_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp) +{ + struct rcu_node *rnp = rcu_get_root(); + + smp_mb(); // Order against root rcu_node structure grace-period cleanup. + if (rgosp->rgos_norm == RCU_GET_STATE_COMPLETED || + rcu_seq_done_exact(&rnp->gp_seq, rgosp->rgos_norm) || + rgosp->rgos_exp == RCU_GET_STATE_COMPLETED || + rcu_seq_done_exact(&rcu_state.expedited_sequence, rgosp->rgos_exp)) { + smp_mb(); /* Ensure GP ends before subsequent accesses. */ + return true; + } + return false; +} +EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu_full); + +/** + * cond_synchronize_rcu - Conditionally wait for an RCU grace period * @oldstate: value from get_state_synchronize_rcu(), start_poll_synchronize_rcu(), or start_poll_synchronize_rcu_expedited() * * If a full RCU grace period has elapsed since the earlier call to @@ -3641,6 +3812,33 @@ void cond_synchronize_rcu(unsigned long oldstate) } EXPORT_SYMBOL_GPL(cond_synchronize_rcu); +/** + * cond_synchronize_rcu_full - Conditionally wait for an RCU grace period + * @rgosp: value from get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(), or start_poll_synchronize_rcu_expedited_full() + * + * If a full RCU grace period has elapsed since the call to + * get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(), + * or start_poll_synchronize_rcu_expedited_full() from which @rgosp was + * obtained, just return. Otherwise, invoke synchronize_rcu() to wait + * for a full grace period. + * + * Yes, this function does not take counter wrap into account. + * But counter wrap is harmless. If the counter wraps, we have waited for + * more than 2 billion grace periods (and way more on a 64-bit system!), + * so waiting for a couple of additional grace periods should be just fine. + * + * This function provides the same memory-ordering guarantees that + * would be provided by a synchronize_rcu() that was invoked at the call + * to the function that provided @rgosp and that returned at the end of + * this function. + */ +void cond_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp) +{ + if (!poll_state_synchronize_rcu_full(rgosp)) + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(cond_synchronize_rcu_full); + /* * Check to see if there is any immediate RCU-related work to be done by * the current CPU, returning 1 if so and zero otherwise. The checks are @@ -3731,6 +3929,8 @@ static void rcu_barrier_entrain(struct rcu_data *rdp) { unsigned long gseq = READ_ONCE(rcu_state.barrier_sequence); unsigned long lseq = READ_ONCE(rdp->barrier_seq_snap); + bool wake_nocb = false; + bool was_alldone = false; lockdep_assert_held(&rcu_state.barrier_lock); if (rcu_seq_state(lseq) || !rcu_seq_state(gseq) || rcu_seq_ctr(lseq) != rcu_seq_ctr(gseq)) @@ -3739,7 +3939,14 @@ static void rcu_barrier_entrain(struct rcu_data *rdp) rdp->barrier_head.func = rcu_barrier_callback; debug_rcu_head_queue(&rdp->barrier_head); rcu_nocb_lock(rdp); - WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies)); + /* + * Flush bypass and wakeup rcuog if we add callbacks to an empty regular + * queue. This way we don't wait for bypass timer that can reach seconds + * if it's fully lazy. + */ + was_alldone = rcu_rdp_is_offloaded(rdp) && !rcu_segcblist_pend_cbs(&rdp->cblist); + WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false)); + wake_nocb = was_alldone && rcu_segcblist_pend_cbs(&rdp->cblist); if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head)) { atomic_inc(&rcu_state.barrier_cpu_count); } else { @@ -3747,6 +3954,8 @@ static void rcu_barrier_entrain(struct rcu_data *rdp) rcu_barrier_trace(TPS("IRQNQ"), -1, rcu_state.barrier_sequence); } rcu_nocb_unlock(rdp); + if (wake_nocb) + wake_nocb_gp(rdp, false); smp_store_release(&rdp->barrier_seq_snap, gseq); } @@ -4113,8 +4322,6 @@ void rcu_report_dead(unsigned int cpu) // Do any dangling deferred wakeups. do_nocb_deferred_wakeup(rdp); - /* QS for any half-done expedited grace period. */ - rcu_report_exp_rdp(rdp); rcu_preempt_deferred_qs(current); /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ @@ -4162,7 +4369,7 @@ void rcutree_migrate_callbacks(int cpu) my_rdp = this_cpu_ptr(&rcu_data); my_rnp = my_rdp->mynode; rcu_nocb_lock(my_rdp); /* irqs already disabled. */ - WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp, NULL, jiffies)); + WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp, NULL, jiffies, false)); raw_spin_lock_rcu_node(my_rnp); /* irqs already disabled. */ /* Leverage recent GPs and set GP for new callbacks. */ needwake = rcu_advance_cbs(my_rnp, rdp) || @@ -4312,9 +4519,20 @@ early_initcall(rcu_spawn_gp_kthread); */ void rcu_scheduler_starting(void) { + unsigned long flags; + struct rcu_node *rnp; + WARN_ON(num_online_cpus() != 1); WARN_ON(nr_context_switches() > 0); rcu_test_sync_prims(); + + // Fix up the ->gp_seq counters. + local_irq_save(flags); + rcu_for_each_node_breadth_first(rnp) + rnp->gp_seq_needed = rnp->gp_seq = rcu_state.gp_seq; + local_irq_restore(flags); + + // Switch out of early boot mode. rcu_scheduler_active = RCU_SCHEDULER_INIT; rcu_test_sync_prims(); } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index d4a97e40ea9c..fcb5d696eb17 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -263,14 +263,16 @@ struct rcu_data { unsigned long last_fqs_resched; /* Time of last rcu_resched(). */ unsigned long last_sched_clock; /* Jiffies of last rcu_sched_clock_irq(). */ + long lazy_len; /* Length of buffered lazy callbacks. */ int cpu; }; /* Values for nocb_defer_wakeup field in struct rcu_data. */ #define RCU_NOCB_WAKE_NOT 0 #define RCU_NOCB_WAKE_BYPASS 1 -#define RCU_NOCB_WAKE 2 -#define RCU_NOCB_WAKE_FORCE 3 +#define RCU_NOCB_WAKE_LAZY 2 +#define RCU_NOCB_WAKE 3 +#define RCU_NOCB_WAKE_FORCE 4 #define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500)) /* For jiffies_till_first_fqs and */ @@ -439,10 +441,12 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp); static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp); static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq); static void rcu_init_one_nocb(struct rcu_node *rnp); +static bool wake_nocb_gp(struct rcu_data *rdp, bool force); static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, - unsigned long j); + unsigned long j, bool lazy); static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, - bool *was_alldone, unsigned long flags); + bool *was_alldone, unsigned long flags, + bool lazy); static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty, unsigned long flags); static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level); diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index be667583a554..ed6c3cce28f2 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -828,11 +828,13 @@ static void rcu_exp_handler(void *unused) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; + bool preempt_bh_enabled = !(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)); if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || __this_cpu_read(rcu_data.cpu_no_qs.b.exp)) return; - if (rcu_is_cpu_rrupt_from_idle()) { + if (rcu_is_cpu_rrupt_from_idle() || + (IS_ENABLED(CONFIG_PREEMPT_COUNT) && preempt_bh_enabled)) { rcu_report_exp_rdp(this_cpu_ptr(&rcu_data)); return; } @@ -906,6 +908,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) void synchronize_rcu_expedited(void) { bool boottime = (rcu_scheduler_active == RCU_SCHEDULER_INIT); + unsigned long flags; struct rcu_exp_work rew; struct rcu_node *rnp; unsigned long s; @@ -924,14 +927,17 @@ void synchronize_rcu_expedited(void) // them, which allows reuse of ->gp_seq_polled_exp_snap. rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_exp_snap); rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_exp_snap); - if (rcu_init_invoked()) - cond_resched(); + + local_irq_save(flags); + WARN_ON_ONCE(num_online_cpus() > 1); + rcu_state.expedited_sequence += (1 << RCU_SEQ_CTR_SHIFT); + local_irq_restore(flags); return; // Context allows vacuous grace periods. } /* If expedited grace periods are prohibited, fall back to normal. */ if (rcu_gp_is_normal()) { - wait_rcu_gp(call_rcu); + wait_rcu_gp(call_rcu_hurry); return; } @@ -1028,6 +1034,24 @@ unsigned long start_poll_synchronize_rcu_expedited(void) EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_expedited); /** + * start_poll_synchronize_rcu_expedited_full - Take a full snapshot and start expedited grace period + * @rgosp: Place to put snapshot of grace-period state + * + * Places the normal and expedited grace-period states in rgosp. This + * state value can be passed to a later call to cond_synchronize_rcu_full() + * or poll_state_synchronize_rcu_full() to determine whether or not a + * grace period (whether normal or expedited) has elapsed in the meantime. + * If the needed expedited grace period is not already slated to start, + * initiates that grace period. + */ +void start_poll_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp) +{ + get_state_synchronize_rcu_full(rgosp); + (void)start_poll_synchronize_rcu_expedited(); +} +EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_expedited_full); + +/** * cond_synchronize_rcu_expedited - Conditionally wait for an expedited RCU grace period * * @oldstate: value from get_state_synchronize_rcu(), start_poll_synchronize_rcu(), or start_poll_synchronize_rcu_expedited() @@ -1053,3 +1077,30 @@ void cond_synchronize_rcu_expedited(unsigned long oldstate) synchronize_rcu_expedited(); } EXPORT_SYMBOL_GPL(cond_synchronize_rcu_expedited); + +/** + * cond_synchronize_rcu_expedited_full - Conditionally wait for an expedited RCU grace period + * @rgosp: value from get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(), or start_poll_synchronize_rcu_expedited_full() + * + * If a full RCU grace period has elapsed since the call to + * get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(), + * or start_poll_synchronize_rcu_expedited_full() from which @rgosp was + * obtained, just return. Otherwise, invoke synchronize_rcu_expedited() + * to wait for a full grace period. + * + * Yes, this function does not take counter wrap into account. + * But counter wrap is harmless. If the counter wraps, we have waited for + * more than 2 billion grace periods (and way more on a 64-bit system!), + * so waiting for a couple of additional grace periods should be just fine. + * + * This function provides the same memory-ordering guarantees that + * would be provided by a synchronize_rcu() that was invoked at the call + * to the function that provided @rgosp and that returned at the end of + * this function. + */ +void cond_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp) +{ + if (!poll_state_synchronize_rcu_full(rgosp)) + synchronize_rcu_expedited(); +} +EXPORT_SYMBOL_GPL(cond_synchronize_rcu_expedited_full); diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index a8f574d8850d..9e1c8caec5ce 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -257,6 +257,31 @@ static bool wake_nocb_gp(struct rcu_data *rdp, bool force) } /* + * LAZY_FLUSH_JIFFIES decides the maximum amount of time that + * can elapse before lazy callbacks are flushed. Lazy callbacks + * could be flushed much earlier for a number of other reasons + * however, LAZY_FLUSH_JIFFIES will ensure no lazy callbacks are + * left unsubmitted to RCU after those many jiffies. + */ +#define LAZY_FLUSH_JIFFIES (10 * HZ) +static unsigned long jiffies_till_flush = LAZY_FLUSH_JIFFIES; + +#ifdef CONFIG_RCU_LAZY +// To be called only from test code. +void rcu_lazy_set_jiffies_till_flush(unsigned long jif) +{ + jiffies_till_flush = jif; +} +EXPORT_SYMBOL(rcu_lazy_set_jiffies_till_flush); + +unsigned long rcu_lazy_get_jiffies_till_flush(void) +{ + return jiffies_till_flush; +} +EXPORT_SYMBOL(rcu_lazy_get_jiffies_till_flush); +#endif + +/* * Arrange to wake the GP kthread for this NOCB group at some future * time when it is safe to do so. */ @@ -269,10 +294,14 @@ static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype, raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); /* - * Bypass wakeup overrides previous deferments. In case - * of callback storm, no need to wake up too early. + * Bypass wakeup overrides previous deferments. In case of + * callback storms, no need to wake up too early. */ - if (waketype == RCU_NOCB_WAKE_BYPASS) { + if (waketype == RCU_NOCB_WAKE_LAZY && + rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) { + mod_timer(&rdp_gp->nocb_timer, jiffies + jiffies_till_flush); + WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype); + } else if (waketype == RCU_NOCB_WAKE_BYPASS) { mod_timer(&rdp_gp->nocb_timer, jiffies + 2); WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype); } else { @@ -293,12 +322,16 @@ static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype, * proves to be initially empty, just return false because the no-CB GP * kthread may need to be awakened in this case. * + * Return true if there was something to be flushed and it succeeded, otherwise + * false. + * * Note that this function always returns true if rhp is NULL. */ -static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, - unsigned long j) +static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp_in, + unsigned long j, bool lazy) { struct rcu_cblist rcl; + struct rcu_head *rhp = rhp_in; WARN_ON_ONCE(!rcu_rdp_is_offloaded(rdp)); rcu_lockdep_assert_cblist_protected(rdp); @@ -310,7 +343,20 @@ static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, /* Note: ->cblist.len already accounts for ->nocb_bypass contents. */ if (rhp) rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */ + + /* + * If the new CB requested was a lazy one, queue it onto the main + * ->cblist so that we can take advantage of the grace-period that will + * happen regardless. But queue it onto the bypass list first so that + * the lazy CB is ordered with the existing CBs in the bypass list. + */ + if (lazy && rhp) { + rcu_cblist_enqueue(&rdp->nocb_bypass, rhp); + rhp = NULL; + } rcu_cblist_flush_enqueue(&rcl, &rdp->nocb_bypass, rhp); + WRITE_ONCE(rdp->lazy_len, 0); + rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rcl); WRITE_ONCE(rdp->nocb_bypass_first, j); rcu_nocb_bypass_unlock(rdp); @@ -326,13 +372,13 @@ static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, * Note that this function always returns true if rhp is NULL. */ static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, - unsigned long j) + unsigned long j, bool lazy) { if (!rcu_rdp_is_offloaded(rdp)) return true; rcu_lockdep_assert_cblist_protected(rdp); rcu_nocb_bypass_lock(rdp); - return rcu_nocb_do_flush_bypass(rdp, rhp, j); + return rcu_nocb_do_flush_bypass(rdp, rhp, j, lazy); } /* @@ -345,7 +391,7 @@ static void rcu_nocb_try_flush_bypass(struct rcu_data *rdp, unsigned long j) if (!rcu_rdp_is_offloaded(rdp) || !rcu_nocb_bypass_trylock(rdp)) return; - WARN_ON_ONCE(!rcu_nocb_do_flush_bypass(rdp, NULL, j)); + WARN_ON_ONCE(!rcu_nocb_do_flush_bypass(rdp, NULL, j, false)); } /* @@ -367,12 +413,14 @@ static void rcu_nocb_try_flush_bypass(struct rcu_data *rdp, unsigned long j) * there is only one CPU in operation. */ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, - bool *was_alldone, unsigned long flags) + bool *was_alldone, unsigned long flags, + bool lazy) { unsigned long c; unsigned long cur_gp_seq; unsigned long j = jiffies; long ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); + bool bypass_is_lazy = (ncbs == READ_ONCE(rdp->lazy_len)); lockdep_assert_irqs_disabled(); @@ -417,24 +465,29 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, // If there hasn't yet been all that many ->cblist enqueues // this jiffy, tell the caller to enqueue onto ->cblist. But flush // ->nocb_bypass first. - if (rdp->nocb_nobypass_count < nocb_nobypass_lim_per_jiffy) { + // Lazy CBs throttle this back and do immediate bypass queuing. + if (rdp->nocb_nobypass_count < nocb_nobypass_lim_per_jiffy && !lazy) { rcu_nocb_lock(rdp); *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); if (*was_alldone) trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstQ")); - WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, j)); + + WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, j, false)); WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); return false; // Caller must enqueue the callback. } // If ->nocb_bypass has been used too long or is too full, // flush ->nocb_bypass to ->cblist. - if ((ncbs && j != READ_ONCE(rdp->nocb_bypass_first)) || + if ((ncbs && !bypass_is_lazy && j != READ_ONCE(rdp->nocb_bypass_first)) || + (ncbs && bypass_is_lazy && + (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + jiffies_till_flush))) || ncbs >= qhimark) { rcu_nocb_lock(rdp); - if (!rcu_nocb_flush_bypass(rdp, rhp, j)) { - *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); + *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); + + if (!rcu_nocb_flush_bypass(rdp, rhp, j, lazy)) { if (*was_alldone) trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstQ")); @@ -447,7 +500,12 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, rcu_advance_cbs_nowake(rdp->mynode, rdp); rdp->nocb_gp_adv_time = j; } - rcu_nocb_unlock_irqrestore(rdp, flags); + + // The flush succeeded and we moved CBs into the regular list. + // Don't wait for the wake up timer as it may be too far ahead. + // Wake up the GP thread now instead, if the cblist was empty. + __call_rcu_nocb_wake(rdp, *was_alldone, flags); + return true; // Callback already enqueued. } @@ -457,13 +515,24 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */ rcu_cblist_enqueue(&rdp->nocb_bypass, rhp); + + if (lazy) + WRITE_ONCE(rdp->lazy_len, rdp->lazy_len + 1); + if (!ncbs) { WRITE_ONCE(rdp->nocb_bypass_first, j); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQ")); } rcu_nocb_bypass_unlock(rdp); smp_mb(); /* Order enqueue before wake. */ - if (ncbs) { + // A wake up of the grace period kthread or timer adjustment + // needs to be done only if: + // 1. Bypass list was fully empty before (this is the first + // bypass list entry), or: + // 2. Both of these conditions are met: + // a. The bypass list previously had only lazy CBs, and: + // b. The new CB is non-lazy. + if (ncbs && (!bypass_is_lazy || lazy)) { local_irq_restore(flags); } else { // No-CBs GP kthread might be indefinitely asleep, if so, wake. @@ -491,8 +560,10 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, unsigned long flags) __releases(rdp->nocb_lock) { + long bypass_len; unsigned long cur_gp_seq; unsigned long j; + long lazy_len; long len; struct task_struct *t; @@ -506,9 +577,16 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, } // Need to actually to a wakeup. len = rcu_segcblist_n_cbs(&rdp->cblist); + bypass_len = rcu_cblist_n_cbs(&rdp->nocb_bypass); + lazy_len = READ_ONCE(rdp->lazy_len); if (was_alldone) { rdp->qlen_last_fqs_check = len; - if (!irqs_disabled_flags(flags)) { + // Only lazy CBs in bypass list + if (lazy_len && bypass_len == lazy_len) { + rcu_nocb_unlock_irqrestore(rdp, flags); + wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_LAZY, + TPS("WakeLazy")); + } else if (!irqs_disabled_flags(flags)) { /* ... if queue was empty ... */ rcu_nocb_unlock_irqrestore(rdp, flags); wake_nocb_gp(rdp, false); @@ -599,12 +677,12 @@ static void nocb_gp_sleep(struct rcu_data *my_rdp, int cpu) static void nocb_gp_wait(struct rcu_data *my_rdp) { bool bypass = false; - long bypass_ncbs; int __maybe_unused cpu = my_rdp->cpu; unsigned long cur_gp_seq; unsigned long flags; bool gotcbs = false; unsigned long j = jiffies; + bool lazy = false; bool needwait_gp = false; // This prevents actual uninitialized use. bool needwake; bool needwake_gp; @@ -634,24 +712,43 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) * won't be ignored for long. */ list_for_each_entry(rdp, &my_rdp->nocb_head_rdp, nocb_entry_rdp) { + long bypass_ncbs; + bool flush_bypass = false; + long lazy_ncbs; + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check")); rcu_nocb_lock_irqsave(rdp, flags); lockdep_assert_held(&rdp->nocb_lock); bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); - if (bypass_ncbs && + lazy_ncbs = READ_ONCE(rdp->lazy_len); + + if (bypass_ncbs && (lazy_ncbs == bypass_ncbs) && + (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + jiffies_till_flush) || + bypass_ncbs > 2 * qhimark)) { + flush_bypass = true; + } else if (bypass_ncbs && (lazy_ncbs != bypass_ncbs) && (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) || bypass_ncbs > 2 * qhimark)) { - // Bypass full or old, so flush it. - (void)rcu_nocb_try_flush_bypass(rdp, j); - bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); + flush_bypass = true; } else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) { rcu_nocb_unlock_irqrestore(rdp, flags); continue; /* No callbacks here, try next. */ } + + if (flush_bypass) { + // Bypass full or old, so flush it. + (void)rcu_nocb_try_flush_bypass(rdp, j); + bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); + lazy_ncbs = READ_ONCE(rdp->lazy_len); + } + if (bypass_ncbs) { trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, - TPS("Bypass")); - bypass = true; + bypass_ncbs == lazy_ncbs ? TPS("Lazy") : TPS("Bypass")); + if (bypass_ncbs == lazy_ncbs) + lazy = true; + else + bypass = true; } rnp = rdp->mynode; @@ -699,12 +796,20 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) my_rdp->nocb_gp_gp = needwait_gp; my_rdp->nocb_gp_seq = needwait_gp ? wait_gp_seq : 0; - if (bypass && !rcu_nocb_poll) { - // At least one child with non-empty ->nocb_bypass, so set - // timer in order to avoid stranding its callbacks. - wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_BYPASS, - TPS("WakeBypassIsDeferred")); + // At least one child with non-empty ->nocb_bypass, so set + // timer in order to avoid stranding its callbacks. + if (!rcu_nocb_poll) { + // If bypass list only has lazy CBs. Add a deferred lazy wake up. + if (lazy && !bypass) { + wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_LAZY, + TPS("WakeLazyIsDeferred")); + // Otherwise add a deferred bypass wake up. + } else if (bypass) { + wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_BYPASS, + TPS("WakeBypassIsDeferred")); + } } + if (rcu_nocb_poll) { /* Polling, so trace if first poll in the series. */ if (gotcbs) @@ -1030,7 +1135,7 @@ static long rcu_nocb_rdp_deoffload(void *arg) * return false, which means that future calls to rcu_nocb_try_bypass() * will refuse to put anything into the bypass. */ - WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies)); + WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false)); /* * Start with invoking rcu_core() early. This way if the current thread * happens to preempt an ongoing call to rcu_core() in the middle, @@ -1111,7 +1216,7 @@ int rcu_nocb_cpu_deoffload(int cpu) if (!ret) cpumask_clear_cpu(cpu, rcu_nocb_mask); } else { - pr_info("NOCB: Can't CB-deoffload an offline CPU\n"); + pr_info("NOCB: Cannot CB-deoffload offline CPU %d\n", rdp->cpu); ret = -EINVAL; } } @@ -1196,7 +1301,7 @@ int rcu_nocb_cpu_offload(int cpu) if (!ret) cpumask_set_cpu(cpu, rcu_nocb_mask); } else { - pr_info("NOCB: Can't CB-offload an offline CPU\n"); + pr_info("NOCB: Cannot CB-offload offline CPU %d\n", rdp->cpu); ret = -EINVAL; } } @@ -1207,47 +1312,87 @@ int rcu_nocb_cpu_offload(int cpu) } EXPORT_SYMBOL_GPL(rcu_nocb_cpu_offload); -void __init rcu_init_nohz(void) +static unsigned long +lazy_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { int cpu; - bool need_rcu_nocb_mask = false; - bool offload_all = false; - struct rcu_data *rdp; + unsigned long count = 0; -#if defined(CONFIG_RCU_NOCB_CPU_DEFAULT_ALL) - if (!rcu_state.nocb_is_setup) { - need_rcu_nocb_mask = true; - offload_all = true; + /* Snapshot count of all CPUs */ + for_each_possible_cpu(cpu) { + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + + count += READ_ONCE(rdp->lazy_len); } -#endif /* #if defined(CONFIG_RCU_NOCB_CPU_DEFAULT_ALL) */ -#if defined(CONFIG_NO_HZ_FULL) - if (tick_nohz_full_running && !cpumask_empty(tick_nohz_full_mask)) { - need_rcu_nocb_mask = true; - offload_all = false; /* NO_HZ_FULL has its own mask. */ + return count ? count : SHRINK_EMPTY; +} + +static unsigned long +lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) +{ + int cpu; + unsigned long flags; + unsigned long count = 0; + + /* Snapshot count of all CPUs */ + for_each_possible_cpu(cpu) { + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + int _count = READ_ONCE(rdp->lazy_len); + + if (_count == 0) + continue; + rcu_nocb_lock_irqsave(rdp, flags); + WRITE_ONCE(rdp->lazy_len, 0); + rcu_nocb_unlock_irqrestore(rdp, flags); + wake_nocb_gp(rdp, false); + sc->nr_to_scan -= _count; + count += _count; + if (sc->nr_to_scan <= 0) + break; } -#endif /* #if defined(CONFIG_NO_HZ_FULL) */ + return count ? count : SHRINK_STOP; +} - if (need_rcu_nocb_mask) { +static struct shrinker lazy_rcu_shrinker = { + .count_objects = lazy_rcu_shrink_count, + .scan_objects = lazy_rcu_shrink_scan, + .batch = 0, + .seeks = DEFAULT_SEEKS, +}; + +void __init rcu_init_nohz(void) +{ + int cpu; + struct rcu_data *rdp; + const struct cpumask *cpumask = NULL; + +#if defined(CONFIG_NO_HZ_FULL) + if (tick_nohz_full_running && !cpumask_empty(tick_nohz_full_mask)) + cpumask = tick_nohz_full_mask; +#endif + + if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_DEFAULT_ALL) && + !rcu_state.nocb_is_setup && !cpumask) + cpumask = cpu_possible_mask; + + if (cpumask) { if (!cpumask_available(rcu_nocb_mask)) { if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) { pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n"); return; } } + + cpumask_or(rcu_nocb_mask, rcu_nocb_mask, cpumask); rcu_state.nocb_is_setup = true; } if (!rcu_state.nocb_is_setup) return; -#if defined(CONFIG_NO_HZ_FULL) - if (tick_nohz_full_running) - cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask); -#endif /* #if defined(CONFIG_NO_HZ_FULL) */ - - if (offload_all) - cpumask_setall(rcu_nocb_mask); + if (register_shrinker(&lazy_rcu_shrinker, "rcu-lazy")) + pr_err("Failed to register lazy_rcu shrinker!\n"); if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) { pr_info("\tNote: kernel parameter 'rcu_nocbs=', 'nohz_full', or 'isolcpus=' contains nonexistent CPUs.\n"); @@ -1284,6 +1429,7 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) raw_spin_lock_init(&rdp->nocb_gp_lock); timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0); rcu_cblist_init(&rdp->nocb_bypass); + WRITE_ONCE(rdp->lazy_len, 0); mutex_init(&rdp->nocb_gp_kthread_mutex); } @@ -1452,8 +1598,8 @@ static void show_rcu_nocb_gp_state(struct rcu_data *rdp) (long)rdp->nocb_gp_seq, rnp->grplo, rnp->grphi, READ_ONCE(rdp->nocb_gp_loops), rdp->nocb_gp_kthread ? task_state_to_char(rdp->nocb_gp_kthread) : '.', - rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1, - show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread)); + rdp->nocb_gp_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1, + show_rcu_should_be_on_cpu(rdp->nocb_gp_kthread)); } /* Dump out nocb kthread state for the specified rcu_data structure. */ @@ -1497,7 +1643,7 @@ static void show_rcu_nocb_state(struct rcu_data *rdp) ".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)], rcu_segcblist_n_cbs(&rdp->cblist), rdp->nocb_cb_kthread ? task_state_to_char(rdp->nocb_cb_kthread) : '.', - rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1, + rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_cb_kthread) : -1, show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread)); /* It is OK for GP kthreads to have GP state. */ @@ -1564,14 +1710,19 @@ static void rcu_init_one_nocb(struct rcu_node *rnp) { } +static bool wake_nocb_gp(struct rcu_data *rdp, bool force) +{ + return false; +} + static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, - unsigned long j) + unsigned long j, bool lazy) { return true; } static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, - bool *was_alldone, unsigned long flags) + bool *was_alldone, unsigned long flags, bool lazy) { return false; } diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 438ecae6bd7e..7b0fe741a088 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -641,7 +641,8 @@ static void rcu_read_unlock_special(struct task_struct *t) expboost = (t->rcu_blocked_node && READ_ONCE(t->rcu_blocked_node->exp_tasks)) || (rdp->grpmask & READ_ONCE(rnp->expmask)) || - IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) || + (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) && + ((rdp->grpmask & READ_ONCE(rnp->qsmask)) || t->rcu_blocked_node)) || (IS_ENABLED(CONFIG_RCU_BOOST) && irqs_were_disabled && t->rcu_blocked_node); // Need to defer quiescent state until everything is enabled. @@ -718,9 +719,6 @@ static void rcu_flavor_sched_clock_irq(int user) struct task_struct *t = current; lockdep_assert_irqs_disabled(); - if (user || rcu_is_cpu_rrupt_from_idle()) { - rcu_note_voluntary_context_switch(current); - } if (rcu_preempt_depth() > 0 || (preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK))) { /* No QS, force context switch if deferred. */ @@ -824,6 +822,7 @@ void rcu_read_unlock_strict(void) if (irqs_disabled() || preempt_count() || !rcu_state.gp_kthread) return; rdp = this_cpu_ptr(&rcu_data); + rdp->cpu_no_qs.b.norm = false; rcu_report_qs_rdp(rdp); udelay(rcu_unlock_delay); } @@ -869,7 +868,7 @@ void rcu_all_qs(void) if (!raw_cpu_read(rcu_data.rcu_urgent_qs)) return; - preempt_disable(); + preempt_disable(); // For CONFIG_PREEMPT_COUNT=y kernels /* Load rcu_urgent_qs before other flags. */ if (!smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) { preempt_enable(); @@ -931,10 +930,13 @@ static notrace bool rcu_preempt_need_deferred_qs(struct task_struct *t) return false; } -// Except that we do need to respond to a request by an expedited grace -// period for a quiescent state from this CPU. Note that requests from -// tasks are handled when removing the task from the blocked-tasks list -// below. +// Except that we do need to respond to a request by an expedited +// grace period for a quiescent state from this CPU. Note that in +// non-preemptible kernels, there can be no context switches within RCU +// read-side critical sections, which in turn means that the leaf rcu_node +// structure's blocked-tasks list is always empty. is therefore no need to +// actually check it. Instead, a quiescent state from this CPU suffices, +// and this function is only called from such a quiescent state. notrace void rcu_preempt_deferred_qs(struct task_struct *t) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); @@ -972,7 +974,6 @@ static void rcu_flavor_sched_clock_irq(int user) * neither access nor modify, at least not while the * corresponding CPU is online. */ - rcu_qs(); } } @@ -1220,11 +1221,13 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp) * We don't include outgoingcpu in the affinity set, use -1 if there is * no outgoing CPU. If there are no CPUs left in the affinity set, * this function allows the kthread to execute on any CPU. + * + * Any future concurrent calls are serialized via ->boost_kthread_mutex. */ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) { struct task_struct *t = rnp->boost_kthread_task; - unsigned long mask = rcu_rnp_online_cpus(rnp); + unsigned long mask; cpumask_var_t cm; int cpu; @@ -1233,13 +1236,17 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) if (!zalloc_cpumask_var(&cm, GFP_KERNEL)) return; mutex_lock(&rnp->boost_kthread_mutex); + mask = rcu_rnp_online_cpus(rnp); for_each_leaf_node_possible_cpu(rnp, cpu) if ((mask & leaf_node_cpu_bit(rnp, cpu)) && cpu != outgoingcpu) cpumask_set_cpu(cpu, cm); cpumask_and(cm, cm, housekeeping_cpumask(HK_TYPE_RCU)); - if (cpumask_empty(cm)) + if (cpumask_empty(cm)) { cpumask_copy(cm, housekeeping_cpumask(HK_TYPE_RCU)); + if (outgoingcpu >= 0) + cpumask_clear_cpu(outgoingcpu, cm); + } set_cpus_allowed_ptr(t, cm); mutex_unlock(&rnp->boost_kthread_mutex); free_cpumask_var(cm); diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index c3fbbcc09327..5653560573e2 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -368,7 +368,7 @@ static void rcu_dump_cpu_stacks(void) if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) { if (cpu_is_offline(cpu)) pr_err("Offline CPU %d blocking current GP.\n", cpu); - else if (!trigger_single_cpu_backtrace(cpu)) + else dump_cpu_task(cpu); } raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -511,8 +511,7 @@ static void rcu_check_gp_kthread_starvation(void) pr_err("RCU GP kthread last ran on offline CPU %d.\n", cpu); } else { pr_err("Stack dump where RCU GP kthread last ran:\n"); - if (!trigger_single_cpu_backtrace(cpu)) - dump_cpu_task(cpu); + dump_cpu_task(cpu); } } wake_up_process(gpk); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 738842c4886b..f5e6a2f95a2a 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -224,7 +224,7 @@ void rcu_test_sync_prims(void) synchronize_rcu_expedited(); } -#if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU) +#if !defined(CONFIG_TINY_RCU) /* * Switch to run-time mode once RCU has fully initialized. @@ -239,7 +239,7 @@ static int __init rcu_set_runtime_mode(void) } core_initcall(rcu_set_runtime_mode); -#endif /* #if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU) */ +#endif /* #if !defined(CONFIG_TINY_RCU) */ #ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key rcu_lock_key; @@ -559,10 +559,8 @@ static void early_boot_test_call_rcu(void) struct early_boot_kfree_rcu *rhp; call_rcu(&head, test_callback); - if (IS_ENABLED(CONFIG_SRCU)) { - early_srcu_cookie = start_poll_synchronize_srcu(&early_srcu); - call_srcu(&early_srcu, &shead, test_callback); - } + early_srcu_cookie = start_poll_synchronize_srcu(&early_srcu); + call_srcu(&early_srcu, &shead, test_callback); rhp = kmalloc(sizeof(*rhp), GFP_KERNEL); if (!WARN_ON_ONCE(!rhp)) kfree_rcu(rhp, rh); @@ -585,11 +583,9 @@ static int rcu_verify_early_boot_tests(void) if (rcu_self_test) { early_boot_test_counter++; rcu_barrier(); - if (IS_ENABLED(CONFIG_SRCU)) { - early_boot_test_counter++; - srcu_barrier(&early_srcu); - WARN_ON_ONCE(!poll_state_synchronize_srcu(&early_srcu, early_srcu_cookie)); - } + early_boot_test_counter++; + srcu_barrier(&early_srcu); + WARN_ON_ONCE(!poll_state_synchronize_srcu(&early_srcu, early_srcu_cookie)); } if (rcu_self_test_counter != early_boot_test_counter) { WARN_ON(1); diff --git a/kernel/reboot.c b/kernel/reboot.c index 3c35445bf5ad..3bba88c7ffc6 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -243,6 +243,17 @@ void migrate_to_reboot_cpu(void) set_cpus_allowed_ptr(current, cpumask_of(cpu)); } +/* + * Notifier list for kernel code which wants to be called + * to prepare system for restart. + */ +static BLOCKING_NOTIFIER_HEAD(restart_prep_handler_list); + +static void do_kernel_restart_prepare(void) +{ + blocking_notifier_call_chain(&restart_prep_handler_list, 0, NULL); +} + /** * kernel_restart - reboot the system * @cmd: pointer to buffer containing command to execute for restart @@ -254,6 +265,7 @@ void migrate_to_reboot_cpu(void) void kernel_restart(char *cmd) { kernel_restart_prepare(cmd); + do_kernel_restart_prepare(); migrate_to_reboot_cpu(); syscore_shutdown(); if (!cmd) @@ -396,6 +408,11 @@ register_sys_off_handler(enum sys_off_mode mode, handler->list = &power_off_handler_list; break; + case SYS_OFF_MODE_RESTART_PREPARE: + handler->list = &restart_prep_handler_list; + handler->blocking = true; + break; + case SYS_OFF_MODE_RESTART: handler->list = &restart_handler_list; break; diff --git a/kernel/relay.c b/kernel/relay.c index 6a611e779e95..ef12532168d9 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -60,10 +60,7 @@ static const struct vm_operations_struct relay_file_mmap_ops = { */ static struct page **relay_alloc_page_array(unsigned int n_pages) { - const size_t pa_size = n_pages * sizeof(struct page *); - if (pa_size > PAGE_SIZE) - return vzalloc(pa_size); - return kzalloc(pa_size, GFP_KERNEL); + return kvcalloc(n_pages, sizeof(struct page *), GFP_KERNEL); } /* @@ -151,13 +148,13 @@ static struct rchan_buf *relay_create_buf(struct rchan *chan) { struct rchan_buf *buf; - if (chan->n_subbufs > KMALLOC_MAX_SIZE / sizeof(size_t *)) + if (chan->n_subbufs > KMALLOC_MAX_SIZE / sizeof(size_t)) return NULL; buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL); if (!buf) return NULL; - buf->padding = kmalloc_array(chan->n_subbufs, sizeof(size_t *), + buf->padding = kmalloc_array(chan->n_subbufs, sizeof(size_t), GFP_KERNEL); if (!buf->padding) goto free_buf; @@ -510,7 +507,7 @@ struct rchan *relay_open(const char *base_filename, chan->private_data = private_data; if (base_filename) { chan->has_base_filename = 1; - strlcpy(chan->base_filename, base_filename, NAME_MAX); + strscpy(chan->base_filename, base_filename, NAME_MAX); } chan->cb = cb; kref_init(&chan->kref); @@ -581,7 +578,7 @@ int relay_late_setup_files(struct rchan *chan, if (!chan || !base_filename) return -EINVAL; - strlcpy(chan->base_filename, base_filename, NAME_MAX); + strscpy(chan->base_filename, base_filename, NAME_MAX); mutex_lock(&relay_channels_mutex); /* Is chan already set up? */ diff --git a/kernel/resource.c b/kernel/resource.c index 4c5e80b92f2f..ddbbacb9fb50 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -888,7 +888,7 @@ void insert_resource_expand_to_fit(struct resource *root, struct resource *new) if (conflict->end > new->end) new->end = conflict->end; - printk("Expanded resource %s due to conflict with %s\n", new->name, conflict->name); + pr_info("Expanded resource %s due to conflict with %s\n", new->name, conflict->name); } write_unlock(&resource_lock); } @@ -1283,9 +1283,7 @@ void __release_region(struct resource *parent, resource_size_t start, write_unlock(&resource_lock); - printk(KERN_WARNING "Trying to free nonexistent resource " - "<%016llx-%016llx>\n", (unsigned long long)start, - (unsigned long long)end); + pr_warn("Trying to free nonexistent resource <%pa-%pa>\n", &start, &end); } EXPORT_SYMBOL(__release_region); @@ -1658,6 +1656,7 @@ __setup("reserve=", reserve_setup); int iomem_map_sanity_check(resource_size_t addr, unsigned long size) { struct resource *p = &iomem_resource; + resource_size_t end = addr + size - 1; int err = 0; loff_t l; @@ -1667,12 +1666,12 @@ int iomem_map_sanity_check(resource_size_t addr, unsigned long size) * We can probably skip the resources without * IORESOURCE_IO attribute? */ - if (p->start >= addr + size) + if (p->start > end) continue; if (p->end < addr) continue; if (PFN_DOWN(p->start) <= PFN_DOWN(addr) && - PFN_DOWN(p->end) >= PFN_DOWN(addr + size - 1)) + PFN_DOWN(p->end) >= PFN_DOWN(end)) continue; /* * if a resource is "BUSY", it's not a hardware resource @@ -1683,10 +1682,8 @@ int iomem_map_sanity_check(resource_size_t addr, unsigned long size) if (p->flags & IORESOURCE_BUSY) continue; - printk(KERN_WARNING "resource sanity check: requesting [mem %#010llx-%#010llx], which spans more than %s %pR\n", - (unsigned long long)addr, - (unsigned long long)(addr + size - 1), - p->name, p); + pr_warn("resource sanity check: requesting [mem %pa-%pa], which spans more than %s %pR\n", + &addr, &end, p->name, p); err = -1; break; } @@ -1707,18 +1704,15 @@ static int strict_iomem_checks; * * Returns true if exclusive to the kernel, otherwise returns false. */ -bool iomem_is_exclusive(u64 addr) +bool resource_is_exclusive(struct resource *root, u64 addr, resource_size_t size) { const unsigned int exclusive_system_ram = IORESOURCE_SYSTEM_RAM | IORESOURCE_EXCLUSIVE; bool skip_children = false, err = false; - int size = PAGE_SIZE; struct resource *p; - addr = addr & PAGE_MASK; - read_lock(&resource_lock); - for_each_resource(&iomem_resource, p, skip_children) { + for_each_resource(root, p, skip_children) { if (p->start >= addr + size) break; if (p->end < addr) { @@ -1757,6 +1751,12 @@ bool iomem_is_exclusive(u64 addr) return err; } +bool iomem_is_exclusive(u64 addr) +{ + return resource_is_exclusive(&iomem_resource, addr & PAGE_MASK, + PAGE_SIZE); +} + struct resource_entry *resource_list_create_entry(struct resource *res, size_t extra_size) { diff --git a/kernel/rseq.c b/kernel/rseq.c index bda8175f8f99..d38ab944105d 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -171,12 +171,27 @@ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) return 0; } +static bool rseq_warn_flags(const char *str, u32 flags) +{ + u32 test_flags; + + if (!flags) + return false; + test_flags = flags & RSEQ_CS_NO_RESTART_FLAGS; + if (test_flags) + pr_warn_once("Deprecated flags (%u) in %s ABI structure", test_flags, str); + test_flags = flags & ~RSEQ_CS_NO_RESTART_FLAGS; + if (test_flags) + pr_warn_once("Unknown flags (%u) in %s ABI structure", test_flags, str); + return true; +} + static int rseq_need_restart(struct task_struct *t, u32 cs_flags) { u32 flags, event_mask; int ret; - if (WARN_ON_ONCE(cs_flags & RSEQ_CS_NO_RESTART_FLAGS) || cs_flags) + if (rseq_warn_flags("rseq_cs", cs_flags)) return -EINVAL; /* Get thread flags. */ @@ -184,7 +199,7 @@ static int rseq_need_restart(struct task_struct *t, u32 cs_flags) if (ret) return ret; - if (WARN_ON_ONCE(flags & RSEQ_CS_NO_RESTART_FLAGS) || flags) + if (rseq_warn_flags("rseq", flags)) return -EINVAL; /* diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index 4ebaf97f7bd8..991fc9002535 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -161,7 +161,8 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) struct task_struct *t; unsigned long flags; - BUG_ON(!lock_task_sighand(p, &flags)); + if (WARN_ON_ONCE(!lock_task_sighand(p, &flags))) + return; prev = p->signal->autogroup; if (prev == ag) { diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 35f15c26ed54..d57a5c1c1cd9 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -204,6 +204,7 @@ EXPORT_SYMBOL(wait_for_completion_io_timeout); int __sched wait_for_completion_interruptible(struct completion *x) { long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); + if (t == -ERESTARTSYS) return t; return 0; @@ -241,12 +242,23 @@ EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); int __sched wait_for_completion_killable(struct completion *x) { long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); + if (t == -ERESTARTSYS) return t; return 0; } EXPORT_SYMBOL(wait_for_completion_killable); +int __sched wait_for_completion_state(struct completion *x, unsigned int state) +{ + long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, state); + + if (t == -ERESTARTSYS) + return t; + return 0; +} +EXPORT_SYMBOL(wait_for_completion_state); + /** * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable)) * @x: holds the state of this particular completion diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ee28253c9ac0..25b582b6ee5f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -73,6 +73,7 @@ #include <uapi/linux/sched/types.h> +#include <asm/irq_regs.h> #include <asm/switch_to.h> #include <asm/tlb.h> @@ -142,11 +143,7 @@ __read_mostly int sysctl_resched_latency_warn_once = 1; * Number of tasks to iterate in a single balance run. * Limited because this is done with IRQs disabled. */ -#ifdef CONFIG_PREEMPT_RT -const_debug unsigned int sysctl_sched_nr_migrate = 8; -#else -const_debug unsigned int sysctl_sched_nr_migrate = 32; -#endif +const_debug unsigned int sysctl_sched_nr_migrate = SCHED_NR_MIGRATE_BREAK; __read_mostly int scheduler_running; @@ -360,10 +357,7 @@ static void __sched_core_flip(bool enabled) /* * Toggle the offline CPUs. */ - cpumask_copy(&sched_core_mask, cpu_possible_mask); - cpumask_andnot(&sched_core_mask, &sched_core_mask, cpu_online_mask); - - for_each_cpu(cpu, &sched_core_mask) + for_each_cpu_andnot(cpu, cpu_possible_mask, cpu_online_mask) cpu_rq(cpu)->core_enabled = enabled; cpus_read_unlock(); @@ -481,8 +475,7 @@ sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { } * p->se.load, p->rt_priority, * p->dl.dl_{runtime, deadline, period, flags, bw, density} * - sched_setnuma(): p->numa_preferred_nid - * - sched_move_task()/ - * cpu_cgroup_fork(): p->sched_task_group + * - sched_move_task(): p->sched_task_group * - uclamp_update_active() p->uclamp* * * p->state <- TASK_*: @@ -708,6 +701,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) rq->prev_irq_time += irq_delta; delta -= irq_delta; + psi_account_irqtime(rq->curr, irq_delta); #endif #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING if (static_key_false((¶virt_steal_rq_enabled))) { @@ -1398,7 +1392,7 @@ static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id, if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE)) return; - WRITE_ONCE(rq->uclamp[clamp_id].value, clamp_value); + uclamp_rq_set(rq, clamp_id, clamp_value); } static inline @@ -1549,8 +1543,8 @@ static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p, if (bucket->tasks == 1 || uc_se->value > bucket->value) bucket->value = uc_se->value; - if (uc_se->value > READ_ONCE(uc_rq->value)) - WRITE_ONCE(uc_rq->value, uc_se->value); + if (uc_se->value > uclamp_rq_get(rq, clamp_id)) + uclamp_rq_set(rq, clamp_id, uc_se->value); } /* @@ -1616,7 +1610,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, if (likely(bucket->tasks)) return; - rq_clamp = READ_ONCE(uc_rq->value); + rq_clamp = uclamp_rq_get(rq, clamp_id); /* * Defensive programming: this should never happen. If it happens, * e.g. due to future modification, warn and fixup the expected value. @@ -1624,7 +1618,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, SCHED_WARN_ON(bucket->value > rq_clamp); if (bucket->value >= rq_clamp) { bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value); - WRITE_ONCE(uc_rq->value, bkt_clamp); + uclamp_rq_set(rq, clamp_id, bkt_clamp); } } @@ -2059,7 +2053,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) if (!(flags & ENQUEUE_RESTORE)) { sched_info_enqueue(rq, p); - psi_enqueue(p, flags & ENQUEUE_WAKEUP); + psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED)); } uclamp_rq_inc(rq, p); @@ -2195,14 +2189,18 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) #ifdef CONFIG_SMP static void -__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags); +__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx); static int __set_cpus_allowed_ptr(struct task_struct *p, - const struct cpumask *new_mask, - u32 flags); + struct affinity_context *ctx); static void migrate_disable_switch(struct rq *rq, struct task_struct *p) { + struct affinity_context ac = { + .new_mask = cpumask_of(rq->cpu), + .flags = SCA_MIGRATE_DISABLE, + }; + if (likely(!p->migration_disabled)) return; @@ -2212,7 +2210,7 @@ static void migrate_disable_switch(struct rq *rq, struct task_struct *p) /* * Violates locking rules! see comment in __do_set_cpus_allowed(). */ - __do_set_cpus_allowed(p, cpumask_of(rq->cpu), SCA_MIGRATE_DISABLE); + __do_set_cpus_allowed(p, &ac); } void migrate_disable(void) @@ -2234,6 +2232,10 @@ EXPORT_SYMBOL_GPL(migrate_disable); void migrate_enable(void) { struct task_struct *p = current; + struct affinity_context ac = { + .new_mask = &p->cpus_mask, + .flags = SCA_MIGRATE_ENABLE, + }; if (p->migration_disabled > 1) { p->migration_disabled--; @@ -2249,7 +2251,7 @@ void migrate_enable(void) */ preempt_disable(); if (p->cpus_ptr != &p->cpus_mask) - __set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE); + __set_cpus_allowed_ptr(p, &ac); /* * Mustn't clear migration_disabled() until cpus_ptr points back at the * regular cpus_mask, otherwise things that race (eg. @@ -2328,7 +2330,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf, rq = cpu_rq(new_cpu); rq_lock(rq, rf); - BUG_ON(task_cpu(p) != new_cpu); + WARN_ON_ONCE(task_cpu(p) != new_cpu); activate_task(rq, p, 0); check_preempt_curr(rq, p, 0); @@ -2529,19 +2531,25 @@ out_unlock: * sched_class::set_cpus_allowed must do the below, but is not required to * actually call this function. */ -void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags) +void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx) { - if (flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) { - p->cpus_ptr = new_mask; + if (ctx->flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) { + p->cpus_ptr = ctx->new_mask; return; } - cpumask_copy(&p->cpus_mask, new_mask); - p->nr_cpus_allowed = cpumask_weight(new_mask); + cpumask_copy(&p->cpus_mask, ctx->new_mask); + p->nr_cpus_allowed = cpumask_weight(ctx->new_mask); + + /* + * Swap in a new user_cpus_ptr if SCA_USER flag set + */ + if (ctx->flags & SCA_USER) + swap(p->user_cpus_ptr, ctx->user_mask); } static void -__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags) +__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx) { struct rq *rq = task_rq(p); bool queued, running; @@ -2558,7 +2566,7 @@ __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 * * XXX do further audits, this smells like something putrid. */ - if (flags & SCA_MIGRATE_DISABLE) + if (ctx->flags & SCA_MIGRATE_DISABLE) SCHED_WARN_ON(!p->on_cpu); else lockdep_assert_held(&p->pi_lock); @@ -2577,7 +2585,7 @@ __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 if (running) put_prev_task(rq, p); - p->sched_class->set_cpus_allowed(p, new_mask, flags); + p->sched_class->set_cpus_allowed(p, ctx); if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); @@ -2585,14 +2593,27 @@ __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 set_next_task(rq, p); } +/* + * Used for kthread_bind() and select_fallback_rq(), in both cases the user + * affinity (if any) should be destroyed too. + */ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) { - __do_set_cpus_allowed(p, new_mask, 0); + struct affinity_context ac = { + .new_mask = new_mask, + .user_mask = NULL, + .flags = SCA_USER, /* clear the user requested mask */ + }; + + __do_set_cpus_allowed(p, &ac); + kfree(ac.user_mask); } int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node) { + unsigned long flags; + if (!src->user_cpus_ptr) return 0; @@ -2600,7 +2621,10 @@ int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, if (!dst->user_cpus_ptr) return -ENOMEM; + /* Use pi_lock to protect content of user_cpus_ptr */ + raw_spin_lock_irqsave(&src->pi_lock, flags); cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr); + raw_spin_unlock_irqrestore(&src->pi_lock, flags); return 0; } @@ -2696,6 +2720,8 @@ void release_user_cpus_ptr(struct task_struct *p) */ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf, int dest_cpu, unsigned int flags) + __releases(rq->lock) + __releases(p->pi_lock) { struct set_affinity_pending my_pending = { }, *pending = NULL; bool stop_pending, complete = false; @@ -2778,7 +2804,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag return -EINVAL; } - if (task_running(rq, p) || READ_ONCE(p->__state) == TASK_WAKING) { + if (task_on_cpu(rq, p) || READ_ONCE(p->__state) == TASK_WAKING) { /* * MIGRATE_ENABLE gets here because 'p == current', but for * anything else we cannot do is_migration_disabled(), punt @@ -2838,8 +2864,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag * Called with both p->pi_lock and rq->lock held; drops both before returning. */ static int __set_cpus_allowed_ptr_locked(struct task_struct *p, - const struct cpumask *new_mask, - u32 flags, + struct affinity_context *ctx, struct rq *rq, struct rq_flags *rf) __releases(rq->lock) @@ -2848,7 +2873,6 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p, const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p); const struct cpumask *cpu_valid_mask = cpu_active_mask; bool kthread = p->flags & PF_KTHREAD; - struct cpumask *user_mask = NULL; unsigned int dest_cpu; int ret = 0; @@ -2868,7 +2892,7 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p, cpu_valid_mask = cpu_online_mask; } - if (!kthread && !cpumask_subset(new_mask, cpu_allowed_mask)) { + if (!kthread && !cpumask_subset(ctx->new_mask, cpu_allowed_mask)) { ret = -EINVAL; goto out; } @@ -2877,18 +2901,18 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p, * Must re-check here, to close a race against __kthread_bind(), * sched_setaffinity() is not guaranteed to observe the flag. */ - if ((flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) { + if ((ctx->flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) { ret = -EINVAL; goto out; } - if (!(flags & SCA_MIGRATE_ENABLE)) { - if (cpumask_equal(&p->cpus_mask, new_mask)) + if (!(ctx->flags & SCA_MIGRATE_ENABLE)) { + if (cpumask_equal(&p->cpus_mask, ctx->new_mask)) goto out; if (WARN_ON_ONCE(p == current && is_migration_disabled(p) && - !cpumask_test_cpu(task_cpu(p), new_mask))) { + !cpumask_test_cpu(task_cpu(p), ctx->new_mask))) { ret = -EBUSY; goto out; } @@ -2899,22 +2923,15 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p, * for groups of tasks (ie. cpuset), so that load balancing is not * immediately required to distribute the tasks within their new mask. */ - dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, new_mask); + dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, ctx->new_mask); if (dest_cpu >= nr_cpu_ids) { ret = -EINVAL; goto out; } - __do_set_cpus_allowed(p, new_mask, flags); - - if (flags & SCA_USER) - user_mask = clear_user_cpus_ptr(p); + __do_set_cpus_allowed(p, ctx); - ret = affine_move_task(rq, p, rf, dest_cpu, flags); - - kfree(user_mask); - - return ret; + return affine_move_task(rq, p, rf, dest_cpu, ctx->flags); out: task_rq_unlock(rq, p, rf); @@ -2932,25 +2949,41 @@ out: * call is not atomic; no spinlocks may be held. */ static int __set_cpus_allowed_ptr(struct task_struct *p, - const struct cpumask *new_mask, u32 flags) + struct affinity_context *ctx) { struct rq_flags rf; struct rq *rq; rq = task_rq_lock(p, &rf); - return __set_cpus_allowed_ptr_locked(p, new_mask, flags, rq, &rf); + /* + * Masking should be skipped if SCA_USER or any of the SCA_MIGRATE_* + * flags are set. + */ + if (p->user_cpus_ptr && + !(ctx->flags & (SCA_USER | SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) && + cpumask_and(rq->scratch_mask, ctx->new_mask, p->user_cpus_ptr)) + ctx->new_mask = rq->scratch_mask; + + return __set_cpus_allowed_ptr_locked(p, ctx, rq, &rf); } int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) { - return __set_cpus_allowed_ptr(p, new_mask, 0); + struct affinity_context ac = { + .new_mask = new_mask, + .flags = 0, + }; + + return __set_cpus_allowed_ptr(p, &ac); } EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); /* * Change a given task's CPU affinity to the intersection of its current - * affinity mask and @subset_mask, writing the resulting mask to @new_mask - * and pointing @p->user_cpus_ptr to a copy of the old mask. + * affinity mask and @subset_mask, writing the resulting mask to @new_mask. + * If user_cpus_ptr is defined, use it as the basis for restricting CPU + * affinity or use cpu_online_mask instead. + * * If the resulting mask is empty, leave the affinity unchanged and return * -EINVAL. */ @@ -2958,17 +2991,14 @@ static int restrict_cpus_allowed_ptr(struct task_struct *p, struct cpumask *new_mask, const struct cpumask *subset_mask) { - struct cpumask *user_mask = NULL; + struct affinity_context ac = { + .new_mask = new_mask, + .flags = 0, + }; struct rq_flags rf; struct rq *rq; int err; - if (!p->user_cpus_ptr) { - user_mask = kmalloc(cpumask_size(), GFP_KERNEL); - if (!user_mask) - return -ENOMEM; - } - rq = task_rq_lock(p, &rf); /* @@ -2981,31 +3011,21 @@ static int restrict_cpus_allowed_ptr(struct task_struct *p, goto err_unlock; } - if (!cpumask_and(new_mask, &p->cpus_mask, subset_mask)) { + if (!cpumask_and(new_mask, task_user_cpus(p), subset_mask)) { err = -EINVAL; goto err_unlock; } - /* - * We're about to butcher the task affinity, so keep track of what - * the user asked for in case we're able to restore it later on. - */ - if (user_mask) { - cpumask_copy(user_mask, p->cpus_ptr); - p->user_cpus_ptr = user_mask; - } - - return __set_cpus_allowed_ptr_locked(p, new_mask, 0, rq, &rf); + return __set_cpus_allowed_ptr_locked(p, &ac, rq, &rf); err_unlock: task_rq_unlock(rq, p, &rf); - kfree(user_mask); return err; } /* * Restrict the CPU affinity of task @p so that it is a subset of - * task_cpu_possible_mask() and point @p->user_cpu_ptr to a copy of the + * task_cpu_possible_mask() and point @p->user_cpus_ptr to a copy of the * old affinity mask. If the resulting mask is empty, we warn and walk * up the cpuset hierarchy until we find a suitable mask. */ @@ -3049,34 +3069,29 @@ out_free_mask: } static int -__sched_setaffinity(struct task_struct *p, const struct cpumask *mask); +__sched_setaffinity(struct task_struct *p, struct affinity_context *ctx); /* * Restore the affinity of a task @p which was previously restricted by a - * call to force_compatible_cpus_allowed_ptr(). This will clear (and free) - * @p->user_cpus_ptr. + * call to force_compatible_cpus_allowed_ptr(). * * It is the caller's responsibility to serialise this with any calls to * force_compatible_cpus_allowed_ptr(@p). */ void relax_compatible_cpus_allowed_ptr(struct task_struct *p) { - struct cpumask *user_mask = p->user_cpus_ptr; - unsigned long flags; + struct affinity_context ac = { + .new_mask = task_user_cpus(p), + .flags = 0, + }; + int ret; /* - * Try to restore the old affinity mask. If this fails, then - * we free the mask explicitly to avoid it being inherited across - * a subsequent fork(). + * Try to restore the old affinity mask with __sched_setaffinity(). + * Cpuset masking will be done there too. */ - if (!user_mask || !__sched_setaffinity(p, user_mask)) - return; - - raw_spin_lock_irqsave(&p->pi_lock, flags); - user_mask = clear_user_cpus_ptr(p); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); - - kfree(user_mask); + ret = __sched_setaffinity(p, &ac); + WARN_ON_ONCE(ret); } void set_task_cpu(struct task_struct *p, unsigned int new_cpu) @@ -3254,12 +3269,12 @@ out: /* * wait_task_inactive - wait for a thread to unschedule. * - * If @match_state is nonzero, it's the @p->state value just checked and - * not expected to change. If it changes, i.e. @p might have woken up, - * then return zero. When we succeed in waiting for @p to be off its CPU, - * we return a positive number (its total switch count). If a second call - * a short while later returns the same number, the caller can be sure that - * @p has remained unscheduled the whole time. + * Wait for the thread to block in any of the states set in @match_state. + * If it changes, i.e. @p might have woken up, then return zero. When we + * succeed in waiting for @p to be off its CPU, we return a positive number + * (its total switch count). If a second call a short while later returns the + * same number, the caller can be sure that @p has remained unscheduled the + * whole time. * * The caller must ensure that the task *will* unschedule sometime soon, * else this function might spin for a *long* time. This function can't @@ -3290,12 +3305,12 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state * * NOTE! Since we don't hold any locks, it's not * even sure that "rq" stays as the right runqueue! - * But we don't care, since "task_running()" will + * But we don't care, since "task_on_cpu()" will * return false if the runqueue has changed and p * is actually now running somewhere else! */ - while (task_running(rq, p)) { - if (match_state && unlikely(READ_ONCE(p->__state) != match_state)) + while (task_on_cpu(rq, p)) { + if (!(READ_ONCE(p->__state) & match_state)) return 0; cpu_relax(); } @@ -3307,10 +3322,10 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state */ rq = task_rq_lock(p, &rf); trace_sched_wait_task(p); - running = task_running(rq, p); + running = task_on_cpu(rq, p); queued = task_on_rq_queued(p); ncsw = 0; - if (!match_state || READ_ONCE(p->__state) == match_state) + if (READ_ONCE(p->__state) & match_state) ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ task_rq_unlock(rq, p, &rf); @@ -3554,10 +3569,9 @@ void sched_set_stop_task(int cpu, struct task_struct *stop) #else /* CONFIG_SMP */ static inline int __set_cpus_allowed_ptr(struct task_struct *p, - const struct cpumask *new_mask, - u32 flags) + struct affinity_context *ctx) { - return set_cpus_allowed_ptr(p, new_mask); + return set_cpus_allowed_ptr(p, ctx->new_mask); } static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { } @@ -3725,13 +3739,6 @@ void sched_ttwu_pending(void *arg) if (!llist) return; - /* - * rq::ttwu_pending racy indication of out-standing wakeups. - * Races such that false-negatives are possible, since they - * are shorter lived that false-positives would be. - */ - WRITE_ONCE(rq->ttwu_pending, 0); - rq_lock_irqsave(rq, &rf); update_rq_clock(rq); @@ -3745,6 +3752,17 @@ void sched_ttwu_pending(void *arg) ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf); } + /* + * Must be after enqueueing at least once task such that + * idle_cpu() does not observe a false-negative -- if it does, + * it is possible for select_idle_siblings() to stack a number + * of tasks on this CPU during that window. + * + * It is ok to clear ttwu_pending when another task pending. + * We will receive IPI after local irq enabled and then enqueue it. + * Since now nr_running > 0, idle_cpu() will always get correct result. + */ + WRITE_ONCE(rq->ttwu_pending, 0); rq_unlock_irqrestore(rq, &rf); } @@ -4206,6 +4224,40 @@ out: return success; } +static bool __task_needs_rq_lock(struct task_struct *p) +{ + unsigned int state = READ_ONCE(p->__state); + + /* + * Since pi->lock blocks try_to_wake_up(), we don't need rq->lock when + * the task is blocked. Make sure to check @state since ttwu() can drop + * locks at the end, see ttwu_queue_wakelist(). + */ + if (state == TASK_RUNNING || state == TASK_WAKING) + return true; + + /* + * Ensure we load p->on_rq after p->__state, otherwise it would be + * possible to, falsely, observe p->on_rq == 0. + * + * See try_to_wake_up() for a longer comment. + */ + smp_rmb(); + if (p->on_rq) + return true; + +#ifdef CONFIG_SMP + /* + * Ensure the task has finished __schedule() and will not be referenced + * anymore. Again, see try_to_wake_up() for a longer comment. + */ + smp_rmb(); + smp_cond_load_acquire(&p->on_cpu, !VAL); +#endif + + return false; +} + /** * task_call_func - Invoke a function on task in fixed state * @p: Process for which the function is to be invoked, can be @current. @@ -4223,28 +4275,12 @@ out: int task_call_func(struct task_struct *p, task_call_f func, void *arg) { struct rq *rq = NULL; - unsigned int state; struct rq_flags rf; int ret; raw_spin_lock_irqsave(&p->pi_lock, rf.flags); - state = READ_ONCE(p->__state); - - /* - * Ensure we load p->on_rq after p->__state, otherwise it would be - * possible to, falsely, observe p->on_rq == 0. - * - * See try_to_wake_up() for a longer comment. - */ - smp_rmb(); - - /* - * Since pi->lock blocks try_to_wake_up(), we don't need rq->lock when - * the task is blocked. Make sure to check @state since ttwu() can drop - * locks at the end, see ttwu_queue_wakelist(). - */ - if (state == TASK_RUNNING || state == TASK_WAKING || p->on_rq) + if (__task_needs_rq_lock(p)) rq = __task_rq_lock(p, &rf); /* @@ -4396,7 +4432,18 @@ void set_numabalancing_state(bool enabled) } #ifdef CONFIG_PROC_SYSCTL -int sysctl_numa_balancing(struct ctl_table *table, int write, +static void reset_memory_tiering(void) +{ + struct pglist_data *pgdat; + + for_each_online_pgdat(pgdat) { + pgdat->nbp_threshold = 0; + pgdat->nbp_th_nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE); + pgdat->nbp_th_start = jiffies_to_msecs(jiffies); + } +} + +static int sysctl_numa_balancing(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; @@ -4412,6 +4459,9 @@ int sysctl_numa_balancing(struct ctl_table *table, int write, if (err < 0) return err; if (write) { + if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) && + (state & NUMA_BALANCING_MEMORY_TIERING)) + reset_memory_tiering(); sysctl_numa_balancing_mode = state; __set_numabalancing_state(state); } @@ -4520,6 +4570,17 @@ static struct ctl_table sched_core_sysctls[] = { .proc_handler = sysctl_sched_uclamp_handler, }, #endif /* CONFIG_UCLAMP_TASK */ +#ifdef CONFIG_NUMA_BALANCING + { + .procname = "numa_balancing", + .data = NULL, /* filled in by handler */ + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_numa_balancing, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_FOUR, + }, +#endif /* CONFIG_NUMA_BALANCING */ {} }; static int __init sched_core_sysctl_init(void) @@ -4815,10 +4876,10 @@ static inline void finish_task(struct task_struct *prev) #ifdef CONFIG_SMP -static void do_balance_callbacks(struct rq *rq, struct callback_head *head) +static void do_balance_callbacks(struct rq *rq, struct balance_callback *head) { void (*func)(struct rq *rq); - struct callback_head *next; + struct balance_callback *next; lockdep_assert_rq_held(rq); @@ -4845,15 +4906,15 @@ static void balance_push(struct rq *rq); * This abuse is tolerated because it places all the unlikely/odd cases behind * a single test, namely: rq->balance_callback == NULL. */ -struct callback_head balance_push_callback = { +struct balance_callback balance_push_callback = { .next = NULL, - .func = (void (*)(struct callback_head *))balance_push, + .func = balance_push, }; -static inline struct callback_head * +static inline struct balance_callback * __splice_balance_callbacks(struct rq *rq, bool split) { - struct callback_head *head = rq->balance_callback; + struct balance_callback *head = rq->balance_callback; if (likely(!head)) return NULL; @@ -4875,7 +4936,7 @@ __splice_balance_callbacks(struct rq *rq, bool split) return head; } -static inline struct callback_head *splice_balance_callbacks(struct rq *rq) +static inline struct balance_callback *splice_balance_callbacks(struct rq *rq) { return __splice_balance_callbacks(rq, true); } @@ -4885,7 +4946,7 @@ static void __balance_callbacks(struct rq *rq) do_balance_callbacks(rq, __splice_balance_callbacks(rq, false)); } -static inline void balance_callbacks(struct rq *rq, struct callback_head *head) +static inline void balance_callbacks(struct rq *rq, struct balance_callback *head) { unsigned long flags; @@ -4902,12 +4963,12 @@ static inline void __balance_callbacks(struct rq *rq) { } -static inline struct callback_head *splice_balance_callbacks(struct rq *rq) +static inline struct balance_callback *splice_balance_callbacks(struct rq *rq) { return NULL; } -static inline void balance_callbacks(struct rq *rq, struct callback_head *head) +static inline void balance_callbacks(struct rq *rq, struct balance_callback *head) { } @@ -5166,6 +5227,7 @@ context_switch(struct rq *rq, struct task_struct *prev, * finish_task_switch()'s mmdrop(). */ switch_mm_irqs_off(prev->active_mm, next->mm, next); + lru_gen_use_mm(next->mm); if (!prev->mm) { // from kernel /* will mmdrop() in finish_task_switch(). */ @@ -5720,8 +5782,7 @@ static noinline void __schedule_bug(struct task_struct *prev) pr_err("Preemption disabled at:"); print_ip_sym(KERN_ERR, preempt_disable_ip); } - if (panic_on_warn) - panic("scheduling while atomic\n"); + check_panic_on_warn("scheduling while atomic"); dump_stack(); add_taint(TAINT_WARN, LOCKDEP_STILL_OK); @@ -6179,7 +6240,7 @@ static void sched_core_balance(struct rq *rq) preempt_enable(); } -static DEFINE_PER_CPU(struct callback_head, core_balance_head); +static DEFINE_PER_CPU(struct balance_callback, core_balance_head); static void queue_core_balance(struct rq *rq) { @@ -6429,7 +6490,7 @@ static void __sched notrace __schedule(unsigned int sched_mode) prev->sched_contributes_to_load = (prev_state & TASK_UNINTERRUPTIBLE) && !(prev_state & TASK_NOLOAD) && - !(prev->flags & PF_FROZEN); + !(prev_state & TASK_FROZEN); if (prev->sched_contributes_to_load) rq->nr_uninterruptible++; @@ -7410,7 +7471,7 @@ static int __sched_setscheduler(struct task_struct *p, int oldpolicy = -1, policy = attr->sched_policy; int retval, oldprio, newprio, queued, running; const struct sched_class *prev_class; - struct callback_head *head; + struct balance_callback *head; struct rq_flags rf; int reset_on_fork; int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; @@ -8079,7 +8140,7 @@ int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask) #endif static int -__sched_setaffinity(struct task_struct *p, const struct cpumask *mask) +__sched_setaffinity(struct task_struct *p, struct affinity_context *ctx) { int retval; cpumask_var_t cpus_allowed, new_mask; @@ -8093,13 +8154,16 @@ __sched_setaffinity(struct task_struct *p, const struct cpumask *mask) } cpuset_cpus_allowed(p, cpus_allowed); - cpumask_and(new_mask, mask, cpus_allowed); + cpumask_and(new_mask, ctx->new_mask, cpus_allowed); + + ctx->new_mask = new_mask; + ctx->flags |= SCA_CHECK; retval = dl_task_check_affinity(p, new_mask); if (retval) goto out_free_new_mask; -again: - retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK | SCA_USER); + + retval = __set_cpus_allowed_ptr(p, ctx); if (retval) goto out_free_new_mask; @@ -8110,7 +8174,24 @@ again: * Just reset the cpumask to the cpuset's cpus_allowed. */ cpumask_copy(new_mask, cpus_allowed); - goto again; + + /* + * If SCA_USER is set, a 2nd call to __set_cpus_allowed_ptr() + * will restore the previous user_cpus_ptr value. + * + * In the unlikely event a previous user_cpus_ptr exists, + * we need to further restrict the mask to what is allowed + * by that old user_cpus_ptr. + */ + if (unlikely((ctx->flags & SCA_USER) && ctx->user_mask)) { + bool empty = !cpumask_and(new_mask, new_mask, + ctx->user_mask); + + if (WARN_ON_ONCE(empty)) + cpumask_copy(new_mask, cpus_allowed); + } + __set_cpus_allowed_ptr(p, ctx); + retval = -EINVAL; } out_free_new_mask: @@ -8122,6 +8203,8 @@ out_free_cpus_allowed: long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) { + struct affinity_context ac; + struct cpumask *user_mask; struct task_struct *p; int retval; @@ -8156,7 +8239,21 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) if (retval) goto out_put_task; - retval = __sched_setaffinity(p, in_mask); + user_mask = kmalloc(cpumask_size(), GFP_KERNEL); + if (!user_mask) { + retval = -ENOMEM; + goto out_put_task; + } + cpumask_copy(user_mask, in_mask); + ac = (struct affinity_context){ + .new_mask = in_mask, + .user_mask = user_mask, + .flags = SCA_USER, + }; + + retval = __sched_setaffinity(p, &ac); + kfree(ac.user_mask); + out_put_task: put_task_struct(p); return retval; @@ -8649,7 +8746,7 @@ again: if (curr->sched_class != p->sched_class) goto out_unlock; - if (task_running(p_rq, p) || !task_is_running(p)) + if (task_on_cpu(p_rq, p) || !task_is_running(p)) goto out_unlock; yielded = curr->sched_class->yield_to_task(rq, p); @@ -8861,7 +8958,7 @@ void sched_show_task(struct task_struct *p) if (pid_alive(p)) ppid = task_pid_nr(rcu_dereference(p->real_parent)); rcu_read_unlock(); - pr_cont(" stack:%5lu pid:%5d ppid:%6d flags:0x%08lx\n", + pr_cont(" stack:%-5lu pid:%-5d ppid:%-6d flags:0x%08lx\n", free, task_pid_nr(p), ppid, read_task_thread_flags(p)); @@ -8889,7 +8986,7 @@ state_filter_match(unsigned long state_filter, struct task_struct *p) * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows * TASK_KILLABLE). */ - if (state_filter == TASK_UNINTERRUPTIBLE && state == TASK_IDLE) + if (state_filter == TASK_UNINTERRUPTIBLE && (state & TASK_NOLOAD)) return false; return true; @@ -8937,6 +9034,12 @@ void show_state_filter(unsigned int state_filter) */ void __init init_idle(struct task_struct *idle, int cpu) { +#ifdef CONFIG_SMP + struct affinity_context ac = (struct affinity_context) { + .new_mask = cpumask_of(cpu), + .flags = 0, + }; +#endif struct rq *rq = cpu_rq(cpu); unsigned long flags; @@ -8961,7 +9064,7 @@ void __init init_idle(struct task_struct *idle, int cpu) * * And since this is boot we can forgo the serialization. */ - set_cpus_allowed_common(idle, cpumask_of(cpu), 0); + set_cpus_allowed_common(idle, &ac); #endif /* * We're having a chicken and egg problem, even though we are @@ -9601,9 +9704,6 @@ LIST_HEAD(task_groups); static struct kmem_cache *task_group_cache __read_mostly; #endif -DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); -DECLARE_PER_CPU(cpumask_var_t, select_rq_mask); - void __init sched_init(void) { unsigned long ptr = 0; @@ -9647,14 +9747,6 @@ void __init sched_init(void) #endif /* CONFIG_RT_GROUP_SCHED */ } -#ifdef CONFIG_CPUMASK_OFFSTACK - for_each_possible_cpu(i) { - per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node( - cpumask_size(), GFP_KERNEL, cpu_to_node(i)); - per_cpu(select_rq_mask, i) = (cpumask_var_t)kzalloc_node( - cpumask_size(), GFP_KERNEL, cpu_to_node(i)); - } -#endif /* CONFIG_CPUMASK_OFFSTACK */ init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime()); @@ -9759,6 +9851,7 @@ void __init sched_init(void) rq->core_cookie = 0UL; #endif + zalloc_cpumask_var_node(&rq->scratch_mask, GFP_KERNEL, cpu_to_node(i)); } set_load_weight(&init_task, false); @@ -10163,7 +10256,7 @@ void sched_release_group(struct task_group *tg) spin_unlock_irqrestore(&task_group_lock, flags); } -static void sched_change_group(struct task_struct *tsk, int type) +static void sched_change_group(struct task_struct *tsk) { struct task_group *tg; @@ -10179,7 +10272,7 @@ static void sched_change_group(struct task_struct *tsk, int type) #ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->task_change_group) - tsk->sched_class->task_change_group(tsk, type); + tsk->sched_class->task_change_group(tsk); else #endif set_task_rq(tsk, task_cpu(tsk)); @@ -10210,7 +10303,7 @@ void sched_move_task(struct task_struct *tsk) if (running) put_prev_task(rq, tsk); - sched_change_group(tsk, TASK_MOVE_GROUP); + sched_change_group(tsk); if (queued) enqueue_task(rq, tsk, queue_flags); @@ -10288,53 +10381,19 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) sched_unregister_group(tg); } -/* - * This is called before wake_up_new_task(), therefore we really only - * have to set its group bits, all the other stuff does not apply. - */ -static void cpu_cgroup_fork(struct task_struct *task) -{ - struct rq_flags rf; - struct rq *rq; - - rq = task_rq_lock(task, &rf); - - update_rq_clock(rq); - sched_change_group(task, TASK_SET_GROUP); - - task_rq_unlock(rq, task, &rf); -} - +#ifdef CONFIG_RT_GROUP_SCHED static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) { struct task_struct *task; struct cgroup_subsys_state *css; - int ret = 0; cgroup_taskset_for_each(task, css, tset) { -#ifdef CONFIG_RT_GROUP_SCHED if (!sched_rt_can_attach(css_tg(css), task)) return -EINVAL; -#endif - /* - * Serialize against wake_up_new_task() such that if it's - * running, we're sure to observe its full state. - */ - raw_spin_lock_irq(&task->pi_lock); - /* - * Avoid calling sched_move_task() before wake_up_new_task() - * has happened. This would lead to problems with PELT, due to - * move wanting to detach+attach while we're not attached yet. - */ - if (READ_ONCE(task->__state) == TASK_NEW) - ret = -EINVAL; - raw_spin_unlock_irq(&task->pi_lock); - - if (ret) - break; } - return ret; + return 0; } +#endif static void cpu_cgroup_attach(struct cgroup_taskset *tset) { @@ -11170,8 +11229,9 @@ struct cgroup_subsys cpu_cgrp_subsys = { .css_released = cpu_cgroup_css_released, .css_free = cpu_cgroup_css_free, .css_extra_stat_show = cpu_extra_stat_show, - .fork = cpu_cgroup_fork, +#ifdef CONFIG_RT_GROUP_SCHED .can_attach = cpu_cgroup_can_attach, +#endif .attach = cpu_cgroup_attach, .legacy_cftypes = cpu_legacy_files, .dfl_cftypes = cpu_files, @@ -11183,6 +11243,19 @@ struct cgroup_subsys cpu_cgrp_subsys = { void dump_cpu_task(int cpu) { + if (cpu == smp_processor_id() && in_hardirq()) { + struct pt_regs *regs; + + regs = get_irq_regs(); + if (regs) { + show_regs(regs); + return; + } + } + + if (trigger_single_cpu_backtrace(cpu)) + return; + pr_info("Task dump for CPU %d:\n", cpu); sched_show_task(cpu_curr(cpu)); } diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c index 93878cb2a46d..a57fd8f27498 100644 --- a/kernel/sched/core_sched.c +++ b/kernel/sched/core_sched.c @@ -88,7 +88,7 @@ static unsigned long sched_core_update_cookie(struct task_struct *p, * core has now entered/left forced idle state. Defer accounting to the * next scheduling edge, rather than always forcing a reschedule here. */ - if (task_running(rq, p)) + if (task_on_cpu(rq, p)) resched_curr(rq); task_rq_unlock(rq, p, &rf); @@ -205,7 +205,7 @@ int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type, default: err = -EINVAL; goto out; - }; + } if (type == PIDTYPE_PID) { __sched_core_set(task, cookie); diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 02d970a879ed..57c92d751bcd 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -123,7 +123,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, unsigned long cap, max_cap = 0; int cpu, max_cpu = -1; - if (!static_branch_unlikely(&sched_asym_cpucapacity)) + if (!sched_asym_cpucap_active()) return 1; /* Ensure the capacity of the CPUs fits the task. */ diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index fa9ce9d83683..a286e726eb4b 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -147,7 +147,7 @@ int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p, int task_pri = convert_prio(p->prio); int idx, cpu; - BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES); + WARN_ON_ONCE(task_pri >= CPUPRI_NR_PRIORITIES); for (idx = 0; idx < task_pri; idx++) { diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 0ab79d819a0d..0d97d54276cc 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -124,15 +124,12 @@ static inline int dl_bw_cpus(int i) return cpus; } -static inline unsigned long __dl_bw_capacity(int i) +static inline unsigned long __dl_bw_capacity(const struct cpumask *mask) { - struct root_domain *rd = cpu_rq(i)->rd; unsigned long cap = 0; + int i; - RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), - "sched RCU must be held"); - - for_each_cpu_and(i, rd->span, cpu_active_mask) + for_each_cpu_and(i, mask, cpu_active_mask) cap += capacity_orig_of(i); return cap; @@ -144,11 +141,14 @@ static inline unsigned long __dl_bw_capacity(int i) */ static inline unsigned long dl_bw_capacity(int i) { - if (!static_branch_unlikely(&sched_asym_cpucapacity) && + if (!sched_asym_cpucap_active() && capacity_orig_of(i) == SCHED_CAPACITY_SCALE) { return dl_bw_cpus(i) << SCHED_CAPACITY_SHIFT; } else { - return __dl_bw_capacity(i); + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), + "sched RCU must be held"); + + return __dl_bw_capacity(cpu_rq(i)->rd->span); } } @@ -310,7 +310,7 @@ static void dl_change_utilization(struct task_struct *p, u64 new_bw) { struct rq *rq; - BUG_ON(p->dl.flags & SCHED_FLAG_SUGOV); + WARN_ON_ONCE(p->dl.flags & SCHED_FLAG_SUGOV); if (task_on_rq_queued(p)) return; @@ -431,8 +431,8 @@ static void task_non_contending(struct task_struct *p) sub_rq_bw(&p->dl, &rq->dl); raw_spin_lock(&dl_b->lock); __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); - __dl_clear_params(p); raw_spin_unlock(&dl_b->lock); + __dl_clear_params(p); } return; @@ -607,7 +607,7 @@ static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) { struct rb_node *leftmost; - BUG_ON(!RB_EMPTY_NODE(&p->pushable_dl_tasks)); + WARN_ON_ONCE(!RB_EMPTY_NODE(&p->pushable_dl_tasks)); leftmost = rb_add_cached(&p->pushable_dl_tasks, &rq->dl.pushable_dl_tasks_root, @@ -644,8 +644,8 @@ static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev) return rq->online && dl_task(prev); } -static DEFINE_PER_CPU(struct callback_head, dl_push_head); -static DEFINE_PER_CPU(struct callback_head, dl_pull_head); +static DEFINE_PER_CPU(struct balance_callback, dl_push_head); +static DEFINE_PER_CPU(struct balance_callback, dl_pull_head); static void push_dl_tasks(struct rq *); static void pull_dl_task(struct rq *); @@ -684,7 +684,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p * Failed to find any suitable CPU. * The task will never come back! */ - BUG_ON(dl_bandwidth_enabled()); + WARN_ON_ONCE(dl_bandwidth_enabled()); /* * If admission control is disabled we @@ -770,6 +770,14 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags); static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, int flags); +static inline void replenish_dl_new_period(struct sched_dl_entity *dl_se, + struct rq *rq) +{ + /* for non-boosted task, pi_of(dl_se) == dl_se */ + dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; + dl_se->runtime = pi_of(dl_se)->dl_runtime; +} + /* * We are being explicitly informed that a new instance is starting, * and this means that: @@ -803,8 +811,7 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se) * future; in fact, we must consider execution overheads (time * spent on hardirq context, etc.). */ - dl_se->deadline = rq_clock(rq) + dl_se->dl_deadline; - dl_se->runtime = dl_se->dl_runtime; + replenish_dl_new_period(dl_se, rq); } /* @@ -830,16 +837,14 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se) struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); - BUG_ON(pi_of(dl_se)->dl_runtime <= 0); + WARN_ON_ONCE(pi_of(dl_se)->dl_runtime <= 0); /* * This could be the case for a !-dl task that is boosted. * Just go with full inherited parameters. */ - if (dl_se->dl_deadline == 0) { - dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; - dl_se->runtime = pi_of(dl_se)->dl_runtime; - } + if (dl_se->dl_deadline == 0) + replenish_dl_new_period(dl_se, rq); if (dl_se->dl_yielded && dl_se->runtime > 0) dl_se->runtime = 0; @@ -866,8 +871,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se) */ if (dl_time_before(dl_se->deadline, rq_clock(rq))) { printk_deferred_once("sched: DL replenish lagged too much\n"); - dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; - dl_se->runtime = pi_of(dl_se)->dl_runtime; + replenish_dl_new_period(dl_se, rq); } if (dl_se->dl_yielded) @@ -1024,8 +1028,7 @@ static void update_dl_entity(struct sched_dl_entity *dl_se) return; } - dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; - dl_se->runtime = pi_of(dl_se)->dl_runtime; + replenish_dl_new_period(dl_se, rq); } } @@ -1333,11 +1336,7 @@ static void update_curr_dl(struct rq *rq) trace_sched_stat_runtime(curr, delta_exec, 0); - curr->se.sum_exec_runtime += delta_exec; - account_group_exec_runtime(curr, delta_exec); - - curr->se.exec_start = now; - cgroup_account_cputime(curr, delta_exec); + update_current_exec_runtime(curr, now, delta_exec); if (dl_entity_is_special(dl_se)) return; @@ -1616,7 +1615,7 @@ static void __enqueue_dl_entity(struct sched_dl_entity *dl_se) { struct dl_rq *dl_rq = dl_rq_of_se(dl_se); - BUG_ON(!RB_EMPTY_NODE(&dl_se->rb_node)); + WARN_ON_ONCE(!RB_EMPTY_NODE(&dl_se->rb_node)); rb_add_cached(&dl_se->rb_node, &dl_rq->root, __dl_less); @@ -1640,7 +1639,7 @@ static void __dequeue_dl_entity(struct sched_dl_entity *dl_se) static void enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags) { - BUG_ON(on_dl_rq(dl_se)); + WARN_ON_ONCE(on_dl_rq(dl_se)); update_stats_enqueue_dl(dl_rq_of_se(dl_se), dl_se, flags); @@ -1814,6 +1813,14 @@ static void yield_task_dl(struct rq *rq) #ifdef CONFIG_SMP +static inline bool dl_task_is_earliest_deadline(struct task_struct *p, + struct rq *rq) +{ + return (!rq->dl.dl_nr_running || + dl_time_before(p->dl.deadline, + rq->dl.earliest_dl.curr)); +} + static int find_later_rq(struct task_struct *task); static int @@ -1849,16 +1856,14 @@ select_task_rq_dl(struct task_struct *p, int cpu, int flags) * Take the capacity of the CPU into account to * ensure it fits the requirement of the task. */ - if (static_branch_unlikely(&sched_asym_cpucapacity)) + if (sched_asym_cpucap_active()) select_rq |= !dl_task_fits_capacity(p, cpu); if (select_rq) { int target = find_later_rq(p); if (target != -1 && - (dl_time_before(p->dl.deadline, - cpu_rq(target)->dl.earliest_dl.curr) || - (cpu_rq(target)->dl.dl_nr_running == 0))) + dl_task_is_earliest_deadline(p, cpu_rq(target))) cpu = target; } rcu_read_unlock(); @@ -2017,7 +2022,7 @@ static struct task_struct *pick_task_dl(struct rq *rq) return NULL; dl_se = pick_next_dl_entity(dl_rq); - BUG_ON(!dl_se); + WARN_ON_ONCE(!dl_se); p = dl_task_of(dl_se); return p; @@ -2087,7 +2092,7 @@ static void task_fork_dl(struct task_struct *p) static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) { - if (!task_running(rq, p) && + if (!task_on_cpu(rq, p) && cpumask_test_cpu(cpu, &p->cpus_mask)) return 1; return 0; @@ -2225,9 +2230,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) later_rq = cpu_rq(cpu); - if (later_rq->dl.dl_nr_running && - !dl_time_before(task->dl.deadline, - later_rq->dl.earliest_dl.curr)) { + if (!dl_task_is_earliest_deadline(task, later_rq)) { /* * Target rq has tasks of equal or earlier deadline, * retrying does not release any lock and is unlikely @@ -2241,7 +2244,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) if (double_lock_balance(rq, later_rq)) { if (unlikely(task_rq(task) != rq || !cpumask_test_cpu(later_rq->cpu, &task->cpus_mask) || - task_running(rq, task) || + task_on_cpu(rq, task) || !dl_task(task) || !task_on_rq_queued(task))) { double_unlock_balance(rq, later_rq); @@ -2255,9 +2258,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) * its earliest one has a later deadline than our * task, the rq is a good one. */ - if (!later_rq->dl.dl_nr_running || - dl_time_before(task->dl.deadline, - later_rq->dl.earliest_dl.curr)) + if (dl_task_is_earliest_deadline(task, later_rq)) break; /* Otherwise we try again. */ @@ -2277,12 +2278,12 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) p = __node_2_pdl(rb_first_cached(&rq->dl.pushable_dl_tasks_root)); - BUG_ON(rq->cpu != task_cpu(p)); - BUG_ON(task_current(rq, p)); - BUG_ON(p->nr_cpus_allowed <= 1); + WARN_ON_ONCE(rq->cpu != task_cpu(p)); + WARN_ON_ONCE(task_current(rq, p)); + WARN_ON_ONCE(p->nr_cpus_allowed <= 1); - BUG_ON(!task_on_rq_queued(p)); - BUG_ON(!dl_task(p)); + WARN_ON_ONCE(!task_on_rq_queued(p)); + WARN_ON_ONCE(!dl_task(p)); return p; } @@ -2428,9 +2429,7 @@ static void pull_dl_task(struct rq *this_rq) * - it will preempt the last one we pulled (if any). */ if (p && dl_time_before(p->dl.deadline, dmin) && - (!this_rq->dl.dl_nr_running || - dl_time_before(p->dl.deadline, - this_rq->dl.earliest_dl.curr))) { + dl_task_is_earliest_deadline(p, this_rq)) { WARN_ON(p == src_rq->curr); WARN_ON(!task_on_rq_queued(p)); @@ -2475,7 +2474,7 @@ skip: */ static void task_woken_dl(struct rq *rq, struct task_struct *p) { - if (!task_running(rq, p) && + if (!task_on_cpu(rq, p) && !test_tsk_need_resched(rq->curr) && p->nr_cpus_allowed > 1 && dl_task(rq->curr) && @@ -2486,13 +2485,12 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p) } static void set_cpus_allowed_dl(struct task_struct *p, - const struct cpumask *new_mask, - u32 flags) + struct affinity_context *ctx) { struct root_domain *src_rd; struct rq *rq; - BUG_ON(!dl_task(p)); + WARN_ON_ONCE(!dl_task(p)); rq = task_rq(p); src_rd = rq->rd; @@ -2502,7 +2500,7 @@ static void set_cpus_allowed_dl(struct task_struct *p, * update. We already made space for us in the destination * domain (see cpuset_can_attach()). */ - if (!cpumask_intersects(src_rd->span, new_mask)) { + if (!cpumask_intersects(src_rd->span, ctx->new_mask)) { struct dl_bw *src_dl_b; src_dl_b = dl_bw_of(cpu_of(rq)); @@ -2516,7 +2514,7 @@ static void set_cpus_allowed_dl(struct task_struct *p, raw_spin_unlock(&src_dl_b->lock); } - set_cpus_allowed_common(p, new_mask, flags); + set_cpus_allowed_common(p, ctx); } /* Assumes rq->lock is held */ @@ -3007,17 +3005,15 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial) { - int ret = 1, trial_cpus; + unsigned long flags, cap; struct dl_bw *cur_dl_b; - unsigned long flags; + int ret = 1; rcu_read_lock_sched(); cur_dl_b = dl_bw_of(cpumask_any(cur)); - trial_cpus = cpumask_weight(trial); - + cap = __dl_bw_capacity(trial); raw_spin_lock_irqsave(&cur_dl_b->lock, flags); - if (cur_dl_b->bw != -1 && - cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw) + if (__dl_overflow(cur_dl_b, cap, 0, 0)) ret = 0; raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags); rcu_read_unlock_sched(); diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 667876da8382..1637b65ba07a 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -333,6 +333,7 @@ static __init int sched_init_debug(void) debugfs_create_u32("scan_period_min_ms", 0644, numa, &sysctl_numa_balancing_scan_period_min); debugfs_create_u32("scan_period_max_ms", 0644, numa, &sysctl_numa_balancing_scan_period_max); debugfs_create_u32("scan_size_mb", 0644, numa, &sysctl_numa_balancing_scan_size); + debugfs_create_u32("hot_threshold_ms", 0644, numa, &sysctl_numa_balancing_hot_threshold); #endif debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 914096c5b1ae..c36aa54ae071 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -40,6 +40,7 @@ #include <linux/cpuidle.h> #include <linux/interrupt.h> +#include <linux/memory-tiers.h> #include <linux/mempolicy.h> #include <linux/mutex_api.h> #include <linux/profile.h> @@ -177,6 +178,11 @@ int __weak arch_asym_cpu_priority(int cpu) static unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; #endif +#ifdef CONFIG_NUMA_BALANCING +/* Restrict the NUMA promotion throughput (MB/s) for each target node. */ +static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536; +#endif + #ifdef CONFIG_SYSCTL static struct ctl_table sched_fair_sysctls[] = { { @@ -196,6 +202,16 @@ static struct ctl_table sched_fair_sysctls[] = { .extra1 = SYSCTL_ONE, }, #endif +#ifdef CONFIG_NUMA_BALANCING + { + .procname = "numa_balancing_promote_rate_limit_MBps", + .data = &sysctl_numa_balancing_promote_rate_limit, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, +#endif /* CONFIG_NUMA_BALANCING */ {} }; @@ -799,8 +815,6 @@ void init_entity_runnable_average(struct sched_entity *se) /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } -static void attach_entity_cfs_rq(struct sched_entity *se); - /* * With new tasks being created, their initial util_avgs are extrapolated * based on the cfs_rq's current util_avg: @@ -835,20 +849,6 @@ void post_init_entity_util_avg(struct task_struct *p) long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq))); long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2; - if (cap > 0) { - if (cfs_rq->avg.util_avg != 0) { - sa->util_avg = cfs_rq->avg.util_avg * se->load.weight; - sa->util_avg /= (cfs_rq->avg.load_avg + 1); - - if (sa->util_avg > cap) - sa->util_avg = cap; - } else { - sa->util_avg = cap; - } - } - - sa->runnable_avg = sa->util_avg; - if (p->sched_class != &fair_sched_class) { /* * For !fair tasks do: @@ -864,7 +864,19 @@ void post_init_entity_util_avg(struct task_struct *p) return; } - attach_entity_cfs_rq(se); + if (cap > 0) { + if (cfs_rq->avg.util_avg != 0) { + sa->util_avg = cfs_rq->avg.util_avg * se->load.weight; + sa->util_avg /= (cfs_rq->avg.load_avg + 1); + + if (sa->util_avg > cap) + sa->util_avg = cap; + } else { + sa->util_avg = cap; + } + } + + sa->runnable_avg = sa->util_avg; } #else /* !CONFIG_SMP */ @@ -1094,6 +1106,9 @@ unsigned int sysctl_numa_balancing_scan_size = 256; /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ unsigned int sysctl_numa_balancing_scan_delay = 1000; +/* The page with hint page fault latency < threshold in ms is considered hot */ +unsigned int sysctl_numa_balancing_hot_threshold = MSEC_PER_SEC; + struct numa_group { refcount_t refcount; @@ -1436,6 +1451,120 @@ static inline unsigned long group_weight(struct task_struct *p, int nid, return 1000 * faults / total_faults; } +/* + * If memory tiering mode is enabled, cpupid of slow memory page is + * used to record scan time instead of CPU and PID. When tiering mode + * is disabled at run time, the scan time (in cpupid) will be + * interpreted as CPU and PID. So CPU needs to be checked to avoid to + * access out of array bound. + */ +static inline bool cpupid_valid(int cpupid) +{ + return cpupid_to_cpu(cpupid) < nr_cpu_ids; +} + +/* + * For memory tiering mode, if there are enough free pages (more than + * enough watermark defined here) in fast memory node, to take full + * advantage of fast memory capacity, all recently accessed slow + * memory pages will be migrated to fast memory node without + * considering hot threshold. + */ +static bool pgdat_free_space_enough(struct pglist_data *pgdat) +{ + int z; + unsigned long enough_wmark; + + enough_wmark = max(1UL * 1024 * 1024 * 1024 >> PAGE_SHIFT, + pgdat->node_present_pages >> 4); + for (z = pgdat->nr_zones - 1; z >= 0; z--) { + struct zone *zone = pgdat->node_zones + z; + + if (!populated_zone(zone)) + continue; + + if (zone_watermark_ok(zone, 0, + wmark_pages(zone, WMARK_PROMO) + enough_wmark, + ZONE_MOVABLE, 0)) + return true; + } + return false; +} + +/* + * For memory tiering mode, when page tables are scanned, the scan + * time will be recorded in struct page in addition to make page + * PROT_NONE for slow memory page. So when the page is accessed, in + * hint page fault handler, the hint page fault latency is calculated + * via, + * + * hint page fault latency = hint page fault time - scan time + * + * The smaller the hint page fault latency, the higher the possibility + * for the page to be hot. + */ +static int numa_hint_fault_latency(struct page *page) +{ + int last_time, time; + + time = jiffies_to_msecs(jiffies); + last_time = xchg_page_access_time(page, time); + + return (time - last_time) & PAGE_ACCESS_TIME_MASK; +} + +/* + * For memory tiering mode, too high promotion/demotion throughput may + * hurt application latency. So we provide a mechanism to rate limit + * the number of pages that are tried to be promoted. + */ +static bool numa_promotion_rate_limit(struct pglist_data *pgdat, + unsigned long rate_limit, int nr) +{ + unsigned long nr_cand; + unsigned int now, start; + + now = jiffies_to_msecs(jiffies); + mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE, nr); + nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE); + start = pgdat->nbp_rl_start; + if (now - start > MSEC_PER_SEC && + cmpxchg(&pgdat->nbp_rl_start, start, now) == start) + pgdat->nbp_rl_nr_cand = nr_cand; + if (nr_cand - pgdat->nbp_rl_nr_cand >= rate_limit) + return true; + return false; +} + +#define NUMA_MIGRATION_ADJUST_STEPS 16 + +static void numa_promotion_adjust_threshold(struct pglist_data *pgdat, + unsigned long rate_limit, + unsigned int ref_th) +{ + unsigned int now, start, th_period, unit_th, th; + unsigned long nr_cand, ref_cand, diff_cand; + + now = jiffies_to_msecs(jiffies); + th_period = sysctl_numa_balancing_scan_period_max; + start = pgdat->nbp_th_start; + if (now - start > th_period && + cmpxchg(&pgdat->nbp_th_start, start, now) == start) { + ref_cand = rate_limit * + sysctl_numa_balancing_scan_period_max / MSEC_PER_SEC; + nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE); + diff_cand = nr_cand - pgdat->nbp_th_nr_cand; + unit_th = ref_th * 2 / NUMA_MIGRATION_ADJUST_STEPS; + th = pgdat->nbp_threshold ? : ref_th; + if (diff_cand > ref_cand * 11 / 10) + th = max(th - unit_th, unit_th); + else if (diff_cand < ref_cand * 9 / 10) + th = min(th + unit_th, ref_th * 2); + pgdat->nbp_th_nr_cand = nr_cand; + pgdat->nbp_threshold = th; + } +} + bool should_numa_migrate_memory(struct task_struct *p, struct page * page, int src_nid, int dst_cpu) { @@ -1443,9 +1572,44 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, int dst_nid = cpu_to_node(dst_cpu); int last_cpupid, this_cpupid; + /* + * The pages in slow memory node should be migrated according + * to hot/cold instead of private/shared. + */ + if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING && + !node_is_toptier(src_nid)) { + struct pglist_data *pgdat; + unsigned long rate_limit; + unsigned int latency, th, def_th; + + pgdat = NODE_DATA(dst_nid); + if (pgdat_free_space_enough(pgdat)) { + /* workload changed, reset hot threshold */ + pgdat->nbp_threshold = 0; + return true; + } + + def_th = sysctl_numa_balancing_hot_threshold; + rate_limit = sysctl_numa_balancing_promote_rate_limit << \ + (20 - PAGE_SHIFT); + numa_promotion_adjust_threshold(pgdat, rate_limit, def_th); + + th = pgdat->nbp_threshold ? : def_th; + latency = numa_hint_fault_latency(page); + if (latency >= th) + return false; + + return !numa_promotion_rate_limit(pgdat, rate_limit, + thp_nr_pages(page)); + } + this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid); last_cpupid = page_cpupid_xchg_last(page, this_cpupid); + if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) && + !node_is_toptier(src_nid) && !cpupid_valid(last_cpupid)) + return false; + /* * Allow first faults or private faults to migrate immediately early in * the lifetime of a task. The magic number 4 is based on waiting for @@ -1592,11 +1756,11 @@ numa_type numa_classify(unsigned int imbalance_pct, #ifdef CONFIG_SCHED_SMT /* Forward declarations of select_idle_sibling helpers */ -static inline bool test_idle_cores(int cpu, bool def); +static inline bool test_idle_cores(int cpu); static inline int numa_idle_core(int idle_core, int cpu) { if (!static_branch_likely(&sched_smt_present) || - idle_core >= 0 || !test_idle_cores(cpu, false)) + idle_core >= 0 || !test_idle_cores(cpu)) return idle_core; /* @@ -2600,7 +2764,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, if (!join) return; - BUG_ON(irqs_disabled()); + WARN_ON_ONCE(irqs_disabled()); double_lock_irq(&my_grp->lock, &grp->lock); for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) { @@ -2685,6 +2849,15 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) if (!p->mm) return; + /* + * NUMA faults statistics are unnecessary for the slow memory + * node for memory tiering mode. + */ + if (!node_is_toptier(mem_node) && + (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING || + !cpupid_valid(last_cpupid))) + return; + /* Allocate buffer to track faults on a per-node basis */ if (unlikely(!p->numa_faults)) { int size = sizeof(*p->numa_faults) * @@ -2765,6 +2938,7 @@ static void task_numa_work(struct callback_head *work) struct task_struct *p = current; struct mm_struct *mm = p->mm; u64 runtime = p->se.sum_exec_runtime; + MA_STATE(mas, &mm->mm_mt, 0, 0); struct vm_area_struct *vma; unsigned long start, end; unsigned long nr_pte_updates = 0; @@ -2802,7 +2976,7 @@ static void task_numa_work(struct callback_head *work) } next_scan = now + msecs_to_jiffies(p->numa_scan_period); - if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate) + if (!try_cmpxchg(&mm->numa_next_scan, &migrate, next_scan)) return; /* @@ -2821,13 +2995,16 @@ static void task_numa_work(struct callback_head *work) if (!mmap_read_trylock(mm)) return; - vma = find_vma(mm, start); + mas_set(&mas, start); + vma = mas_find(&mas, ULONG_MAX); if (!vma) { reset_ptenuma_scan(p); start = 0; - vma = mm->mmap; + mas_set(&mas, start); + vma = mas_find(&mas, ULONG_MAX); } - for (; vma; vma = vma->vm_next) { + + for (; vma; vma = mas_find(&mas, ULONG_MAX)) { if (!vma_migratable(vma) || !vma_policy_mof(vma) || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) { continue; @@ -3838,8 +4015,7 @@ static void migrate_se_pelt_lag(struct sched_entity *se) {} * @cfs_rq: cfs_rq to update * * The cfs_rq avg is the direct sum of all its entities (blocked and runnable) - * avg. The immediate corollary is that all (fair) tasks must be attached, see - * post_init_entity_util_avg(). + * avg. The immediate corollary is that all (fair) tasks must be attached. * * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example. * @@ -4003,6 +4179,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s #define UPDATE_TG 0x1 #define SKIP_AGE_LOAD 0x2 #define DO_ATTACH 0x4 +#define DO_DETACH 0x8 /* Update task and its cfs_rq load average */ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) @@ -4032,6 +4209,13 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s attach_entity_load_avg(cfs_rq, se); update_tg_load_avg(cfs_rq); + } else if (flags & DO_DETACH) { + /* + * DO_DETACH means we're here from dequeue_entity() + * and we are migrating task out of the CPU. + */ + detach_entity_load_avg(cfs_rq, se); + update_tg_load_avg(cfs_rq); } else if (decayed) { cfs_rq_util_change(cfs_rq, 0); @@ -4064,8 +4248,8 @@ static void remove_entity_load_avg(struct sched_entity *se) /* * tasks cannot exit without having gone through wake_up_new_task() -> - * post_init_entity_util_avg() which will have added things to the - * cfs_rq, so we can remove unconditionally. + * enqueue_task_fair() which will have added things to the cfs_rq, + * so we can remove unconditionally. */ sync_entity_load_avg(se); @@ -4108,14 +4292,16 @@ static inline unsigned long task_util_est(struct task_struct *p) } #ifdef CONFIG_UCLAMP_TASK -static inline unsigned long uclamp_task_util(struct task_struct *p) +static inline unsigned long uclamp_task_util(struct task_struct *p, + unsigned long uclamp_min, + unsigned long uclamp_max) { - return clamp(task_util_est(p), - uclamp_eff_value(p, UCLAMP_MIN), - uclamp_eff_value(p, UCLAMP_MAX)); + return clamp(task_util_est(p), uclamp_min, uclamp_max); } #else -static inline unsigned long uclamp_task_util(struct task_struct *p) +static inline unsigned long uclamp_task_util(struct task_struct *p, + unsigned long uclamp_min, + unsigned long uclamp_max) { return task_util_est(p); } @@ -4254,15 +4440,144 @@ done: trace_sched_util_est_se_tp(&p->se); } -static inline int task_fits_capacity(struct task_struct *p, - unsigned long capacity) +static inline int util_fits_cpu(unsigned long util, + unsigned long uclamp_min, + unsigned long uclamp_max, + int cpu) +{ + unsigned long capacity_orig, capacity_orig_thermal; + unsigned long capacity = capacity_of(cpu); + bool fits, uclamp_max_fits; + + /* + * Check if the real util fits without any uclamp boost/cap applied. + */ + fits = fits_capacity(util, capacity); + + if (!uclamp_is_used()) + return fits; + + /* + * We must use capacity_orig_of() for comparing against uclamp_min and + * uclamp_max. We only care about capacity pressure (by using + * capacity_of()) for comparing against the real util. + * + * If a task is boosted to 1024 for example, we don't want a tiny + * pressure to skew the check whether it fits a CPU or not. + * + * Similarly if a task is capped to capacity_orig_of(little_cpu), it + * should fit a little cpu even if there's some pressure. + * + * Only exception is for thermal pressure since it has a direct impact + * on available OPP of the system. + * + * We honour it for uclamp_min only as a drop in performance level + * could result in not getting the requested minimum performance level. + * + * For uclamp_max, we can tolerate a drop in performance level as the + * goal is to cap the task. So it's okay if it's getting less. + * + * In case of capacity inversion we should honour the inverted capacity + * for both uclamp_min and uclamp_max all the time. + */ + capacity_orig = cpu_in_capacity_inversion(cpu); + if (capacity_orig) { + capacity_orig_thermal = capacity_orig; + } else { + capacity_orig = capacity_orig_of(cpu); + capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu); + } + + /* + * We want to force a task to fit a cpu as implied by uclamp_max. + * But we do have some corner cases to cater for.. + * + * + * C=z + * | ___ + * | C=y | | + * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max + * | C=x | | | | + * | ___ | | | | + * | | | | | | | (util somewhere in this region) + * | | | | | | | + * | | | | | | | + * +---------------------------------------- + * cpu0 cpu1 cpu2 + * + * In the above example if a task is capped to a specific performance + * point, y, then when: + * + * * util = 80% of x then it does not fit on cpu0 and should migrate + * to cpu1 + * * util = 80% of y then it is forced to fit on cpu1 to honour + * uclamp_max request. + * + * which is what we're enforcing here. A task always fits if + * uclamp_max <= capacity_orig. But when uclamp_max > capacity_orig, + * the normal upmigration rules should withhold still. + * + * Only exception is when we are on max capacity, then we need to be + * careful not to block overutilized state. This is so because: + * + * 1. There's no concept of capping at max_capacity! We can't go + * beyond this performance level anyway. + * 2. The system is being saturated when we're operating near + * max capacity, it doesn't make sense to block overutilized. + */ + uclamp_max_fits = (capacity_orig == SCHED_CAPACITY_SCALE) && (uclamp_max == SCHED_CAPACITY_SCALE); + uclamp_max_fits = !uclamp_max_fits && (uclamp_max <= capacity_orig); + fits = fits || uclamp_max_fits; + + /* + * + * C=z + * | ___ (region a, capped, util >= uclamp_max) + * | C=y | | + * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max + * | C=x | | | | + * | ___ | | | | (region b, uclamp_min <= util <= uclamp_max) + * |_ _ _|_ _|_ _ _ _| _ | _ _ _| _ | _ _ _ _ _ uclamp_min + * | | | | | | | + * | | | | | | | (region c, boosted, util < uclamp_min) + * +---------------------------------------- + * cpu0 cpu1 cpu2 + * + * a) If util > uclamp_max, then we're capped, we don't care about + * actual fitness value here. We only care if uclamp_max fits + * capacity without taking margin/pressure into account. + * See comment above. + * + * b) If uclamp_min <= util <= uclamp_max, then the normal + * fits_capacity() rules apply. Except we need to ensure that we + * enforce we remain within uclamp_max, see comment above. + * + * c) If util < uclamp_min, then we are boosted. Same as (b) but we + * need to take into account the boosted value fits the CPU without + * taking margin/pressure into account. + * + * Cases (a) and (b) are handled in the 'fits' variable already. We + * just need to consider an extra check for case (c) after ensuring we + * handle the case uclamp_min > uclamp_max. + */ + uclamp_min = min(uclamp_min, uclamp_max); + if (util < uclamp_min && capacity_orig != SCHED_CAPACITY_SCALE) + fits = fits && (uclamp_min <= capacity_orig_thermal); + + return fits; +} + +static inline int task_fits_cpu(struct task_struct *p, int cpu) { - return fits_capacity(uclamp_task_util(p), capacity); + unsigned long uclamp_min = uclamp_eff_value(p, UCLAMP_MIN); + unsigned long uclamp_max = uclamp_eff_value(p, UCLAMP_MAX); + unsigned long util = task_util_est(p); + return util_fits_cpu(util, uclamp_min, uclamp_max, cpu); } static inline void update_misfit_status(struct task_struct *p, struct rq *rq) { - if (!static_branch_unlikely(&sched_asym_cpucapacity)) + if (!sched_asym_cpucap_active()) return; if (!p || p->nr_cpus_allowed == 1) { @@ -4270,7 +4585,7 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) return; } - if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) { + if (task_fits_cpu(p, cpu_of(rq))) { rq->misfit_task_load = 0; return; } @@ -4292,6 +4607,7 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) #define UPDATE_TG 0x0 #define SKIP_AGE_LOAD 0x0 #define DO_ATTACH 0x0 +#define DO_DETACH 0x0 static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1) { @@ -4434,7 +4750,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) /* * When enqueuing a sched_entity, we must: * - Update loads to have both entity and cfs_rq synced with now. - * - Add its load to cfs_rq->runnable_avg + * - For group_entity, update its runnable_weight to reflect the new + * h_nr_running of its group cfs_rq. * - For group_entity, update its weight to reflect the new share of * its group cfs_rq * - Add its new weight to cfs_rq->load.weight @@ -4511,6 +4828,11 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); static void dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { + int action = UPDATE_TG; + + if (entity_is_task(se) && task_on_rq_migrating(task_of(se))) + action |= DO_DETACH; + /* * Update run-time statistics of the 'current'. */ @@ -4519,12 +4841,13 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) /* * When dequeuing a sched_entity, we must: * - Update loads to have both entity and cfs_rq synced with now. - * - Subtract its load from the cfs_rq->runnable_avg. + * - For group_entity, update its runnable_weight to reflect the new + * h_nr_running of its group cfs_rq. * - Subtract its previous weight from cfs_rq->load.weight. * - For group entity, update its weight to reflect the new share * of its group cfs_rq. */ - update_load_avg(cfs_rq, se, UPDATE_TG); + update_load_avg(cfs_rq, se, action); se_update_runnable(se); update_stats_dequeue_fair(cfs_rq, se, flags); @@ -5682,7 +6005,10 @@ static inline void hrtick_update(struct rq *rq) #ifdef CONFIG_SMP static inline bool cpu_overutilized(int cpu) { - return !fits_capacity(cpu_util_cfs(cpu), capacity_of(cpu)); + unsigned long rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN); + unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX); + + return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu); } static inline void update_overutilized_status(struct rq *rq) @@ -5893,8 +6219,8 @@ dequeue_throttle: #ifdef CONFIG_SMP /* Working cpumask for: load_balance, load_balance_newidle. */ -DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); -DEFINE_PER_CPU(cpumask_var_t, select_rq_mask); +static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); +static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask); #ifdef CONFIG_NO_HZ_COMMON @@ -6260,7 +6586,7 @@ static inline void set_idle_cores(int cpu, int val) WRITE_ONCE(sds->has_idle_cores, val); } -static inline bool test_idle_cores(int cpu, bool def) +static inline bool test_idle_cores(int cpu) { struct sched_domain_shared *sds; @@ -6268,7 +6594,7 @@ static inline bool test_idle_cores(int cpu, bool def) if (sds) return READ_ONCE(sds->has_idle_cores); - return def; + return false; } /* @@ -6284,7 +6610,7 @@ void __update_idle_core(struct rq *rq) int cpu; rcu_read_lock(); - if (test_idle_cores(core, true)) + if (test_idle_cores(core)) goto unlock; for_each_cpu(cpu, cpu_smt_mask(core)) { @@ -6310,9 +6636,6 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu bool idle = true; int cpu; - if (!static_branch_likely(&sched_smt_present)) - return __select_idle_cpu(core, p); - for_each_cpu(cpu, cpu_smt_mask(core)) { if (!available_idle_cpu(cpu)) { idle = false; @@ -6339,13 +6662,12 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu /* * Scan the local SMT mask for idle CPUs. */ -static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) +static int select_idle_smt(struct task_struct *p, int target) { int cpu; - for_each_cpu(cpu, cpu_smt_mask(target)) { - if (!cpumask_test_cpu(cpu, p->cpus_ptr) || - !cpumask_test_cpu(cpu, sched_domain_span(sd))) + for_each_cpu_and(cpu, cpu_smt_mask(target), p->cpus_ptr) { + if (cpu == target) continue; if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) return cpu; @@ -6360,9 +6682,9 @@ static inline void set_idle_cores(int cpu, int val) { } -static inline bool test_idle_cores(int cpu, bool def) +static inline bool test_idle_cores(int cpu) { - return def; + return false; } static inline int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu) @@ -6370,7 +6692,7 @@ static inline int select_idle_core(struct task_struct *p, int core, struct cpuma return __select_idle_cpu(core, p); } -static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) +static inline int select_idle_smt(struct task_struct *p, int target) { return -1; } @@ -6389,19 +6711,19 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool struct sched_domain_shared *sd_share; struct rq *this_rq = this_rq(); int this = smp_processor_id(); - struct sched_domain *this_sd; + struct sched_domain *this_sd = NULL; u64 time = 0; - this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); - if (!this_sd) - return -1; - cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); if (sched_feat(SIS_PROP) && !has_idle_core) { u64 avg_cost, avg_idle, span_avg; unsigned long now = jiffies; + this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); + if (!this_sd) + return -1; + /* * If we're busy, the assumption that the last idle period * predicts the future is flawed; age away the remaining @@ -6455,7 +6777,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool if (has_idle_core) set_idle_cores(target, false); - if (sched_feat(SIS_PROP) && !has_idle_core) { + if (sched_feat(SIS_PROP) && this_sd && !has_idle_core) { time = cpu_clock(this) - time; /* @@ -6478,21 +6800,23 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool static int select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) { - unsigned long task_util, best_cap = 0; + unsigned long task_util, util_min, util_max, best_cap = 0; int cpu, best_cpu = -1; struct cpumask *cpus; cpus = this_cpu_cpumask_var_ptr(select_rq_mask); cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); - task_util = uclamp_task_util(p); + task_util = task_util_est(p); + util_min = uclamp_eff_value(p, UCLAMP_MIN); + util_max = uclamp_eff_value(p, UCLAMP_MAX); for_each_cpu_wrap(cpu, cpus, target) { unsigned long cpu_cap = capacity_of(cpu); if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu)) continue; - if (fits_capacity(task_util, cpu_cap)) + if (util_fits_cpu(task_util, util_min, util_max, cpu)) return cpu; if (cpu_cap > best_cap) { @@ -6504,10 +6828,13 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) return best_cpu; } -static inline bool asym_fits_capacity(unsigned long task_util, int cpu) +static inline bool asym_fits_cpu(unsigned long util, + unsigned long util_min, + unsigned long util_max, + int cpu) { - if (static_branch_unlikely(&sched_asym_cpucapacity)) - return fits_capacity(task_util, capacity_of(cpu)); + if (sched_asym_cpucap_active()) + return util_fits_cpu(util, util_min, util_max, cpu); return true; } @@ -6519,16 +6846,18 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) { bool has_idle_core = false; struct sched_domain *sd; - unsigned long task_util; + unsigned long task_util, util_min, util_max; int i, recent_used_cpu; /* * On asymmetric system, update task utilization because we will check * that the task fits with cpu's capacity. */ - if (static_branch_unlikely(&sched_asym_cpucapacity)) { + if (sched_asym_cpucap_active()) { sync_entity_load_avg(&p->se); - task_util = uclamp_task_util(p); + task_util = task_util_est(p); + util_min = uclamp_eff_value(p, UCLAMP_MIN); + util_max = uclamp_eff_value(p, UCLAMP_MAX); } /* @@ -6537,7 +6866,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) lockdep_assert_irqs_disabled(); if ((available_idle_cpu(target) || sched_idle_cpu(target)) && - asym_fits_capacity(task_util, target)) + asym_fits_cpu(task_util, util_min, util_max, target)) return target; /* @@ -6545,7 +6874,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) */ if (prev != target && cpus_share_cache(prev, target) && (available_idle_cpu(prev) || sched_idle_cpu(prev)) && - asym_fits_capacity(task_util, prev)) + asym_fits_cpu(task_util, util_min, util_max, prev)) return prev; /* @@ -6560,7 +6889,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) in_task() && prev == smp_processor_id() && this_rq()->nr_running <= 1 && - asym_fits_capacity(task_util, prev)) { + asym_fits_cpu(task_util, util_min, util_max, prev)) { return prev; } @@ -6572,7 +6901,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) cpus_share_cache(recent_used_cpu, target) && (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) && - asym_fits_capacity(task_util, recent_used_cpu)) { + asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) { return recent_used_cpu; } @@ -6580,7 +6909,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) * For asymmetric CPU capacity systems, our domain of interest is * sd_asym_cpucapacity rather than sd_llc. */ - if (static_branch_unlikely(&sched_asym_cpucapacity)) { + if (sched_asym_cpucap_active()) { sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target)); /* * On an asymmetric CPU capacity system where an exclusive @@ -6601,10 +6930,10 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) return target; if (sched_smt_active()) { - has_idle_core = test_idle_cores(target, false); + has_idle_core = test_idle_cores(target); if (!has_idle_core && cpus_share_cache(prev, target)) { - i = select_idle_smt(p, sd, prev); + i = select_idle_smt(p, prev); if ((unsigned int)i < nr_cpumask_bits) return i; } @@ -6868,6 +7197,8 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) { struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask); unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX; + unsigned long p_util_min = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MIN) : 0; + unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024; struct root_domain *rd = this_rq()->rd; int cpu, best_energy_cpu, target = -1; struct sched_domain *sd; @@ -6892,7 +7223,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) target = prev_cpu; sync_entity_load_avg(&p->se); - if (!task_util_est(p)) + if (!uclamp_task_util(p, p_util_min, p_util_max)) goto unlock; eenv_task_busy_time(&eenv, p, prev_cpu); @@ -6900,7 +7231,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) for (; pd; pd = pd->next) { unsigned long cpu_cap, cpu_thermal_cap, util; unsigned long cur_delta, max_spare_cap = 0; - bool compute_prev_delta = false; + unsigned long rq_util_min, rq_util_max; + unsigned long util_min, util_max; + unsigned long prev_spare_cap = 0; int max_spare_cap_cpu = -1; unsigned long base_energy; @@ -6936,26 +7269,45 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) * much capacity we can get out of the CPU; this is * aligned with sched_cpu_util(). */ - util = uclamp_rq_util_with(cpu_rq(cpu), util, p); - if (!fits_capacity(util, cpu_cap)) + if (uclamp_is_used()) { + if (uclamp_rq_is_idle(cpu_rq(cpu))) { + util_min = p_util_min; + util_max = p_util_max; + } else { + /* + * Open code uclamp_rq_util_with() except for + * the clamp() part. Ie: apply max aggregation + * only. util_fits_cpu() logic requires to + * operate on non clamped util but must use the + * max-aggregated uclamp_{min, max}. + */ + rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN); + rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX); + + util_min = max(rq_util_min, p_util_min); + util_max = max(rq_util_max, p_util_max); + } + } + if (!util_fits_cpu(util, util_min, util_max, cpu)) continue; lsub_positive(&cpu_cap, util); if (cpu == prev_cpu) { /* Always use prev_cpu as a candidate. */ - compute_prev_delta = true; + prev_spare_cap = cpu_cap; } else if (cpu_cap > max_spare_cap) { /* * Find the CPU with the maximum spare capacity - * in the performance domain. + * among the remaining CPUs in the performance + * domain. */ max_spare_cap = cpu_cap; max_spare_cap_cpu = cpu; } } - if (max_spare_cap_cpu < 0 && !compute_prev_delta) + if (max_spare_cap_cpu < 0 && prev_spare_cap == 0) continue; eenv_pd_busy_time(&eenv, cpus, p); @@ -6963,7 +7315,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) base_energy = compute_energy(&eenv, pd, cpus, p, -1); /* Evaluate the energy impact of using prev_cpu. */ - if (compute_prev_delta) { + if (prev_spare_cap > 0) { prev_delta = compute_energy(&eenv, pd, cpus, p, prev_cpu); /* CPU utilization has changed */ @@ -6974,7 +7326,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) } /* Evaluate the energy impact of using max_spare_cap_cpu. */ - if (max_spare_cap_cpu >= 0) { + if (max_spare_cap_cpu >= 0 && max_spare_cap > prev_spare_cap) { cur_delta = compute_energy(&eenv, pd, cpus, p, max_spare_cap_cpu); /* CPU utilization has changed */ @@ -7076,8 +7428,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) return new_cpu; } -static void detach_entity_cfs_rq(struct sched_entity *se); - /* * Called immediately before a task is migrated to a new CPU; task_cpu(p) and * cfs_rq_of(p) references at time of call are still valid and identify the @@ -7099,15 +7449,7 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) se->vruntime -= u64_u32_load(cfs_rq->min_vruntime); } - if (p->on_rq == TASK_ON_RQ_MIGRATING) { - /* - * In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old' - * rq->lock and can modify state directly. - */ - lockdep_assert_rq_held(task_rq(p)); - detach_entity_cfs_rq(se); - - } else { + if (!task_on_rq_migrating(p)) { remove_entity_load_avg(se); /* @@ -7279,7 +7621,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ return; find_matching_se(&se, &pse); - BUG_ON(!pse); + WARN_ON_ONCE(!pse); cse_is_idle = se_is_idle(se); pse_is_idle = se_is_idle(pse); @@ -7938,7 +8280,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) /* Record that we found at least one task that could run on dst_cpu */ env->flags &= ~LBF_ALL_PINNED; - if (task_running(env->src_rq, p)) { + if (task_on_cpu(env->src_rq, p)) { schedstat_inc(p->stats.nr_failed_migrations_running); return 0; } @@ -8012,8 +8354,6 @@ static struct task_struct *detach_one_task(struct lb_env *env) return NULL; } -static const unsigned int sched_nr_migrate_break = 32; - /* * detach_tasks() -- tries to detach up to imbalance load/util/tasks from * busiest_rq, as part of a balancing operation within domain "sd". @@ -8049,20 +8389,24 @@ static int detach_tasks(struct lb_env *env) if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1) break; - p = list_last_entry(tasks, struct task_struct, se.group_node); - env->loop++; - /* We've more or less seen every task there is, call it quits */ - if (env->loop > env->loop_max) + /* + * We've more or less seen every task there is, call it quits + * unless we haven't found any movable task yet. + */ + if (env->loop > env->loop_max && + !(env->flags & LBF_ALL_PINNED)) break; /* take a breather every nr_migrate tasks */ if (env->loop > env->loop_break) { - env->loop_break += sched_nr_migrate_break; + env->loop_break += SCHED_NR_MIGRATE_BREAK; env->flags |= LBF_NEED_BREAK; break; } + p = list_last_entry(tasks, struct task_struct, se.group_node); + if (!can_migrate_task(p, env)) goto next; @@ -8108,7 +8452,7 @@ static int detach_tasks(struct lb_env *env) case migrate_misfit: /* This is not a misfit task */ - if (task_fits_capacity(p, capacity_of(env->src_cpu))) + if (task_fits_cpu(p, env->src_cpu)) goto next; env->imbalance = 0; @@ -8159,7 +8503,7 @@ static void attach_task(struct rq *rq, struct task_struct *p) { lockdep_assert_rq_held(rq); - BUG_ON(task_rq(p) != rq); + WARN_ON_ONCE(task_rq(p) != rq); activate_task(rq, p, ENQUEUE_NOCLOCK); check_preempt_curr(rq, p, 0); } @@ -8497,16 +8841,73 @@ static unsigned long scale_rt_capacity(int cpu) static void update_cpu_capacity(struct sched_domain *sd, int cpu) { + unsigned long capacity_orig = arch_scale_cpu_capacity(cpu); unsigned long capacity = scale_rt_capacity(cpu); struct sched_group *sdg = sd->groups; + struct rq *rq = cpu_rq(cpu); - cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu); + rq->cpu_capacity_orig = capacity_orig; if (!capacity) capacity = 1; - cpu_rq(cpu)->cpu_capacity = capacity; - trace_sched_cpu_capacity_tp(cpu_rq(cpu)); + rq->cpu_capacity = capacity; + + /* + * Detect if the performance domain is in capacity inversion state. + * + * Capacity inversion happens when another perf domain with equal or + * lower capacity_orig_of() ends up having higher capacity than this + * domain after subtracting thermal pressure. + * + * We only take into account thermal pressure in this detection as it's + * the only metric that actually results in *real* reduction of + * capacity due to performance points (OPPs) being dropped/become + * unreachable due to thermal throttling. + * + * We assume: + * * That all cpus in a perf domain have the same capacity_orig + * (same uArch). + * * Thermal pressure will impact all cpus in this perf domain + * equally. + */ + if (static_branch_unlikely(&sched_asym_cpucapacity)) { + unsigned long inv_cap = capacity_orig - thermal_load_avg(rq); + struct perf_domain *pd = rcu_dereference(rq->rd->pd); + + rq->cpu_capacity_inverted = 0; + + for (; pd; pd = pd->next) { + struct cpumask *pd_span = perf_domain_span(pd); + unsigned long pd_cap_orig, pd_cap; + + cpu = cpumask_any(pd_span); + pd_cap_orig = arch_scale_cpu_capacity(cpu); + + if (capacity_orig < pd_cap_orig) + continue; + + /* + * handle the case of multiple perf domains have the + * same capacity_orig but one of them is under higher + * thermal pressure. We record it as capacity + * inversion. + */ + if (capacity_orig == pd_cap_orig) { + pd_cap = pd_cap_orig - thermal_load_avg(cpu_rq(cpu)); + + if (pd_cap > inv_cap) { + rq->cpu_capacity_inverted = inv_cap; + break; + } + } else if (pd_cap_orig > inv_cap) { + rq->cpu_capacity_inverted = inv_cap; + break; + } + } + } + + trace_sched_cpu_capacity_tp(rq); sdg->sgc->capacity = capacity; sdg->sgc->min_capacity = capacity; @@ -9113,6 +9514,10 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd, memset(sgs, 0, sizeof(*sgs)); + /* Assume that task can't fit any CPU of the group */ + if (sd->flags & SD_ASYM_CPUCAPACITY) + sgs->group_misfit_task_load = 1; + for_each_cpu(i, sched_group_span(group)) { struct rq *rq = cpu_rq(i); unsigned int local; @@ -9132,12 +9537,12 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd, if (!nr_running && idle_cpu_without(i, p)) sgs->idle_cpus++; - } + /* Check if task fits in the CPU */ + if (sd->flags & SD_ASYM_CPUCAPACITY && + sgs->group_misfit_task_load && + task_fits_cpu(p, i)) + sgs->group_misfit_task_load = 0; - /* Check if task fits in the group */ - if (sd->flags & SD_ASYM_CPUCAPACITY && - !task_fits_capacity(p, group->sgc->max_capacity)) { - sgs->group_misfit_task_load = 1; } sgs->group_capacity = group->sgc->capacity; @@ -10099,14 +10504,13 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct rq *busiest; struct rq_flags rf; struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask); - struct lb_env env = { .sd = sd, .dst_cpu = this_cpu, .dst_rq = this_rq, .dst_grpmask = sched_group_span(sd->groups), .idle = idle, - .loop_break = sched_nr_migrate_break, + .loop_break = SCHED_NR_MIGRATE_BREAK, .cpus = cpus, .fbq_type = all, .tasks = LIST_HEAD_INIT(env.tasks), @@ -10134,7 +10538,7 @@ redo: goto out_balanced; } - BUG_ON(busiest == env.dst_rq); + WARN_ON_ONCE(busiest == env.dst_rq); schedstat_add(sd->lb_imbalance[idle], env.imbalance); @@ -10182,7 +10586,9 @@ more_balance: if (env.flags & LBF_NEED_BREAK) { env.flags &= ~LBF_NEED_BREAK; - goto more_balance; + /* Stop if we tried all running tasks */ + if (env.loop < busiest->nr_running) + goto more_balance; } /* @@ -10213,7 +10619,7 @@ more_balance: env.dst_cpu = env.new_dst_cpu; env.flags &= ~LBF_DST_PINNED; env.loop = 0; - env.loop_break = sched_nr_migrate_break; + env.loop_break = SCHED_NR_MIGRATE_BREAK; /* * Go back to "more_balance" rather than "redo" since we @@ -10245,7 +10651,7 @@ more_balance: */ if (!cpumask_subset(cpus, env.dst_grpmask)) { env.loop = 0; - env.loop_break = sched_nr_migrate_break; + env.loop_break = SCHED_NR_MIGRATE_BREAK; goto redo; } goto out_all_pinned; @@ -10430,7 +10836,7 @@ static int active_load_balance_cpu_stop(void *data) * we need to fix it. Originally reported by * Bjorn Helgaas on a 128-CPU setup. */ - BUG_ON(busiest_rq == target_rq); + WARN_ON_ONCE(busiest_rq == target_rq); /* Search for an sd spanning us and the target CPU. */ rcu_read_lock(); @@ -10916,8 +11322,7 @@ static bool update_nohz_stats(struct rq *rq) * can be a simple update of blocked load or a complete load balance with * tasks movement depending of flags. */ -static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags, - enum cpu_idle_type idle) +static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags) { /* Earliest time when we have to do rebalance again */ unsigned long now = jiffies; @@ -11032,7 +11437,7 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) if (idle != CPU_IDLE) return false; - _nohz_idle_balance(this_rq, flags, idle); + _nohz_idle_balance(this_rq, flags); return true; } @@ -11052,7 +11457,7 @@ void nohz_run_idle_balance(int cpu) * (ie NOHZ_STATS_KICK set) and will do the same. */ if ((flags == NOHZ_NEWILB_KICK) && !need_resched()) - _nohz_idle_balance(cpu_rq(cpu), NOHZ_STATS_KICK, CPU_IDLE); + _nohz_idle_balance(cpu_rq(cpu), NOHZ_STATS_KICK); } static void nohz_newidle_balance(struct rq *this_rq) @@ -11552,6 +11957,17 @@ static void detach_entity_cfs_rq(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); +#ifdef CONFIG_SMP + /* + * In case the task sched_avg hasn't been attached: + * - A forked task which hasn't been woken up by wake_up_new_task(). + * - A task which has been woken up by try_to_wake_up() but is + * waiting for actually being woken up by sched_ttwu_pending(). + */ + if (!se->avg.last_update_time) + return; +#endif + /* Catch up with the cfs_rq and remove our load when we leave */ update_load_avg(cfs_rq, se, 0); detach_entity_load_avg(cfs_rq, se); @@ -11563,14 +11979,6 @@ static void attach_entity_cfs_rq(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); -#ifdef CONFIG_FAIR_GROUP_SCHED - /* - * Since the real-depth could have been changed (only FAIR - * class maintain depth value), reset depth properly. - */ - se->depth = se->parent ? se->parent->depth + 1 : 0; -#endif - /* Synchronize entity with its cfs_rq */ update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); attach_entity_load_avg(cfs_rq, se); @@ -11666,39 +12074,25 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) } #ifdef CONFIG_FAIR_GROUP_SCHED -static void task_set_group_fair(struct task_struct *p) +static void task_change_group_fair(struct task_struct *p) { - struct sched_entity *se = &p->se; - - set_task_rq(p, task_cpu(p)); - se->depth = se->parent ? se->parent->depth + 1 : 0; -} + /* + * We couldn't detach or attach a forked task which + * hasn't been woken up by wake_up_new_task(). + */ + if (READ_ONCE(p->__state) == TASK_NEW) + return; -static void task_move_group_fair(struct task_struct *p) -{ detach_task_cfs_rq(p); - set_task_rq(p, task_cpu(p)); #ifdef CONFIG_SMP /* Tell se's cfs_rq has been changed -- migrated */ p->se.avg.last_update_time = 0; #endif + set_task_rq(p, task_cpu(p)); attach_task_cfs_rq(p); } -static void task_change_group_fair(struct task_struct *p, int type) -{ - switch (type) { - case TASK_SET_GROUP: - task_set_group_fair(p); - break; - - case TASK_MOVE_GROUP: - task_move_group_fair(p); - break; - } -} - void free_fair_sched_group(struct task_group *tg) { int i; @@ -12075,6 +12469,13 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m) __init void init_sched_fair_class(void) { #ifdef CONFIG_SMP + int i; + + for_each_possible_cpu(i) { + zalloc_cpumask_var_node(&per_cpu(load_balance_mask, i), GFP_KERNEL, cpu_to_node(i)); + zalloc_cpumask_var_node(&per_cpu(select_rq_mask, i), GFP_KERNEL, cpu_to_node(i)); + } + open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); #ifdef CONFIG_NO_HZ_COMMON diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index ecb4b4ff4ce0..8ac8b81bfee6 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -181,6 +181,7 @@ static void group_init(struct psi_group *group) { int cpu; + group->enabled = true; for_each_possible_cpu(cpu) seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq); group->avg_last_update = sched_clock(); @@ -188,6 +189,7 @@ static void group_init(struct psi_group *group) INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work); mutex_init(&group->avgs_lock); /* Init trigger-related members */ + atomic_set(&group->poll_scheduled, 0); mutex_init(&group->trigger_lock); INIT_LIST_HEAD(&group->triggers); group->poll_min_period = U32_MAX; @@ -201,6 +203,7 @@ void __init psi_init(void) { if (!psi_enable) { static_branch_enable(&psi_disabled); + static_branch_disable(&psi_cgroups_enabled); return; } @@ -211,7 +214,7 @@ void __init psi_init(void) group_init(&psi_system); } -static bool test_state(unsigned int *tasks, enum psi_states state) +static bool test_state(unsigned int *tasks, enum psi_states state, bool oncpu) { switch (state) { case PSI_IO_SOME: @@ -224,9 +227,9 @@ static bool test_state(unsigned int *tasks, enum psi_states state) return unlikely(tasks[NR_MEMSTALL] && tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]); case PSI_CPU_SOME: - return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]); + return unlikely(tasks[NR_RUNNING] > oncpu); case PSI_CPU_FULL: - return unlikely(tasks[NR_RUNNING] && !tasks[NR_ONCPU]); + return unlikely(tasks[NR_RUNNING] && !oncpu); case PSI_NONIDLE: return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] || tasks[NR_RUNNING]; @@ -240,6 +243,8 @@ static void get_recent_times(struct psi_group *group, int cpu, u32 *pchanged_states) { struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); + int current_cpu = raw_smp_processor_id(); + unsigned int tasks[NR_PSI_TASK_COUNTS]; u64 now, state_start; enum psi_states s; unsigned int seq; @@ -254,6 +259,8 @@ static void get_recent_times(struct psi_group *group, int cpu, memcpy(times, groupc->times, sizeof(groupc->times)); state_mask = groupc->state_mask; state_start = groupc->state_start; + if (cpu == current_cpu) + memcpy(tasks, groupc->tasks, sizeof(groupc->tasks)); } while (read_seqcount_retry(&groupc->seq, seq)); /* Calculate state time deltas against the previous snapshot */ @@ -278,6 +285,28 @@ static void get_recent_times(struct psi_group *group, int cpu, if (delta) *pchanged_states |= (1 << s); } + + /* + * When collect_percpu_times() from the avgs_work, we don't want to + * re-arm avgs_work when all CPUs are IDLE. But the current CPU running + * this avgs_work is never IDLE, cause avgs_work can't be shut off. + * So for the current CPU, we need to re-arm avgs_work only when + * (NR_RUNNING > 1 || NR_IOWAIT > 0 || NR_MEMSTALL > 0), for other CPUs + * we can just check PSI_NONIDLE delta. + */ + if (current_work() == &group->avgs_work.work) { + bool reschedule; + + if (cpu == current_cpu) + reschedule = tasks[NR_RUNNING] + + tasks[NR_IOWAIT] + + tasks[NR_MEMSTALL] > 1; + else + reschedule = *pchanged_states & (1 << PSI_NONIDLE); + + if (reschedule) + *pchanged_states |= PSI_STATE_RESCHEDULE; + } } static void calc_avgs(unsigned long avg[3], int missed_periods, @@ -413,7 +442,6 @@ static void psi_avgs_work(struct work_struct *work) struct delayed_work *dwork; struct psi_group *group; u32 changed_states; - bool nonidle; u64 now; dwork = to_delayed_work(work); @@ -424,7 +452,6 @@ static void psi_avgs_work(struct work_struct *work) now = sched_clock(); collect_percpu_times(group, PSI_AVGS, &changed_states); - nonidle = changed_states & (1 << PSI_NONIDLE); /* * If there is task activity, periodically fold the per-cpu * times and feed samples into the running averages. If things @@ -435,7 +462,7 @@ static void psi_avgs_work(struct work_struct *work) if (now >= group->avg_next_update) group->avg_next_update = update_averages(group, now); - if (nonidle) { + if (changed_states & PSI_STATE_RESCHEDULE) { schedule_delayed_work(dwork, nsecs_to_jiffies( group->avg_next_update - now) + 1); } @@ -537,10 +564,12 @@ static u64 update_triggers(struct psi_group *group, u64 now) /* Calculate growth since last update */ growth = window_update(&t->win, now, total[t->state]); - if (growth < t->threshold) - continue; + if (!t->pending_event) { + if (growth < t->threshold) + continue; - t->pending_event = true; + t->pending_event = true; + } } /* Limit event signaling to once per window */ if (now < t->last_event_time + t->win.size) @@ -561,18 +590,17 @@ static u64 update_triggers(struct psi_group *group, u64 now) return now + group->poll_min_period; } -/* Schedule polling if it's not already scheduled. */ -static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay) +/* Schedule polling if it's not already scheduled or forced. */ +static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay, + bool force) { struct task_struct *task; /* - * Do not reschedule if already scheduled. - * Possible race with a timer scheduled after this check but before - * mod_timer below can be tolerated because group->polling_next_update - * will keep updates on schedule. + * atomic_xchg should be called even when !force to provide a + * full memory barrier (see the comment inside psi_poll_work). */ - if (timer_pending(&group->poll_timer)) + if (atomic_xchg(&group->poll_scheduled, 1) && !force) return; rcu_read_lock(); @@ -584,12 +612,15 @@ static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay) */ if (likely(task)) mod_timer(&group->poll_timer, jiffies + delay); + else + atomic_set(&group->poll_scheduled, 0); rcu_read_unlock(); } static void psi_poll_work(struct psi_group *group) { + bool force_reschedule = false; u32 changed_states; u64 now; @@ -597,6 +628,43 @@ static void psi_poll_work(struct psi_group *group) now = sched_clock(); + if (now > group->polling_until) { + /* + * We are either about to start or might stop polling if no + * state change was recorded. Resetting poll_scheduled leaves + * a small window for psi_group_change to sneak in and schedule + * an immediate poll_work before we get to rescheduling. One + * potential extra wakeup at the end of the polling window + * should be negligible and polling_next_update still keeps + * updates correctly on schedule. + */ + atomic_set(&group->poll_scheduled, 0); + /* + * A task change can race with the poll worker that is supposed to + * report on it. To avoid missing events, ensure ordering between + * poll_scheduled and the task state accesses, such that if the poll + * worker misses the state update, the task change is guaranteed to + * reschedule the poll worker: + * + * poll worker: + * atomic_set(poll_scheduled, 0) + * smp_mb() + * LOAD states + * + * task change: + * STORE states + * if atomic_xchg(poll_scheduled, 1) == 0: + * schedule poll worker + * + * The atomic_xchg() implies a full barrier. + */ + smp_mb(); + } else { + /* Polling window is not over, keep rescheduling */ + force_reschedule = true; + } + + collect_percpu_times(group, PSI_POLL, &changed_states); if (changed_states & group->poll_states) { @@ -622,7 +690,8 @@ static void psi_poll_work(struct psi_group *group) group->polling_next_update = update_triggers(group, now); psi_schedule_poll_work(group, - nsecs_to_jiffies(group->polling_next_update - now) + 1); + nsecs_to_jiffies(group->polling_next_update - now) + 1, + force_reschedule); out: mutex_unlock(&group->trigger_lock); @@ -688,35 +757,53 @@ static void psi_group_change(struct psi_group *group, int cpu, bool wake_clock) { struct psi_group_cpu *groupc; - u32 state_mask = 0; unsigned int t, m; enum psi_states s; + u32 state_mask; groupc = per_cpu_ptr(group->pcpu, cpu); /* - * First we assess the aggregate resource states this CPU's - * tasks have been in since the last change, and account any - * SOME and FULL time these may have resulted in. - * - * Then we update the task counts according to the state + * First we update the task counts according to the state * change requested through the @clear and @set bits. + * + * Then if the cgroup PSI stats accounting enabled, we + * assess the aggregate resource states this CPU's tasks + * have been in since the last change, and account any + * SOME and FULL time these may have resulted in. */ write_seqcount_begin(&groupc->seq); - record_times(groupc, now); + /* + * Start with TSK_ONCPU, which doesn't have a corresponding + * task count - it's just a boolean flag directly encoded in + * the state mask. Clear, set, or carry the current state if + * no changes are requested. + */ + if (unlikely(clear & TSK_ONCPU)) { + state_mask = 0; + clear &= ~TSK_ONCPU; + } else if (unlikely(set & TSK_ONCPU)) { + state_mask = PSI_ONCPU; + set &= ~TSK_ONCPU; + } else { + state_mask = groupc->state_mask & PSI_ONCPU; + } + /* + * The rest of the state mask is calculated based on the task + * counts. Update those first, then construct the mask. + */ for (t = 0, m = clear; m; m &= ~(1 << t), t++) { if (!(m & (1 << t))) continue; if (groupc->tasks[t]) { groupc->tasks[t]--; } else if (!psi_bug) { - printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u %u] clear=%x set=%x\n", + printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n", cpu, t, groupc->tasks[0], groupc->tasks[1], groupc->tasks[2], - groupc->tasks[3], groupc->tasks[4], - clear, set); + groupc->tasks[3], clear, set); psi_bug = 1; } } @@ -725,9 +812,25 @@ static void psi_group_change(struct psi_group *group, int cpu, if (set & (1 << t)) groupc->tasks[t]++; - /* Calculate state mask representing active states */ + if (!group->enabled) { + /* + * On the first group change after disabling PSI, conclude + * the current state and flush its time. This is unlikely + * to matter to the user, but aggregation (get_recent_times) + * may have already incorporated the live state into times_prev; + * avoid a delta sample underflow when PSI is later re-enabled. + */ + if (unlikely(groupc->state_mask & (1 << PSI_NONIDLE))) + record_times(groupc, now); + + groupc->state_mask = state_mask; + + write_seqcount_end(&groupc->seq); + return; + } + for (s = 0; s < NR_PSI_STATES; s++) { - if (test_state(groupc->tasks, s)) + if (test_state(groupc->tasks, s, state_mask & PSI_ONCPU)) state_mask |= (1 << s); } @@ -739,41 +842,28 @@ static void psi_group_change(struct psi_group *group, int cpu, * task in a cgroup is in_memstall, the corresponding groupc * on that cpu is in PSI_MEM_FULL state. */ - if (unlikely(groupc->tasks[NR_ONCPU] && cpu_curr(cpu)->in_memstall)) + if (unlikely((state_mask & PSI_ONCPU) && cpu_curr(cpu)->in_memstall)) state_mask |= (1 << PSI_MEM_FULL); + record_times(groupc, now); + groupc->state_mask = state_mask; write_seqcount_end(&groupc->seq); if (state_mask & group->poll_states) - psi_schedule_poll_work(group, 1); + psi_schedule_poll_work(group, 1, false); if (wake_clock && !delayed_work_pending(&group->avgs_work)) schedule_delayed_work(&group->avgs_work, PSI_FREQ); } -static struct psi_group *iterate_groups(struct task_struct *task, void **iter) +static inline struct psi_group *task_psi_group(struct task_struct *task) { - if (*iter == &psi_system) - return NULL; - #ifdef CONFIG_CGROUPS - if (static_branch_likely(&psi_cgroups_enabled)) { - struct cgroup *cgroup = NULL; - - if (!*iter) - cgroup = task->cgroups->dfl_cgrp; - else - cgroup = cgroup_parent(*iter); - - if (cgroup && cgroup_parent(cgroup)) { - *iter = cgroup; - return cgroup_psi(cgroup); - } - } + if (static_branch_likely(&psi_cgroups_enabled)) + return cgroup_psi(task_dfl_cgroup(task)); #endif - *iter = &psi_system; return &psi_system; } @@ -796,8 +886,6 @@ void psi_task_change(struct task_struct *task, int clear, int set) { int cpu = task_cpu(task); struct psi_group *group; - bool wake_clock = true; - void *iter = NULL; u64 now; if (!task->pid) @@ -806,19 +894,11 @@ void psi_task_change(struct task_struct *task, int clear, int set) psi_flags_change(task, clear, set); now = cpu_clock(cpu); - /* - * Periodic aggregation shuts off if there is a period of no - * task changes, so we wake it back up if necessary. However, - * don't do this if the task change is the aggregation worker - * itself going to sleep, or we'll ping-pong forever. - */ - if (unlikely((clear & TSK_RUNNING) && - (task->flags & PF_WQ_WORKER) && - wq_worker_last_func(task) == psi_avgs_work)) - wake_clock = false; - while ((group = iterate_groups(task, &iter))) - psi_group_change(group, cpu, clear, set, now, wake_clock); + group = task_psi_group(task); + do { + psi_group_change(group, cpu, clear, set, now, true); + } while ((group = group->parent)); } void psi_task_switch(struct task_struct *prev, struct task_struct *next, @@ -826,34 +906,30 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, { struct psi_group *group, *common = NULL; int cpu = task_cpu(prev); - void *iter; u64 now = cpu_clock(cpu); if (next->pid) { - bool identical_state; - psi_flags_change(next, 0, TSK_ONCPU); /* - * When switching between tasks that have an identical - * runtime state, the cgroup that contains both tasks - * we reach the first common ancestor. Iterate @next's - * ancestors only until we encounter @prev's ONCPU. + * Set TSK_ONCPU on @next's cgroups. If @next shares any + * ancestors with @prev, those will already have @prev's + * TSK_ONCPU bit set, and we can stop the iteration there. */ - identical_state = prev->psi_flags == next->psi_flags; - iter = NULL; - while ((group = iterate_groups(next, &iter))) { - if (identical_state && - per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) { + group = task_psi_group(next); + do { + if (per_cpu_ptr(group->pcpu, cpu)->state_mask & + PSI_ONCPU) { common = group; break; } psi_group_change(group, cpu, 0, TSK_ONCPU, now, true); - } + } while ((group = group->parent)); } if (prev->pid) { int clear = TSK_ONCPU, set = 0; + bool wake_clock = true; /* * When we're going to sleep, psi_dequeue() lets us @@ -867,26 +943,74 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, clear |= TSK_MEMSTALL_RUNNING; if (prev->in_iowait) set |= TSK_IOWAIT; + + /* + * Periodic aggregation shuts off if there is a period of no + * task changes, so we wake it back up if necessary. However, + * don't do this if the task change is the aggregation worker + * itself going to sleep, or we'll ping-pong forever. + */ + if (unlikely((prev->flags & PF_WQ_WORKER) && + wq_worker_last_func(prev) == psi_avgs_work)) + wake_clock = false; } psi_flags_change(prev, clear, set); - iter = NULL; - while ((group = iterate_groups(prev, &iter)) && group != common) - psi_group_change(group, cpu, clear, set, now, true); + group = task_psi_group(prev); + do { + if (group == common) + break; + psi_group_change(group, cpu, clear, set, now, wake_clock); + } while ((group = group->parent)); /* - * TSK_ONCPU is handled up to the common ancestor. If we're tasked - * with dequeuing too, finish that for the rest of the hierarchy. + * TSK_ONCPU is handled up to the common ancestor. If there are + * any other differences between the two tasks (e.g. prev goes + * to sleep, or only one task is memstall), finish propagating + * those differences all the way up to the root. */ - if (sleep) { + if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) { clear &= ~TSK_ONCPU; - for (; group; group = iterate_groups(prev, &iter)) - psi_group_change(group, cpu, clear, set, now, true); + for (; group; group = group->parent) + psi_group_change(group, cpu, clear, set, now, wake_clock); } } } +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +void psi_account_irqtime(struct task_struct *task, u32 delta) +{ + int cpu = task_cpu(task); + struct psi_group *group; + struct psi_group_cpu *groupc; + u64 now; + + if (!task->pid) + return; + + now = cpu_clock(cpu); + + group = task_psi_group(task); + do { + if (!group->enabled) + continue; + + groupc = per_cpu_ptr(group->pcpu, cpu); + + write_seqcount_begin(&groupc->seq); + + record_times(groupc, now); + groupc->times[PSI_IRQ_FULL] += delta; + + write_seqcount_end(&groupc->seq); + + if (group->poll_states & (1 << PSI_IRQ_FULL)) + psi_schedule_poll_work(group, 1, false); + } while ((group = group->parent)); +} +#endif + /** * psi_memstall_enter - mark the beginning of a memory stall section * @flags: flags to handle nested sections @@ -917,6 +1041,7 @@ void psi_memstall_enter(unsigned long *flags) rq_unlock_irq(rq, &rf); } +EXPORT_SYMBOL_GPL(psi_memstall_enter); /** * psi_memstall_leave - mark the end of an memory stall section @@ -946,11 +1071,12 @@ void psi_memstall_leave(unsigned long *flags) rq_unlock_irq(rq, &rf); } +EXPORT_SYMBOL_GPL(psi_memstall_leave); #ifdef CONFIG_CGROUPS int psi_cgroup_alloc(struct cgroup *cgroup) { - if (static_branch_likely(&psi_disabled)) + if (!static_branch_likely(&psi_cgroups_enabled)) return 0; cgroup->psi = kzalloc(sizeof(struct psi_group), GFP_KERNEL); @@ -963,12 +1089,13 @@ int psi_cgroup_alloc(struct cgroup *cgroup) return -ENOMEM; } group_init(cgroup->psi); + cgroup->psi->parent = cgroup_psi(cgroup_parent(cgroup)); return 0; } void psi_cgroup_free(struct cgroup *cgroup) { - if (static_branch_likely(&psi_disabled)) + if (!static_branch_likely(&psi_cgroups_enabled)) return; cancel_delayed_work_sync(&cgroup->psi->avgs_work); @@ -996,7 +1123,7 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to) struct rq_flags rf; struct rq *rq; - if (static_branch_likely(&psi_disabled)) { + if (!static_branch_likely(&psi_cgroups_enabled)) { /* * Lame to do this here, but the scheduler cannot be locked * from the outside, so we move cgroups from inside sched/. @@ -1044,10 +1171,45 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to) task_rq_unlock(rq, task, &rf); } + +void psi_cgroup_restart(struct psi_group *group) +{ + int cpu; + + /* + * After we disable psi_group->enabled, we don't actually + * stop percpu tasks accounting in each psi_group_cpu, + * instead only stop test_state() loop, record_times() + * and averaging worker, see psi_group_change() for details. + * + * When disable cgroup PSI, this function has nothing to sync + * since cgroup pressure files are hidden and percpu psi_group_cpu + * would see !psi_group->enabled and only do task accounting. + * + * When re-enable cgroup PSI, this function use psi_group_change() + * to get correct state mask from test_state() loop on tasks[], + * and restart groupc->state_start from now, use .clear = .set = 0 + * here since no task status really changed. + */ + if (!group->enabled) + return; + + for_each_possible_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + struct rq_flags rf; + u64 now; + + rq_lock_irq(rq, &rf); + now = cpu_clock(cpu); + psi_group_change(group, cpu, 0, 0, now, true); + rq_unlock_irq(rq, &rf); + } +} #endif /* CONFIG_CGROUPS */ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) { + bool only_full = false; int full; u64 now; @@ -1062,7 +1224,11 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) group->avg_next_update = update_averages(group, now); mutex_unlock(&group->avgs_lock); - for (full = 0; full < 2; full++) { +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + only_full = res == PSI_IRQ; +#endif + + for (full = 0; full < 2 - only_full; full++) { unsigned long avg[3] = { 0, }; u64 total = 0; int w; @@ -1076,7 +1242,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) } seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n", - full ? "full" : "some", + full || only_full ? "full" : "some", LOAD_INT(avg[0]), LOAD_FRAC(avg[0]), LOAD_INT(avg[1]), LOAD_FRAC(avg[1]), LOAD_INT(avg[2]), LOAD_FRAC(avg[2]), @@ -1104,6 +1270,11 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, else return ERR_PTR(-EINVAL); +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + if (res == PSI_IRQ && --state != PSI_IRQ_FULL) + return ERR_PTR(-EINVAL); +#endif + if (state >= PSI_NONIDLE) return ERR_PTR(-EINVAL); @@ -1221,6 +1392,7 @@ void psi_trigger_destroy(struct psi_trigger *t) * can no longer be found through group->poll_task. */ kthread_stop(task_to_destroy); + atomic_set(&group->poll_scheduled, 0); } kfree(t); } @@ -1388,6 +1560,33 @@ static const struct proc_ops psi_cpu_proc_ops = { .proc_release = psi_fop_release, }; +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +static int psi_irq_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_IRQ); +} + +static int psi_irq_open(struct inode *inode, struct file *file) +{ + return psi_open(file, psi_irq_show); +} + +static ssize_t psi_irq_write(struct file *file, const char __user *user_buf, + size_t nbytes, loff_t *ppos) +{ + return psi_write(file, user_buf, nbytes, PSI_IRQ); +} + +static const struct proc_ops psi_irq_proc_ops = { + .proc_open = psi_irq_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_write = psi_irq_write, + .proc_poll = psi_fop_poll, + .proc_release = psi_fop_release, +}; +#endif + static int __init psi_proc_init(void) { if (psi_enable) { @@ -1395,6 +1594,9 @@ static int __init psi_proc_init(void) proc_create("pressure/io", 0666, NULL, &psi_io_proc_ops); proc_create("pressure/memory", 0666, NULL, &psi_memory_proc_ops); proc_create("pressure/cpu", 0666, NULL, &psi_cpu_proc_ops); +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + proc_create("pressure/irq", 0666, NULL, &psi_irq_proc_ops); +#endif } return 0; } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 55f39c8f4203..ed2a47e4ddae 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -410,8 +410,8 @@ static inline int has_pushable_tasks(struct rq *rq) return !plist_head_empty(&rq->rt.pushable_tasks); } -static DEFINE_PER_CPU(struct callback_head, rt_push_head); -static DEFINE_PER_CPU(struct callback_head, rt_pull_head); +static DEFINE_PER_CPU(struct balance_callback, rt_push_head); +static DEFINE_PER_CPU(struct balance_callback, rt_pull_head); static void push_rt_tasks(struct rq *); static void pull_rt_task(struct rq *); @@ -509,7 +509,7 @@ static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu) unsigned int cpu_cap; /* Only heterogeneous systems can benefit from this check */ - if (!static_branch_unlikely(&sched_asym_cpucapacity)) + if (!sched_asym_cpucap_active()) return true; min_cap = uclamp_eff_value(p, UCLAMP_MIN); @@ -843,7 +843,7 @@ static void __disable_runtime(struct rq *rq) * We cannot be left wanting - that would mean some runtime * leaked out of the system. */ - BUG_ON(want); + WARN_ON_ONCE(want); balanced: /* * Disable all the borrow logic by pretending we have inf @@ -1062,11 +1062,7 @@ static void update_curr_rt(struct rq *rq) trace_sched_stat_runtime(curr, delta_exec, 0); - curr->se.sum_exec_runtime += delta_exec; - account_group_exec_runtime(curr, delta_exec); - - curr->se.exec_start = now; - cgroup_account_cputime(curr, delta_exec); + update_current_exec_runtime(curr, now, delta_exec); if (!rt_bandwidth_enabled()) return; @@ -1849,7 +1845,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) { - if (!task_running(rq, p) && + if (!task_on_cpu(rq, p) && cpumask_test_cpu(cpu, &p->cpus_mask)) return 1; @@ -1897,7 +1893,7 @@ static int find_lowest_rq(struct task_struct *task) * If we're on asym system ensure we consider the different capacities * of the CPUs when searching for the lowest_mask. */ - if (static_branch_unlikely(&sched_asym_cpucapacity)) { + if (sched_asym_cpucap_active()) { ret = cpupri_find_fitness(&task_rq(task)->rd->cpupri, task, lowest_mask, @@ -2004,7 +2000,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) */ if (unlikely(task_rq(task) != rq || !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) || - task_running(rq, task) || + task_on_cpu(rq, task) || !rt_task(task) || !task_on_rq_queued(task))) { @@ -2462,7 +2458,7 @@ skip: */ static void task_woken_rt(struct rq *rq, struct task_struct *p) { - bool need_to_push = !task_running(rq, p) && + bool need_to_push = !task_on_cpu(rq, p) && !test_tsk_need_resched(rq->curr) && p->nr_cpus_allowed > 1 && (dl_task(rq->curr) || rt_task(rq->curr)) && diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e26688d387ae..771f8ddb7053 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -321,21 +321,6 @@ struct dl_bw { u64 total_bw; }; -/* - * Verify the fitness of task @p to run on @cpu taking into account the - * CPU original capacity and the runtime/deadline ratio of the task. - * - * The function will return true if the CPU original capacity of the - * @cpu scaled by SCHED_CAPACITY_SCALE >= runtime/deadline ratio of the - * task and false otherwise. - */ -static inline bool dl_task_fits_capacity(struct task_struct *p, int cpu) -{ - unsigned long cap = arch_scale_cpu_capacity(cpu); - - return cap_scale(p->dl.dl_deadline, cap) >= p->dl.dl_runtime; -} - extern void init_dl_bw(struct dl_bw *dl_b); extern int sched_dl_global_validate(void); extern void sched_dl_do_global(void); @@ -953,6 +938,12 @@ struct uclamp_rq { DECLARE_STATIC_KEY_FALSE(sched_uclamp_used); #endif /* CONFIG_UCLAMP_TASK */ +struct rq; +struct balance_callback { + struct balance_callback *next; + void (*func)(struct rq *rq); +}; + /* * This is the main, per-CPU runqueue data structure. * @@ -1050,8 +1041,9 @@ struct rq { unsigned long cpu_capacity; unsigned long cpu_capacity_orig; + unsigned long cpu_capacity_inverted; - struct callback_head *balance_callback; + struct balance_callback *balance_callback; unsigned char nohz_idle_balance; unsigned char idle_balance; @@ -1159,6 +1151,9 @@ struct rq { unsigned int core_forceidle_occupation; u64 core_forceidle_start; #endif + + /* Scratch cpumask to be temporarily used under rq_lock */ + cpumask_var_t scratch_mask; }; #ifdef CONFIG_FAIR_GROUP_SCHED @@ -1197,6 +1192,14 @@ static inline bool is_migration_disabled(struct task_struct *p) #endif } +DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); + +#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) +#define this_rq() this_cpu_ptr(&runqueues) +#define task_rq(p) cpu_rq(task_cpu(p)) +#define cpu_curr(cpu) (cpu_rq(cpu)->curr) +#define raw_rq() raw_cpu_ptr(&runqueues) + struct sched_group; #ifdef CONFIG_SCHED_CORE static inline struct cpumask *sched_group_span(struct sched_group *sg); @@ -1284,7 +1287,7 @@ static inline bool sched_group_cookie_match(struct rq *rq, return true; for_each_cpu_and(cpu, sched_group_span(group), p->cpus_ptr) { - if (sched_core_cookie_match(rq, p)) + if (sched_core_cookie_match(cpu_rq(cpu), p)) return true; } return false; @@ -1399,14 +1402,6 @@ static inline void update_idle_core(struct rq *rq) static inline void update_idle_core(struct rq *rq) { } #endif -DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); - -#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) -#define this_rq() this_cpu_ptr(&runqueues) -#define task_rq(p) cpu_rq(task_cpu(p)) -#define cpu_curr(cpu) (cpu_rq(cpu)->curr) -#define raw_rq() raw_cpu_ptr(&runqueues) - #ifdef CONFIG_FAIR_GROUP_SCHED static inline struct task_struct *task_of(struct sched_entity *se) { @@ -1559,7 +1554,7 @@ struct rq_flags { #endif }; -extern struct callback_head balance_push_callback; +extern struct balance_callback balance_push_callback; /* * Lockdep annotation that avoids accidental unlocks; it's like a @@ -1739,7 +1734,7 @@ init_numa_balancing(unsigned long clone_flags, struct task_struct *p) static inline void queue_balance_callback(struct rq *rq, - struct callback_head *head, + struct balance_callback *head, void (*func)(struct rq *rq)) { lockdep_assert_rq_held(rq); @@ -1752,7 +1747,7 @@ queue_balance_callback(struct rq *rq, if (unlikely(head->next || rq->balance_callback == &balance_push_callback)) return; - head->func = (void (*)(struct callback_head *))func; + head->func = func; head->next = rq->balance_callback; rq->balance_callback = head; } @@ -1815,6 +1810,11 @@ DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); extern struct static_key_false sched_asym_cpucapacity; +static __always_inline bool sched_asym_cpucap_active(void) +{ + return static_branch_unlikely(&sched_asym_cpucapacity); +} + struct sched_group_capacity { atomic_t ref; /* @@ -1881,6 +1881,13 @@ static inline void dirty_sched_domain_sysctl(int cpu) #endif extern int sched_update_scaling(void); + +static inline const struct cpumask *task_user_cpus(struct task_struct *p) +{ + if (!p->user_cpus_ptr) + return cpu_possible_mask; /* &init_task.cpus_mask */ + return p->user_cpus_ptr; +} #endif /* CONFIG_SMP */ #include "stats.h" @@ -1942,6 +1949,7 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) set_task_rq_fair(&p->se, p->se.cfs_rq, tg->cfs_rq[cpu]); p->se.cfs_rq = tg->cfs_rq[cpu]; p->se.parent = tg->se[cpu]; + p->se.depth = tg->se[cpu] ? tg->se[cpu]->depth + 1 : 0; #endif #ifdef CONFIG_RT_GROUP_SCHED @@ -2060,7 +2068,7 @@ static inline int task_current(struct rq *rq, struct task_struct *p) return rq->curr == p; } -static inline int task_running(struct rq *rq, struct task_struct *p) +static inline int task_on_cpu(struct rq *rq, struct task_struct *p) { #ifdef CONFIG_SMP return p->on_cpu; @@ -2147,6 +2155,12 @@ extern const u32 sched_prio_to_wmult[40]; #define RETRY_TASK ((void *)-1UL) +struct affinity_context { + const struct cpumask *new_mask; + struct cpumask *user_mask; + unsigned int flags; +}; + struct sched_class { #ifdef CONFIG_UCLAMP_TASK @@ -2175,9 +2189,7 @@ struct sched_class { void (*task_woken)(struct rq *this_rq, struct task_struct *task); - void (*set_cpus_allowed)(struct task_struct *p, - const struct cpumask *newmask, - u32 flags); + void (*set_cpus_allowed)(struct task_struct *p, struct affinity_context *ctx); void (*rq_online)(struct rq *rq); void (*rq_offline)(struct rq *rq); @@ -2204,11 +2216,8 @@ struct sched_class { void (*update_curr)(struct rq *rq); -#define TASK_SET_GROUP 0 -#define TASK_MOVE_GROUP 1 - #ifdef CONFIG_FAIR_GROUP_SCHED - void (*task_change_group)(struct task_struct *p, int type); + void (*task_change_group)(struct task_struct *p); #endif }; @@ -2291,7 +2300,7 @@ extern void update_group_capacity(struct sched_domain *sd, int cpu); extern void trigger_load_balance(struct rq *rq); -extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags); +extern void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx); static inline struct task_struct *get_push_task(struct rq *rq) { @@ -2435,6 +2444,12 @@ extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); +#ifdef CONFIG_PREEMPT_RT +#define SCHED_NR_MIGRATE_BREAK 8 +#else +#define SCHED_NR_MIGRATE_BREAK 32 +#endif + extern const_debug unsigned int sysctl_sched_nr_migrate; extern const_debug unsigned int sysctl_sched_migration_cost; @@ -2452,6 +2467,7 @@ extern unsigned int sysctl_numa_balancing_scan_delay; extern unsigned int sysctl_numa_balancing_scan_period_min; extern unsigned int sysctl_numa_balancing_scan_period_max; extern unsigned int sysctl_numa_balancing_scan_size; +extern unsigned int sysctl_numa_balancing_hot_threshold; #endif #ifdef CONFIG_SCHED_HRTICK @@ -2709,8 +2725,8 @@ static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) __acquires(rq1->lock) __acquires(rq2->lock) { - BUG_ON(!irqs_disabled()); - BUG_ON(rq1 != rq2); + WARN_ON_ONCE(!irqs_disabled()); + WARN_ON_ONCE(rq1 != rq2); raw_spin_rq_lock(rq1); __acquire(rq2->lock); /* Fake it out ;) */ double_rq_clock_clear_update(rq1, rq2); @@ -2726,7 +2742,7 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) __releases(rq1->lock) __releases(rq2->lock) { - BUG_ON(rq1 != rq2); + WARN_ON_ONCE(rq1 != rq2); raw_spin_rq_unlock(rq1); __release(rq2->lock); } @@ -2877,6 +2893,24 @@ static inline unsigned long capacity_orig_of(int cpu) return cpu_rq(cpu)->cpu_capacity_orig; } +/* + * Returns inverted capacity if the CPU is in capacity inversion state. + * 0 otherwise. + * + * Capacity inversion detection only considers thermal impact where actual + * performance points (OPPs) gets dropped. + * + * Capacity inversion state happens when another performance domain that has + * equal or lower capacity_orig_of() becomes effectively larger than the perf + * domain this CPU belongs to due to thermal pressure throttling it hard. + * + * See comment in update_cpu_capacity(). + */ +static inline unsigned long cpu_in_capacity_inversion(int cpu) +{ + return cpu_rq(cpu)->cpu_capacity_inverted; +} + /** * enum cpu_util_type - CPU utilization type * @FREQUENCY_UTIL: Utilization used to select frequency @@ -2896,6 +2930,21 @@ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, enum cpu_util_type type, struct task_struct *p); +/* + * Verify the fitness of task @p to run on @cpu taking into account the + * CPU original capacity and the runtime/deadline ratio of the task. + * + * The function will return true if the original capacity of @cpu is + * greater than or equal to task's deadline density right shifted by + * (BW_SHIFT - SCHED_CAPACITY_SHIFT) and false otherwise. + */ +static inline bool dl_task_fits_capacity(struct task_struct *p, int cpu) +{ + unsigned long cap = arch_scale_cpu_capacity(cpu); + + return cap >= p->dl.dl_density >> (BW_SHIFT - SCHED_CAPACITY_SHIFT); +} + static inline unsigned long cpu_bw_dl(struct rq *rq) { return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT; @@ -2963,6 +3012,23 @@ static inline unsigned long cpu_util_rt(struct rq *rq) #ifdef CONFIG_UCLAMP_TASK unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id); +static inline unsigned long uclamp_rq_get(struct rq *rq, + enum uclamp_id clamp_id) +{ + return READ_ONCE(rq->uclamp[clamp_id].value); +} + +static inline void uclamp_rq_set(struct rq *rq, enum uclamp_id clamp_id, + unsigned int value) +{ + WRITE_ONCE(rq->uclamp[clamp_id].value, value); +} + +static inline bool uclamp_rq_is_idle(struct rq *rq) +{ + return rq->uclamp_flags & UCLAMP_FLAG_IDLE; +} + /** * uclamp_rq_util_with - clamp @util with @rq and @p effective uclamp values. * @rq: The rq to clamp against. Must not be NULL. @@ -2998,12 +3064,12 @@ unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, * Ignore last runnable task's max clamp, as this task will * reset it. Similarly, no need to read the rq's min clamp. */ - if (rq->uclamp_flags & UCLAMP_FLAG_IDLE) + if (uclamp_rq_is_idle(rq)) goto out; } - min_util = max_t(unsigned long, min_util, READ_ONCE(rq->uclamp[UCLAMP_MIN].value)); - max_util = max_t(unsigned long, max_util, READ_ONCE(rq->uclamp[UCLAMP_MAX].value)); + min_util = max_t(unsigned long, min_util, uclamp_rq_get(rq, UCLAMP_MIN)); + max_util = max_t(unsigned long, max_util, uclamp_rq_get(rq, UCLAMP_MAX)); out: /* * Since CPU's {min,max}_util clamps are MAX aggregated considering @@ -3044,6 +3110,15 @@ static inline bool uclamp_is_used(void) return static_branch_likely(&sched_uclamp_used); } #else /* CONFIG_UCLAMP_TASK */ +static inline unsigned long uclamp_eff_value(struct task_struct *p, + enum uclamp_id clamp_id) +{ + if (clamp_id == UCLAMP_MIN) + return 0; + + return SCHED_CAPACITY_SCALE; +} + static inline unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, struct task_struct *p) @@ -3057,6 +3132,25 @@ static inline bool uclamp_is_used(void) { return false; } + +static inline unsigned long uclamp_rq_get(struct rq *rq, + enum uclamp_id clamp_id) +{ + if (clamp_id == UCLAMP_MIN) + return 0; + + return SCHED_CAPACITY_SCALE; +} + +static inline void uclamp_rq_set(struct rq *rq, enum uclamp_id clamp_id, + unsigned int value) +{ +} + +static inline bool uclamp_rq_is_idle(struct rq *rq) +{ + return false; +} #endif /* CONFIG_UCLAMP_TASK */ #ifdef CONFIG_HAVE_SCHED_AVG_IRQ @@ -3157,4 +3251,14 @@ extern int sched_dynamic_mode(const char *str); extern void sched_dynamic_update(int mode); #endif +static inline void update_current_exec_runtime(struct task_struct *curr, + u64 now, u64 delta_exec) +{ + curr->se.sum_exec_runtime += delta_exec; + account_group_exec_runtime(curr, delta_exec); + + curr->se.exec_start = now; + cgroup_account_cputime(curr, delta_exec); +} + #endif /* _KERNEL_SCHED_SCHED_H */ diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index baa839c1ba96..38f3698f5e5b 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -107,6 +107,11 @@ __schedstats_from_se(struct sched_entity *se) } #ifdef CONFIG_PSI +void psi_task_change(struct task_struct *task, int clear, int set); +void psi_task_switch(struct task_struct *prev, struct task_struct *next, + bool sleep); +void psi_account_irqtime(struct task_struct *task, u32 delta); + /* * PSI tracks state that persists across sleeps, such as iowaits and * memory stalls. As a result, it has to distinguish between sleeps, @@ -123,11 +128,9 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup) if (p->in_memstall) set |= TSK_MEMSTALL_RUNNING; - if (!wakeup || p->sched_psi_wake_requeue) { + if (!wakeup) { if (p->in_memstall) set |= TSK_MEMSTALL; - if (p->sched_psi_wake_requeue) - p->sched_psi_wake_requeue = 0; } else { if (p->in_iowait) clear |= TSK_IOWAIT; @@ -138,8 +141,6 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup) static inline void psi_dequeue(struct task_struct *p, bool sleep) { - int clear = TSK_RUNNING; - if (static_branch_likely(&psi_disabled)) return; @@ -152,10 +153,7 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep) if (sleep) return; - if (p->in_memstall) - clear |= (TSK_MEMSTALL | TSK_MEMSTALL_RUNNING); - - psi_task_change(p, clear, 0); + psi_task_change(p, p->psi_flags, 0); } static inline void psi_ttwu_dequeue(struct task_struct *p) @@ -167,19 +165,12 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) * deregister its sleep-persistent psi states from the old * queue, and let psi_enqueue() know it has to requeue. */ - if (unlikely(p->in_iowait || p->in_memstall)) { + if (unlikely(p->psi_flags)) { struct rq_flags rf; struct rq *rq; - int clear = 0; - - if (p->in_iowait) - clear |= TSK_IOWAIT; - if (p->in_memstall) - clear |= TSK_MEMSTALL; rq = __task_rq_lock(p, &rf); - psi_task_change(p, clear, 0); - p->sched_psi_wake_requeue = 1; + psi_task_change(p, p->psi_flags, 0); __task_rq_unlock(rq, &rf); } } @@ -201,6 +192,7 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) {} static inline void psi_sched_switch(struct task_struct *prev, struct task_struct *next, bool sleep) {} +static inline void psi_account_irqtime(struct task_struct *task, u32 delta) {} #endif /* CONFIG_PSI */ #ifdef CONFIG_SCHED_INFO diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index d04073a93eb4..85590599b4d6 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -71,20 +71,17 @@ static void yield_task_stop(struct rq *rq) static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) { struct task_struct *curr = rq->curr; - u64 delta_exec; + u64 now, delta_exec; - delta_exec = rq_clock_task(rq) - curr->se.exec_start; + now = rq_clock_task(rq); + delta_exec = now - curr->se.exec_start; if (unlikely((s64)delta_exec < 0)) delta_exec = 0; schedstat_set(curr->stats.exec_max, max(curr->stats.exec_max, delta_exec)); - curr->se.sum_exec_runtime += delta_exec; - account_group_exec_runtime(curr, delta_exec); - - curr->se.exec_start = rq_clock_task(rq); - cgroup_account_cputime(curr, delta_exec); + update_current_exec_runtime(curr, now, delta_exec); } /* diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 9860bb9a847c..133b74730738 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -121,11 +121,12 @@ static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode, return nr_exclusive; } -static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int mode, +static int __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int mode, int nr_exclusive, int wake_flags, void *key) { unsigned long flags; wait_queue_entry_t bookmark; + int remaining = nr_exclusive; bookmark.flags = 0; bookmark.private = NULL; @@ -134,10 +135,12 @@ static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int do { spin_lock_irqsave(&wq_head->lock, flags); - nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive, + remaining = __wake_up_common(wq_head, mode, remaining, wake_flags, key, &bookmark); spin_unlock_irqrestore(&wq_head->lock, flags); } while (bookmark.flags & WQ_FLAG_BOOKMARK); + + return nr_exclusive - remaining; } /** @@ -147,13 +150,14 @@ static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int * @nr_exclusive: how many wake-one or wake-many threads to wake up * @key: is directly passed to the wakeup function * - * If this function wakes up a task, it executes a full memory barrier before - * accessing the task state. + * If this function wakes up a task, it executes a full memory barrier + * before accessing the task state. Returns the number of exclusive + * tasks that were awaken. */ -void __wake_up(struct wait_queue_head *wq_head, unsigned int mode, - int nr_exclusive, void *key) +int __wake_up(struct wait_queue_head *wq_head, unsigned int mode, + int nr_exclusive, void *key) { - __wake_up_common_lock(wq_head, mode, nr_exclusive, 0, key); + return __wake_up_common_lock(wq_head, mode, nr_exclusive, 0, key); } EXPORT_SYMBOL(__wake_up); diff --git a/kernel/scs.c b/kernel/scs.c index b7e1b096d906..d7809affe740 100644 --- a/kernel/scs.c +++ b/kernel/scs.c @@ -12,6 +12,10 @@ #include <linux/vmalloc.h> #include <linux/vmstat.h> +#ifdef CONFIG_DYNAMIC_SCS +DEFINE_STATIC_KEY_FALSE(dynamic_scs_enabled); +#endif + static void __scs_account(void *s, int account) { struct page *scs_page = vmalloc_to_page(s); @@ -101,14 +105,20 @@ static int scs_cleanup(unsigned int cpu) void __init scs_init(void) { + if (!scs_is_enabled()) + return; cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "scs:scs_cache", NULL, scs_cleanup); } int scs_prepare(struct task_struct *tsk, int node) { - void *s = scs_alloc(node); + void *s; + if (!scs_is_enabled()) + return 0; + + s = scs_alloc(node); if (!s) return -ENOMEM; @@ -148,7 +158,7 @@ void scs_release(struct task_struct *tsk) { void *s = task_scs(tsk); - if (!s) + if (!scs_is_enabled() || !s) return; WARN(task_scs_end_corrupted(tsk), diff --git a/kernel/signal.c b/kernel/signal.c index 6f86fda5e432..ae26da61c4d9 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -913,8 +913,9 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force) if (signal->core_state) return sig == SIGKILL; /* - * The process is in the middle of dying, nothing to do. + * The process is in the middle of dying, drop the signal. */ + return false; } else if (sig_kernel_stop(sig)) { /* * This is a stop signal. Remove SIGCONT from all queues. @@ -1254,7 +1255,7 @@ int send_signal_locked(int sig, struct kernel_siginfo *info, static void print_fatal_signal(int signr) { - struct pt_regs *regs = signal_pt_regs(); + struct pt_regs *regs = task_pt_regs(current); pr_info("potentially unexpected fatal signal %d.\n", signr); #if defined(__i386__) && !defined(__arch_um__) @@ -2304,7 +2305,7 @@ static int ptrace_stop(int exit_code, int why, unsigned long message, read_unlock(&tasklist_lock); cgroup_enter_frozen(); preempt_enable_no_resched(); - freezable_schedule(); + schedule(); cgroup_leave_frozen(true); /* @@ -2473,7 +2474,7 @@ static bool do_signal_stop(int signr) /* Now we don't run again until woken by SIGCONT or SIGKILL */ cgroup_enter_frozen(); - freezable_schedule(); + schedule(); return true; } else { /* @@ -2548,11 +2549,11 @@ static void do_freezer_trap(void) * immediately (if there is a non-fatal signal pending), and * put the task into sleep. */ - __set_current_state(TASK_INTERRUPTIBLE); + __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); clear_thread_flag(TIF_SIGPENDING); spin_unlock_irq(¤t->sighand->siglock); cgroup_enter_frozen(); - freezable_schedule(); + schedule(); } static int ptrace_signal(int signr, kernel_siginfo_t *info, enum pid_type type) @@ -2692,6 +2693,7 @@ relock: /* Has this task already been marked for death? */ if ((signal->flags & SIGNAL_GROUP_EXIT) || signal->group_exec_task) { + clear_siginfo(&ksig->info); ksig->info.si_signo = signr = SIGKILL; sigdelset(¤t->pending.signal, SIGKILL); trace_signal_deliver(SIGKILL, SEND_SIG_NOINFO, @@ -3600,9 +3602,9 @@ static int do_sigtimedwait(const sigset_t *which, kernel_siginfo_t *info, recalc_sigpending(); spin_unlock_irq(&tsk->sighand->siglock); - __set_current_state(TASK_INTERRUPTIBLE); - ret = freezable_schedule_hrtimeout_range(to, tsk->timer_slack_ns, - HRTIMER_MODE_REL); + __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); + ret = schedule_hrtimeout_range(to, tsk->timer_slack_ns, + HRTIMER_MODE_REL); spin_lock_irq(&tsk->sighand->siglock); __set_task_blocked(tsk, &tsk->real_blocked); sigemptyset(&tsk->real_blocked); diff --git a/kernel/smp.c b/kernel/smp.c index 650810a6f29b..06a413987a14 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -370,8 +370,7 @@ static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 * if (cpu >= 0) { if (static_branch_unlikely(&csdlock_debug_extended)) csd_lock_print_extended(csd, cpu); - if (!trigger_single_cpu_backtrace(cpu)) - dump_cpu_task(cpu); + dump_cpu_task(cpu); if (!cpu_cur_csd) { pr_alert("csd: Re-sending CSD lock (#%d) IPI from CPU#%02d to CPU#%02d\n", *bug_id, raw_smp_processor_id(), cpu); arch_send_call_function_single_ipi(cpu); @@ -1070,7 +1069,7 @@ static int __init nrcpus(char *str) int nr_cpus; if (get_option(&str, &nr_cpus) && nr_cpus > 0 && nr_cpus < nr_cpu_ids) - nr_cpu_ids = nr_cpus; + set_nr_cpu_ids(nr_cpus); return 0; } @@ -1088,14 +1087,16 @@ static int __init maxcpus(char *str) early_param("maxcpus", maxcpus); +#if (NR_CPUS > 1) && !defined(CONFIG_FORCE_NR_CPUS) /* Setup number of possible processor ids */ unsigned int nr_cpu_ids __read_mostly = NR_CPUS; EXPORT_SYMBOL(nr_cpu_ids); +#endif /* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */ void __init setup_nr_cpu_ids(void) { - nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1; + set_nr_cpu_ids(find_last_bit(cpumask_bits(cpu_possible_mask), NR_CPUS) + 1); } /* Called by boot processor to activate the rest. */ diff --git a/kernel/smpboot.c b/kernel/smpboot.c index b9f54544e749..2c7396da470c 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -433,7 +433,7 @@ bool cpu_wait_death(unsigned int cpu, int seconds) /* The outgoing CPU will normally get done quite quickly. */ if (atomic_read(&per_cpu(cpu_hotplug_state, cpu)) == CPU_DEAD) - goto update_state; + goto update_state_early; udelay(5); /* But if the outgoing CPU dawdles, wait increasingly long times. */ @@ -444,16 +444,17 @@ bool cpu_wait_death(unsigned int cpu, int seconds) break; sleep_jf = DIV_ROUND_UP(sleep_jf * 11, 10); } -update_state: +update_state_early: oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu)); +update_state: if (oldstate == CPU_DEAD) { /* Outgoing CPU died normally, update state. */ smp_mb(); /* atomic_read() before update. */ atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_POST_DEAD); } else { /* Outgoing CPU still hasn't died, set state accordingly. */ - if (atomic_cmpxchg(&per_cpu(cpu_hotplug_state, cpu), - oldstate, CPU_BROKEN) != oldstate) + if (!atomic_try_cmpxchg(&per_cpu(cpu_hotplug_state, cpu), + &oldstate, CPU_BROKEN)) goto update_state; ret = false; } @@ -475,14 +476,14 @@ bool cpu_report_death(void) int newstate; int cpu = smp_processor_id(); + oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu)); do { - oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu)); if (oldstate != CPU_BROKEN) newstate = CPU_DEAD; else newstate = CPU_DEAD_FROZEN; - } while (atomic_cmpxchg(&per_cpu(cpu_hotplug_state, cpu), - oldstate, newstate) != oldstate); + } while (!atomic_try_cmpxchg(&per_cpu(cpu_hotplug_state, cpu), + &oldstate, newstate)); return newstate == CPU_DEAD; } diff --git a/kernel/static_call_inline.c b/kernel/static_call_inline.c index dc5665b62814..639397b5491c 100644 --- a/kernel/static_call_inline.c +++ b/kernel/static_call_inline.c @@ -15,7 +15,18 @@ extern struct static_call_site __start_static_call_sites[], extern struct static_call_tramp_key __start_static_call_tramp_key[], __stop_static_call_tramp_key[]; -static bool static_call_initialized; +static int static_call_initialized; + +/* + * Must be called before early_initcall() to be effective. + */ +void static_call_force_reinit(void) +{ + if (WARN_ON_ONCE(!static_call_initialized)) + return; + + static_call_initialized++; +} /* mutex to protect key modules/sites */ static DEFINE_MUTEX(static_call_mutex); @@ -475,7 +486,8 @@ int __init static_call_init(void) { int ret; - if (static_call_initialized) + /* See static_call_force_reinit(). */ + if (static_call_initialized == 1) return 0; cpus_read_lock(); @@ -490,11 +502,12 @@ int __init static_call_init(void) BUG(); } - static_call_initialized = true; - #ifdef CONFIG_MODULES - register_module_notifier(&static_call_module_nb); + if (!static_call_initialized) + register_module_notifier(&static_call_module_nb); #endif + + static_call_initialized = 1; return 0; } early_initcall(static_call_init); diff --git a/kernel/sys.c b/kernel/sys.c index b911fa6d81ab..5fd54bf0e886 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -25,6 +25,7 @@ #include <linux/times.h> #include <linux/posix-timers.h> #include <linux/security.h> +#include <linux/random.h> #include <linux/suspend.h> #include <linux/tty.h> #include <linux/signal.h> @@ -496,7 +497,7 @@ static void flag_nproc_exceeded(struct cred *new) * for programs doing set*uid()+execve() by harmlessly deferring the * failure to the execve() stage. */ - if (is_ucounts_overlimit(new->ucounts, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)) && + if (is_rlimit_overlimit(new->ucounts, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)) && new->user != INIT_USER) current->flags |= PF_NPROC_EXCEEDED; else @@ -1366,6 +1367,7 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) if (!copy_from_user(tmp, name, len)) { struct new_utsname *u; + add_device_randomness(tmp, len); down_write(&uts_sem); u = utsname(); memcpy(u->nodename, tmp, len); @@ -1419,6 +1421,7 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len) if (!copy_from_user(tmp, name, len)) { struct new_utsname *u; + add_device_randomness(tmp, len); down_write(&uts_sem); u = utsname(); memcpy(u->domainname, tmp, len); diff --git a/kernel/sysctl-test.c b/kernel/sysctl-test.c index 664ded05dd7a..6ef887c19c48 100644 --- a/kernel/sysctl-test.c +++ b/kernel/sysctl-test.c @@ -9,9 +9,6 @@ #define KUNIT_PROC_READ 0 #define KUNIT_PROC_WRITE 1 -static int i_zero; -static int i_one_hundred = 100; - /* * Test that proc_dointvec will not try to use a NULL .data field even when the * length is non-zero. @@ -29,8 +26,8 @@ static void sysctl_test_api_dointvec_null_tbl_data(struct kunit *test) .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, - .extra1 = &i_zero, - .extra2 = &i_one_hundred, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, }; /* * proc_dointvec expects a buffer in user space, so we allocate one. We @@ -79,8 +76,8 @@ static void sysctl_test_api_dointvec_table_maxlen_unset(struct kunit *test) .maxlen = 0, .mode = 0644, .proc_handler = proc_dointvec, - .extra1 = &i_zero, - .extra2 = &i_one_hundred, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, }; void __user *buffer = (void __user *)kunit_kzalloc(test, sizeof(int), GFP_USER); @@ -122,8 +119,8 @@ static void sysctl_test_api_dointvec_table_len_is_zero(struct kunit *test) .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, - .extra1 = &i_zero, - .extra2 = &i_one_hundred, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, }; void __user *buffer = (void __user *)kunit_kzalloc(test, sizeof(int), GFP_USER); @@ -156,8 +153,8 @@ static void sysctl_test_api_dointvec_table_read_but_position_set( .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, - .extra1 = &i_zero, - .extra2 = &i_one_hundred, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, }; void __user *buffer = (void __user *)kunit_kzalloc(test, sizeof(int), GFP_USER); @@ -191,8 +188,8 @@ static void sysctl_test_dointvec_read_happy_single_positive(struct kunit *test) .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, - .extra1 = &i_zero, - .extra2 = &i_one_hundred, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, }; size_t len = 4; loff_t pos = 0; @@ -222,8 +219,8 @@ static void sysctl_test_dointvec_read_happy_single_negative(struct kunit *test) .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, - .extra1 = &i_zero, - .extra2 = &i_one_hundred, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, }; size_t len = 5; loff_t pos = 0; @@ -251,8 +248,8 @@ static void sysctl_test_dointvec_write_happy_single_positive(struct kunit *test) .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, - .extra1 = &i_zero, - .extra2 = &i_one_hundred, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, }; char input[] = "9"; size_t len = sizeof(input) - 1; @@ -281,8 +278,8 @@ static void sysctl_test_dointvec_write_happy_single_negative(struct kunit *test) .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, - .extra1 = &i_zero, - .extra2 = &i_one_hundred, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, }; char input[] = "-9"; size_t len = sizeof(input) - 1; @@ -313,8 +310,8 @@ static void sysctl_test_api_dointvec_write_single_less_int_min( .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, - .extra1 = &i_zero, - .extra2 = &i_one_hundred, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, }; size_t max_len = 32, len = max_len; loff_t pos = 0; @@ -351,8 +348,8 @@ static void sysctl_test_api_dointvec_write_single_greater_int_max( .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, - .extra1 = &i_zero, - .extra2 = &i_one_hundred, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, }; size_t max_len = 32, len = max_len; loff_t pos = 0; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 205d605cacc5..137d4abe3eda 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -82,9 +82,16 @@ #include <linux/rtmutex.h> #endif +/* shared constants to be used in various sysctls */ +const int sysctl_vals[] = { 0, 1, 2, 3, 4, 100, 200, 1000, 3000, INT_MAX, 65535, -1 }; +EXPORT_SYMBOL(sysctl_vals); + +const unsigned long sysctl_long_vals[] = { 0, 1, LONG_MAX }; +EXPORT_SYMBOL_GPL(sysctl_long_vals); + #if defined(CONFIG_SYSCTL) -/* Constants used for minimum and maximum */ +/* Constants used for minimum and maximum */ #ifdef CONFIG_PERF_EVENTS static const int six_hundred_forty_kb = 640 * 1024; @@ -129,11 +136,6 @@ static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT; int sysctl_legacy_va_layout; #endif -#ifdef CONFIG_COMPACTION -/* min_extfrag_threshold is SYSCTL_ZERO */; -static const int max_extfrag_threshold = 1000; -#endif - #endif /* CONFIG_SYSCTL */ /* @@ -265,13 +267,14 @@ int proc_dostring(struct ctl_table *table, int write, ppos); } -static size_t proc_skip_spaces(char **buf) +static void proc_skip_spaces(char **buf, size_t *size) { - size_t ret; - char *tmp = skip_spaces(*buf); - ret = tmp - *buf; - *buf = tmp; - return ret; + while (*size) { + if (!isspace(**buf)) + break; + (*size)--; + (*buf)++; + } } static void proc_skip_char(char **buf, size_t *size, const char v) @@ -340,13 +343,12 @@ static int proc_get_long(char **buf, size_t *size, unsigned long *val, bool *neg, const char *perm_tr, unsigned perm_tr_len, char *tr) { - int len; char *p, tmp[TMPBUFLEN]; + ssize_t len = *size; - if (!*size) + if (len <= 0) return -EINVAL; - len = *size; if (len > TMPBUFLEN - 1) len = TMPBUFLEN - 1; @@ -519,7 +521,7 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, bool neg; if (write) { - left -= proc_skip_spaces(&p); + proc_skip_spaces(&p, &left); if (!left) break; @@ -546,7 +548,7 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, if (!write && !first && left && !err) proc_put_char(&buffer, &left, '\n'); if (write && !err && left) - left -= proc_skip_spaces(&p); + proc_skip_spaces(&p, &left); if (write && first) return err ? : -EINVAL; *lenp -= left; @@ -588,7 +590,7 @@ static int do_proc_douintvec_w(unsigned int *tbl_data, if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; - left -= proc_skip_spaces(&p); + proc_skip_spaces(&p, &left); if (!left) { err = -EINVAL; goto out_free; @@ -608,7 +610,7 @@ static int do_proc_douintvec_w(unsigned int *tbl_data, } if (!err && left) - left -= proc_skip_spaces(&p); + proc_skip_spaces(&p, &left); out_free: if (err) @@ -1052,9 +1054,9 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, return 0; } - i = (unsigned long *) data; - min = (unsigned long *) table->extra1; - max = (unsigned long *) table->extra2; + i = data; + min = table->extra1; + max = table->extra2; vleft = table->maxlen / sizeof(unsigned long); left = *lenp; @@ -1073,7 +1075,7 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, if (write) { bool neg; - left -= proc_skip_spaces(&p); + proc_skip_spaces(&p, &left); if (!left) break; @@ -1102,7 +1104,7 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, if (!write && !first && left && !err) proc_put_char(&buffer, &left, '\n'); if (write && !err) - left -= proc_skip_spaces(&p); + proc_skip_spaces(&p, &left); if (write && first) return err ? : -EINVAL; *lenp -= left; @@ -1631,17 +1633,6 @@ int proc_do_static_key(struct ctl_table *table, int write, } static struct ctl_table kern_table[] = { -#ifdef CONFIG_NUMA_BALANCING - { - .procname = "numa_balancing", - .data = NULL, /* filled in by handler */ - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sysctl_numa_balancing, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_FOUR, - }, -#endif /* CONFIG_NUMA_BALANCING */ { .procname = "panic", .data = &panic_timeout, @@ -2115,6 +2106,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, + .extra2 = (void *)&page_cluster_max, }, { .procname = "dirtytime_expire_seconds", @@ -2216,7 +2208,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = (void *)&max_extfrag_threshold, + .extra2 = SYSCTL_ONE_THOUSAND, }, { .procname = "compact_unevictable_allowed", diff --git a/kernel/task_work.c b/kernel/task_work.c index dff75bcde151..065e1ef8fc8d 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -47,12 +47,12 @@ int task_work_add(struct task_struct *task, struct callback_head *work, /* record the work call stack in order to print it in KASAN reports */ kasan_record_aux_stack(work); + head = READ_ONCE(task->task_works); do { - head = READ_ONCE(task->task_works); if (unlikely(head == &work_exited)) return -ESRCH; work->next = head; - } while (cmpxchg(&task->task_works, head, work) != head); + } while (!try_cmpxchg(&task->task_works, &head, work)); switch (notify) { case TWA_NONE: @@ -100,10 +100,12 @@ task_work_cancel_match(struct task_struct *task, * we raced with task_work_run(), *pprev == NULL/exited. */ raw_spin_lock_irqsave(&task->pi_lock, flags); - while ((work = READ_ONCE(*pprev))) { - if (!match(work, data)) + work = READ_ONCE(*pprev); + while (work) { + if (!match(work, data)) { pprev = &work->next; - else if (cmpxchg(pprev, work, work->next) == work) + work = READ_ONCE(*pprev); + } else if (try_cmpxchg(pprev, &work, work->next)) break; } raw_spin_unlock_irqrestore(&task->pi_lock, flags); @@ -151,16 +153,16 @@ void task_work_run(void) * work->func() can do task_work_add(), do not set * work_exited unless the list is empty. */ + work = READ_ONCE(task->task_works); do { head = NULL; - work = READ_ONCE(task->task_works); if (!work) { if (task->flags & PF_EXITING) head = &work_exited; else break; } - } while (cmpxchg(&task->task_works, work, head) != work); + } while (!try_cmpxchg(&task->task_works, &work, head)); if (!work) break; diff --git a/kernel/taskstats.c b/kernel/taskstats.c index f7e246336218..8ce3fa0c19e2 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -688,6 +688,7 @@ static struct genl_family family __ro_after_init = { .module = THIS_MODULE, .ops = taskstats_ops, .n_ops = ARRAY_SIZE(taskstats_ops), + .resv_start_op = CGROUPSTATS_CMD_GET + 1, .netnsok = true, }; diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 5d85014d59b5..960143b183cd 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -76,7 +76,7 @@ static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt, } /** - * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds + * clockevent_delta2ns - Convert a latch value (device ticks) to nanoseconds * @latch: value to convert * @evt: pointer to clock event device descriptor * diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index cee5da1e54c4..9cf32ccda715 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -310,7 +310,7 @@ static void clocksource_verify_choose_cpus(void) * CPUs that are currently online. */ for (i = 1; i < n; i++) { - cpu = prandom_u32() % nr_cpu_ids; + cpu = get_random_u32_below(nr_cpu_ids); cpu = cpumask_next(cpu - 1, cpu_online_mask); if (cpu >= nr_cpu_ids) cpu = cpumask_first(cpu_online_mask); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 23af5eca11b1..3ae661ab6260 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -2037,11 +2037,11 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod struct restart_block *restart; do { - set_current_state(TASK_INTERRUPTIBLE); + set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); hrtimer_sleeper_start_expires(t, mode); if (likely(t->task)) - freezable_schedule(); + schedule(); hrtimer_cancel(&t->timer); mode = HRTIMER_MODE_ABS; diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index aec832801c26..0775b9ec952a 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -192,6 +192,24 @@ static void timens_setup_vdso_data(struct vdso_data *vdata, offset[CLOCK_BOOTTIME_ALARM] = boottime; } +struct page *find_timens_vvar_page(struct vm_area_struct *vma) +{ + if (likely(vma->vm_mm == current->mm)) + return current->nsproxy->time_ns->vvar_page; + + /* + * VM_PFNMAP | VM_IO protect .fault() handler from being called + * through interfaces like /proc/$pid/mem or + * process_vm_{readv,writev}() as long as there's no .access() + * in special_mapping_vmops(). + * For more details check_vma_flags() and __access_remote_vm() + */ + + WARN(1, "vvar_page accessed remotely"); + + return NULL; +} + /* * Protects possibly multiple offsets writers racing each other * and tasks entering the namespace. diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 717fcb9fb14a..63a8ce7177dd 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1017,7 +1017,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option unsigned int idx = UINT_MAX; int ret = 0; - BUG_ON(!timer->function); + debug_assert_init(timer); /* * This is a common optimization triggered by the networking code - if @@ -1044,6 +1044,14 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option * dequeue/enqueue dance. */ base = lock_timer_base(timer, &flags); + /* + * Has @timer been shutdown? This needs to be evaluated + * while holding base lock to prevent a race against the + * shutdown code. + */ + if (!timer->function) + goto out_unlock; + forward_timer_base(base); if (timer_pending(timer) && (options & MOD_TIMER_REDUCE) && @@ -1070,6 +1078,14 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option } } else { base = lock_timer_base(timer, &flags); + /* + * Has @timer been shutdown? This needs to be evaluated + * while holding base lock to prevent a race against the + * shutdown code. + */ + if (!timer->function) + goto out_unlock; + forward_timer_base(base); } @@ -1083,7 +1099,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option /* * We are trying to schedule the timer on the new base. * However we can't change timer's base while it is running, - * otherwise del_timer_sync() can't detect that the timer's + * otherwise timer_delete_sync() can't detect that the timer's * handler yet has not finished. This also guarantees that the * timer is serialized wrt itself. */ @@ -1121,14 +1137,20 @@ out_unlock: } /** - * mod_timer_pending - modify a pending timer's timeout - * @timer: the pending timer to be modified - * @expires: new timeout in jiffies + * mod_timer_pending - Modify a pending timer's timeout + * @timer: The pending timer to be modified + * @expires: New absolute timeout in jiffies + * + * mod_timer_pending() is the same for pending timers as mod_timer(), but + * will not activate inactive timers. * - * mod_timer_pending() is the same for pending timers as mod_timer(), - * but will not re-activate and modify already deleted timers. + * If @timer->function == NULL then the start operation is silently + * discarded. * - * It is useful for unserialized use of timers. + * Return: + * * %0 - The timer was inactive and not modified or was in + * shutdown state and the operation was discarded + * * %1 - The timer was active and requeued to expire at @expires */ int mod_timer_pending(struct timer_list *timer, unsigned long expires) { @@ -1137,24 +1159,31 @@ int mod_timer_pending(struct timer_list *timer, unsigned long expires) EXPORT_SYMBOL(mod_timer_pending); /** - * mod_timer - modify a timer's timeout - * @timer: the timer to be modified - * @expires: new timeout in jiffies - * - * mod_timer() is a more efficient way to update the expire field of an - * active timer (if the timer is inactive it will be activated) + * mod_timer - Modify a timer's timeout + * @timer: The timer to be modified + * @expires: New absolute timeout in jiffies * * mod_timer(timer, expires) is equivalent to: * * del_timer(timer); timer->expires = expires; add_timer(timer); * + * mod_timer() is more efficient than the above open coded sequence. In + * case that the timer is inactive, the del_timer() part is a NOP. The + * timer is in any case activated with the new expiry time @expires. + * * Note that if there are multiple unserialized concurrent users of the * same timer, then mod_timer() is the only safe way to modify the timeout, * since add_timer() cannot modify an already running timer. * - * The function returns whether it has modified a pending timer or not. - * (ie. mod_timer() of an inactive timer returns 0, mod_timer() of an - * active timer returns 1.) + * If @timer->function == NULL then the start operation is silently + * discarded. In this case the return value is 0 and meaningless. + * + * Return: + * * %0 - The timer was inactive and started or was in shutdown + * state and the operation was discarded + * * %1 - The timer was active and requeued to expire at @expires or + * the timer was active and not modified because @expires did + * not change the effective expiry time */ int mod_timer(struct timer_list *timer, unsigned long expires) { @@ -1165,11 +1194,22 @@ EXPORT_SYMBOL(mod_timer); /** * timer_reduce - Modify a timer's timeout if it would reduce the timeout * @timer: The timer to be modified - * @expires: New timeout in jiffies + * @expires: New absolute timeout in jiffies * * timer_reduce() is very similar to mod_timer(), except that it will only - * modify a running timer if that would reduce the expiration time (it will - * start a timer that isn't running). + * modify an enqueued timer if that would reduce the expiration time. If + * @timer is not enqueued it starts the timer. + * + * If @timer->function == NULL then the start operation is silently + * discarded. + * + * Return: + * * %0 - The timer was inactive and started or was in shutdown + * state and the operation was discarded + * * %1 - The timer was active and requeued to expire at @expires or + * the timer was active and not modified because @expires + * did not change the effective expiry time such that the + * timer would expire earlier than already scheduled */ int timer_reduce(struct timer_list *timer, unsigned long expires) { @@ -1178,39 +1218,51 @@ int timer_reduce(struct timer_list *timer, unsigned long expires) EXPORT_SYMBOL(timer_reduce); /** - * add_timer - start a timer - * @timer: the timer to be added + * add_timer - Start a timer + * @timer: The timer to be started * - * The kernel will do a ->function(@timer) callback from the - * timer interrupt at the ->expires point in the future. The - * current time is 'jiffies'. + * Start @timer to expire at @timer->expires in the future. @timer->expires + * is the absolute expiry time measured in 'jiffies'. When the timer expires + * timer->function(timer) will be invoked from soft interrupt context. * - * The timer's ->expires, ->function fields must be set prior calling this - * function. + * The @timer->expires and @timer->function fields must be set prior + * to calling this function. * - * Timers with an ->expires field in the past will be executed in the next - * timer tick. + * If @timer->function == NULL then the start operation is silently + * discarded. + * + * If @timer->expires is already in the past @timer will be queued to + * expire at the next timer tick. + * + * This can only operate on an inactive timer. Attempts to invoke this on + * an active timer are rejected with a warning. */ void add_timer(struct timer_list *timer) { - BUG_ON(timer_pending(timer)); + if (WARN_ON_ONCE(timer_pending(timer))) + return; __mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING); } EXPORT_SYMBOL(add_timer); /** - * add_timer_on - start a timer on a particular CPU - * @timer: the timer to be added - * @cpu: the CPU to start it on + * add_timer_on - Start a timer on a particular CPU + * @timer: The timer to be started + * @cpu: The CPU to start it on + * + * Same as add_timer() except that it starts the timer on the given CPU. * - * This is not very scalable on SMP. Double adds are not possible. + * See add_timer() for further details. */ void add_timer_on(struct timer_list *timer, int cpu) { struct timer_base *new_base, *base; unsigned long flags; - BUG_ON(timer_pending(timer) || !timer->function); + debug_assert_init(timer); + + if (WARN_ON_ONCE(timer_pending(timer))) + return; new_base = get_timer_cpu_base(timer->flags, cpu); @@ -1220,6 +1272,13 @@ void add_timer_on(struct timer_list *timer, int cpu) * wrong base locked. See lock_timer_base(). */ base = lock_timer_base(timer, &flags); + /* + * Has @timer been shutdown? This needs to be evaluated while + * holding base lock to prevent a race against the shutdown code. + */ + if (!timer->function) + goto out_unlock; + if (base != new_base) { timer->flags |= TIMER_MIGRATING; @@ -1233,22 +1292,27 @@ void add_timer_on(struct timer_list *timer, int cpu) debug_timer_activate(timer); internal_add_timer(base, timer); +out_unlock: raw_spin_unlock_irqrestore(&base->lock, flags); } EXPORT_SYMBOL_GPL(add_timer_on); /** - * del_timer - deactivate a timer. - * @timer: the timer to be deactivated - * - * del_timer() deactivates a timer - this works on both active and inactive - * timers. - * - * The function returns whether it has deactivated a pending timer or not. - * (ie. del_timer() of an inactive timer returns 0, del_timer() of an - * active timer returns 1.) + * __timer_delete - Internal function: Deactivate a timer + * @timer: The timer to be deactivated + * @shutdown: If true, this indicates that the timer is about to be + * shutdown permanently. + * + * If @shutdown is true then @timer->function is set to NULL under the + * timer base lock which prevents further rearming of the time. In that + * case any attempt to rearm @timer after this function returns will be + * silently ignored. + * + * Return: + * * %0 - The timer was not pending + * * %1 - The timer was pending and deactivated */ -int del_timer(struct timer_list *timer) +static int __timer_delete(struct timer_list *timer, bool shutdown) { struct timer_base *base; unsigned long flags; @@ -1256,24 +1320,90 @@ int del_timer(struct timer_list *timer) debug_assert_init(timer); - if (timer_pending(timer)) { + /* + * If @shutdown is set then the lock has to be taken whether the + * timer is pending or not to protect against a concurrent rearm + * which might hit between the lockless pending check and the lock + * aquisition. By taking the lock it is ensured that such a newly + * enqueued timer is dequeued and cannot end up with + * timer->function == NULL in the expiry code. + * + * If timer->function is currently executed, then this makes sure + * that the callback cannot requeue the timer. + */ + if (timer_pending(timer) || shutdown) { base = lock_timer_base(timer, &flags); ret = detach_if_pending(timer, base, true); + if (shutdown) + timer->function = NULL; raw_spin_unlock_irqrestore(&base->lock, flags); } return ret; } -EXPORT_SYMBOL(del_timer); /** - * try_to_del_timer_sync - Try to deactivate a timer - * @timer: timer to delete + * timer_delete - Deactivate a timer + * @timer: The timer to be deactivated + * + * The function only deactivates a pending timer, but contrary to + * timer_delete_sync() it does not take into account whether the timer's + * callback function is concurrently executed on a different CPU or not. + * It neither prevents rearming of the timer. If @timer can be rearmed + * concurrently then the return value of this function is meaningless. + * + * Return: + * * %0 - The timer was not pending + * * %1 - The timer was pending and deactivated + */ +int timer_delete(struct timer_list *timer) +{ + return __timer_delete(timer, false); +} +EXPORT_SYMBOL(timer_delete); + +/** + * timer_shutdown - Deactivate a timer and prevent rearming + * @timer: The timer to be deactivated * - * This function tries to deactivate a timer. Upon successful (ret >= 0) - * exit the timer is not queued and the handler is not running on any CPU. + * The function does not wait for an eventually running timer callback on a + * different CPU but it prevents rearming of the timer. Any attempt to arm + * @timer after this function returns will be silently ignored. + * + * This function is useful for teardown code and should only be used when + * timer_shutdown_sync() cannot be invoked due to locking or context constraints. + * + * Return: + * * %0 - The timer was not pending + * * %1 - The timer was pending */ -int try_to_del_timer_sync(struct timer_list *timer) +int timer_shutdown(struct timer_list *timer) +{ + return __timer_delete(timer, true); +} +EXPORT_SYMBOL_GPL(timer_shutdown); + +/** + * __try_to_del_timer_sync - Internal function: Try to deactivate a timer + * @timer: Timer to deactivate + * @shutdown: If true, this indicates that the timer is about to be + * shutdown permanently. + * + * If @shutdown is true then @timer->function is set to NULL under the + * timer base lock which prevents further rearming of the timer. Any + * attempt to rearm @timer after this function returns will be silently + * ignored. + * + * This function cannot guarantee that the timer cannot be rearmed + * right after dropping the base lock if @shutdown is false. That + * needs to be prevented by the calling code if necessary. + * + * Return: + * * %0 - The timer was not pending + * * %1 - The timer was pending and deactivated + * * %-1 - The timer callback function is running on a different CPU + */ +static int __try_to_del_timer_sync(struct timer_list *timer, bool shutdown) { struct timer_base *base; unsigned long flags; @@ -1285,11 +1415,34 @@ int try_to_del_timer_sync(struct timer_list *timer) if (base->running_timer != timer) ret = detach_if_pending(timer, base, true); + if (shutdown) + timer->function = NULL; raw_spin_unlock_irqrestore(&base->lock, flags); return ret; } + +/** + * try_to_del_timer_sync - Try to deactivate a timer + * @timer: Timer to deactivate + * + * This function tries to deactivate a timer. On success the timer is not + * queued and the timer callback function is not running on any CPU. + * + * This function does not guarantee that the timer cannot be rearmed right + * after dropping the base lock. That needs to be prevented by the calling + * code if necessary. + * + * Return: + * * %0 - The timer was not pending + * * %1 - The timer was pending and deactivated + * * %-1 - The timer callback function is running on a different CPU + */ +int try_to_del_timer_sync(struct timer_list *timer) +{ + return __try_to_del_timer_sync(timer, false); +} EXPORT_SYMBOL(try_to_del_timer_sync); #ifdef CONFIG_PREEMPT_RT @@ -1365,44 +1518,29 @@ static inline void timer_sync_wait_running(struct timer_base *base) { } static inline void del_timer_wait_running(struct timer_list *timer) { } #endif -#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) /** - * del_timer_sync - deactivate a timer and wait for the handler to finish. - * @timer: the timer to be deactivated - * - * This function only differs from del_timer() on SMP: besides deactivating - * the timer it also makes sure the handler has finished executing on other - * CPUs. - * - * Synchronization rules: Callers must prevent restarting of the timer, - * otherwise this function is meaningless. It must not be called from - * interrupt contexts unless the timer is an irqsafe one. The caller must - * not hold locks which would prevent completion of the timer's - * handler. The timer's handler must not call add_timer_on(). Upon exit the - * timer is not queued and the handler is not running on any CPU. - * - * Note: For !irqsafe timers, you must not hold locks that are held in - * interrupt context while calling this function. Even if the lock has - * nothing to do with the timer in question. Here's why:: - * - * CPU0 CPU1 - * ---- ---- - * <SOFTIRQ> - * call_timer_fn(); - * base->running_timer = mytimer; - * spin_lock_irq(somelock); - * <IRQ> - * spin_lock(somelock); - * del_timer_sync(mytimer); - * while (base->running_timer == mytimer); - * - * Now del_timer_sync() will never return and never release somelock. - * The interrupt on the other CPU is waiting to grab somelock but - * it has interrupted the softirq that CPU0 is waiting to finish. - * - * The function returns whether it has deactivated a pending timer or not. + * __timer_delete_sync - Internal function: Deactivate a timer and wait + * for the handler to finish. + * @timer: The timer to be deactivated + * @shutdown: If true, @timer->function will be set to NULL under the + * timer base lock which prevents rearming of @timer + * + * If @shutdown is not set the timer can be rearmed later. If the timer can + * be rearmed concurrently, i.e. after dropping the base lock then the + * return value is meaningless. + * + * If @shutdown is set then @timer->function is set to NULL under timer + * base lock which prevents rearming of the timer. Any attempt to rearm + * a shutdown timer is silently ignored. + * + * If the timer should be reused after shutdown it has to be initialized + * again. + * + * Return: + * * %0 - The timer was not pending + * * %1 - The timer was pending and deactivated */ -int del_timer_sync(struct timer_list *timer) +static int __timer_delete_sync(struct timer_list *timer, bool shutdown) { int ret; @@ -1422,7 +1560,7 @@ int del_timer_sync(struct timer_list *timer) * don't use it in hardirq context, because it * could lead to deadlock. */ - WARN_ON(in_irq() && !(timer->flags & TIMER_IRQSAFE)); + WARN_ON(in_hardirq() && !(timer->flags & TIMER_IRQSAFE)); /* * Must be able to sleep on PREEMPT_RT because of the slowpath in @@ -1432,7 +1570,7 @@ int del_timer_sync(struct timer_list *timer) lockdep_assert_preemption_enabled(); do { - ret = try_to_del_timer_sync(timer); + ret = __try_to_del_timer_sync(timer, shutdown); if (unlikely(ret < 0)) { del_timer_wait_running(timer); @@ -1442,8 +1580,96 @@ int del_timer_sync(struct timer_list *timer) return ret; } -EXPORT_SYMBOL(del_timer_sync); -#endif + +/** + * timer_delete_sync - Deactivate a timer and wait for the handler to finish. + * @timer: The timer to be deactivated + * + * Synchronization rules: Callers must prevent restarting of the timer, + * otherwise this function is meaningless. It must not be called from + * interrupt contexts unless the timer is an irqsafe one. The caller must + * not hold locks which would prevent completion of the timer's callback + * function. The timer's handler must not call add_timer_on(). Upon exit + * the timer is not queued and the handler is not running on any CPU. + * + * For !irqsafe timers, the caller must not hold locks that are held in + * interrupt context. Even if the lock has nothing to do with the timer in + * question. Here's why:: + * + * CPU0 CPU1 + * ---- ---- + * <SOFTIRQ> + * call_timer_fn(); + * base->running_timer = mytimer; + * spin_lock_irq(somelock); + * <IRQ> + * spin_lock(somelock); + * timer_delete_sync(mytimer); + * while (base->running_timer == mytimer); + * + * Now timer_delete_sync() will never return and never release somelock. + * The interrupt on the other CPU is waiting to grab somelock but it has + * interrupted the softirq that CPU0 is waiting to finish. + * + * This function cannot guarantee that the timer is not rearmed again by + * some concurrent or preempting code, right after it dropped the base + * lock. If there is the possibility of a concurrent rearm then the return + * value of the function is meaningless. + * + * If such a guarantee is needed, e.g. for teardown situations then use + * timer_shutdown_sync() instead. + * + * Return: + * * %0 - The timer was not pending + * * %1 - The timer was pending and deactivated + */ +int timer_delete_sync(struct timer_list *timer) +{ + return __timer_delete_sync(timer, false); +} +EXPORT_SYMBOL(timer_delete_sync); + +/** + * timer_shutdown_sync - Shutdown a timer and prevent rearming + * @timer: The timer to be shutdown + * + * When the function returns it is guaranteed that: + * - @timer is not queued + * - The callback function of @timer is not running + * - @timer cannot be enqueued again. Any attempt to rearm + * @timer is silently ignored. + * + * See timer_delete_sync() for synchronization rules. + * + * This function is useful for final teardown of an infrastructure where + * the timer is subject to a circular dependency problem. + * + * A common pattern for this is a timer and a workqueue where the timer can + * schedule work and work can arm the timer. On shutdown the workqueue must + * be destroyed and the timer must be prevented from rearming. Unless the + * code has conditionals like 'if (mything->in_shutdown)' to prevent that + * there is no way to get this correct with timer_delete_sync(). + * + * timer_shutdown_sync() is solving the problem. The correct ordering of + * calls in this case is: + * + * timer_shutdown_sync(&mything->timer); + * workqueue_destroy(&mything->workqueue); + * + * After this 'mything' can be safely freed. + * + * This obviously implies that the timer is not required to be functional + * for the rest of the shutdown operation. + * + * Return: + * * %0 - The timer was not pending + * * %1 - The timer was pending + */ +int timer_shutdown_sync(struct timer_list *timer) +{ + return __timer_delete_sync(timer, true); +} +EXPORT_SYMBOL_GPL(timer_shutdown_sync); static void call_timer_fn(struct timer_list *timer, void (*fn)(struct timer_list *), @@ -1465,8 +1691,8 @@ static void call_timer_fn(struct timer_list *timer, #endif /* * Couple the lock chain with the lock chain at - * del_timer_sync() by acquiring the lock_map around the fn() - * call here and in del_timer_sync(). + * timer_delete_sync() by acquiring the lock_map around the fn() + * call here and in timer_delete_sync(). */ lock_map_acquire(&lockdep_map); @@ -1509,6 +1735,12 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head) fn = timer->function; + if (WARN_ON_ONCE(!fn)) { + /* Should never happen. Emphasis on should! */ + base->running_timer = NULL; + continue; + } + if (timer->flags & TIMER_IRQSAFE) { raw_spin_unlock(&base->lock); call_timer_fn(timer, fn, baseclk); @@ -1933,7 +2165,7 @@ signed long __sched schedule_timeout(signed long timeout) timer_setup_on_stack(&timer.timer, process_timeout, 0); __mod_timer(&timer.timer, expire, MOD_TIMER_NOTPENDING); schedule(); - del_singleshot_timer_sync(&timer.timer); + del_timer_sync(&timer.timer); /* Remove the timer from the object tracker */ destroy_timer_on_stack(&timer.timer); @@ -2017,8 +2249,6 @@ int timers_dead_cpu(unsigned int cpu) struct timer_base *new_base; int b, i; - BUG_ON(cpu_online(cpu)); - for (b = 0; b < NR_BASES; b++) { old_base = per_cpu_ptr(&timer_bases[b], cpu); new_base = get_cpu_ptr(&timer_bases[b]); @@ -2035,7 +2265,8 @@ int timers_dead_cpu(unsigned int cpu) */ forward_timer_base(new_base); - BUG_ON(old_base->running_timer); + WARN_ON_ONCE(old_base->running_timer); + old_base->running_timer = NULL; for (i = 0; i < WHEEL_SIZE; i++) migrate_timer_list(new_base, old_base->vectors + i); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 1052126bdca2..197545241ab8 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -46,10 +46,16 @@ config HAVE_DYNAMIC_FTRACE_WITH_ARGS bool help If this is set, then arguments and stack can be found from - the pt_regs passed into the function callback regs parameter + the ftrace_regs passed into the function callback regs parameter by default, even without setting the REGS flag in the ftrace_ops. - This allows for use of regs_get_kernel_argument() and - kernel_stack_pointer(). + This allows for use of ftrace_regs_get_argument() and + ftrace_regs_get_stack_pointer(). + +config HAVE_DYNAMIC_FTRACE_NO_PATCHABLE + bool + help + If the architecture generates __patchable_function_entries sections + but does not want them included in the ftrace locations. config HAVE_FTRACE_MCOUNT_RECORD bool @@ -76,6 +82,13 @@ config HAVE_OBJTOOL_MCOUNT help Arch supports objtool --mcount +config HAVE_OBJTOOL_NOP_MCOUNT + bool + help + Arch supports the objtool options --mcount with --mnop. + An architecture can select this if it wants to enable nop'ing + of ftrace locations. + config HAVE_C_RECORDMCOUNT bool help @@ -369,6 +382,7 @@ config SCHED_TRACER config HWLAT_TRACER bool "Tracer to detect hardware latencies (like SMIs)" select GENERIC_TRACER + select TRACER_MAX_TRACE help This tracer, when enabled will create one or more kernel threads, depending on what the cpumask file is set to, which each thread @@ -404,6 +418,7 @@ config HWLAT_TRACER config OSNOISE_TRACER bool "OS Noise tracer" select GENERIC_TRACER + select TRACER_MAX_TRACE help In the context of high-performance computing (HPC), the Operating System Noise (osnoise) refers to the interference experienced by an diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 7f5eb295fe19..918a7d12df8f 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -346,8 +346,40 @@ static void put_probe_ref(void) mutex_unlock(&blk_probe_mutex); } +static int blk_trace_start(struct blk_trace *bt) +{ + if (bt->trace_state != Blktrace_setup && + bt->trace_state != Blktrace_stopped) + return -EINVAL; + + blktrace_seq++; + smp_mb(); + bt->trace_state = Blktrace_running; + raw_spin_lock_irq(&running_trace_lock); + list_add(&bt->running_list, &running_trace_list); + raw_spin_unlock_irq(&running_trace_lock); + trace_note_time(bt); + + return 0; +} + +static int blk_trace_stop(struct blk_trace *bt) +{ + if (bt->trace_state != Blktrace_running) + return -EINVAL; + + bt->trace_state = Blktrace_stopped; + raw_spin_lock_irq(&running_trace_lock); + list_del_init(&bt->running_list); + raw_spin_unlock_irq(&running_trace_lock); + relay_flush(bt->rchan); + + return 0; +} + static void blk_trace_cleanup(struct request_queue *q, struct blk_trace *bt) { + blk_trace_stop(bt); synchronize_rcu(); blk_trace_free(q, bt); put_probe_ref(); @@ -362,8 +394,7 @@ static int __blk_trace_remove(struct request_queue *q) if (!bt) return -EINVAL; - if (bt->trace_state != Blktrace_running) - blk_trace_cleanup(q, bt); + blk_trace_cleanup(q, bt); return 0; } @@ -658,7 +689,6 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name, static int __blk_trace_startstop(struct request_queue *q, int start) { - int ret; struct blk_trace *bt; bt = rcu_dereference_protected(q->blk_trace, @@ -666,36 +696,10 @@ static int __blk_trace_startstop(struct request_queue *q, int start) if (bt == NULL) return -EINVAL; - /* - * For starting a trace, we can transition from a setup or stopped - * trace. For stopping a trace, the state must be running - */ - ret = -EINVAL; - if (start) { - if (bt->trace_state == Blktrace_setup || - bt->trace_state == Blktrace_stopped) { - blktrace_seq++; - smp_mb(); - bt->trace_state = Blktrace_running; - raw_spin_lock_irq(&running_trace_lock); - list_add(&bt->running_list, &running_trace_list); - raw_spin_unlock_irq(&running_trace_lock); - - trace_note_time(bt); - ret = 0; - } - } else { - if (bt->trace_state == Blktrace_running) { - bt->trace_state = Blktrace_stopped; - raw_spin_lock_irq(&running_trace_lock); - list_del_init(&bt->running_list); - raw_spin_unlock_irq(&running_trace_lock); - relay_flush(bt->rchan); - ret = 0; - } - } - - return ret; + if (start) + return blk_trace_start(bt); + else + return blk_trace_stop(bt); } int blk_trace_startstop(struct request_queue *q, int start) @@ -717,7 +721,7 @@ EXPORT_SYMBOL_GPL(blk_trace_startstop); */ /** - * blk_trace_ioctl: - handle the ioctls associated with tracing + * blk_trace_ioctl - handle the ioctls associated with tracing * @bdev: the block device * @cmd: the ioctl cmd * @arg: the argument data, if any @@ -765,17 +769,15 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) } /** - * blk_trace_shutdown: - stop and cleanup trace structures + * blk_trace_shutdown - stop and cleanup trace structures * @q: the request queue associated with the device * **/ void blk_trace_shutdown(struct request_queue *q) { if (rcu_dereference_protected(q->blk_trace, - lockdep_is_held(&q->debugfs_mutex))) { - __blk_trace_startstop(q, 0); + lockdep_is_held(&q->debugfs_mutex))) __blk_trace_remove(q); - } } #ifdef CONFIG_BLK_CGROUP @@ -1546,7 +1548,8 @@ blk_trace_event_print_binary(struct trace_iterator *iter, int flags, static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter) { - if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC)) + if ((iter->ent->type != TRACE_BLK) || + !(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC)) return TRACE_TYPE_UNHANDLED; return print_one_line(iter, true); @@ -1614,13 +1617,7 @@ static int blk_trace_remove_queue(struct request_queue *q) if (bt == NULL) return -EINVAL; - if (bt->trace_state == Blktrace_running) { - bt->trace_state = Blktrace_stopped; - raw_spin_lock_irq(&running_trace_lock); - list_del_init(&bt->running_list); - raw_spin_unlock_irq(&running_trace_lock); - relay_flush(bt->rchan); - } + blk_trace_stop(bt); put_probe_ref(); synchronize_rcu(); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 68e5cdd24cef..3bbd3f0c810c 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -6,6 +6,7 @@ #include <linux/types.h> #include <linux/slab.h> #include <linux/bpf.h> +#include <linux/bpf_verifier.h> #include <linux/bpf_perf_event.h> #include <linux/btf.h> #include <linux/filter.h> @@ -20,6 +21,8 @@ #include <linux/fprobe.h> #include <linux/bsearch.h> #include <linux/sort.h> +#include <linux/key.h> +#include <linux/verification.h> #include <net/bpf_sk_storage.h> @@ -685,6 +688,7 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, perf_sample_data_init(sd, 0, 0); sd->raw = &raw; + sd->sample_flags |= PERF_SAMPLE_RAW; err = __bpf_perf_event_output(regs, map, flags, sd); @@ -743,6 +747,7 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, perf_fetch_caller_regs(regs); perf_sample_data_init(sd, 0, 0); sd->raw = &raw; + sd->sample_flags |= PERF_SAMPLE_RAW; ret = __bpf_perf_event_output(regs, map, flags, sd); out: @@ -769,7 +774,7 @@ BPF_CALL_0(bpf_get_current_task_btf) const struct bpf_func_proto bpf_get_current_task_btf_proto = { .func = bpf_get_current_task_btf, .gpl_only = true, - .ret_type = RET_PTR_TO_BTF_ID, + .ret_type = RET_PTR_TO_BTF_ID_TRUSTED, .ret_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], }; @@ -1026,11 +1031,30 @@ static const struct bpf_func_proto bpf_get_func_ip_proto_tracing = { .arg1_type = ARG_PTR_TO_CTX, }; +#ifdef CONFIG_X86_KERNEL_IBT +static unsigned long get_entry_ip(unsigned long fentry_ip) +{ + u32 instr; + + /* Being extra safe in here in case entry ip is on the page-edge. */ + if (get_kernel_nofault(instr, (u32 *) fentry_ip - 1)) + return fentry_ip; + if (is_endbr(instr)) + fentry_ip -= ENDBR_INSN_SIZE; + return fentry_ip; +} +#else +#define get_entry_ip(fentry_ip) fentry_ip +#endif + BPF_CALL_1(bpf_get_func_ip_kprobe, struct pt_regs *, regs) { struct kprobe *kp = kprobe_running(); - return kp ? (uintptr_t)kp->addr : 0; + if (!kp || !(kp->flags & KPROBE_FLAG_ON_FUNC_ENTRY)) + return 0; + + return get_entry_ip((uintptr_t)kp->addr); } static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe = { @@ -1181,6 +1205,184 @@ static const struct bpf_func_proto bpf_get_func_arg_cnt_proto = { .arg1_type = ARG_PTR_TO_CTX, }; +#ifdef CONFIG_KEYS +__diag_push(); +__diag_ignore_all("-Wmissing-prototypes", + "kfuncs which will be used in BPF programs"); + +/** + * bpf_lookup_user_key - lookup a key by its serial + * @serial: key handle serial number + * @flags: lookup-specific flags + * + * Search a key with a given *serial* and the provided *flags*. + * If found, increment the reference count of the key by one, and + * return it in the bpf_key structure. + * + * The bpf_key structure must be passed to bpf_key_put() when done + * with it, so that the key reference count is decremented and the + * bpf_key structure is freed. + * + * Permission checks are deferred to the time the key is used by + * one of the available key-specific kfuncs. + * + * Set *flags* with KEY_LOOKUP_CREATE, to attempt creating a requested + * special keyring (e.g. session keyring), if it doesn't yet exist. + * Set *flags* with KEY_LOOKUP_PARTIAL, to lookup a key without waiting + * for the key construction, and to retrieve uninstantiated keys (keys + * without data attached to them). + * + * Return: a bpf_key pointer with a valid key pointer if the key is found, a + * NULL pointer otherwise. + */ +struct bpf_key *bpf_lookup_user_key(u32 serial, u64 flags) +{ + key_ref_t key_ref; + struct bpf_key *bkey; + + if (flags & ~KEY_LOOKUP_ALL) + return NULL; + + /* + * Permission check is deferred until the key is used, as the + * intent of the caller is unknown here. + */ + key_ref = lookup_user_key(serial, flags, KEY_DEFER_PERM_CHECK); + if (IS_ERR(key_ref)) + return NULL; + + bkey = kmalloc(sizeof(*bkey), GFP_KERNEL); + if (!bkey) { + key_put(key_ref_to_ptr(key_ref)); + return NULL; + } + + bkey->key = key_ref_to_ptr(key_ref); + bkey->has_ref = true; + + return bkey; +} + +/** + * bpf_lookup_system_key - lookup a key by a system-defined ID + * @id: key ID + * + * Obtain a bpf_key structure with a key pointer set to the passed key ID. + * The key pointer is marked as invalid, to prevent bpf_key_put() from + * attempting to decrement the key reference count on that pointer. The key + * pointer set in such way is currently understood only by + * verify_pkcs7_signature(). + * + * Set *id* to one of the values defined in include/linux/verification.h: + * 0 for the primary keyring (immutable keyring of system keys); + * VERIFY_USE_SECONDARY_KEYRING for both the primary and secondary keyring + * (where keys can be added only if they are vouched for by existing keys + * in those keyrings); VERIFY_USE_PLATFORM_KEYRING for the platform + * keyring (primarily used by the integrity subsystem to verify a kexec'ed + * kerned image and, possibly, the initramfs signature). + * + * Return: a bpf_key pointer with an invalid key pointer set from the + * pre-determined ID on success, a NULL pointer otherwise + */ +struct bpf_key *bpf_lookup_system_key(u64 id) +{ + struct bpf_key *bkey; + + if (system_keyring_id_check(id) < 0) + return NULL; + + bkey = kmalloc(sizeof(*bkey), GFP_ATOMIC); + if (!bkey) + return NULL; + + bkey->key = (struct key *)(unsigned long)id; + bkey->has_ref = false; + + return bkey; +} + +/** + * bpf_key_put - decrement key reference count if key is valid and free bpf_key + * @bkey: bpf_key structure + * + * Decrement the reference count of the key inside *bkey*, if the pointer + * is valid, and free *bkey*. + */ +void bpf_key_put(struct bpf_key *bkey) +{ + if (bkey->has_ref) + key_put(bkey->key); + + kfree(bkey); +} + +#ifdef CONFIG_SYSTEM_DATA_VERIFICATION +/** + * bpf_verify_pkcs7_signature - verify a PKCS#7 signature + * @data_ptr: data to verify + * @sig_ptr: signature of the data + * @trusted_keyring: keyring with keys trusted for signature verification + * + * Verify the PKCS#7 signature *sig_ptr* against the supplied *data_ptr* + * with keys in a keyring referenced by *trusted_keyring*. + * + * Return: 0 on success, a negative value on error. + */ +int bpf_verify_pkcs7_signature(struct bpf_dynptr_kern *data_ptr, + struct bpf_dynptr_kern *sig_ptr, + struct bpf_key *trusted_keyring) +{ + int ret; + + if (trusted_keyring->has_ref) { + /* + * Do the permission check deferred in bpf_lookup_user_key(). + * See bpf_lookup_user_key() for more details. + * + * A call to key_task_permission() here would be redundant, as + * it is already done by keyring_search() called by + * find_asymmetric_key(). + */ + ret = key_validate(trusted_keyring->key); + if (ret < 0) + return ret; + } + + return verify_pkcs7_signature(data_ptr->data, + bpf_dynptr_get_size(data_ptr), + sig_ptr->data, + bpf_dynptr_get_size(sig_ptr), + trusted_keyring->key, + VERIFYING_UNSPECIFIED_SIGNATURE, NULL, + NULL); +} +#endif /* CONFIG_SYSTEM_DATA_VERIFICATION */ + +__diag_pop(); + +BTF_SET8_START(key_sig_kfunc_set) +BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_lookup_system_key, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_key_put, KF_RELEASE) +#ifdef CONFIG_SYSTEM_DATA_VERIFICATION +BTF_ID_FLAGS(func, bpf_verify_pkcs7_signature, KF_SLEEPABLE) +#endif +BTF_SET8_END(key_sig_kfunc_set) + +static const struct btf_kfunc_id_set bpf_key_sig_kfunc_set = { + .owner = THIS_MODULE, + .set = &key_sig_kfunc_set, +}; + +static int __init bpf_key_sig_kfuncs_init(void) +{ + return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, + &bpf_key_sig_kfunc_set); +} + +late_initcall(bpf_key_sig_kfuncs_init); +#endif /* CONFIG_KEYS */ + static const struct bpf_func_proto * bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -1255,6 +1457,10 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_current_cgroup_id_proto; case BPF_FUNC_get_current_ancestor_cgroup_id: return &bpf_get_current_ancestor_cgroup_id_proto; + case BPF_FUNC_cgrp_storage_get: + return &bpf_cgrp_storage_get_proto; + case BPF_FUNC_cgrp_storage_delete: + return &bpf_cgrp_storage_delete_proto; #endif case BPF_FUNC_send_signal: return &bpf_send_signal_proto; @@ -1279,9 +1485,9 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_get_task_stack: return &bpf_get_task_stack_proto; case BPF_FUNC_copy_from_user: - return prog->aux->sleepable ? &bpf_copy_from_user_proto : NULL; + return &bpf_copy_from_user_proto; case BPF_FUNC_copy_from_user_task: - return prog->aux->sleepable ? &bpf_copy_from_user_task_proto : NULL; + return &bpf_copy_from_user_task_proto; case BPF_FUNC_snprintf_btf: return &bpf_snprintf_btf_proto; case BPF_FUNC_per_cpu_ptr: @@ -1289,8 +1495,12 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_this_cpu_ptr: return &bpf_this_cpu_ptr_proto; case BPF_FUNC_task_storage_get: + if (bpf_prog_check_recur(prog)) + return &bpf_task_storage_get_recur_proto; return &bpf_task_storage_get_proto; case BPF_FUNC_task_storage_delete: + if (bpf_prog_check_recur(prog)) + return &bpf_task_storage_delete_recur_proto; return &bpf_task_storage_delete_proto; case BPF_FUNC_for_each_map_elem: return &bpf_for_each_map_elem_proto; @@ -1507,6 +1717,9 @@ BPF_CALL_4(bpf_read_branch_records, struct bpf_perf_event_data_kern *, ctx, if (unlikely(flags & ~BPF_F_GET_BRANCH_RECORDS_SIZE)) return -EINVAL; + if (unlikely(!(ctx->data->sample_flags & PERF_SAMPLE_BRANCH_STACK))) + return -ENOENT; + if (unlikely(!br_stack)) return -ENOENT; @@ -2042,9 +2255,15 @@ static __always_inline void __bpf_trace_run(struct bpf_prog *prog, u64 *args) { cant_sleep(); + if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { + bpf_prog_inc_misses_counter(prog); + goto out; + } rcu_read_lock(); (void) bpf_prog_run(prog, args); rcu_read_unlock(); +out: + this_cpu_dec(*(prog->active)); } #define UNPACK(...) __VA_ARGS__ @@ -2242,6 +2461,8 @@ struct bpf_kprobe_multi_link { unsigned long *addrs; u64 *cookies; u32 cnt; + u32 mods_cnt; + struct module **mods; }; struct bpf_kprobe_multi_run_ctx { @@ -2297,6 +2518,14 @@ error: return err; } +static void kprobe_multi_put_modules(struct module **mods, u32 cnt) +{ + u32 i; + + for (i = 0; i < cnt; i++) + module_put(mods[i]); +} + static void free_user_syms(struct user_syms *us) { kvfree(us->syms); @@ -2309,6 +2538,7 @@ static void bpf_kprobe_multi_link_release(struct bpf_link *link) kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link); unregister_fprobe(&kmulti_link->fp); + kprobe_multi_put_modules(kmulti_link->mods, kmulti_link->mods_cnt); } static void bpf_kprobe_multi_link_dealloc(struct bpf_link *link) @@ -2318,6 +2548,7 @@ static void bpf_kprobe_multi_link_dealloc(struct bpf_link *link) kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link); kvfree(kmulti_link->addrs); kvfree(kmulti_link->cookies); + kfree(kmulti_link->mods); kfree(kmulti_link); } @@ -2340,7 +2571,7 @@ static void bpf_kprobe_multi_cookie_swap(void *a, void *b, int size, const void swap(*cookie_a, *cookie_b); } -static int __bpf_kprobe_multi_cookie_cmp(const void *a, const void *b) +static int bpf_kprobe_multi_addrs_cmp(const void *a, const void *b) { const unsigned long *addr_a = a, *addr_b = b; @@ -2351,7 +2582,7 @@ static int __bpf_kprobe_multi_cookie_cmp(const void *a, const void *b) static int bpf_kprobe_multi_cookie_cmp(const void *a, const void *b, const void *priv) { - return __bpf_kprobe_multi_cookie_cmp(a, b); + return bpf_kprobe_multi_addrs_cmp(a, b); } static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx) @@ -2369,7 +2600,7 @@ static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx) return 0; entry_ip = run_ctx->entry_ip; addr = bsearch(&entry_ip, link->addrs, link->cnt, sizeof(entry_ip), - __bpf_kprobe_multi_cookie_cmp); + bpf_kprobe_multi_addrs_cmp); if (!addr) return 0; cookie = link->cookies + (addr - link->addrs); @@ -2414,13 +2645,13 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link, } static void -kprobe_multi_link_handler(struct fprobe *fp, unsigned long entry_ip, +kprobe_multi_link_handler(struct fprobe *fp, unsigned long fentry_ip, struct pt_regs *regs) { struct bpf_kprobe_multi_link *link; link = container_of(fp, struct bpf_kprobe_multi_link, fp); - kprobe_multi_link_prog_run(link, entry_ip, regs); + kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs); } static int symbols_cmp_r(const void *a, const void *b, const void *priv) @@ -2453,6 +2684,71 @@ static void symbols_swap_r(void *a, void *b, int size, const void *priv) } } +struct module_addr_args { + unsigned long *addrs; + u32 addrs_cnt; + struct module **mods; + int mods_cnt; + int mods_cap; +}; + +static int module_callback(void *data, const char *name, + struct module *mod, unsigned long addr) +{ + struct module_addr_args *args = data; + struct module **mods; + + /* We iterate all modules symbols and for each we: + * - search for it in provided addresses array + * - if found we check if we already have the module pointer stored + * (we iterate modules sequentially, so we can check just the last + * module pointer) + * - take module reference and store it + */ + if (!bsearch(&addr, args->addrs, args->addrs_cnt, sizeof(addr), + bpf_kprobe_multi_addrs_cmp)) + return 0; + + if (args->mods && args->mods[args->mods_cnt - 1] == mod) + return 0; + + if (args->mods_cnt == args->mods_cap) { + args->mods_cap = max(16, args->mods_cap * 3 / 2); + mods = krealloc_array(args->mods, args->mods_cap, sizeof(*mods), GFP_KERNEL); + if (!mods) + return -ENOMEM; + args->mods = mods; + } + + if (!try_module_get(mod)) + return -EINVAL; + + args->mods[args->mods_cnt] = mod; + args->mods_cnt++; + return 0; +} + +static int get_modules_for_addrs(struct module ***mods, unsigned long *addrs, u32 addrs_cnt) +{ + struct module_addr_args args = { + .addrs = addrs, + .addrs_cnt = addrs_cnt, + }; + int err; + + /* We return either err < 0 in case of error, ... */ + err = module_kallsyms_on_each_symbol(module_callback, &args); + if (err) { + kprobe_multi_put_modules(args.mods, args.mods_cnt); + kfree(args.mods); + return err; + } + + /* or number of modules found if everything is ok. */ + *mods = args.mods; + return args.mods_cnt; +} + int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { struct bpf_kprobe_multi_link *link = NULL; @@ -2563,10 +2859,25 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr bpf_kprobe_multi_cookie_cmp, bpf_kprobe_multi_cookie_swap, link); + } else { + /* + * We need to sort addrs array even if there are no cookies + * provided, to allow bsearch in get_modules_for_addrs. + */ + sort(addrs, cnt, sizeof(*addrs), + bpf_kprobe_multi_addrs_cmp, NULL); + } + + err = get_modules_for_addrs(&link->mods, addrs, cnt); + if (err < 0) { + bpf_link_cleanup(&link_primer); + return err; } + link->mods_cnt = err; err = register_fprobe_ips(&link->fp, addrs, cnt); if (err) { + kprobe_multi_put_modules(link->mods, link->mods_cnt); bpf_link_cleanup(&link_primer); return err; } diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index aac63ca9c3d1..e8143e368074 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -141,6 +141,8 @@ static int fprobe_init_rethook(struct fprobe *fp, int num) return -E2BIG; fp->rethook = rethook_alloc((void *)fp, fprobe_exit_handler); + if (!fp->rethook) + return -ENOMEM; for (i = 0; i < size; i++) { struct fprobe_rethook_node *node; @@ -301,7 +303,8 @@ int unregister_fprobe(struct fprobe *fp) { int ret; - if (!fp || fp->ops.func != fprobe_handler) + if (!fp || (fp->ops.saved_func != fprobe_handler && + fp->ops.saved_func != fprobe_kprobe_handler)) return -EINVAL; /* diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 439e2ab6905e..442438b93fe9 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -163,7 +163,7 @@ static void ftrace_sync_ipi(void *data) static ftrace_func_t ftrace_ops_get_list_func(struct ftrace_ops *ops) { /* - * If this is a dynamic, RCU, or per CPU ops, or we force list func, + * If this is a dynamic or RCU ops, or we force list func, * then it needs to call the list anyway. */ if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_RCU) || @@ -1289,6 +1289,7 @@ static int ftrace_add_mod(struct trace_array *tr, if (!ftrace_mod) return -ENOMEM; + INIT_LIST_HEAD(&ftrace_mod->list); ftrace_mod->func = kstrdup(func, GFP_KERNEL); ftrace_mod->module = kstrdup(module, GFP_KERNEL); ftrace_mod->enable = enable; @@ -1644,6 +1645,18 @@ ftrace_find_tramp_ops_any_other(struct dyn_ftrace *rec, struct ftrace_ops *op_ex static struct ftrace_ops * ftrace_find_tramp_ops_next(struct dyn_ftrace *rec, struct ftrace_ops *ops); +static bool skip_record(struct dyn_ftrace *rec) +{ + /* + * At boot up, weak functions are set to disable. Function tracing + * can be enabled before they are, and they still need to be disabled now. + * If the record is disabled, still continue if it is marked as already + * enabled (this is needed to keep the accounting working). + */ + return rec->flags & FTRACE_FL_DISABLED && + !(rec->flags & FTRACE_FL_ENABLED); +} + static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, int filter_hash, bool inc) @@ -1693,7 +1706,7 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, int in_hash = 0; int match = 0; - if (rec->flags & FTRACE_FL_DISABLED) + if (skip_record(rec)) continue; if (all) { @@ -2016,7 +2029,6 @@ static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops, static void print_ip_ins(const char *fmt, const unsigned char *p) { char ins[MCOUNT_INSN_SIZE]; - int i; if (copy_from_kernel_nofault(ins, p, MCOUNT_INSN_SIZE)) { printk(KERN_CONT "%s[FAULT] %px\n", fmt, p); @@ -2024,9 +2036,7 @@ static void print_ip_ins(const char *fmt, const unsigned char *p) } printk(KERN_CONT "%s", fmt); - - for (i = 0; i < MCOUNT_INSN_SIZE; i++) - printk(KERN_CONT "%s%02x", i ? ":" : "", ins[i]); + pr_cont("%*phC", MCOUNT_INSN_SIZE, ins); } enum ftrace_bug_type ftrace_bug_type; @@ -2126,7 +2136,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update) ftrace_bug_type = FTRACE_BUG_UNKNOWN; - if (rec->flags & FTRACE_FL_DISABLED) + if (skip_record(rec)) return FTRACE_UPDATE_IGNORE; /* @@ -2241,7 +2251,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update) if (update) { /* If there's no more users, clear all flags */ if (!ftrace_rec_count(rec)) - rec->flags = 0; + rec->flags &= FTRACE_FL_DISABLED; else /* * Just disable the record, but keep the ops TRAMP @@ -2478,14 +2488,13 @@ ftrace_add_rec_direct(unsigned long ip, unsigned long addr, static void call_direct_funcs(unsigned long ip, unsigned long pip, struct ftrace_ops *ops, struct ftrace_regs *fregs) { - struct pt_regs *regs = ftrace_get_regs(fregs); unsigned long addr; addr = ftrace_find_rec_direct(ip); if (!addr) return; - arch_ftrace_set_direct_caller(regs, addr); + arch_ftrace_set_direct_caller(fregs, addr); } struct ftrace_ops direct_ops = { @@ -2634,7 +2643,7 @@ void __weak ftrace_replace_code(int mod_flags) do_for_each_ftrace_rec(pg, rec) { - if (rec->flags & FTRACE_FL_DISABLED) + if (skip_record(rec)) continue; failed = __ftrace_replace_code(rec, enable); @@ -2753,6 +2762,19 @@ void __weak ftrace_arch_code_modify_post_process(void) { } +static int update_ftrace_func(ftrace_func_t func) +{ + static ftrace_func_t save_func; + + /* Avoid updating if it hasn't changed */ + if (func == save_func) + return 0; + + save_func = func; + + return ftrace_update_ftrace_func(func); +} + void ftrace_modify_all_code(int command) { int update = command & FTRACE_UPDATE_TRACE_FUNC; @@ -2773,7 +2795,7 @@ void ftrace_modify_all_code(int command) * traced. */ if (update) { - err = ftrace_update_ftrace_func(ftrace_ops_list_func); + err = update_ftrace_func(ftrace_ops_list_func); if (FTRACE_WARN_ON(err)) return; } @@ -2789,7 +2811,7 @@ void ftrace_modify_all_code(int command) /* If irqs are disabled, we are in stop machine */ if (!irqs_disabled()) smp_call_function(ftrace_sync_ipi, NULL, 1); - err = ftrace_update_ftrace_func(ftrace_trace_function); + err = update_ftrace_func(ftrace_trace_function); if (FTRACE_WARN_ON(err)) return; } @@ -3019,18 +3041,8 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command) command |= FTRACE_UPDATE_TRACE_FUNC; } - if (!command || !ftrace_enabled) { - /* - * If these are dynamic or per_cpu ops, they still - * need their data freed. Since, function tracing is - * not currently active, we can just free them - * without synchronizing all CPUs. - */ - if (ops->flags & FTRACE_OPS_FL_DYNAMIC) - goto free_ops; - - return 0; - } + if (!command || !ftrace_enabled) + goto out; /* * If the ops uses a trampoline, then it needs to be @@ -3067,11 +3079,10 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command) removed_ops = NULL; ops->flags &= ~FTRACE_OPS_FL_REMOVING; +out: /* * Dynamic ops may be freed, we must make sure that all * callers are done before leaving this function. - * The same goes for freeing the per_cpu data of the per_cpu - * ops. */ if (ops->flags & FTRACE_OPS_FL_DYNAMIC) { /* @@ -3094,7 +3105,6 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command) if (IS_ENABLED(CONFIG_PREEMPTION)) synchronize_rcu_tasks(); - free_ops: ftrace_trampoline_free(ops); } @@ -3191,7 +3201,7 @@ static int ftrace_allocate_records(struct ftrace_page *pg, int count) /* if we can't allocate this size, try something smaller */ if (!order) return -ENOMEM; - order >>= 1; + order--; goto again; } @@ -4193,6 +4203,7 @@ match_records(struct ftrace_hash *hash, char *func, int len, char *mod) } found = 1; } + cond_resched(); } while_for_each_ftrace_rec(); out_unlock: mutex_unlock(&ftrace_lock); @@ -5427,6 +5438,8 @@ static struct ftrace_ops stub_ops = { * it is safe to modify the ftrace record, where it should be * currently calling @old_addr directly, to call @new_addr. * + * This is called with direct_mutex locked. + * * Safety checks should be made to make sure that the code at * @rec->ip is currently calling @old_addr. And this must * also update entry->direct to @new_addr. @@ -5439,6 +5452,8 @@ int __weak ftrace_modify_direct_caller(struct ftrace_func_entry *entry, unsigned long ip = rec->ip; int ret; + lockdep_assert_held(&direct_mutex); + /* * The ftrace_lock was used to determine if the record * had more than one registered user to it. If it did, @@ -5461,7 +5476,7 @@ int __weak ftrace_modify_direct_caller(struct ftrace_func_entry *entry, if (ret) goto out_lock; - ret = register_ftrace_function(&stub_ops); + ret = register_ftrace_function_nolock(&stub_ops); if (ret) { ftrace_set_filter_ip(&stub_ops, ip, 1, 0); goto out_lock; @@ -6081,8 +6096,12 @@ int ftrace_regex_release(struct inode *inode, struct file *file) if (filter_hash) { orig_hash = &iter->ops->func_hash->filter_hash; - if (iter->tr && !list_empty(&iter->tr->mod_trace)) - iter->hash->flags |= FTRACE_HASH_FL_MOD; + if (iter->tr) { + if (list_empty(&iter->tr->mod_trace)) + iter->hash->flags &= ~FTRACE_HASH_FL_MOD; + else + iter->hash->flags |= FTRACE_HASH_FL_MOD; + } } else orig_hash = &iter->ops->func_hash->notrace_hash; @@ -7384,7 +7403,7 @@ void __init ftrace_init(void) } pr_info("ftrace: allocating %ld entries in %ld pages\n", - count, count / ENTRIES_PER_PAGE + 1); + count, DIV_ROUND_UP(count, ENTRIES_PER_PAGE)); ret = ftrace_process_locs(NULL, __start_mcount_loc, @@ -7511,8 +7530,6 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, /* * Check the following for each ops before calling their func: * if RCU flag is set, then rcu_is_watching() must be true - * if PER_CPU is set, then ftrace_function_local_disable() - * must be false * Otherwise test if the ip matches the ops filter * * If any of the above fails then the op->func() is not executed. @@ -7562,8 +7579,8 @@ NOKPROBE_SYMBOL(arch_ftrace_ops_list_func); /* * If there's only one function registered but it does not support - * recursion, needs RCU protection and/or requires per cpu handling, then - * this function will be called by the mcount trampoline. + * recursion, needs RCU protection, then this function will be called + * by the mcount trampoline. */ static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs) @@ -8250,6 +8267,10 @@ struct kallsyms_data { size_t found; }; +/* This function gets called for all kernel and module symbols + * and returns 1 in case we resolved all the requested symbols, + * 0 otherwise. + */ static int kallsyms_callback(void *data, const char *name, struct module *mod, unsigned long addr) { @@ -8265,8 +8286,7 @@ static int kallsyms_callback(void *data, const char *name, if (args->addrs[idx]) return 0; - addr = ftrace_location(addr); - if (!addr) + if (!ftrace_location(addr)) return 0; args->addrs[idx] = addr; @@ -8293,17 +8313,19 @@ static int kallsyms_callback(void *data, const char *name, int ftrace_lookup_symbols(const char **sorted_syms, size_t cnt, unsigned long *addrs) { struct kallsyms_data args; - int err; + int found_all; memset(addrs, 0, sizeof(*addrs) * cnt); args.addrs = addrs; args.syms = sorted_syms; args.cnt = cnt; args.found = 0; - err = kallsyms_on_each_symbol(kallsyms_callback, &args); - if (err < 0) - return err; - return args.found == args.cnt ? 0 : -ESRCH; + + found_all = kallsyms_on_each_symbol(kallsyms_callback, &args); + if (found_all) + return 0; + found_all = module_kallsyms_on_each_symbol(kallsyms_callback, &args); + return found_all ? 0 : -ESRCH; } #ifdef CONFIG_SYSCTL diff --git a/kernel/trace/kprobe_event_gen_test.c b/kernel/trace/kprobe_event_gen_test.c index 18b0f1cbb947..c736487fc0e4 100644 --- a/kernel/trace/kprobe_event_gen_test.c +++ b/kernel/trace/kprobe_event_gen_test.c @@ -35,6 +35,49 @@ static struct trace_event_file *gen_kprobe_test; static struct trace_event_file *gen_kretprobe_test; +#define KPROBE_GEN_TEST_FUNC "do_sys_open" + +/* X86 */ +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_32) +#define KPROBE_GEN_TEST_ARG0 "dfd=%ax" +#define KPROBE_GEN_TEST_ARG1 "filename=%dx" +#define KPROBE_GEN_TEST_ARG2 "flags=%cx" +#define KPROBE_GEN_TEST_ARG3 "mode=+4($stack)" + +/* ARM64 */ +#elif defined(CONFIG_ARM64) +#define KPROBE_GEN_TEST_ARG0 "dfd=%x0" +#define KPROBE_GEN_TEST_ARG1 "filename=%x1" +#define KPROBE_GEN_TEST_ARG2 "flags=%x2" +#define KPROBE_GEN_TEST_ARG3 "mode=%x3" + +/* ARM */ +#elif defined(CONFIG_ARM) +#define KPROBE_GEN_TEST_ARG0 "dfd=%r0" +#define KPROBE_GEN_TEST_ARG1 "filename=%r1" +#define KPROBE_GEN_TEST_ARG2 "flags=%r2" +#define KPROBE_GEN_TEST_ARG3 "mode=%r3" + +/* RISCV */ +#elif defined(CONFIG_RISCV) +#define KPROBE_GEN_TEST_ARG0 "dfd=%a0" +#define KPROBE_GEN_TEST_ARG1 "filename=%a1" +#define KPROBE_GEN_TEST_ARG2 "flags=%a2" +#define KPROBE_GEN_TEST_ARG3 "mode=%a3" + +/* others */ +#else +#define KPROBE_GEN_TEST_ARG0 NULL +#define KPROBE_GEN_TEST_ARG1 NULL +#define KPROBE_GEN_TEST_ARG2 NULL +#define KPROBE_GEN_TEST_ARG3 NULL +#endif + +static bool trace_event_file_is_valid(struct trace_event_file *input) +{ + return input && !IS_ERR(input); +} + /* * Test to make sure we can create a kprobe event, then add more * fields. @@ -58,23 +101,23 @@ static int __init test_gen_kprobe_cmd(void) * fields. */ ret = kprobe_event_gen_cmd_start(&cmd, "gen_kprobe_test", - "do_sys_open", - "dfd=%ax", "filename=%dx"); + KPROBE_GEN_TEST_FUNC, + KPROBE_GEN_TEST_ARG0, KPROBE_GEN_TEST_ARG1); if (ret) - goto free; + goto out; /* Use kprobe_event_add_fields to add the rest of the fields */ - ret = kprobe_event_add_fields(&cmd, "flags=%cx", "mode=+4($stack)"); + ret = kprobe_event_add_fields(&cmd, KPROBE_GEN_TEST_ARG2, KPROBE_GEN_TEST_ARG3); if (ret) - goto free; + goto out; /* * This actually creates the event. */ ret = kprobe_event_gen_cmd_end(&cmd); if (ret) - goto free; + goto out; /* * Now get the gen_kprobe_test event file. We need to prevent @@ -97,13 +140,13 @@ static int __init test_gen_kprobe_cmd(void) goto delete; } out: + kfree(buf); return ret; delete: + if (trace_event_file_is_valid(gen_kprobe_test)) + gen_kprobe_test = NULL; /* We got an error after creating the event, delete it */ ret = kprobe_event_delete("gen_kprobe_test"); - free: - kfree(buf); - goto out; } @@ -128,17 +171,17 @@ static int __init test_gen_kretprobe_cmd(void) * Define the kretprobe event. */ ret = kretprobe_event_gen_cmd_start(&cmd, "gen_kretprobe_test", - "do_sys_open", + KPROBE_GEN_TEST_FUNC, "$retval"); if (ret) - goto free; + goto out; /* * This actually creates the event. */ ret = kretprobe_event_gen_cmd_end(&cmd); if (ret) - goto free; + goto out; /* * Now get the gen_kretprobe_test event file. We need to @@ -162,13 +205,13 @@ static int __init test_gen_kretprobe_cmd(void) goto delete; } out: + kfree(buf); return ret; delete: + if (trace_event_file_is_valid(gen_kretprobe_test)) + gen_kretprobe_test = NULL; /* We got an error after creating the event, delete it */ ret = kprobe_event_delete("gen_kretprobe_test"); - free: - kfree(buf); - goto out; } @@ -182,10 +225,12 @@ static int __init kprobe_event_gen_test_init(void) ret = test_gen_kretprobe_cmd(); if (ret) { - WARN_ON(trace_array_set_clr_event(gen_kretprobe_test->tr, - "kprobes", - "gen_kretprobe_test", false)); - trace_put_event_file(gen_kretprobe_test); + if (trace_event_file_is_valid(gen_kretprobe_test)) { + WARN_ON(trace_array_set_clr_event(gen_kretprobe_test->tr, + "kprobes", + "gen_kretprobe_test", false)); + trace_put_event_file(gen_kretprobe_test); + } WARN_ON(kprobe_event_delete("gen_kretprobe_test")); } @@ -194,24 +239,30 @@ static int __init kprobe_event_gen_test_init(void) static void __exit kprobe_event_gen_test_exit(void) { - /* Disable the event or you can't remove it */ - WARN_ON(trace_array_set_clr_event(gen_kprobe_test->tr, - "kprobes", - "gen_kprobe_test", false)); + if (trace_event_file_is_valid(gen_kprobe_test)) { + /* Disable the event or you can't remove it */ + WARN_ON(trace_array_set_clr_event(gen_kprobe_test->tr, + "kprobes", + "gen_kprobe_test", false)); + + /* Now give the file and instance back */ + trace_put_event_file(gen_kprobe_test); + } - /* Now give the file and instance back */ - trace_put_event_file(gen_kprobe_test); /* Now unregister and free the event */ WARN_ON(kprobe_event_delete("gen_kprobe_test")); - /* Disable the event or you can't remove it */ - WARN_ON(trace_array_set_clr_event(gen_kprobe_test->tr, - "kprobes", - "gen_kretprobe_test", false)); + if (trace_event_file_is_valid(gen_kretprobe_test)) { + /* Disable the event or you can't remove it */ + WARN_ON(trace_array_set_clr_event(gen_kretprobe_test->tr, + "kprobes", + "gen_kretprobe_test", false)); + + /* Now give the file and instance back */ + trace_put_event_file(gen_kretprobe_test); + } - /* Now give the file and instance back */ - trace_put_event_file(gen_kretprobe_test); /* Now unregister and free the event */ WARN_ON(kprobe_event_delete("gen_kretprobe_test")); diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c index c69d82273ce7..32c3dfdb4d6a 100644 --- a/kernel/trace/rethook.c +++ b/kernel/trace/rethook.c @@ -83,8 +83,10 @@ struct rethook *rethook_alloc(void *data, rethook_handler_t handler) { struct rethook *rh = kzalloc(sizeof(struct rethook), GFP_KERNEL); - if (!rh || !handler) + if (!rh || !handler) { + kfree(rh); return NULL; + } rh->data = data; rh->handler = handler; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index d59b6a328b7f..c366a0a9ddba 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -413,6 +413,7 @@ struct rb_irq_work { struct irq_work work; wait_queue_head_t waiters; wait_queue_head_t full_waiters; + long wait_index; bool waiters_pending; bool full_waiters_pending; bool wakeup_full; @@ -518,6 +519,7 @@ struct ring_buffer_per_cpu { local_t committing; local_t commits; local_t pages_touched; + local_t pages_lost; local_t pages_read; long last_pages_touch; size_t shortest_full; @@ -884,7 +886,7 @@ size_t ring_buffer_nr_pages(struct trace_buffer *buffer, int cpu) } /** - * ring_buffer_nr_pages_dirty - get the number of used pages in the ring buffer + * ring_buffer_nr_dirty_pages - get the number of used pages in the ring buffer * @buffer: The ring_buffer to get the number of pages from * @cpu: The cpu of the ring_buffer to get the number of pages from * @@ -893,10 +895,18 @@ size_t ring_buffer_nr_pages(struct trace_buffer *buffer, int cpu) size_t ring_buffer_nr_dirty_pages(struct trace_buffer *buffer, int cpu) { size_t read; + size_t lost; size_t cnt; read = local_read(&buffer->buffers[cpu]->pages_read); + lost = local_read(&buffer->buffers[cpu]->pages_lost); cnt = local_read(&buffer->buffers[cpu]->pages_touched); + + if (WARN_ON_ONCE(cnt < lost)) + return 0; + + cnt -= lost; + /* The reader can read an empty page, but not more than that */ if (cnt < read) { WARN_ON_ONCE(read > cnt + 1); @@ -906,6 +916,21 @@ size_t ring_buffer_nr_dirty_pages(struct trace_buffer *buffer, int cpu) return cnt - read; } +static __always_inline bool full_hit(struct trace_buffer *buffer, int cpu, int full) +{ + struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; + size_t nr_pages; + size_t dirty; + + nr_pages = cpu_buffer->nr_pages; + if (!nr_pages || !full) + return true; + + dirty = ring_buffer_nr_dirty_pages(buffer, cpu); + + return (dirty * 100) > (full * nr_pages); +} + /* * rb_wake_up_waiters - wake up tasks waiting for ring buffer input * @@ -917,13 +942,56 @@ static void rb_wake_up_waiters(struct irq_work *work) struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work); wake_up_all(&rbwork->waiters); - if (rbwork->wakeup_full) { + if (rbwork->full_waiters_pending || rbwork->wakeup_full) { rbwork->wakeup_full = false; + rbwork->full_waiters_pending = false; wake_up_all(&rbwork->full_waiters); } } /** + * ring_buffer_wake_waiters - wake up any waiters on this ring buffer + * @buffer: The ring buffer to wake waiters on + * + * In the case of a file that represents a ring buffer is closing, + * it is prudent to wake up any waiters that are on this. + */ +void ring_buffer_wake_waiters(struct trace_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + struct rb_irq_work *rbwork; + + if (!buffer) + return; + + if (cpu == RING_BUFFER_ALL_CPUS) { + + /* Wake up individual ones too. One level recursion */ + for_each_buffer_cpu(buffer, cpu) + ring_buffer_wake_waiters(buffer, cpu); + + rbwork = &buffer->irq_work; + } else { + if (WARN_ON_ONCE(!buffer->buffers)) + return; + if (WARN_ON_ONCE(cpu >= nr_cpu_ids)) + return; + + cpu_buffer = buffer->buffers[cpu]; + /* The CPU buffer may not have been initialized yet */ + if (!cpu_buffer) + return; + rbwork = &cpu_buffer->irq_work; + } + + rbwork->wait_index++; + /* make sure the waiters see the new index */ + smp_wmb(); + + rb_wake_up_waiters(&rbwork->work); +} + +/** * ring_buffer_wait - wait for input to the ring buffer * @buffer: buffer to wait on * @cpu: the cpu buffer to wait on @@ -938,6 +1006,7 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full) struct ring_buffer_per_cpu *cpu_buffer; DEFINE_WAIT(wait); struct rb_irq_work *work; + long wait_index; int ret = 0; /* @@ -956,6 +1025,7 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full) work = &cpu_buffer->irq_work; } + wait_index = READ_ONCE(work->wait_index); while (true) { if (full) @@ -1000,26 +1070,29 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full) !ring_buffer_empty_cpu(buffer, cpu)) { unsigned long flags; bool pagebusy; - size_t nr_pages; - size_t dirty; + bool done; if (!full) break; raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page; - nr_pages = cpu_buffer->nr_pages; - dirty = ring_buffer_nr_dirty_pages(buffer, cpu); + done = !pagebusy && full_hit(buffer, cpu, full); + if (!cpu_buffer->shortest_full || - cpu_buffer->shortest_full < full) + cpu_buffer->shortest_full > full) cpu_buffer->shortest_full = full; raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - if (!pagebusy && - (!nr_pages || (dirty * 100) > full * nr_pages)) + if (done) break; } schedule(); + + /* Make sure to see the new wait index */ + smp_rmb(); + if (wait_index != work->wait_index) + break; } if (full) @@ -1036,6 +1109,7 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full) * @cpu: the cpu buffer to wait on * @filp: the file descriptor * @poll_table: The poll descriptor + * @full: wait until the percentage of pages are available, if @cpu != RING_BUFFER_ALL_CPUS * * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon * as data is added to any of the @buffer's cpu buffers. Otherwise @@ -1045,14 +1119,15 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full) * zero otherwise. */ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu, - struct file *filp, poll_table *poll_table) + struct file *filp, poll_table *poll_table, int full) { struct ring_buffer_per_cpu *cpu_buffer; struct rb_irq_work *work; - if (cpu == RING_BUFFER_ALL_CPUS) + if (cpu == RING_BUFFER_ALL_CPUS) { work = &buffer->irq_work; - else { + full = 0; + } else { if (!cpumask_test_cpu(cpu, buffer->cpumask)) return -EINVAL; @@ -1060,8 +1135,14 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu, work = &cpu_buffer->irq_work; } - poll_wait(filp, &work->waiters, poll_table); - work->waiters_pending = true; + if (full) { + poll_wait(filp, &work->full_waiters, poll_table); + work->full_waiters_pending = true; + } else { + poll_wait(filp, &work->waiters, poll_table); + work->waiters_pending = true; + } + /* * There's a tight race between setting the waiters_pending and * checking if the ring buffer is empty. Once the waiters_pending bit @@ -1077,6 +1158,9 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu, */ smp_mb(); + if (full) + return full_hit(buffer, cpu, full) ? EPOLLIN | EPOLLRDNORM : 0; + if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) || (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu))) return EPOLLIN | EPOLLRDNORM; @@ -1718,9 +1802,9 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) free_buffer_page(cpu_buffer->reader_page); - rb_head_page_deactivate(cpu_buffer); - if (head) { + rb_head_page_deactivate(cpu_buffer); + list_for_each_entry_safe(bpage, tmp, head, list) { list_del_init(&bpage->list); free_buffer_page(bpage); @@ -1956,6 +2040,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages) */ local_add(page_entries, &cpu_buffer->overrun); local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes); + local_inc(&cpu_buffer->pages_lost); } /* @@ -1977,8 +2062,10 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer) { struct list_head *pages = &cpu_buffer->new_pages; int retries, success; + unsigned long flags; - raw_spin_lock_irq(&cpu_buffer->reader_lock); + /* Can be called at early boot up, where interrupts must not been enabled */ + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); /* * We are holding the reader lock, so the reader page won't be swapped * in the ring buffer. Now we are racing with the writer trying to @@ -2035,7 +2122,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer) * tracing */ RB_WARN_ON(cpu_buffer, !success); - raw_spin_unlock_irq(&cpu_buffer->reader_lock); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); /* free pages if they weren't inserted */ if (!success) { @@ -2163,8 +2250,16 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, rb_update_pages(cpu_buffer); cpu_buffer->nr_pages_to_update = 0; } else { - schedule_work_on(cpu, - &cpu_buffer->update_pages_work); + /* Run directly if possible. */ + migrate_disable(); + if (cpu != smp_processor_id()) { + migrate_enable(); + schedule_work_on(cpu, + &cpu_buffer->update_pages_work); + } else { + update_pages_handler(&cpu_buffer->update_pages_work); + migrate_enable(); + } } } @@ -2213,9 +2308,17 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, if (!cpu_online(cpu_id)) rb_update_pages(cpu_buffer); else { - schedule_work_on(cpu_id, - &cpu_buffer->update_pages_work); - wait_for_completion(&cpu_buffer->update_done); + /* Run directly if possible. */ + migrate_disable(); + if (cpu_id == smp_processor_id()) { + rb_update_pages(cpu_buffer); + migrate_enable(); + } else { + migrate_enable(); + schedule_work_on(cpu_id, + &cpu_buffer->update_pages_work); + wait_for_completion(&cpu_buffer->update_done); + } } cpu_buffer->nr_pages_to_update = 0; @@ -2440,6 +2543,7 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer, */ local_add(entries, &cpu_buffer->overrun); local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes); + local_inc(&cpu_buffer->pages_lost); /* * The entries will be zeroed out when we move the @@ -2608,6 +2712,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, /* Mark the rest of the page with padding */ rb_event_set_padding(event); + /* Make sure the padding is visible before the write update */ + smp_wmb(); + /* Set the write back to the previous setting */ local_sub(length, &tail_page->write); return; @@ -2619,6 +2726,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, /* time delta must be non zero */ event->time_delta = 1; + /* Make sure the padding is visible before the tail_page->write update */ + smp_wmb(); + /* Set write to end of buffer */ length = (tail + length) - BUF_PAGE_SIZE; local_sub(length, &tail_page->write); @@ -3088,8 +3198,7 @@ static inline void rb_event_discard(struct ring_buffer_event *event) event->time_delta = 1; } -static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event) +static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer) { local_inc(&cpu_buffer->entries); rb_end_commit(cpu_buffer); @@ -3098,10 +3207,6 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, static __always_inline void rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) { - size_t nr_pages; - size_t dirty; - size_t full; - if (buffer->irq_work.waiters_pending) { buffer->irq_work.waiters_pending = false; /* irq_work_queue() supplies it's own memory barriers */ @@ -3125,10 +3230,7 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->last_pages_touch = local_read(&cpu_buffer->pages_touched); - full = cpu_buffer->shortest_full; - nr_pages = cpu_buffer->nr_pages; - dirty = ring_buffer_nr_dirty_pages(buffer, cpu_buffer->cpu); - if (full && nr_pages && (dirty * 100) <= full * nr_pages) + if (!full_hit(buffer, cpu_buffer->cpu, cpu_buffer->shortest_full)) return; cpu_buffer->irq_work.wakeup_full = true; @@ -3298,15 +3400,14 @@ void ring_buffer_nest_end(struct trace_buffer *buffer) * * Must be paired with ring_buffer_lock_reserve. */ -int ring_buffer_unlock_commit(struct trace_buffer *buffer, - struct ring_buffer_event *event) +int ring_buffer_unlock_commit(struct trace_buffer *buffer) { struct ring_buffer_per_cpu *cpu_buffer; int cpu = raw_smp_processor_id(); cpu_buffer = buffer->buffers[cpu]; - rb_commit(cpu_buffer, event); + rb_commit(cpu_buffer); rb_wakeups(buffer, cpu_buffer); @@ -3892,7 +3993,7 @@ int ring_buffer_write(struct trace_buffer *buffer, memcpy(body, data, length); - rb_commit(cpu_buffer, event); + rb_commit(cpu_buffer); rb_wakeups(buffer, cpu_buffer); @@ -4587,6 +4688,33 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) arch_spin_unlock(&cpu_buffer->lock); local_irq_restore(flags); + /* + * The writer has preempt disable, wait for it. But not forever + * Although, 1 second is pretty much "forever" + */ +#define USECS_WAIT 1000000 + for (nr_loops = 0; nr_loops < USECS_WAIT; nr_loops++) { + /* If the write is past the end of page, a writer is still updating it */ + if (likely(!reader || rb_page_write(reader) <= BUF_PAGE_SIZE)) + break; + + udelay(1); + + /* Get the latest version of the reader write value */ + smp_rmb(); + } + + /* The writer is not moving forward? Something is wrong */ + if (RB_WARN_ON(cpu_buffer, nr_loops == USECS_WAIT)) + reader = NULL; + + /* + * Make sure we see any padding after the write update + * (see rb_reset_tail()) + */ + smp_rmb(); + + return reader; } @@ -5164,6 +5292,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) local_set(&cpu_buffer->committing, 0); local_set(&cpu_buffer->commits, 0); local_set(&cpu_buffer->pages_touched, 0); + local_set(&cpu_buffer->pages_lost, 0); local_set(&cpu_buffer->pages_read, 0); cpu_buffer->last_pages_touch = 0; cpu_buffer->shortest_full = 0; @@ -5232,7 +5361,7 @@ void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu) EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); /** - * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer + * ring_buffer_reset_online_cpus - reset a ring buffer per CPU buffer * @buffer: The ring buffer to reset a per cpu buffer of * @cpu: The CPU buffer to be reset */ @@ -5302,7 +5431,7 @@ void ring_buffer_reset(struct trace_buffer *buffer) EXPORT_SYMBOL_GPL(ring_buffer_reset); /** - * rind_buffer_empty - is the ring buffer empty? + * ring_buffer_empty - is the ring buffer empty? * @buffer: The ring buffer to test */ bool ring_buffer_empty(struct trace_buffer *buffer) @@ -5616,7 +5745,15 @@ int ring_buffer_read_page(struct trace_buffer *buffer, unsigned int pos = 0; unsigned int size; - if (full) + /* + * If a full page is expected, this can still be returned + * if there's been a previous partial read and the + * rest of the page can be read and the commit page is off + * the reader page. + */ + if (full && + (!read || (len < (commit - read)) || + cpu_buffer->reader_page == cpu_buffer->commit_page)) goto out_unlock; if (len > (commit - read)) @@ -5877,7 +6014,7 @@ static __init int rb_write_something(struct rb_test_data *data, bool nested) } out: - ring_buffer_unlock_commit(data->buffer, event); + ring_buffer_unlock_commit(data->buffer); return 0; } diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 78e576575b79..aef34673d79d 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -258,7 +258,7 @@ static void ring_buffer_producer(void) hit++; entry = ring_buffer_event_data(event); *entry = smp_processor_id(); - ring_buffer_unlock_commit(buffer, event); + ring_buffer_unlock_commit(buffer); } } end_time = ktime_get(); diff --git a/kernel/trace/rv/monitors/wip/wip.c b/kernel/trace/rv/monitors/wip/wip.c index 83cace53b9fa..b2b49a27e886 100644 --- a/kernel/trace/rv/monitors/wip/wip.c +++ b/kernel/trace/rv/monitors/wip/wip.c @@ -16,7 +16,7 @@ #include "wip.h" -struct rv_monitor rv_wip; +static struct rv_monitor rv_wip; DECLARE_DA_MON_PER_CPU(wip, unsigned char); static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip) @@ -60,7 +60,7 @@ static void disable_wip(void) da_monitor_destroy_wip(); } -struct rv_monitor rv_wip = { +static struct rv_monitor rv_wip = { .name = "wip", .description = "wakeup in preemptive per-cpu testing monitor.", .enable = enable_wip, @@ -69,13 +69,13 @@ struct rv_monitor rv_wip = { .enabled = 0, }; -static int register_wip(void) +static int __init register_wip(void) { rv_register_monitor(&rv_wip); return 0; } -static void unregister_wip(void) +static void __exit unregister_wip(void) { rv_unregister_monitor(&rv_wip); } diff --git a/kernel/trace/rv/monitors/wip/wip.h b/kernel/trace/rv/monitors/wip/wip.h index dacc37b62a2c..2e373f2c65ed 100644 --- a/kernel/trace/rv/monitors/wip/wip.h +++ b/kernel/trace/rv/monitors/wip/wip.h @@ -27,7 +27,7 @@ struct automaton_wip { bool final_states[state_max_wip]; }; -static struct automaton_wip automaton_wip = { +static const struct automaton_wip automaton_wip = { .state_names = { "preemptive", "non_preemptive" diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.c b/kernel/trace/rv/monitors/wwnr/wwnr.c index 599225d9cf38..0e43dd2db685 100644 --- a/kernel/trace/rv/monitors/wwnr/wwnr.c +++ b/kernel/trace/rv/monitors/wwnr/wwnr.c @@ -15,7 +15,7 @@ #include "wwnr.h" -struct rv_monitor rv_wwnr; +static struct rv_monitor rv_wwnr; DECLARE_DA_MON_PER_TASK(wwnr, unsigned char); static void handle_switch(void *data, bool preempt, struct task_struct *p, @@ -59,7 +59,7 @@ static void disable_wwnr(void) da_monitor_destroy_wwnr(); } -struct rv_monitor rv_wwnr = { +static struct rv_monitor rv_wwnr = { .name = "wwnr", .description = "wakeup while not running per-task testing model.", .enable = enable_wwnr, @@ -68,13 +68,13 @@ struct rv_monitor rv_wwnr = { .enabled = 0, }; -static int register_wwnr(void) +static int __init register_wwnr(void) { rv_register_monitor(&rv_wwnr); return 0; } -static void unregister_wwnr(void) +static void __exit unregister_wwnr(void) { rv_unregister_monitor(&rv_wwnr); } diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.h b/kernel/trace/rv/monitors/wwnr/wwnr.h index 118e576b91b4..d0d9c4b8121b 100644 --- a/kernel/trace/rv/monitors/wwnr/wwnr.h +++ b/kernel/trace/rv/monitors/wwnr/wwnr.h @@ -27,7 +27,7 @@ struct automaton_wwnr { bool final_states[state_max_wwnr]; }; -static struct automaton_wwnr automaton_wwnr = { +static const struct automaton_wwnr automaton_wwnr = { .state_names = { "not_running", "running" diff --git a/kernel/trace/synth_event_gen_test.c b/kernel/trace/synth_event_gen_test.c index 0b15e975d2c2..8d77526892f4 100644 --- a/kernel/trace/synth_event_gen_test.c +++ b/kernel/trace/synth_event_gen_test.c @@ -120,15 +120,13 @@ static int __init test_gen_synth_cmd(void) /* Now generate a gen_synth_test event */ ret = synth_event_trace_array(gen_synth_test, vals, ARRAY_SIZE(vals)); - out: + free: + kfree(buf); return ret; delete: /* We got an error after creating the event, delete it */ synth_event_delete("gen_synth_test"); - free: - kfree(buf); - - goto out; + goto free; } /* @@ -227,15 +225,13 @@ static int __init test_empty_synth_event(void) /* Now trace an empty_synth_test event */ ret = synth_event_trace_array(empty_synth_test, vals, ARRAY_SIZE(vals)); - out: + free: + kfree(buf); return ret; delete: /* We got an error after creating the event, delete it */ synth_event_delete("empty_synth_test"); - free: - kfree(buf); - - goto out; + goto free; } static struct synth_field_desc create_synth_test_fields[] = { diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d3005279165d..a555a861b978 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -19,7 +19,6 @@ #include <linux/kallsyms.h> #include <linux/security.h> #include <linux/seq_file.h> -#include <linux/notifier.h> #include <linux/irqflags.h> #include <linux/debugfs.h> #include <linux/tracefs.h> @@ -85,7 +84,7 @@ void __init disable_tracing_selftest(const char *reason) #endif /* Pipe tracepoints to printk */ -struct trace_iterator *tracepoint_print_iter; +static struct trace_iterator *tracepoint_print_iter; int tracepoint_printk; static bool tracepoint_printk_stop_on_boot __initdata; static DEFINE_STATIC_KEY_FALSE(tracepoint_printk_key); @@ -999,7 +998,7 @@ __buffer_unlock_commit(struct trace_buffer *buffer, struct ring_buffer_event *ev /* ring_buffer_unlock_commit() enables preemption */ preempt_enable_notrace(); } else - ring_buffer_unlock_commit(buffer, event); + ring_buffer_unlock_commit(buffer); } /** @@ -1193,12 +1192,14 @@ void *tracing_cond_snapshot_data(struct trace_array *tr) { void *cond_data = NULL; + local_irq_disable(); arch_spin_lock(&tr->max_lock); if (tr->cond_snapshot) cond_data = tr->cond_snapshot->cond_data; arch_spin_unlock(&tr->max_lock); + local_irq_enable(); return cond_data; } @@ -1334,9 +1335,11 @@ int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, goto fail_unlock; } + local_irq_disable(); arch_spin_lock(&tr->max_lock); tr->cond_snapshot = cond_snapshot; arch_spin_unlock(&tr->max_lock); + local_irq_enable(); mutex_unlock(&trace_types_lock); @@ -1363,6 +1366,7 @@ int tracing_snapshot_cond_disable(struct trace_array *tr) { int ret = 0; + local_irq_disable(); arch_spin_lock(&tr->max_lock); if (!tr->cond_snapshot) @@ -1373,6 +1377,7 @@ int tracing_snapshot_cond_disable(struct trace_array *tr) } arch_spin_unlock(&tr->max_lock); + local_irq_enable(); return ret; } @@ -1415,6 +1420,7 @@ int tracing_snapshot_cond_disable(struct trace_array *tr) return false; } EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable); +#define free_snapshot(tr) do { } while (0) #endif /* CONFIG_TRACER_SNAPSHOT */ void tracer_tracing_off(struct trace_array *tr) @@ -1686,6 +1692,8 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) } unsigned long __read_mostly tracing_thresh; + +#ifdef CONFIG_TRACER_MAX_TRACE static const struct file_operations tracing_max_lat_fops; #ifdef LATENCY_FS_NOTIFY @@ -1742,18 +1750,14 @@ void latency_fsnotify(struct trace_array *tr) irq_work_queue(&tr->fsnotify_irqwork); } -#elif defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) \ - || defined(CONFIG_OSNOISE_TRACER) +#else /* !LATENCY_FS_NOTIFY */ #define trace_create_maxlat_file(tr, d_tracer) \ trace_create_file("tracing_max_latency", TRACE_MODE_WRITE, \ d_tracer, &tr->max_latency, &tracing_max_lat_fops) -#else -#define trace_create_maxlat_file(tr, d_tracer) do { } while (0) #endif -#ifdef CONFIG_TRACER_MAX_TRACE /* * Copy the new maximum trace into the separate maximum-trace * structure. (this way the maximum trace is permanently saved, @@ -1828,14 +1832,15 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu, ring_buffer_record_off(tr->max_buffer.buffer); #ifdef CONFIG_TRACER_SNAPSHOT - if (tr->cond_snapshot && !tr->cond_snapshot->update(tr, cond_data)) - goto out_unlock; + if (tr->cond_snapshot && !tr->cond_snapshot->update(tr, cond_data)) { + arch_spin_unlock(&tr->max_lock); + return; + } #endif swap(tr->array_buffer.buffer, tr->max_buffer.buffer); __update_max_tr(tr, tsk, cpu); - out_unlock: arch_spin_unlock(&tr->max_lock); } @@ -1882,6 +1887,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) __update_max_tr(tr, tsk, cpu); arch_spin_unlock(&tr->max_lock); } + #endif /* CONFIG_TRACER_MAX_TRACE */ static int wait_on_pipe(struct trace_iterator *iter, int full) @@ -2174,10 +2180,12 @@ void tracing_reset_online_cpus(struct array_buffer *buf) } /* Must have trace_types_lock held */ -void tracing_reset_all_online_cpus(void) +void tracing_reset_all_online_cpus_unlocked(void) { struct trace_array *tr; + lockdep_assert_held(&trace_types_lock); + list_for_each_entry(tr, &ftrace_trace_arrays, list) { if (!tr->clear_trace) continue; @@ -2189,6 +2197,13 @@ void tracing_reset_all_online_cpus(void) } } +void tracing_reset_all_online_cpus(void) +{ + mutex_lock(&trace_types_lock); + tracing_reset_all_online_cpus_unlocked(); + mutex_unlock(&trace_types_lock); +} + /* * The tgid_map array maps from pid to tgid; i.e. the value stored at index i * is the tgid last observed corresponding to pid=i. @@ -2200,6 +2215,11 @@ static size_t tgid_map_max; #define SAVED_CMDLINES_DEFAULT 128 #define NO_CMDLINE_MAP UINT_MAX +/* + * Preemption must be disabled before acquiring trace_cmdline_lock. + * The various trace_arrays' max_lock must be acquired in a context + * where interrupt is disabled. + */ static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED; struct saved_cmdlines_buffer { unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; @@ -2412,7 +2432,11 @@ static int trace_save_cmdline(struct task_struct *tsk) * the lock, but we also don't want to spin * nor do we want to disable interrupts, * so if we miss here, then better luck next time. + * + * This is called within the scheduler and wake up, so interrupts + * had better been disabled and run queue lock been held. */ + lockdep_assert_preemption_disabled(); if (!arch_spin_trylock(&trace_cmdline_lock)) return 0; @@ -5593,7 +5617,7 @@ static const char readme_msg[] = "\t +|-[u]<offset>(<fetcharg>), \\imm-value, \\\"imm-string\"\n" "\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, string, symbol,\n" "\t b<bit-width>@<bit-offset>/<container-size>, ustring,\n" - "\t <type>\\[<array-size>\\]\n" + "\t symstr, <type>\\[<array-size>\\]\n" #ifdef CONFIG_HIST_TRIGGERS "\t field: <stype> <name>;\n" "\t stype: u8/u16/u32/u64, s8/s16/s32/s64, pid_t,\n" @@ -5654,6 +5678,7 @@ static const char readme_msg[] = "\t [:size=#entries]\n" "\t [:pause][:continue][:clear]\n" "\t [:name=histname1]\n" + "\t [:nohitcount]\n" "\t [:<handler>.<action>]\n" "\t [if <filter>]\n\n" "\t Note, special fields can be used as well:\n" @@ -5700,7 +5725,9 @@ static const char readme_msg[] = "\t .syscall display a syscall id as a syscall name\n" "\t .log2 display log2 value rather than raw number\n" "\t .buckets=size display values in groups of size rather than raw number\n" - "\t .usecs display a common_timestamp in microseconds\n\n" + "\t .usecs display a common_timestamp in microseconds\n" + "\t .percent display a number of percentage value\n" + "\t .graph display a bar-graph of a value\n\n" "\t The 'pause' parameter can be used to pause an existing hist\n" "\t trigger or to start a hist trigger but not log any events\n" "\t until told to do so. 'continue' can be used to start or\n" @@ -5708,6 +5735,8 @@ static const char readme_msg[] = "\t The 'clear' parameter will clear the contents of a running\n" "\t hist trigger and leave its current paused/active state\n" "\t unchanged.\n\n" + "\t The 'nohitcount' (or NOHC) parameter will suppress display of\n" + "\t raw hitcount in the histogram.\n\n" "\t The enable_hist and disable_hist triggers can be used to\n" "\t have one event conditionally start and stop another event's\n" "\t already-attached hist trigger. The syntax is analogous to\n" @@ -5890,9 +5919,11 @@ tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf, char buf[64]; int r; + preempt_disable(); arch_spin_lock(&trace_cmdline_lock); r = scnprintf(buf, sizeof(buf), "%u\n", savedcmd->cmdline_num); arch_spin_unlock(&trace_cmdline_lock); + preempt_enable(); return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } @@ -5917,10 +5948,12 @@ static int tracing_resize_saved_cmdlines(unsigned int val) return -ENOMEM; } + preempt_disable(); arch_spin_lock(&trace_cmdline_lock); savedcmd_temp = savedcmd; savedcmd = s; arch_spin_unlock(&trace_cmdline_lock); + preempt_enable(); free_saved_cmdlines_buffer(savedcmd_temp); return 0; @@ -6373,10 +6406,12 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) #ifdef CONFIG_TRACER_SNAPSHOT if (t->use_max_tr) { + local_irq_disable(); arch_spin_lock(&tr->max_lock); if (tr->cond_snapshot) ret = -EBUSY; arch_spin_unlock(&tr->max_lock); + local_irq_enable(); if (ret) goto out; } @@ -6407,12 +6442,12 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) if (tr->current_trace->reset) tr->current_trace->reset(tr); +#ifdef CONFIG_TRACER_MAX_TRACE + had_max_tr = tr->current_trace->use_max_tr; + /* Current trace needs to be nop_trace before synchronize_rcu */ tr->current_trace = &nop_trace; -#ifdef CONFIG_TRACER_MAX_TRACE - had_max_tr = tr->allocated_snapshot; - if (had_max_tr && !t->use_max_tr) { /* * We need to make sure that the update_max_tr sees that @@ -6425,11 +6460,13 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) free_snapshot(tr); } - if (t->use_max_tr && !had_max_tr) { + if (t->use_max_tr && !tr->allocated_snapshot) { ret = tracing_alloc_snapshot_instance(tr); if (ret < 0) goto out; } +#else + tr->current_trace = &nop_trace; #endif if (t->init) { @@ -6540,7 +6577,7 @@ out: return ret; } -#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) +#ifdef CONFIG_TRACER_MAX_TRACE static ssize_t tracing_max_lat_read(struct file *filp, char __user *ubuf, @@ -6634,6 +6671,7 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) mutex_unlock(&trace_types_lock); free_cpumask_var(iter->started); + kfree(iter->fmt); mutex_destroy(&iter->mutex); kfree(iter); @@ -6658,7 +6696,7 @@ trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_tabl return EPOLLIN | EPOLLRDNORM; else return ring_buffer_poll_wait(iter->array_buffer->buffer, iter->cpu_file, - filp, poll_table); + filp, poll_table, iter->tr->buffer_percent); } static __poll_t @@ -6763,7 +6801,20 @@ waitagain: ret = print_trace_line(iter); if (ret == TRACE_TYPE_PARTIAL_LINE) { - /* don't print partial lines */ + /* + * If one print_trace_line() fills entire trace_seq in one shot, + * trace_seq_to_user() will returns -EBUSY because save_len == 0, + * In this case, we need to consume it, otherwise, loop will peek + * this event next time, resulting in an infinite loop. + */ + if (save_len == 0) { + iter->seq.full = 0; + trace_seq_puts(&iter->seq, "[LINE TOO BIG]\n"); + trace_consume(iter); + break; + } + + /* In other cases, don't print partial lines */ iter->seq.seq.len = save_len; break; } @@ -7436,10 +7487,12 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, goto out; } + local_irq_disable(); arch_spin_lock(&tr->max_lock); if (tr->cond_snapshot) ret = -EBUSY; arch_spin_unlock(&tr->max_lock); + local_irq_enable(); if (ret) goto out; @@ -7552,7 +7605,7 @@ static const struct file_operations tracing_thresh_fops = { .llseek = generic_file_llseek, }; -#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) +#ifdef CONFIG_TRACER_MAX_TRACE static const struct file_operations tracing_max_lat_fops = { .open = tracing_open_generic, .read = tracing_max_lat_read, @@ -7777,6 +7830,7 @@ static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr, int len) { struct tracing_log_err *err; + char *cmd; if (tr->n_err_log_entries < TRACING_LOG_ERRS_MAX) { err = alloc_tracing_log_err(len); @@ -7785,12 +7839,12 @@ static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr, return err; } - + cmd = kzalloc(len, GFP_KERNEL); + if (!cmd) + return ERR_PTR(-ENOMEM); err = list_first_entry(&tr->err_log, struct tracing_log_err, list); kfree(err->cmd); - err->cmd = kzalloc(len, GFP_KERNEL); - if (!err->cmd) - return ERR_PTR(-ENOMEM); + err->cmd = cmd; list_del(&err->list); return err; @@ -8137,6 +8191,12 @@ static int tracing_buffers_release(struct inode *inode, struct file *file) __trace_array_put(iter->tr); + iter->wait_index++; + /* Make sure the waiters see the new wait_index */ + smp_wmb(); + + ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file); + if (info->spare) ring_buffer_free_read_page(iter->array_buffer->buffer, info->spare_cpu, info->spare); @@ -8290,6 +8350,8 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, /* did we read anything? */ if (!spd.nr_pages) { + long wait_index; + if (ret) goto out; @@ -8297,10 +8359,21 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) goto out; + wait_index = READ_ONCE(iter->wait_index); + ret = wait_on_pipe(iter, iter->tr->buffer_percent); if (ret) goto out; + /* No need to wait after waking up when tracing is off */ + if (!tracer_tracing_is_on(iter->tr)) + goto out; + + /* Make sure we see the new wait_index */ + smp_rmb(); + if (wait_index != iter->wait_index) + goto out; + goto again; } @@ -8311,12 +8384,34 @@ out: return ret; } +/* An ioctl call with cmd 0 to the ring buffer file will wake up all waiters */ +static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct ftrace_buffer_info *info = file->private_data; + struct trace_iterator *iter = &info->iter; + + if (cmd) + return -ENOIOCTLCMD; + + mutex_lock(&trace_types_lock); + + iter->wait_index++; + /* Make sure the waiters see the new wait_index */ + smp_wmb(); + + ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file); + + mutex_unlock(&trace_types_lock); + return 0; +} + static const struct file_operations tracing_buffers_fops = { .open = tracing_buffers_open, .read = tracing_buffers_read, .poll = tracing_buffers_poll, .release = tracing_buffers_release, .splice_read = tracing_buffers_splice_read, + .unlocked_ioctl = tracing_buffers_ioctl, .llseek = no_llseek, }; @@ -9005,6 +9100,8 @@ rb_simple_write(struct file *filp, const char __user *ubuf, tracer_tracing_off(tr); if (tr->current_trace->stop) tr->current_trace->stop(tr); + /* Wake up any waiters */ + ring_buffer_wake_waiters(buffer, RING_BUFFER_ALL_CPUS); } mutex_unlock(&trace_types_lock); } @@ -9522,7 +9619,9 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) create_trace_options_dir(tr); +#ifdef CONFIG_TRACER_MAX_TRACE trace_create_maxlat_file(tr, d_tracer); +#endif if (ftrace_create_function_files(tr, d_tracer)) MEM_FAIL(1, "Could not allocate function filter files"); @@ -9776,41 +9875,41 @@ static __init int tracer_init_tracefs(void) fs_initcall(tracer_init_tracefs); -static int trace_panic_handler(struct notifier_block *this, - unsigned long event, void *unused) -{ - if (ftrace_dump_on_oops) - ftrace_dump(ftrace_dump_on_oops); - return NOTIFY_OK; -} +static int trace_die_panic_handler(struct notifier_block *self, + unsigned long ev, void *unused); static struct notifier_block trace_panic_notifier = { - .notifier_call = trace_panic_handler, - .next = NULL, - .priority = 150 /* priority: INT_MAX >= x >= 0 */ + .notifier_call = trace_die_panic_handler, + .priority = INT_MAX - 1, }; -static int trace_die_handler(struct notifier_block *self, - unsigned long val, - void *data) -{ - switch (val) { - case DIE_OOPS: - if (ftrace_dump_on_oops) - ftrace_dump(ftrace_dump_on_oops); - break; - default: - break; - } - return NOTIFY_OK; -} - static struct notifier_block trace_die_notifier = { - .notifier_call = trace_die_handler, - .priority = 200 + .notifier_call = trace_die_panic_handler, + .priority = INT_MAX - 1, }; /* + * The idea is to execute the following die/panic callback early, in order + * to avoid showing irrelevant information in the trace (like other panic + * notifier functions); we are the 2nd to run, after hung_task/rcu_stall + * warnings get disabled (to prevent potential log flooding). + */ +static int trace_die_panic_handler(struct notifier_block *self, + unsigned long ev, void *unused) +{ + if (!ftrace_dump_on_oops) + return NOTIFY_DONE; + + /* The die notifier requires DIE_OOPS to trigger */ + if (self == &trace_die_notifier && ev != DIE_OOPS) + return NOTIFY_DONE; + + ftrace_dump(ftrace_dump_on_oops); + + return NOTIFY_DONE; +} + +/* * printk is set to max of 1024, we really don't need it that big. * Nothing should be printing 1000 characters anyway. */ @@ -10091,7 +10190,7 @@ __init static int tracer_alloc_buffers(void) * buffer. The memory will be removed once the "instance" is removed. */ ret = cpuhp_setup_state_multi(CPUHP_TRACE_RB_PREPARE, - "trace/RB:preapre", trace_rb_cpu_prepare, + "trace/RB:prepare", trace_rb_cpu_prepare, NULL); if (ret < 0) goto out_free_cpumask; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 900e75d96c84..e46a49269be2 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -308,8 +308,7 @@ struct trace_array { struct array_buffer max_buffer; bool allocated_snapshot; #endif -#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) \ - || defined(CONFIG_OSNOISE_TRACER) +#ifdef CONFIG_TRACER_MAX_TRACE unsigned long max_latency; #ifdef CONFIG_FSNOTIFY struct dentry *d_max_latency; @@ -580,6 +579,7 @@ int tracing_is_enabled(void); void tracing_reset_online_cpus(struct array_buffer *buf); void tracing_reset_current(int cpu); void tracing_reset_all_online_cpus(void); +void tracing_reset_all_online_cpus_unlocked(void); int tracing_open_generic(struct inode *inode, struct file *filp); int tracing_open_generic_tr(struct inode *inode, struct file *filp); bool tracing_is_disabled(void); @@ -614,7 +614,7 @@ void trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer, bool trace_is_tracepoint_string(const char *str); const char *trace_event_format(struct trace_iterator *iter, const char *fmt); void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, - va_list ap); + va_list ap) __printf(2, 0); int trace_empty(struct trace_iterator *iter); @@ -687,12 +687,11 @@ void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu, void *cond_data); void update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu); -#endif /* CONFIG_TRACER_MAX_TRACE */ -#if (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) \ - || defined(CONFIG_OSNOISE_TRACER)) && defined(CONFIG_FSNOTIFY) +#ifdef CONFIG_FSNOTIFY #define LATENCY_FS_NOTIFY #endif +#endif /* CONFIG_TRACER_MAX_TRACE */ #ifdef LATENCY_FS_NOTIFY void latency_fsnotify(struct trace_array *tr); @@ -1435,8 +1434,6 @@ event_trigger_unlock_commit(struct trace_event_file *file, struct filter_pred; struct regex; -typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event); - typedef int (*regex_match_func)(char *str, struct regex *r, int len); enum regex_type { @@ -1455,17 +1452,6 @@ struct regex { regex_match_func match; }; -struct filter_pred { - filter_pred_fn_t fn; - u64 val; - struct regex regex; - unsigned short *ops; - struct ftrace_event_field *field; - int offset; - int not; - int op; -}; - static inline bool is_string_field(struct ftrace_event_field *field) { return field->filter_type == FILTER_DYN_STRING || @@ -1954,8 +1940,6 @@ static inline void tracer_hardirqs_on(unsigned long a0, unsigned long a1) { } static inline void tracer_hardirqs_off(unsigned long a0, unsigned long a1) { } #endif -extern struct trace_iterator *tracepoint_print_iter; - /* * Reset the state of the trace_iterator so that it can read consumed data. * Normally, the trace_iterator is used for reading the data when it is not @@ -1968,17 +1952,30 @@ static __always_inline void trace_iterator_reset(struct trace_iterator *iter) } /* Check the name is good for event/group/fields */ -static inline bool is_good_name(const char *name) +static inline bool __is_good_name(const char *name, bool hash_ok) { - if (!isalpha(*name) && *name != '_') + if (!isalpha(*name) && *name != '_' && (!hash_ok || *name != '-')) return false; while (*++name != '\0') { - if (!isalpha(*name) && !isdigit(*name) && *name != '_') + if (!isalpha(*name) && !isdigit(*name) && *name != '_' && + (!hash_ok || *name != '-')) return false; } return true; } +/* Check the name is good for event/group/fields */ +static inline bool is_good_name(const char *name) +{ + return __is_good_name(name, false); +} + +/* Check the name is good for system */ +static inline bool is_good_system_name(const char *name) +{ + return __is_good_name(name, true); +} + /* Convert certain expected symbols into '_' when generating event names */ static inline void sanitize_event_name(char *name) { diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c index 801c2a7f7605..54d5fa35c90a 100644 --- a/kernel/trace/trace_benchmark.c +++ b/kernel/trace/trace_benchmark.c @@ -51,7 +51,7 @@ static void trace_do_benchmark(void) local_irq_disable(); start = trace_clock_local(); - trace_benchmark_event(bm_str); + trace_benchmark_event(bm_str, bm_last); stop = trace_clock_local(); local_irq_enable(); diff --git a/kernel/trace/trace_benchmark.h b/kernel/trace/trace_benchmark.h index 79e6fbe5b365..c3e91060dc94 100644 --- a/kernel/trace/trace_benchmark.h +++ b/kernel/trace/trace_benchmark.h @@ -14,19 +14,21 @@ extern void trace_benchmark_unreg(void); TRACE_EVENT_FN(benchmark_event, - TP_PROTO(const char *str), + TP_PROTO(const char *str, u64 delta), - TP_ARGS(str), + TP_ARGS(str, delta), TP_STRUCT__entry( __array( char, str, BENCHMARK_EVENT_STRLEN ) + __field( u64, delta) ), TP_fast_assign( memcpy(__entry->str, str, BENCHMARK_EVENT_STRLEN); + __entry->delta = delta; ), - TP_printk("%s", __entry->str), + TP_printk("%s delta=%llu", __entry->str, __entry->delta), trace_benchmark_reg, trace_benchmark_unreg ); diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index 154996684fb5..4376887e0d8a 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -118,6 +118,7 @@ int dyn_event_release(const char *raw_command, struct dyn_event_operations *type if (ret) break; } + tracing_reset_all_online_cpus(); mutex_unlock(&event_mutex); out: argv_free(argv); @@ -214,6 +215,7 @@ int dyn_events_release_all(struct dyn_event_operations *type) break; } out: + tracing_reset_all_online_cpus(); mutex_unlock(&event_mutex); return ret; diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 1783e3478912..352b65e2b910 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -16,6 +16,7 @@ #include "trace_dynevent.h" #include "trace_probe.h" #include "trace_probe_tmpl.h" +#include "trace_probe_kernel.h" #define EPROBE_EVENT_SYSTEM "eprobes" @@ -26,6 +27,9 @@ struct trace_eprobe { /* tracepoint event */ const char *event_name; + /* filter string for the tracepoint */ + char *filter_str; + struct trace_event_call *event; struct dyn_event devent; @@ -48,6 +52,7 @@ static void trace_event_probe_cleanup(struct trace_eprobe *ep) kfree(ep->event_system); if (ep->event) trace_event_put_ref(ep->event); + kfree(ep->filter_str); kfree(ep); } @@ -453,29 +458,14 @@ NOKPROBE_SYMBOL(process_fetch_insn) static nokprobe_inline int fetch_store_strlen_user(unsigned long addr) { - const void __user *uaddr = (__force const void __user *)addr; - - return strnlen_user_nofault(uaddr, MAX_STRING_SIZE); + return kern_fetch_store_strlen_user(addr); } /* Return the length of string -- including null terminal byte */ static nokprobe_inline int fetch_store_strlen(unsigned long addr) { - int ret, len = 0; - u8 c; - -#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE - if (addr < TASK_SIZE) - return fetch_store_strlen_user(addr); -#endif - - do { - ret = copy_from_kernel_nofault(&c, (u8 *)addr + len, 1); - len++; - } while (c && ret == 0 && len < MAX_STRING_SIZE); - - return (ret < 0) ? ret : len; + return kern_fetch_store_strlen(addr); } /* @@ -485,21 +475,7 @@ fetch_store_strlen(unsigned long addr) static nokprobe_inline int fetch_store_string_user(unsigned long addr, void *dest, void *base) { - const void __user *uaddr = (__force const void __user *)addr; - int maxlen = get_loc_len(*(u32 *)dest); - void *__dest; - long ret; - - if (unlikely(!maxlen)) - return -ENOMEM; - - __dest = get_loc_data(dest, base); - - ret = strncpy_from_user_nofault(__dest, uaddr, maxlen); - if (ret >= 0) - *(u32 *)dest = make_data_loc(ret, __dest - base); - - return ret; + return kern_fetch_store_string_user(addr, dest, base); } /* @@ -509,29 +485,7 @@ fetch_store_string_user(unsigned long addr, void *dest, void *base) static nokprobe_inline int fetch_store_string(unsigned long addr, void *dest, void *base) { - int maxlen = get_loc_len(*(u32 *)dest); - void *__dest; - long ret; - -#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE - if ((unsigned long)addr < TASK_SIZE) - return fetch_store_string_user(addr, dest, base); -#endif - - if (unlikely(!maxlen)) - return -ENOMEM; - - __dest = get_loc_data(dest, base); - - /* - * Try to get string again, since the string can be changed while - * probing. - */ - ret = strncpy_from_kernel_nofault(__dest, (void *)addr, maxlen); - if (ret >= 0) - *(u32 *)dest = make_data_loc(ret, __dest - base); - - return ret; + return kern_fetch_store_string(addr, dest, base); } static nokprobe_inline int @@ -610,6 +564,9 @@ static void eprobe_trigger_func(struct event_trigger_data *data, { struct eprobe_data *edata = data->private_data; + if (unlikely(!rec)) + return; + __eprobe_trace_func(edata, rec); } @@ -664,14 +621,15 @@ static struct event_trigger_data * new_eprobe_trigger(struct trace_eprobe *ep, struct trace_event_file *file) { struct event_trigger_data *trigger; + struct event_filter *filter = NULL; struct eprobe_data *edata; + int ret; edata = kzalloc(sizeof(*edata), GFP_KERNEL); trigger = kzalloc(sizeof(*trigger), GFP_KERNEL); if (!trigger || !edata) { - kfree(edata); - kfree(trigger); - return ERR_PTR(-ENOMEM); + ret = -ENOMEM; + goto error; } trigger->flags = EVENT_TRIGGER_FL_PROBE; @@ -686,13 +644,25 @@ new_eprobe_trigger(struct trace_eprobe *ep, struct trace_event_file *file) trigger->cmd_ops = &event_trigger_cmd; INIT_LIST_HEAD(&trigger->list); - RCU_INIT_POINTER(trigger->filter, NULL); + + if (ep->filter_str) { + ret = create_event_filter(file->tr, ep->event, + ep->filter_str, false, &filter); + if (ret) + goto error; + } + RCU_INIT_POINTER(trigger->filter, filter); edata->file = file; edata->ep = ep; trigger->private_data = edata; return trigger; +error: + free_event_filter(filter); + kfree(edata); + kfree(trigger); + return ERR_PTR(ret); } static int enable_eprobe(struct trace_eprobe *ep, @@ -726,6 +696,7 @@ static int disable_eprobe(struct trace_eprobe *ep, { struct event_trigger_data *trigger = NULL, *iter; struct trace_event_file *file; + struct event_filter *filter; struct eprobe_data *edata; file = find_event_file(tr, ep->event_system, ep->event_name); @@ -752,6 +723,10 @@ static int disable_eprobe(struct trace_eprobe *ep, /* Make sure nothing is using the edata or trigger */ tracepoint_synchronize_unregister(); + filter = rcu_access_pointer(trigger->filter); + + if (filter) + free_event_filter(filter); kfree(edata); kfree(trigger); @@ -927,12 +902,62 @@ static int trace_eprobe_tp_update_arg(struct trace_eprobe *ep, const char *argv[ return ret; } +static int trace_eprobe_parse_filter(struct trace_eprobe *ep, int argc, const char *argv[]) +{ + struct event_filter *dummy = NULL; + int i, ret, len = 0; + char *p; + + if (argc == 0) { + trace_probe_log_err(0, NO_EP_FILTER); + return -EINVAL; + } + + /* Recover the filter string */ + for (i = 0; i < argc; i++) + len += strlen(argv[i]) + 1; + + ep->filter_str = kzalloc(len, GFP_KERNEL); + if (!ep->filter_str) + return -ENOMEM; + + p = ep->filter_str; + for (i = 0; i < argc; i++) { + ret = snprintf(p, len, "%s ", argv[i]); + if (ret < 0) + goto error; + if (ret > len) { + ret = -E2BIG; + goto error; + } + p += ret; + len -= ret; + } + p[-1] = '\0'; + + /* + * Ensure the filter string can be parsed correctly. Note, this + * filter string is for the original event, not for the eprobe. + */ + ret = create_event_filter(top_trace_array(), ep->event, ep->filter_str, + true, &dummy); + free_event_filter(dummy); + if (ret) + goto error; + + return 0; +error: + kfree(ep->filter_str); + ep->filter_str = NULL; + return ret; +} + static int __trace_eprobe_create(int argc, const char *argv[]) { /* * Argument syntax: - * e[:[GRP/][ENAME]] SYSTEM.EVENT [FETCHARGS] - * Fetch args: + * e[:[GRP/][ENAME]] SYSTEM.EVENT [FETCHARGS] [if FILTER] + * Fetch args (no space): * <name>=$<field>[:TYPE] */ const char *event = NULL, *group = EPROBE_EVENT_SYSTEM; @@ -942,8 +967,8 @@ static int __trace_eprobe_create(int argc, const char *argv[]) char buf1[MAX_EVENT_NAME_LEN]; char buf2[MAX_EVENT_NAME_LEN]; char gbuf[MAX_EVENT_NAME_LEN]; - int ret = 0; - int i; + int ret = 0, filter_idx = 0; + int i, filter_cnt; if (argc < 2 || argv[0][0] != 'e') return -ECANCELED; @@ -968,11 +993,19 @@ static int __trace_eprobe_create(int argc, const char *argv[]) } if (!event) { - strscpy(buf1, argv[1], MAX_EVENT_NAME_LEN); - sanitize_event_name(buf1); + strscpy(buf1, sys_event, MAX_EVENT_NAME_LEN); event = buf1; } + for (i = 2; i < argc; i++) { + if (!strcmp(argv[i], "if")) { + filter_idx = i + 1; + filter_cnt = argc - filter_idx; + argc = i; + break; + } + } + mutex_lock(&event_mutex); event_call = find_and_get_event(sys_name, sys_event); ep = alloc_event_probe(group, event, event_call, argc - 2); @@ -988,6 +1021,14 @@ static int __trace_eprobe_create(int argc, const char *argv[]) goto error; } + if (filter_idx) { + trace_probe_log_set_index(filter_idx); + ret = trace_eprobe_parse_filter(ep, filter_cnt, argv + filter_idx); + if (ret) + goto parse_error; + } else + ep->filter_str = NULL; + argc -= 2; argv += 2; /* parse arguments */ for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 61e3a2620fa3..05e791241812 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -251,16 +251,12 @@ int perf_kprobe_init(struct perf_event *p_event, bool is_retprobe) struct trace_event_call *tp_event; if (p_event->attr.kprobe_func) { - func = kzalloc(KSYM_NAME_LEN, GFP_KERNEL); - if (!func) - return -ENOMEM; - ret = strncpy_from_user( - func, u64_to_user_ptr(p_event->attr.kprobe_func), - KSYM_NAME_LEN); - if (ret == KSYM_NAME_LEN) - ret = -E2BIG; - if (ret < 0) - goto out; + func = strndup_user(u64_to_user_ptr(p_event->attr.kprobe_func), + KSYM_NAME_LEN); + if (IS_ERR(func)) { + ret = PTR_ERR(func); + return (ret == -EINVAL) ? -E2BIG : ret; + } if (func[0] == '\0') { kfree(func); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 0356cae0cf74..33e0b4f8ebe6 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2796,6 +2796,42 @@ trace_create_new_event(struct trace_event_call *call, return file; } +#define MAX_BOOT_TRIGGERS 32 + +static struct boot_triggers { + const char *event; + char *trigger; +} bootup_triggers[MAX_BOOT_TRIGGERS]; + +static char bootup_trigger_buf[COMMAND_LINE_SIZE]; +static int nr_boot_triggers; + +static __init int setup_trace_triggers(char *str) +{ + char *trigger; + char *buf; + int i; + + strlcpy(bootup_trigger_buf, str, COMMAND_LINE_SIZE); + ring_buffer_expanded = true; + disable_tracing_selftest("running event triggers"); + + buf = bootup_trigger_buf; + for (i = 0; i < MAX_BOOT_TRIGGERS; i++) { + trigger = strsep(&buf, ","); + if (!trigger) + break; + bootup_triggers[i].event = strsep(&trigger, "."); + bootup_triggers[i].trigger = strsep(&trigger, "."); + if (!bootup_triggers[i].trigger) + break; + } + + nr_boot_triggers = i; + return 1; +} +__setup("trace_trigger=", setup_trace_triggers); + /* Add an event to a trace directory */ static int __trace_add_new_event(struct trace_event_call *call, struct trace_array *tr) @@ -2812,6 +2848,24 @@ __trace_add_new_event(struct trace_event_call *call, struct trace_array *tr) return event_define_fields(call); } +static void trace_early_triggers(struct trace_event_file *file, const char *name) +{ + int ret; + int i; + + for (i = 0; i < nr_boot_triggers; i++) { + if (strcmp(name, bootup_triggers[i].event)) + continue; + mutex_lock(&event_mutex); + ret = trigger_process_regex(file, bootup_triggers[i].trigger); + mutex_unlock(&event_mutex); + if (ret) + pr_err("Failed to register trigger '%s' on event %s\n", + bootup_triggers[i].trigger, + bootup_triggers[i].event); + } +} + /* * Just create a descriptor for early init. A descriptor is required * for enabling events at boot. We want to enable events before @@ -2822,12 +2876,19 @@ __trace_early_add_new_event(struct trace_event_call *call, struct trace_array *tr) { struct trace_event_file *file; + int ret; file = trace_create_new_event(call, tr); if (!file) return -ENOMEM; - return event_define_fields(call); + ret = event_define_fields(call); + if (ret) + return ret; + + trace_early_triggers(file, trace_event_name(call)); + + return 0; } struct ftrace_module_file_ops; @@ -2880,7 +2941,10 @@ static int probe_remove_event_call(struct trace_event_call *call) * TRACE_REG_UNREGISTER. */ if (file->flags & EVENT_FILE_FL_ENABLED) - return -EBUSY; + goto busy; + + if (file->flags & EVENT_FILE_FL_WAS_ENABLED) + tr->clear_trace = true; /* * The do_for_each_event_file_safe() is * a double loop. After finding the call for this @@ -2893,6 +2957,12 @@ static int probe_remove_event_call(struct trace_event_call *call) __trace_remove_event_call(call); return 0; + busy: + /* No need to clear the trace now */ + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + tr->clear_trace = false; + } + return -EBUSY; } /* Remove an event_call */ @@ -2972,7 +3042,7 @@ static void trace_module_remove_events(struct module *mod) * over from this module may be passed to the new module events and * unexpected results may occur. */ - tracing_reset_all_online_cpus(); + tracing_reset_all_online_cpus_unlocked(); } static int trace_module_notify(struct notifier_block *self, @@ -3726,6 +3796,8 @@ static __init int event_trace_enable(void) list_add(&call->list, &ftrace_events); } + register_trigger_cmds(); + /* * We need the top trace array to have a working set of trace * points at early init, before the debug files and directories @@ -3740,7 +3812,6 @@ static __init int event_trace_enable(void) register_event_cmds(); - register_trigger_cmds(); return 0; } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 4b1057ab9d96..96acc2b71ac7 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -43,6 +43,42 @@ enum filter_op_ids { OPS }; static const char * ops[] = { OPS }; +enum filter_pred_fn { + FILTER_PRED_FN_NOP, + FILTER_PRED_FN_64, + FILTER_PRED_FN_S64, + FILTER_PRED_FN_U64, + FILTER_PRED_FN_32, + FILTER_PRED_FN_S32, + FILTER_PRED_FN_U32, + FILTER_PRED_FN_16, + FILTER_PRED_FN_S16, + FILTER_PRED_FN_U16, + FILTER_PRED_FN_8, + FILTER_PRED_FN_S8, + FILTER_PRED_FN_U8, + FILTER_PRED_FN_COMM, + FILTER_PRED_FN_STRING, + FILTER_PRED_FN_STRLOC, + FILTER_PRED_FN_STRRELLOC, + FILTER_PRED_FN_PCHAR_USER, + FILTER_PRED_FN_PCHAR, + FILTER_PRED_FN_CPU, + FILTER_PRED_FN_, + FILTER_PRED_TEST_VISITED, +}; + +struct filter_pred { + enum filter_pred_fn fn_num; + u64 val; + struct regex regex; + unsigned short *ops; + struct ftrace_event_field *field; + int offset; + int not; + int op; +}; + /* * pred functions are OP_LE, OP_LT, OP_GE, OP_GT, and OP_BAND * pred_funcs_##type below must match the order of them above. @@ -590,44 +626,48 @@ out_free: return ERR_PTR(ret); } +enum pred_cmp_types { + PRED_CMP_TYPE_NOP, + PRED_CMP_TYPE_LT, + PRED_CMP_TYPE_LE, + PRED_CMP_TYPE_GT, + PRED_CMP_TYPE_GE, + PRED_CMP_TYPE_BAND, +}; + #define DEFINE_COMPARISON_PRED(type) \ -static int filter_pred_LT_##type(struct filter_pred *pred, void *event) \ -{ \ - type *addr = (type *)(event + pred->offset); \ - type val = (type)pred->val; \ - return *addr < val; \ -} \ -static int filter_pred_LE_##type(struct filter_pred *pred, void *event) \ -{ \ - type *addr = (type *)(event + pred->offset); \ - type val = (type)pred->val; \ - return *addr <= val; \ -} \ -static int filter_pred_GT_##type(struct filter_pred *pred, void *event) \ +static int filter_pred_##type(struct filter_pred *pred, void *event) \ { \ - type *addr = (type *)(event + pred->offset); \ - type val = (type)pred->val; \ - return *addr > val; \ -} \ -static int filter_pred_GE_##type(struct filter_pred *pred, void *event) \ -{ \ - type *addr = (type *)(event + pred->offset); \ - type val = (type)pred->val; \ - return *addr >= val; \ -} \ -static int filter_pred_BAND_##type(struct filter_pred *pred, void *event) \ -{ \ - type *addr = (type *)(event + pred->offset); \ - type val = (type)pred->val; \ - return !!(*addr & val); \ -} \ -static const filter_pred_fn_t pred_funcs_##type[] = { \ - filter_pred_LE_##type, \ - filter_pred_LT_##type, \ - filter_pred_GE_##type, \ - filter_pred_GT_##type, \ - filter_pred_BAND_##type, \ -}; + switch (pred->op) { \ + case OP_LT: { \ + type *addr = (type *)(event + pred->offset); \ + type val = (type)pred->val; \ + return *addr < val; \ + } \ + case OP_LE: { \ + type *addr = (type *)(event + pred->offset); \ + type val = (type)pred->val; \ + return *addr <= val; \ + } \ + case OP_GT: { \ + type *addr = (type *)(event + pred->offset); \ + type val = (type)pred->val; \ + return *addr > val; \ + } \ + case OP_GE: { \ + type *addr = (type *)(event + pred->offset); \ + type val = (type)pred->val; \ + return *addr >= val; \ + } \ + case OP_BAND: { \ + type *addr = (type *)(event + pred->offset); \ + type val = (type)pred->val; \ + return !!(*addr & val); \ + } \ + default: \ + return 0; \ + } \ +} #define DEFINE_EQUALITY_PRED(size) \ static int filter_pred_##size(struct filter_pred *pred, void *event) \ @@ -836,11 +876,6 @@ static int filter_pred_comm(struct filter_pred *pred, void *event) return cmp ^ pred->not; } -static int filter_pred_none(struct filter_pred *pred, void *event) -{ - return 0; -} - /* * regex_match_foo - Basic regex callbacks * @@ -986,6 +1021,19 @@ static void filter_build_regex(struct filter_pred *pred) } } + +#ifdef CONFIG_FTRACE_STARTUP_TEST +static int test_pred_visited_fn(struct filter_pred *pred, void *event); +#else +static int test_pred_visited_fn(struct filter_pred *pred, void *event) +{ + return 0; +} +#endif + + +static int filter_pred_fn_call(struct filter_pred *pred, void *event); + /* return 1 if event matches, 0 otherwise (discard) */ int filter_match_preds(struct event_filter *filter, void *rec) { @@ -1003,7 +1051,7 @@ int filter_match_preds(struct event_filter *filter, void *rec) for (i = 0; prog[i].pred; i++) { struct filter_pred *pred = prog[i].pred; - int match = pred->fn(pred, rec); + int match = filter_pred_fn_call(pred, rec); if (match == prog[i].when_to_branch) i = prog[i].target; } @@ -1189,10 +1237,10 @@ int filter_assign_type(const char *type) return FILTER_OTHER; } -static filter_pred_fn_t select_comparison_fn(enum filter_op_ids op, - int field_size, int field_is_signed) +static enum filter_pred_fn select_comparison_fn(enum filter_op_ids op, + int field_size, int field_is_signed) { - filter_pred_fn_t fn = NULL; + enum filter_pred_fn fn = FILTER_PRED_FN_NOP; int pred_func_index = -1; switch (op) { @@ -1201,50 +1249,99 @@ static filter_pred_fn_t select_comparison_fn(enum filter_op_ids op, break; default: if (WARN_ON_ONCE(op < PRED_FUNC_START)) - return NULL; + return fn; pred_func_index = op - PRED_FUNC_START; if (WARN_ON_ONCE(pred_func_index > PRED_FUNC_MAX)) - return NULL; + return fn; } switch (field_size) { case 8: if (pred_func_index < 0) - fn = filter_pred_64; + fn = FILTER_PRED_FN_64; else if (field_is_signed) - fn = pred_funcs_s64[pred_func_index]; + fn = FILTER_PRED_FN_S64; else - fn = pred_funcs_u64[pred_func_index]; + fn = FILTER_PRED_FN_U64; break; case 4: if (pred_func_index < 0) - fn = filter_pred_32; + fn = FILTER_PRED_FN_32; else if (field_is_signed) - fn = pred_funcs_s32[pred_func_index]; + fn = FILTER_PRED_FN_S32; else - fn = pred_funcs_u32[pred_func_index]; + fn = FILTER_PRED_FN_U32; break; case 2: if (pred_func_index < 0) - fn = filter_pred_16; + fn = FILTER_PRED_FN_16; else if (field_is_signed) - fn = pred_funcs_s16[pred_func_index]; + fn = FILTER_PRED_FN_S16; else - fn = pred_funcs_u16[pred_func_index]; + fn = FILTER_PRED_FN_U16; break; case 1: if (pred_func_index < 0) - fn = filter_pred_8; + fn = FILTER_PRED_FN_8; else if (field_is_signed) - fn = pred_funcs_s8[pred_func_index]; + fn = FILTER_PRED_FN_S8; else - fn = pred_funcs_u8[pred_func_index]; + fn = FILTER_PRED_FN_U8; break; } return fn; } + +static int filter_pred_fn_call(struct filter_pred *pred, void *event) +{ + switch (pred->fn_num) { + case FILTER_PRED_FN_64: + return filter_pred_64(pred, event); + case FILTER_PRED_FN_S64: + return filter_pred_s64(pred, event); + case FILTER_PRED_FN_U64: + return filter_pred_u64(pred, event); + case FILTER_PRED_FN_32: + return filter_pred_32(pred, event); + case FILTER_PRED_FN_S32: + return filter_pred_s32(pred, event); + case FILTER_PRED_FN_U32: + return filter_pred_u32(pred, event); + case FILTER_PRED_FN_16: + return filter_pred_16(pred, event); + case FILTER_PRED_FN_S16: + return filter_pred_s16(pred, event); + case FILTER_PRED_FN_U16: + return filter_pred_u16(pred, event); + case FILTER_PRED_FN_8: + return filter_pred_8(pred, event); + case FILTER_PRED_FN_S8: + return filter_pred_s8(pred, event); + case FILTER_PRED_FN_U8: + return filter_pred_u8(pred, event); + case FILTER_PRED_FN_COMM: + return filter_pred_comm(pred, event); + case FILTER_PRED_FN_STRING: + return filter_pred_string(pred, event); + case FILTER_PRED_FN_STRLOC: + return filter_pred_strloc(pred, event); + case FILTER_PRED_FN_STRRELLOC: + return filter_pred_strrelloc(pred, event); + case FILTER_PRED_FN_PCHAR_USER: + return filter_pred_pchar_user(pred, event); + case FILTER_PRED_FN_PCHAR: + return filter_pred_pchar(pred, event); + case FILTER_PRED_FN_CPU: + return filter_pred_cpu(pred, event); + case FILTER_PRED_TEST_VISITED: + return test_pred_visited_fn(pred, event); + default: + return 0; + } +} + /* Called when a predicate is encountered by predicate_parse() */ static int parse_pred(const char *str, void *data, int pos, struct filter_parse_error *pe, @@ -1338,7 +1435,7 @@ static int parse_pred(const char *str, void *data, parse_error(pe, FILT_ERR_IP_FIELD_ONLY, pos + i); goto err_free; } - pred->fn = filter_pred_none; + pred->fn_num = FILTER_PRED_FN_NOP; /* * Quotes are not required, but if they exist then we need @@ -1416,16 +1513,16 @@ static int parse_pred(const char *str, void *data, filter_build_regex(pred); if (field->filter_type == FILTER_COMM) { - pred->fn = filter_pred_comm; + pred->fn_num = FILTER_PRED_FN_COMM; } else if (field->filter_type == FILTER_STATIC_STRING) { - pred->fn = filter_pred_string; + pred->fn_num = FILTER_PRED_FN_STRING; pred->regex.field_len = field->size; } else if (field->filter_type == FILTER_DYN_STRING) { - pred->fn = filter_pred_strloc; + pred->fn_num = FILTER_PRED_FN_STRLOC; } else if (field->filter_type == FILTER_RDYN_STRING) - pred->fn = filter_pred_strrelloc; + pred->fn_num = FILTER_PRED_FN_STRRELLOC; else { if (!ustring_per_cpu) { @@ -1436,9 +1533,9 @@ static int parse_pred(const char *str, void *data, } if (ustring) - pred->fn = filter_pred_pchar_user; + pred->fn_num = FILTER_PRED_FN_PCHAR_USER; else - pred->fn = filter_pred_pchar; + pred->fn_num = FILTER_PRED_FN_PCHAR; } /* go past the last quote */ i++; @@ -1486,10 +1583,10 @@ static int parse_pred(const char *str, void *data, pred->val = val; if (field->filter_type == FILTER_CPU) - pred->fn = filter_pred_cpu; + pred->fn_num = FILTER_PRED_FN_CPU; else { - pred->fn = select_comparison_fn(pred->op, field->size, - field->is_signed); + pred->fn_num = select_comparison_fn(pred->op, field->size, + field->is_signed); if (pred->op == OP_NE) pred->not = 1; } @@ -2296,7 +2393,7 @@ static void update_pred_fn(struct event_filter *filter, char *fields) struct filter_pred *pred = prog[i].pred; struct ftrace_event_field *field = pred->field; - WARN_ON_ONCE(!pred->fn); + WARN_ON_ONCE(pred->fn_num == FILTER_PRED_FN_NOP); if (!field) { WARN_ONCE(1, "all leafs should have field defined %d", i); @@ -2306,7 +2403,7 @@ static void update_pred_fn(struct event_filter *filter, char *fields) if (!strchr(fields, *field->name)) continue; - pred->fn = test_pred_visited_fn; + pred->fn_num = FILTER_PRED_TEST_VISITED; } } diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index fdf784620c28..fcaf226b7744 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -69,7 +69,8 @@ C(INVALID_STR_OPERAND, "String type can not be an operand in expression"), \ C(EXPECT_NUMBER, "Expecting numeric literal"), \ C(UNARY_MINUS_SUBEXPR, "Unary minus not supported in sub-expressions"), \ - C(DIVISION_BY_ZERO, "Division by zero"), + C(DIVISION_BY_ZERO, "Division by zero"), \ + C(NEED_NOHC_VAL, "Non-hitcount value is required for 'nohitcount'"), #undef C #define C(a, b) HIST_ERR_##a @@ -104,6 +105,38 @@ enum field_op_id { FIELD_OP_MULT, }; +enum hist_field_fn { + HIST_FIELD_FN_NOP, + HIST_FIELD_FN_VAR_REF, + HIST_FIELD_FN_COUNTER, + HIST_FIELD_FN_CONST, + HIST_FIELD_FN_LOG2, + HIST_FIELD_FN_BUCKET, + HIST_FIELD_FN_TIMESTAMP, + HIST_FIELD_FN_CPU, + HIST_FIELD_FN_STRING, + HIST_FIELD_FN_DYNSTRING, + HIST_FIELD_FN_RELDYNSTRING, + HIST_FIELD_FN_PSTRING, + HIST_FIELD_FN_S64, + HIST_FIELD_FN_U64, + HIST_FIELD_FN_S32, + HIST_FIELD_FN_U32, + HIST_FIELD_FN_S16, + HIST_FIELD_FN_U16, + HIST_FIELD_FN_S8, + HIST_FIELD_FN_U8, + HIST_FIELD_FN_UMINUS, + HIST_FIELD_FN_MINUS, + HIST_FIELD_FN_PLUS, + HIST_FIELD_FN_DIV, + HIST_FIELD_FN_MULT, + HIST_FIELD_FN_DIV_POWER2, + HIST_FIELD_FN_DIV_NOT_POWER2, + HIST_FIELD_FN_DIV_MULT_SHIFT, + HIST_FIELD_FN_EXECNAME, +}; + /* * A hist_var (histogram variable) contains variable information for * hist_fields having the HIST_FIELD_FL_VAR or HIST_FIELD_FL_VAR_REF @@ -123,15 +156,15 @@ struct hist_var { struct hist_field { struct ftrace_event_field *field; unsigned long flags; - hist_field_fn_t fn; - unsigned int ref; - unsigned int size; - unsigned int offset; - unsigned int is_signed; unsigned long buckets; const char *type; struct hist_field *operands[HIST_FIELD_OPERANDS_MAX]; struct hist_trigger_data *hist_data; + enum hist_field_fn fn_num; + unsigned int ref; + unsigned int size; + unsigned int offset; + unsigned int is_signed; /* * Variable fields contain variable-specific info in var. @@ -166,14 +199,11 @@ struct hist_field { u64 div_multiplier; }; -static u64 hist_field_none(struct hist_field *field, - struct tracing_map_elt *elt, - struct trace_buffer *buffer, - struct ring_buffer_event *rbe, - void *event) -{ - return 0; -} +static u64 hist_fn_call(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event); static u64 hist_field_const(struct hist_field *field, struct tracing_map_elt *elt, @@ -250,7 +280,7 @@ static u64 hist_field_log2(struct hist_field *hist_field, { struct hist_field *operand = hist_field->operands[0]; - u64 val = operand->fn(operand, elt, buffer, rbe, event); + u64 val = hist_fn_call(operand, elt, buffer, rbe, event); return (u64) ilog2(roundup_pow_of_two(val)); } @@ -264,7 +294,7 @@ static u64 hist_field_bucket(struct hist_field *hist_field, struct hist_field *operand = hist_field->operands[0]; unsigned long buckets = hist_field->buckets; - u64 val = operand->fn(operand, elt, buffer, rbe, event); + u64 val = hist_fn_call(operand, elt, buffer, rbe, event); if (WARN_ON_ONCE(!buckets)) return val; @@ -285,8 +315,8 @@ static u64 hist_field_plus(struct hist_field *hist_field, struct hist_field *operand1 = hist_field->operands[0]; struct hist_field *operand2 = hist_field->operands[1]; - u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event); - u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event); + u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event); + u64 val2 = hist_fn_call(operand2, elt, buffer, rbe, event); return val1 + val2; } @@ -300,8 +330,8 @@ static u64 hist_field_minus(struct hist_field *hist_field, struct hist_field *operand1 = hist_field->operands[0]; struct hist_field *operand2 = hist_field->operands[1]; - u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event); - u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event); + u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event); + u64 val2 = hist_fn_call(operand2, elt, buffer, rbe, event); return val1 - val2; } @@ -315,8 +345,8 @@ static u64 hist_field_div(struct hist_field *hist_field, struct hist_field *operand1 = hist_field->operands[0]; struct hist_field *operand2 = hist_field->operands[1]; - u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event); - u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event); + u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event); + u64 val2 = hist_fn_call(operand2, elt, buffer, rbe, event); /* Return -1 for the undefined case */ if (!val2) @@ -338,7 +368,7 @@ static u64 div_by_power_of_two(struct hist_field *hist_field, struct hist_field *operand1 = hist_field->operands[0]; struct hist_field *operand2 = hist_field->operands[1]; - u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event); + u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event); return val1 >> __ffs64(operand2->constant); } @@ -352,7 +382,7 @@ static u64 div_by_not_power_of_two(struct hist_field *hist_field, struct hist_field *operand1 = hist_field->operands[0]; struct hist_field *operand2 = hist_field->operands[1]; - u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event); + u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event); return div64_u64(val1, operand2->constant); } @@ -366,7 +396,7 @@ static u64 div_by_mult_and_shift(struct hist_field *hist_field, struct hist_field *operand1 = hist_field->operands[0]; struct hist_field *operand2 = hist_field->operands[1]; - u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event); + u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event); /* * If the divisor is a constant, do a multiplication and shift instead. @@ -400,8 +430,8 @@ static u64 hist_field_mult(struct hist_field *hist_field, struct hist_field *operand1 = hist_field->operands[0]; struct hist_field *operand2 = hist_field->operands[1]; - u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event); - u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event); + u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event); + u64 val2 = hist_fn_call(operand2, elt, buffer, rbe, event); return val1 * val2; } @@ -414,7 +444,7 @@ static u64 hist_field_unary_minus(struct hist_field *hist_field, { struct hist_field *operand = hist_field->operands[0]; - s64 sval = (s64)operand->fn(operand, elt, buffer, rbe, event); + s64 sval = (s64)hist_fn_call(operand, elt, buffer, rbe, event); u64 val = (u64)-sval; return val; @@ -477,6 +507,8 @@ enum hist_field_flags { HIST_FIELD_FL_ALIAS = 1 << 16, HIST_FIELD_FL_BUCKET = 1 << 17, HIST_FIELD_FL_CONST = 1 << 18, + HIST_FIELD_FL_PERCENT = 1 << 19, + HIST_FIELD_FL_GRAPH = 1 << 20, }; struct var_defs { @@ -495,6 +527,7 @@ struct hist_trigger_attrs { bool cont; bool clear; bool ts_in_usecs; + bool no_hitcount; unsigned int map_bits; char *assignment_str[TRACING_MAP_VARS_MAX]; @@ -588,7 +621,7 @@ struct action_data { * event param, and is passed to the synthetic event * invocation. */ - unsigned int var_ref_idx[TRACING_MAP_VARS_MAX]; + unsigned int var_ref_idx[SYNTH_FIELDS_MAX]; struct synth_event *synth_event; bool use_trace_keyword; char *synth_event_name; @@ -657,19 +690,19 @@ struct snapshot_context { * Returns the specific division function to use if the divisor * is constant. This avoids extra branches when the trigger is hit. */ -static hist_field_fn_t hist_field_get_div_fn(struct hist_field *divisor) +static enum hist_field_fn hist_field_get_div_fn(struct hist_field *divisor) { u64 div = divisor->constant; if (!(div & (div - 1))) - return div_by_power_of_two; + return HIST_FIELD_FN_DIV_POWER2; /* If the divisor is too large, do a regular division */ if (div > (1 << HIST_DIV_SHIFT)) - return div_by_not_power_of_two; + return HIST_FIELD_FN_DIV_NOT_POWER2; divisor->div_multiplier = div64_u64((u64)(1 << HIST_DIV_SHIFT), div); - return div_by_mult_and_shift; + return HIST_FIELD_FN_DIV_MULT_SHIFT; } static void track_data_free(struct track_data *track_data) @@ -954,7 +987,7 @@ static struct hist_field *find_any_var_ref(struct hist_trigger_data *hist_data, * A trigger can define one or more variables. If any one of them is * currently referenced by any other trigger, this function will * determine that. - + * * Typically used to determine whether or not a trigger can be removed * - if there are any references to a trigger's variables, it cannot. * @@ -1327,6 +1360,8 @@ static const char *hist_field_name(struct hist_field *field, field_name = field->name; } else if (field->flags & HIST_FIELD_FL_TIMESTAMP) field_name = "common_timestamp"; + else if (field->flags & HIST_FIELD_FL_HITCOUNT) + field_name = "hitcount"; if (field_name == NULL) field_name = ""; @@ -1334,38 +1369,32 @@ static const char *hist_field_name(struct hist_field *field, return field_name; } -static hist_field_fn_t select_value_fn(int field_size, int field_is_signed) +static enum hist_field_fn select_value_fn(int field_size, int field_is_signed) { - hist_field_fn_t fn = NULL; - switch (field_size) { case 8: if (field_is_signed) - fn = hist_field_s64; + return HIST_FIELD_FN_S64; else - fn = hist_field_u64; - break; + return HIST_FIELD_FN_U64; case 4: if (field_is_signed) - fn = hist_field_s32; + return HIST_FIELD_FN_S32; else - fn = hist_field_u32; - break; + return HIST_FIELD_FN_U32; case 2: if (field_is_signed) - fn = hist_field_s16; + return HIST_FIELD_FN_S16; else - fn = hist_field_u16; - break; + return HIST_FIELD_FN_U16; case 1: if (field_is_signed) - fn = hist_field_s8; + return HIST_FIELD_FN_S8; else - fn = hist_field_u8; - break; + return HIST_FIELD_FN_U8; } - return fn; + return HIST_FIELD_FN_NOP; } static int parse_map_size(char *str) @@ -1523,7 +1552,10 @@ parse_hist_trigger_attrs(struct trace_array *tr, char *trigger_str) ret = parse_assignment(tr, str, attrs); if (ret) goto free; - } else if (strcmp(str, "pause") == 0) + } else if (strcmp(str, "nohitcount") == 0 || + strcmp(str, "NOHC") == 0) + attrs->no_hitcount = true; + else if (strcmp(str, "pause") == 0) attrs->pause = true; else if ((strcmp(str, "cont") == 0) || (strcmp(str, "continue") == 0)) @@ -1682,6 +1714,10 @@ static const char *get_hist_field_flags(struct hist_field *hist_field) flags_str = "buckets"; else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP_USECS) flags_str = "usecs"; + else if (hist_field->flags & HIST_FIELD_FL_PERCENT) + flags_str = "percent"; + else if (hist_field->flags & HIST_FIELD_FL_GRAPH) + flags_str = "graph"; return flags_str; } @@ -1922,19 +1958,19 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, goto out; /* caller will populate */ if (flags & HIST_FIELD_FL_VAR_REF) { - hist_field->fn = hist_field_var_ref; + hist_field->fn_num = HIST_FIELD_FN_VAR_REF; goto out; } if (flags & HIST_FIELD_FL_HITCOUNT) { - hist_field->fn = hist_field_counter; + hist_field->fn_num = HIST_FIELD_FN_COUNTER; hist_field->size = sizeof(u64); hist_field->type = "u64"; goto out; } if (flags & HIST_FIELD_FL_CONST) { - hist_field->fn = hist_field_const; + hist_field->fn_num = HIST_FIELD_FN_CONST; hist_field->size = sizeof(u64); hist_field->type = kstrdup("u64", GFP_KERNEL); if (!hist_field->type) @@ -1943,14 +1979,14 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, } if (flags & HIST_FIELD_FL_STACKTRACE) { - hist_field->fn = hist_field_none; + hist_field->fn_num = HIST_FIELD_FN_NOP; goto out; } if (flags & (HIST_FIELD_FL_LOG2 | HIST_FIELD_FL_BUCKET)) { unsigned long fl = flags & ~(HIST_FIELD_FL_LOG2 | HIST_FIELD_FL_BUCKET); - hist_field->fn = flags & HIST_FIELD_FL_LOG2 ? hist_field_log2 : - hist_field_bucket; + hist_field->fn_num = flags & HIST_FIELD_FL_LOG2 ? HIST_FIELD_FN_LOG2 : + HIST_FIELD_FN_BUCKET; hist_field->operands[0] = create_hist_field(hist_data, field, fl, NULL); hist_field->size = hist_field->operands[0]->size; hist_field->type = kstrdup_const(hist_field->operands[0]->type, GFP_KERNEL); @@ -1960,14 +1996,14 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, } if (flags & HIST_FIELD_FL_TIMESTAMP) { - hist_field->fn = hist_field_timestamp; + hist_field->fn_num = HIST_FIELD_FN_TIMESTAMP; hist_field->size = sizeof(u64); hist_field->type = "u64"; goto out; } if (flags & HIST_FIELD_FL_CPU) { - hist_field->fn = hist_field_cpu; + hist_field->fn_num = HIST_FIELD_FN_CPU; hist_field->size = sizeof(int); hist_field->type = "unsigned int"; goto out; @@ -1987,14 +2023,14 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, goto free; if (field->filter_type == FILTER_STATIC_STRING) { - hist_field->fn = hist_field_string; + hist_field->fn_num = HIST_FIELD_FN_STRING; hist_field->size = field->size; } else if (field->filter_type == FILTER_DYN_STRING) { - hist_field->fn = hist_field_dynstring; + hist_field->fn_num = HIST_FIELD_FN_DYNSTRING; } else if (field->filter_type == FILTER_RDYN_STRING) - hist_field->fn = hist_field_reldynstring; + hist_field->fn_num = HIST_FIELD_FN_RELDYNSTRING; else - hist_field->fn = hist_field_pstring; + hist_field->fn_num = HIST_FIELD_FN_PSTRING; } else { hist_field->size = field->size; hist_field->is_signed = field->is_signed; @@ -2002,9 +2038,9 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, if (!hist_field->type) goto free; - hist_field->fn = select_value_fn(field->size, - field->is_signed); - if (!hist_field->fn) { + hist_field->fn_num = select_value_fn(field->size, + field->is_signed); + if (hist_field->fn_num == HIST_FIELD_FN_NOP) { destroy_hist_field(hist_field, 0); return NULL; } @@ -2150,7 +2186,9 @@ static struct hist_field *create_var_ref(struct hist_trigger_data *hist_data, return ref_field; } } - + /* Sanity check to avoid out-of-bound write on 'hist_data->var_refs' */ + if (hist_data->n_var_refs >= TRACING_MAP_VARS_MAX) + return NULL; ref_field = create_hist_field(var_field->hist_data, NULL, flags, NULL); if (ref_field) { if (init_var_ref(ref_field, var_field, system, event_name)) { @@ -2290,6 +2328,14 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, if (ret || !(*buckets)) goto error; *flags |= HIST_FIELD_FL_BUCKET; + } else if (strncmp(modifier, "percent", 7) == 0) { + if (*flags & (HIST_FIELD_FL_VAR | HIST_FIELD_FL_KEY)) + goto error; + *flags |= HIST_FIELD_FL_PERCENT; + } else if (strncmp(modifier, "graph", 5) == 0) { + if (*flags & (HIST_FIELD_FL_VAR | HIST_FIELD_FL_KEY)) + goto error; + *flags |= HIST_FIELD_FL_GRAPH; } else { error: hist_err(tr, HIST_ERR_BAD_FIELD_MODIFIER, errpos(modifier)); @@ -2305,6 +2351,8 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, hist_data->attrs->ts_in_usecs = true; } else if (strcmp(field_name, "common_cpu") == 0) *flags |= HIST_FIELD_FL_CPU; + else if (strcmp(field_name, "hitcount") == 0) + *flags |= HIST_FIELD_FL_HITCOUNT; else { field = trace_find_event_field(file->event_call, field_name); if (!field || !field->size) { @@ -2340,7 +2388,7 @@ static struct hist_field *create_alias(struct hist_trigger_data *hist_data, if (!alias) return NULL; - alias->fn = var_ref->fn; + alias->fn_num = var_ref->fn_num; alias->operands[0] = var_ref; if (init_var_ref(alias, var_ref, var_ref->system, var_ref->event_name)) { @@ -2523,7 +2571,7 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data, expr->flags |= operand1->flags & (HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS); - expr->fn = hist_field_unary_minus; + expr->fn_num = HIST_FIELD_FN_UMINUS; expr->operands[0] = operand1; expr->size = operand1->size; expr->is_signed = operand1->is_signed; @@ -2595,7 +2643,7 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, unsigned long operand_flags, operand2_flags; int field_op, ret = -EINVAL; char *sep, *operand1_str; - hist_field_fn_t op_fn; + enum hist_field_fn op_fn; bool combine_consts; if (*n_subexprs > 3) { @@ -2654,16 +2702,16 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, switch (field_op) { case FIELD_OP_MINUS: - op_fn = hist_field_minus; + op_fn = HIST_FIELD_FN_MINUS; break; case FIELD_OP_PLUS: - op_fn = hist_field_plus; + op_fn = HIST_FIELD_FN_PLUS; break; case FIELD_OP_DIV: - op_fn = hist_field_div; + op_fn = HIST_FIELD_FN_DIV; break; case FIELD_OP_MULT: - op_fn = hist_field_mult; + op_fn = HIST_FIELD_FN_MULT; break; default: ret = -EINVAL; @@ -2719,13 +2767,16 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, op_fn = hist_field_get_div_fn(operand2); } + expr->fn_num = op_fn; + if (combine_consts) { if (var1) expr->operands[0] = var1; if (var2) expr->operands[1] = var2; - expr->constant = op_fn(expr, NULL, NULL, NULL, NULL); + expr->constant = hist_fn_call(expr, NULL, NULL, NULL, NULL); + expr->fn_num = HIST_FIELD_FN_CONST; expr->operands[0] = NULL; expr->operands[1] = NULL; @@ -2739,8 +2790,6 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, expr->name = expr_str(expr, 0); } else { - expr->fn = op_fn; - /* The operand sizes should be the same, so just pick one */ expr->size = operand1->size; expr->is_signed = operand1->is_signed; @@ -3065,7 +3114,7 @@ static inline void __update_field_vars(struct tracing_map_elt *elt, struct hist_field *var = field_var->var; struct hist_field *val = field_var->val; - var_val = val->fn(val, elt, buffer, rbe, rec); + var_val = hist_fn_call(val, elt, buffer, rbe, rec); var_idx = var->var.idx; if (val->flags & HIST_FIELD_FL_STRING) { @@ -3202,7 +3251,7 @@ static struct field_var *create_field_var(struct hist_trigger_data *hist_data, * events. However, for convenience, users are allowed to directly * specify an event field in an action, which will be automatically * converted into a variable on their behalf. - + * * This function creates a field variable with the name var_name on * the hist trigger currently being defined on the target event. If * subsys_name and event_name are specified, this function simply @@ -3562,6 +3611,7 @@ static int parse_action_params(struct trace_array *tr, char *params, while (params) { if (data->n_params >= SYNTH_FIELDS_MAX) { hist_err(tr, HIST_ERR_TOO_MANY_PARAMS, 0); + ret = -EINVAL; goto out; } @@ -3898,6 +3948,10 @@ static int trace_action_create(struct hist_trigger_data *hist_data, lockdep_assert_held(&event_mutex); + /* Sanity check to avoid out-of-bound write on 'data->var_ref_idx' */ + if (data->n_params > SYNTH_FIELDS_MAX) + return -EINVAL; + if (data->use_trace_keyword) synth_event_name = data->synth_event_name; else @@ -4186,6 +4240,74 @@ static u64 hist_field_execname(struct hist_field *hist_field, return (u64)(unsigned long)(elt_data->comm); } +static u64 hist_fn_call(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + switch (hist_field->fn_num) { + case HIST_FIELD_FN_VAR_REF: + return hist_field_var_ref(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_COUNTER: + return hist_field_counter(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_CONST: + return hist_field_const(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_LOG2: + return hist_field_log2(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_BUCKET: + return hist_field_bucket(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_TIMESTAMP: + return hist_field_timestamp(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_CPU: + return hist_field_cpu(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_STRING: + return hist_field_string(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_DYNSTRING: + return hist_field_dynstring(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_RELDYNSTRING: + return hist_field_reldynstring(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_PSTRING: + return hist_field_pstring(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_S64: + return hist_field_s64(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_U64: + return hist_field_u64(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_S32: + return hist_field_s32(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_U32: + return hist_field_u32(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_S16: + return hist_field_s16(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_U16: + return hist_field_u16(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_S8: + return hist_field_s8(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_U8: + return hist_field_u8(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_UMINUS: + return hist_field_unary_minus(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_MINUS: + return hist_field_minus(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_PLUS: + return hist_field_plus(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_DIV: + return hist_field_div(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_MULT: + return hist_field_mult(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_DIV_POWER2: + return div_by_power_of_two(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_DIV_NOT_POWER2: + return div_by_not_power_of_two(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_DIV_MULT_SHIFT: + return div_by_mult_and_shift(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_EXECNAME: + return hist_field_execname(hist_field, elt, buffer, rbe, event); + default: + return 0; + } +} + /* Convert a var that points to common_pid.execname to a string */ static void update_var_execname(struct hist_field *hist_field) { @@ -4197,7 +4319,7 @@ static void update_var_execname(struct hist_field *hist_field) kfree_const(hist_field->type); hist_field->type = "char[]"; - hist_field->fn = hist_field_execname; + hist_field->fn_num = HIST_FIELD_FN_EXECNAME; } static int create_var_field(struct hist_trigger_data *hist_data, @@ -4236,8 +4358,8 @@ static int create_var_field(struct hist_trigger_data *hist_data, static int create_val_fields(struct hist_trigger_data *hist_data, struct trace_event_file *file) { + unsigned int i, j = 1, n_hitcount = 0; char *fields_str, *field_str; - unsigned int i, j = 1; int ret; ret = create_hitcount_val(hist_data); @@ -4254,8 +4376,10 @@ static int create_val_fields(struct hist_trigger_data *hist_data, if (!field_str) break; - if (strcmp(field_str, "hitcount") == 0) - continue; + if (strcmp(field_str, "hitcount") == 0) { + if (!n_hitcount++) + continue; + } ret = create_val_field(hist_data, j++, file, field_str); if (ret) @@ -4265,6 +4389,12 @@ static int create_val_fields(struct hist_trigger_data *hist_data, if (fields_str && (strcmp(fields_str, "hitcount") != 0)) ret = -EINVAL; out: + /* There is only raw hitcount but nohitcount suppresses it. */ + if (j == 1 && hist_data->attrs->no_hitcount) { + hist_err(hist_data->event_file->tr, HIST_ERR_NEED_NOHC_VAL, 0); + ret = -ENOENT; + } + return ret; } @@ -4956,7 +5086,7 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, for_each_hist_val_field(i, hist_data) { hist_field = hist_data->fields[i]; - hist_val = hist_field->fn(hist_field, elt, buffer, rbe, rec); + hist_val = hist_fn_call(hist_field, elt, buffer, rbe, rec); if (hist_field->flags & HIST_FIELD_FL_VAR) { var_idx = hist_field->var.idx; @@ -4987,7 +5117,7 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, for_each_hist_key_field(i, hist_data) { hist_field = hist_data->fields[i]; if (hist_field->flags & HIST_FIELD_FL_VAR) { - hist_val = hist_field->fn(hist_field, elt, buffer, rbe, rec); + hist_val = hist_fn_call(hist_field, elt, buffer, rbe, rec); var_idx = hist_field->var.idx; tracing_map_set_var(elt, var_idx, hist_val); } @@ -5051,6 +5181,9 @@ static void event_hist_trigger(struct event_trigger_data *data, void *key = NULL; unsigned int i; + if (unlikely(!rbe)) + return; + memset(compound_key, 0, hist_data->key_size); for_each_hist_key_field(i, hist_data) { @@ -5062,7 +5195,7 @@ static void event_hist_trigger(struct event_trigger_data *data, HIST_STACKTRACE_SKIP); key = entries; } else { - field_contents = key_field->fn(key_field, elt, buffer, rbe, rec); + field_contents = hist_fn_call(key_field, elt, buffer, rbe, rec); if (key_field->flags & HIST_FIELD_FL_STRING) { key = (void *)(unsigned long)field_contents; use_compound_key = true; @@ -5190,33 +5323,101 @@ static void hist_trigger_print_key(struct seq_file *m, seq_puts(m, "}"); } +/* Get the 100 times of the percentage of @val in @total */ +static inline unsigned int __get_percentage(u64 val, u64 total) +{ + if (!total) + goto div0; + + if (val < (U64_MAX / 10000)) + return (unsigned int)div64_ul(val * 10000, total); + + total = div64_u64(total, 10000); + if (!total) + goto div0; + + return (unsigned int)div64_ul(val, total); +div0: + return val ? UINT_MAX : 0; +} + +#define BAR_CHAR '#' + +static inline const char *__fill_bar_str(char *buf, int size, u64 val, u64 max) +{ + unsigned int len = __get_percentage(val, max); + int i; + + if (len == UINT_MAX) { + snprintf(buf, size, "[ERROR]"); + return buf; + } + + len = len * size / 10000; + for (i = 0; i < len && i < size; i++) + buf[i] = BAR_CHAR; + while (i < size) + buf[i++] = ' '; + buf[size] = '\0'; + + return buf; +} + +struct hist_val_stat { + u64 max; + u64 total; +}; + +static void hist_trigger_print_val(struct seq_file *m, unsigned int idx, + const char *field_name, unsigned long flags, + struct hist_val_stat *stats, + struct tracing_map_elt *elt) +{ + u64 val = tracing_map_read_sum(elt, idx); + unsigned int pc; + char bar[21]; + + if (flags & HIST_FIELD_FL_PERCENT) { + pc = __get_percentage(val, stats[idx].total); + if (pc == UINT_MAX) + seq_printf(m, " %s (%%):[ERROR]", field_name); + else + seq_printf(m, " %s (%%): %3u.%02u", field_name, + pc / 100, pc % 100); + } else if (flags & HIST_FIELD_FL_GRAPH) { + seq_printf(m, " %s: %20s", field_name, + __fill_bar_str(bar, 20, val, stats[idx].max)); + } else if (flags & HIST_FIELD_FL_HEX) { + seq_printf(m, " %s: %10llx", field_name, val); + } else { + seq_printf(m, " %s: %10llu", field_name, val); + } +} + static void hist_trigger_entry_print(struct seq_file *m, struct hist_trigger_data *hist_data, + struct hist_val_stat *stats, void *key, struct tracing_map_elt *elt) { const char *field_name; - unsigned int i; + unsigned int i = HITCOUNT_IDX; + unsigned long flags; hist_trigger_print_key(m, hist_data, key, elt); - seq_printf(m, " hitcount: %10llu", - tracing_map_read_sum(elt, HITCOUNT_IDX)); + /* At first, show the raw hitcount if !nohitcount */ + if (!hist_data->attrs->no_hitcount) + hist_trigger_print_val(m, i, "hitcount", 0, stats, elt); for (i = 1; i < hist_data->n_vals; i++) { field_name = hist_field_name(hist_data->fields[i], 0); - - if (hist_data->fields[i]->flags & HIST_FIELD_FL_VAR || - hist_data->fields[i]->flags & HIST_FIELD_FL_EXPR) + flags = hist_data->fields[i]->flags; + if (flags & HIST_FIELD_FL_VAR || flags & HIST_FIELD_FL_EXPR) continue; - if (hist_data->fields[i]->flags & HIST_FIELD_FL_HEX) { - seq_printf(m, " %s: %10llx", field_name, - tracing_map_read_sum(elt, i)); - } else { - seq_printf(m, " %s: %10llu", field_name, - tracing_map_read_sum(elt, i)); - } + seq_puts(m, " "); + hist_trigger_print_val(m, i, field_name, flags, stats, elt); } print_actions(m, hist_data, elt); @@ -5229,7 +5430,9 @@ static int print_entries(struct seq_file *m, { struct tracing_map_sort_entry **sort_entries = NULL; struct tracing_map *map = hist_data->map; - int i, n_entries; + int i, j, n_entries; + struct hist_val_stat *stats = NULL; + u64 val; n_entries = tracing_map_sort_entries(map, hist_data->sort_keys, hist_data->n_sort_keys, @@ -5237,11 +5440,34 @@ static int print_entries(struct seq_file *m, if (n_entries < 0) return n_entries; + /* Calculate the max and the total for each field if needed. */ + for (j = 0; j < hist_data->n_vals; j++) { + if (!(hist_data->fields[j]->flags & + (HIST_FIELD_FL_PERCENT | HIST_FIELD_FL_GRAPH))) + continue; + if (!stats) { + stats = kcalloc(hist_data->n_vals, sizeof(*stats), + GFP_KERNEL); + if (!stats) { + n_entries = -ENOMEM; + goto out; + } + } + for (i = 0; i < n_entries; i++) { + val = tracing_map_read_sum(sort_entries[i]->elt, j); + stats[j].total += val; + if (stats[j].max < val) + stats[j].max = val; + } + } + for (i = 0; i < n_entries; i++) - hist_trigger_entry_print(m, hist_data, + hist_trigger_entry_print(m, hist_data, stats, sort_entries[i]->key, sort_entries[i]->elt); + kfree(stats); +out: tracing_map_destroy_sort_entries(sort_entries, n_entries); return n_entries; @@ -5631,6 +5857,7 @@ static int event_hist_trigger_print(struct seq_file *m, struct hist_trigger_data *hist_data = data->private_data; struct hist_field *field; bool have_var = false; + bool show_val = false; unsigned int i; seq_puts(m, HIST_PREFIX); @@ -5661,12 +5888,16 @@ static int event_hist_trigger_print(struct seq_file *m, continue; } - if (i == HITCOUNT_IDX) + if (i == HITCOUNT_IDX) { + if (hist_data->attrs->no_hitcount) + continue; seq_puts(m, "hitcount"); - else { - seq_puts(m, ","); + } else { + if (show_val) + seq_puts(m, ","); hist_field_print(m, field); } + show_val = true; } if (have_var) { @@ -5717,6 +5948,8 @@ static int event_hist_trigger_print(struct seq_file *m, seq_printf(m, ":size=%u", (1 << hist_data->map->map_bits)); if (hist_data->enable_timestamps) seq_printf(m, ":clock=%s", hist_data->attrs->clock); + if (hist_data->attrs->no_hitcount) + seq_puts(m, ":nohitcount"); print_actions_spec(m, hist_data); @@ -6343,7 +6576,7 @@ enable: if (se) se->ref++; out: - if (ret == 0) + if (ret == 0 && glob[0]) hist_err_clear(); return ret; diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 5e8c07aef071..67592eed0be8 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -17,6 +17,8 @@ /* for gfp flag names */ #include <linux/trace_events.h> #include <trace/events/mmflags.h> +#include "trace_probe.h" +#include "trace_probe_kernel.h" #include "trace_synth.h" @@ -409,6 +411,7 @@ static unsigned int trace_string(struct synth_trace_event *entry, { unsigned int len = 0; char *str_field; + int ret; if (is_dynamic) { u32 data_offset; @@ -417,19 +420,27 @@ static unsigned int trace_string(struct synth_trace_event *entry, data_offset += event->n_u64 * sizeof(u64); data_offset += data_size; - str_field = (char *)entry + data_offset; - - len = strlen(str_val) + 1; - strscpy(str_field, str_val, len); + len = kern_fetch_store_strlen((unsigned long)str_val); data_offset |= len << 16; *(u32 *)&entry->fields[*n_u64] = data_offset; + ret = kern_fetch_store_string((unsigned long)str_val, &entry->fields[*n_u64], entry); + (*n_u64)++; } else { str_field = (char *)&entry->fields[*n_u64]; - strscpy(str_field, str_val, STR_VAR_LEN_MAX); +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + if ((unsigned long)str_val < TASK_SIZE) + ret = strncpy_from_user_nofault(str_field, str_val, STR_VAR_LEN_MAX); + else +#endif + ret = strncpy_from_kernel_nofault(str_field, str_val, STR_VAR_LEN_MAX); + + if (ret < 0) + strcpy(str_field, FAULT_STRING); + (*n_u64) += STR_VAR_LEN_MAX / sizeof(u64); } @@ -462,7 +473,7 @@ static notrace void trace_event_raw_event_synth(void *__data, val_idx = var_ref_idx[field_pos]; str_val = (char *)(long)var_ref_vals[val_idx]; - len = strlen(str_val) + 1; + len = kern_fetch_store_strlen((unsigned long)str_val); fields_size += len; } @@ -817,10 +828,9 @@ static int register_synth_event(struct synth_event *event) } ret = set_synth_event_print_fmt(call); - if (ret < 0) { + /* unregister_trace_event() will be called inside */ + if (ret < 0) trace_remove_event_call(call); - goto err; - } out: return ret; err: @@ -1272,12 +1282,12 @@ static int __create_synth_event(const char *name, const char *raw_fields) goto err_free_arg; } - fields[n_fields++] = field; if (n_fields == SYNTH_FIELDS_MAX) { synth_err(SYNTH_ERR_TOO_MANY_FIELDS, 0); ret = -EINVAL; goto err_free_arg; } + fields[n_fields++] = field; n_fields_this_loop++; } @@ -1415,7 +1425,6 @@ int synth_event_delete(const char *event_name) mutex_unlock(&event_mutex); if (mod) { - mutex_lock(&trace_types_lock); /* * It is safest to reset the ring buffer if the module * being unloaded registered any events that were @@ -1427,7 +1436,6 @@ int synth_event_delete(const char *event_name) * occur. */ tracing_reset_all_online_cpus(); - mutex_unlock(&trace_types_lock); } return ret; diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 918730d74932..e535959939d3 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -1067,7 +1067,14 @@ int set_trigger_filter(char *filter_str, /* The filter is for the 'trigger' event, not the triggered event */ ret = create_event_filter(file->tr, file->event_call, - filter_str, false, &filter); + filter_str, true, &filter); + + /* Only enabled set_str for error handling */ + if (filter) { + kfree(filter->filter_string); + filter->filter_string = NULL; + } + /* * If create_event_filter() fails, filter still needs to be freed. * Which the calling code will do with data->filter. @@ -1078,8 +1085,14 @@ int set_trigger_filter(char *filter_str, rcu_assign_pointer(data->filter, filter); if (tmp) { - /* Make sure the call is done with the filter */ - tracepoint_synchronize_unregister(); + /* + * Make sure the call is done with the filter. + * It is possible that a filter could fail at boot up, + * and then this path will be called. Avoid the synchronization + * in that case. + */ + if (system_state != SYSTEM_BOOTING) + tracepoint_synchronize_unregister(); free_event_filter(tmp); } diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index a6621c52ce45..908e8a13c675 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -14,6 +14,7 @@ #include <linux/uio.h> #include <linux/ioctl.h> #include <linux/jhash.h> +#include <linux/refcount.h> #include <linux/trace_events.h> #include <linux/tracefs.h> #include <linux/types.h> @@ -39,28 +40,69 @@ */ #define MAX_PAGE_ORDER 0 #define MAX_PAGES (1 << MAX_PAGE_ORDER) -#define MAX_EVENTS (MAX_PAGES * PAGE_SIZE) +#define MAX_BYTES (MAX_PAGES * PAGE_SIZE) +#define MAX_EVENTS (MAX_BYTES * 8) /* Limit how long of an event name plus args within the subsystem. */ #define MAX_EVENT_DESC 512 #define EVENT_NAME(user_event) ((user_event)->tracepoint.name) #define MAX_FIELD_ARRAY_SIZE 1024 -#define MAX_FIELD_ARG_NAME 256 -static char *register_page_data; +/* + * The MAP_STATUS_* macros are used for taking a index and determining the + * appropriate byte and the bit in the byte to set/reset for an event. + * + * The lower 3 bits of the index decide which bit to set. + * The remaining upper bits of the index decide which byte to use for the bit. + * + * This is used when an event has a probe attached/removed to reflect live + * status of the event wanting tracing or not to user-programs via shared + * memory maps. + */ +#define MAP_STATUS_BYTE(index) ((index) >> 3) +#define MAP_STATUS_MASK(index) BIT((index) & 7) + +/* + * Internal bits (kernel side only) to keep track of connected probes: + * These are used when status is requested in text form about an event. These + * bits are compared against an internal byte on the event to determine which + * probes to print out to the user. + * + * These do not reflect the mapped bytes between the user and kernel space. + */ +#define EVENT_STATUS_FTRACE BIT(0) +#define EVENT_STATUS_PERF BIT(1) +#define EVENT_STATUS_OTHER BIT(7) + +/* + * Stores the pages, tables, and locks for a group of events. + * Each logical grouping of events has its own group, with a + * matching page for status checks within user programs. This + * allows for isolation of events to user programs by various + * means. + */ +struct user_event_group { + struct page *pages; + char *register_page_data; + char *system_name; + struct hlist_node node; + struct mutex reg_mutex; + DECLARE_HASHTABLE(register_table, 8); + DECLARE_BITMAP(page_bitmap, MAX_EVENTS); +}; -static DEFINE_MUTEX(reg_mutex); -static DEFINE_HASHTABLE(register_table, 4); -static DECLARE_BITMAP(page_bitmap, MAX_EVENTS); +/* Group for init_user_ns mapping, top-most group */ +static struct user_event_group *init_group; /* * Stores per-event properties, as users register events * within a file a user_event might be created if it does not * already exist. These are globally used and their lifetime * is tied to the refcnt member. These cannot go away until the - * refcnt reaches zero. + * refcnt reaches one. */ struct user_event { + struct user_event_group *group; struct tracepoint tracepoint; struct trace_event_call call; struct trace_event_class class; @@ -68,10 +110,11 @@ struct user_event { struct hlist_node node; struct list_head fields; struct list_head validators; - atomic_t refcnt; + refcount_t refcnt; int index; int flags; int min_size; + char status; }; /* @@ -86,6 +129,11 @@ struct user_event_refs { struct user_event *events[]; }; +struct user_event_file_info { + struct user_event_group *group; + struct user_event_refs *refs; +}; + #define VALIDATOR_ENSURE_NULL (1 << 0) #define VALIDATOR_REL (1 << 1) @@ -98,7 +146,8 @@ struct user_event_validator { typedef void (*user_event_func_t) (struct user_event *user, struct iov_iter *i, void *tpdata, bool *faulted); -static int user_event_parse(char *name, char *args, char *flags, +static int user_event_parse(struct user_event_group *group, char *name, + char *args, char *flags, struct user_event **newuser); static u32 user_event_key(char *name) @@ -106,6 +155,144 @@ static u32 user_event_key(char *name) return jhash(name, strlen(name), 0); } +static void set_page_reservations(char *pages, bool set) +{ + int page; + + for (page = 0; page < MAX_PAGES; ++page) { + void *addr = pages + (PAGE_SIZE * page); + + if (set) + SetPageReserved(virt_to_page(addr)); + else + ClearPageReserved(virt_to_page(addr)); + } +} + +static void user_event_group_destroy(struct user_event_group *group) +{ + if (group->register_page_data) + set_page_reservations(group->register_page_data, false); + + if (group->pages) + __free_pages(group->pages, MAX_PAGE_ORDER); + + kfree(group->system_name); + kfree(group); +} + +static char *user_event_group_system_name(struct user_namespace *user_ns) +{ + char *system_name; + int len = sizeof(USER_EVENTS_SYSTEM) + 1; + + if (user_ns != &init_user_ns) { + /* + * Unexpected at this point: + * We only currently support init_user_ns. + * When we enable more, this will trigger a failure so log. + */ + pr_warn("user_events: Namespace other than init_user_ns!\n"); + return NULL; + } + + system_name = kmalloc(len, GFP_KERNEL); + + if (!system_name) + return NULL; + + snprintf(system_name, len, "%s", USER_EVENTS_SYSTEM); + + return system_name; +} + +static inline struct user_event_group +*user_event_group_from_user_ns(struct user_namespace *user_ns) +{ + if (user_ns == &init_user_ns) + return init_group; + + return NULL; +} + +static struct user_event_group *current_user_event_group(void) +{ + struct user_namespace *user_ns = current_user_ns(); + struct user_event_group *group = NULL; + + while (user_ns) { + group = user_event_group_from_user_ns(user_ns); + + if (group) + break; + + user_ns = user_ns->parent; + } + + return group; +} + +static struct user_event_group +*user_event_group_create(struct user_namespace *user_ns) +{ + struct user_event_group *group; + + group = kzalloc(sizeof(*group), GFP_KERNEL); + + if (!group) + return NULL; + + group->system_name = user_event_group_system_name(user_ns); + + if (!group->system_name) + goto error; + + group->pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, MAX_PAGE_ORDER); + + if (!group->pages) + goto error; + + group->register_page_data = page_address(group->pages); + + set_page_reservations(group->register_page_data, true); + + /* Zero all bits beside 0 (which is reserved for failures) */ + bitmap_zero(group->page_bitmap, MAX_EVENTS); + set_bit(0, group->page_bitmap); + + mutex_init(&group->reg_mutex); + hash_init(group->register_table); + + return group; +error: + if (group) + user_event_group_destroy(group); + + return NULL; +}; + +static __always_inline +void user_event_register_set(struct user_event *user) +{ + int i = user->index; + + user->group->register_page_data[MAP_STATUS_BYTE(i)] |= MAP_STATUS_MASK(i); +} + +static __always_inline +void user_event_register_clear(struct user_event *user) +{ + int i = user->index; + + user->group->register_page_data[MAP_STATUS_BYTE(i)] &= ~MAP_STATUS_MASK(i); +} + +static __always_inline __must_check +bool user_event_last_ref(struct user_event *user) +{ + return refcount_read(&user->refcnt) == 1; +} + static __always_inline __must_check size_t copy_nofault(void *addr, size_t bytes, struct iov_iter *i) { @@ -141,7 +328,8 @@ static struct list_head *user_event_get_fields(struct trace_event_call *call) * * Upon success user_event has its ref count increased by 1. */ -static int user_event_parse_cmd(char *raw_command, struct user_event **newuser) +static int user_event_parse_cmd(struct user_event_group *group, + char *raw_command, struct user_event **newuser) { char *name = raw_command; char *args = strpbrk(name, " "); @@ -155,7 +343,7 @@ static int user_event_parse_cmd(char *raw_command, struct user_event **newuser) if (flags) *flags++ = '\0'; - return user_event_parse(name, args, flags, newuser); + return user_event_parse(group, name, args, flags, newuser); } static int user_field_array_size(const char *type) @@ -277,7 +465,7 @@ static int user_event_add_field(struct user_event *user, const char *type, goto add_field; add_validator: - if (strstr(type, "char") != 0) + if (strstr(type, "char") != NULL) validator_flags |= VALIDATOR_ENSURE_NULL; validator = kmalloc(sizeof(*validator), GFP_KERNEL); @@ -458,7 +646,7 @@ static const char *user_field_format(const char *type) return "%d"; if (strcmp(type, "unsigned char") == 0) return "%u"; - if (strstr(type, "char[") != 0) + if (strstr(type, "char[") != NULL) return "%s"; /* Unknown, likely struct, allowed treat as 64-bit */ @@ -479,10 +667,52 @@ static bool user_field_is_dyn_string(const char *type, const char **str_func) return false; check: - return strstr(type, "char") != 0; + return strstr(type, "char") != NULL; } #define LEN_OR_ZERO (len ? len - pos : 0) +static int user_dyn_field_set_string(int argc, const char **argv, int *iout, + char *buf, int len, bool *colon) +{ + int pos = 0, i = *iout; + + *colon = false; + + for (; i < argc; ++i) { + if (i != *iout) + pos += snprintf(buf + pos, LEN_OR_ZERO, " "); + + pos += snprintf(buf + pos, LEN_OR_ZERO, "%s", argv[i]); + + if (strchr(argv[i], ';')) { + ++i; + *colon = true; + break; + } + } + + /* Actual set, advance i */ + if (len != 0) + *iout = i; + + return pos + 1; +} + +static int user_field_set_string(struct ftrace_event_field *field, + char *buf, int len, bool colon) +{ + int pos = 0; + + pos += snprintf(buf + pos, LEN_OR_ZERO, "%s", field->type); + pos += snprintf(buf + pos, LEN_OR_ZERO, " "); + pos += snprintf(buf + pos, LEN_OR_ZERO, "%s", field->name); + + if (colon) + pos += snprintf(buf + pos, LEN_OR_ZERO, ";"); + + return pos + 1; +} + static int user_event_set_print_fmt(struct user_event *user, char *buf, int len) { struct ftrace_event_field *field, *next; @@ -600,8 +830,8 @@ static int destroy_user_event(struct user_event *user) dyn_event_remove(&user->devent); - register_page_data[user->index] = 0; - clear_bit(user->index, page_bitmap); + user_event_register_clear(user); + clear_bit(user->index, user->group->page_bitmap); hash_del(&user->node); user_event_destroy_validators(user); @@ -612,16 +842,17 @@ static int destroy_user_event(struct user_event *user) return ret; } -static struct user_event *find_user_event(char *name, u32 *outkey) +static struct user_event *find_user_event(struct user_event_group *group, + char *name, u32 *outkey) { struct user_event *user; u32 key = user_event_key(name); *outkey = key; - hash_for_each_possible(register_table, user, node, key) + hash_for_each_possible(group->register_table, user, node, key) if (!strcmp(EVENT_NAME(user), name)) { - atomic_inc(&user->refcnt); + refcount_inc(&user->refcnt); return user; } @@ -779,7 +1010,12 @@ static void update_reg_page_for(struct user_event *user) rcu_read_unlock_sched(); } - register_page_data[user->index] = status; + if (status) + user_event_register_set(user); + else + user_event_register_clear(user); + + user->status = status; } /* @@ -835,17 +1071,18 @@ static int user_event_reg(struct trace_event_call *call, return ret; inc: - atomic_inc(&user->refcnt); + refcount_inc(&user->refcnt); update_reg_page_for(user); return 0; dec: update_reg_page_for(user); - atomic_dec(&user->refcnt); + refcount_dec(&user->refcnt); return 0; } static int user_event_create(const char *raw_command) { + struct user_event_group *group; struct user_event *user; char *name; int ret; @@ -861,14 +1098,21 @@ static int user_event_create(const char *raw_command) if (!name) return -ENOMEM; - mutex_lock(®_mutex); + group = current_user_event_group(); - ret = user_event_parse_cmd(name, &user); + if (!group) { + kfree(name); + return -ENOENT; + } + + mutex_lock(&group->reg_mutex); + + ret = user_event_parse_cmd(group, name, &user); if (!ret) - atomic_dec(&user->refcnt); + refcount_dec(&user->refcnt); - mutex_unlock(®_mutex); + mutex_unlock(&group->reg_mutex); if (ret) kfree(name); @@ -910,14 +1154,14 @@ static bool user_event_is_busy(struct dyn_event *ev) { struct user_event *user = container_of(ev, struct user_event, devent); - return atomic_read(&user->refcnt) != 0; + return !user_event_last_ref(user); } static int user_event_free(struct dyn_event *ev) { struct user_event *user = container_of(ev, struct user_event, devent); - if (atomic_read(&user->refcnt) != 0) + if (!user_event_last_ref(user)) return -EBUSY; return destroy_user_event(user); @@ -926,49 +1170,35 @@ static int user_event_free(struct dyn_event *ev) static bool user_field_match(struct ftrace_event_field *field, int argc, const char **argv, int *iout) { - char *field_name, *arg_name; - int len, pos, i = *iout; + char *field_name = NULL, *dyn_field_name = NULL; bool colon = false, match = false; + int dyn_len, len; - if (i >= argc) + if (*iout >= argc) return false; - len = MAX_FIELD_ARG_NAME; - field_name = kmalloc(len, GFP_KERNEL); - arg_name = kmalloc(len, GFP_KERNEL); + dyn_len = user_dyn_field_set_string(argc, argv, iout, dyn_field_name, + 0, &colon); - if (!arg_name || !field_name) - goto out; + len = user_field_set_string(field, field_name, 0, colon); - pos = 0; - - for (; i < argc; ++i) { - if (i != *iout) - pos += snprintf(arg_name + pos, len - pos, " "); - - pos += snprintf(arg_name + pos, len - pos, argv[i]); - - if (strchr(argv[i], ';')) { - ++i; - colon = true; - break; - } - } + if (dyn_len != len) + return false; - pos = 0; + dyn_field_name = kmalloc(dyn_len, GFP_KERNEL); + field_name = kmalloc(len, GFP_KERNEL); - pos += snprintf(field_name + pos, len - pos, field->type); - pos += snprintf(field_name + pos, len - pos, " "); - pos += snprintf(field_name + pos, len - pos, field->name); + if (!dyn_field_name || !field_name) + goto out; - if (colon) - pos += snprintf(field_name + pos, len - pos, ";"); + user_dyn_field_set_string(argc, argv, iout, dyn_field_name, + dyn_len, &colon); - *iout = i; + user_field_set_string(field, field_name, len, colon); - match = strcmp(arg_name, field_name) == 0; + match = strcmp(dyn_field_name, field_name) == 0; out: - kfree(arg_name); + kfree(dyn_field_name); kfree(field_name); return match; @@ -1036,7 +1266,8 @@ static int user_event_trace_register(struct user_event *user) * The name buffer lifetime is owned by this method for success cases only. * Upon success the returned user_event has its ref count increased by 1. */ -static int user_event_parse(char *name, char *args, char *flags, +static int user_event_parse(struct user_event_group *group, char *name, + char *args, char *flags, struct user_event **newuser) { int ret; @@ -1046,7 +1277,7 @@ static int user_event_parse(char *name, char *args, char *flags, /* Prevent dyn_event from racing */ mutex_lock(&event_mutex); - user = find_user_event(name, &key); + user = find_user_event(group, name, &key); mutex_unlock(&event_mutex); if (user) { @@ -1059,7 +1290,7 @@ static int user_event_parse(char *name, char *args, char *flags, return 0; } - index = find_first_zero_bit(page_bitmap, MAX_EVENTS); + index = find_first_zero_bit(group->page_bitmap, MAX_EVENTS); if (index == MAX_EVENTS) return -EMFILE; @@ -1073,6 +1304,7 @@ static int user_event_parse(char *name, char *args, char *flags, INIT_LIST_HEAD(&user->fields); INIT_LIST_HEAD(&user->validators); + user->group = group; user->tracepoint.name = name; ret = user_event_parse_fields(user, args); @@ -1091,8 +1323,8 @@ static int user_event_parse(char *name, char *args, char *flags, user->call.flags = TRACE_EVENT_FL_TRACEPOINT; user->call.tp = &user->tracepoint; user->call.event.funcs = &user_event_funcs; + user->class.system = group->system_name; - user->class.system = USER_EVENTS_SYSTEM; user->class.fields_array = user_event_fields_array; user->class.get_fields = user_event_get_fields; user->class.reg = user_event_reg; @@ -1110,13 +1342,13 @@ static int user_event_parse(char *name, char *args, char *flags, user->index = index; - /* Ensure we track ref */ - atomic_inc(&user->refcnt); + /* Ensure we track self ref and caller ref (2) */ + refcount_set(&user->refcnt, 2); dyn_event_init(&user->devent, &user_event_dops); dyn_event_add(&user->devent, &user->call); - set_bit(user->index, page_bitmap); - hash_add(register_table, &user->node, key); + set_bit(user->index, group->page_bitmap); + hash_add(group->register_table, &user->node, key); mutex_unlock(&event_mutex); @@ -1127,6 +1359,7 @@ put_user_lock: put_user: user_event_destroy_fields(user); user_event_destroy_validators(user); + kfree(user->call.print_fmt); kfree(user); return ret; } @@ -1134,32 +1367,20 @@ put_user: /* * Deletes a previously created event if it is no longer being used. */ -static int delete_user_event(char *name) +static int delete_user_event(struct user_event_group *group, char *name) { u32 key; - int ret; - struct user_event *user = find_user_event(name, &key); + struct user_event *user = find_user_event(group, name, &key); if (!user) return -ENOENT; - /* Ensure we are the last ref */ - if (atomic_read(&user->refcnt) != 1) { - ret = -EBUSY; - goto put_ref; - } - - ret = destroy_user_event(user); - - if (ret) - goto put_ref; + refcount_dec(&user->refcnt); - return ret; -put_ref: - /* No longer have this ref */ - atomic_dec(&user->refcnt); + if (!user_event_last_ref(user)) + return -EBUSY; - return ret; + return destroy_user_event(user); } /* @@ -1167,6 +1388,7 @@ put_ref: */ static ssize_t user_events_write_core(struct file *file, struct iov_iter *i) { + struct user_event_file_info *info = file->private_data; struct user_event_refs *refs; struct user_event *user = NULL; struct tracepoint *tp; @@ -1178,7 +1400,7 @@ static ssize_t user_events_write_core(struct file *file, struct iov_iter *i) rcu_read_lock_sched(); - refs = rcu_dereference_sched(file->private_data); + refs = rcu_dereference_sched(info->refs); /* * The refs->events array is protected by RCU, and new items may be @@ -1236,6 +1458,28 @@ static ssize_t user_events_write_core(struct file *file, struct iov_iter *i) return ret; } +static int user_events_open(struct inode *node, struct file *file) +{ + struct user_event_group *group; + struct user_event_file_info *info; + + group = current_user_event_group(); + + if (!group) + return -ENOENT; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + + if (!info) + return -ENOMEM; + + info->group = group; + + file->private_data = info; + + return 0; +} + static ssize_t user_events_write(struct file *file, const char __user *ubuf, size_t count, loff_t *ppos) { @@ -1245,7 +1489,8 @@ static ssize_t user_events_write(struct file *file, const char __user *ubuf, if (unlikely(*ppos != 0)) return -EFAULT; - if (unlikely(import_single_range(READ, (char *)ubuf, count, &iov, &i))) + if (unlikely(import_single_range(ITER_SOURCE, (char __user *)ubuf, + count, &iov, &i))) return -EFAULT; return user_events_write_core(file, &i); @@ -1256,13 +1501,15 @@ static ssize_t user_events_write_iter(struct kiocb *kp, struct iov_iter *i) return user_events_write_core(kp->ki_filp, i); } -static int user_events_ref_add(struct file *file, struct user_event *user) +static int user_events_ref_add(struct user_event_file_info *info, + struct user_event *user) { + struct user_event_group *group = info->group; struct user_event_refs *refs, *new_refs; int i, size, count = 0; - refs = rcu_dereference_protected(file->private_data, - lockdep_is_held(®_mutex)); + refs = rcu_dereference_protected(info->refs, + lockdep_is_held(&group->reg_mutex)); if (refs) { count = refs->count; @@ -1286,9 +1533,9 @@ static int user_events_ref_add(struct file *file, struct user_event *user) new_refs->events[i] = user; - atomic_inc(&user->refcnt); + refcount_inc(&user->refcnt); - rcu_assign_pointer(file->private_data, new_refs); + rcu_assign_pointer(info->refs, new_refs); if (refs) kfree_rcu(refs, rcu); @@ -1309,13 +1556,24 @@ static long user_reg_get(struct user_reg __user *ureg, struct user_reg *kreg) if (size > PAGE_SIZE) return -E2BIG; - return copy_struct_from_user(kreg, sizeof(*kreg), ureg, size); + if (size < offsetofend(struct user_reg, write_index)) + return -EINVAL; + + ret = copy_struct_from_user(kreg, sizeof(*kreg), ureg, size); + + if (ret) + return ret; + + kreg->size = size; + + return 0; } /* * Registers a user_event on behalf of a user process. */ -static long user_events_ioctl_reg(struct file *file, unsigned long uarg) +static long user_events_ioctl_reg(struct user_event_file_info *info, + unsigned long uarg) { struct user_reg __user *ureg = (struct user_reg __user *)uarg; struct user_reg reg; @@ -1336,24 +1594,24 @@ static long user_events_ioctl_reg(struct file *file, unsigned long uarg) return ret; } - ret = user_event_parse_cmd(name, &user); + ret = user_event_parse_cmd(info->group, name, &user); if (ret) { kfree(name); return ret; } - ret = user_events_ref_add(file, user); + ret = user_events_ref_add(info, user); /* No longer need parse ref, ref_add either worked or not */ - atomic_dec(&user->refcnt); + refcount_dec(&user->refcnt); /* Positive number is index and valid */ if (ret < 0) return ret; put_user((u32)ret, &ureg->write_index); - put_user(user->index, &ureg->status_index); + put_user(user->index, &ureg->status_bit); return 0; } @@ -1361,7 +1619,8 @@ static long user_events_ioctl_reg(struct file *file, unsigned long uarg) /* * Deletes a user_event on behalf of a user process. */ -static long user_events_ioctl_del(struct file *file, unsigned long uarg) +static long user_events_ioctl_del(struct user_event_file_info *info, + unsigned long uarg) { void __user *ubuf = (void __user *)uarg; char *name; @@ -1374,7 +1633,7 @@ static long user_events_ioctl_del(struct file *file, unsigned long uarg) /* event_mutex prevents dyn_event from racing */ mutex_lock(&event_mutex); - ret = delete_user_event(name); + ret = delete_user_event(info->group, name); mutex_unlock(&event_mutex); kfree(name); @@ -1388,19 +1647,21 @@ static long user_events_ioctl_del(struct file *file, unsigned long uarg) static long user_events_ioctl(struct file *file, unsigned int cmd, unsigned long uarg) { + struct user_event_file_info *info = file->private_data; + struct user_event_group *group = info->group; long ret = -ENOTTY; switch (cmd) { case DIAG_IOCSREG: - mutex_lock(®_mutex); - ret = user_events_ioctl_reg(file, uarg); - mutex_unlock(®_mutex); + mutex_lock(&group->reg_mutex); + ret = user_events_ioctl_reg(info, uarg); + mutex_unlock(&group->reg_mutex); break; case DIAG_IOCSDEL: - mutex_lock(®_mutex); - ret = user_events_ioctl_del(file, uarg); - mutex_unlock(®_mutex); + mutex_lock(&group->reg_mutex); + ret = user_events_ioctl_del(info, uarg); + mutex_unlock(&group->reg_mutex); break; } @@ -1412,17 +1673,24 @@ static long user_events_ioctl(struct file *file, unsigned int cmd, */ static int user_events_release(struct inode *node, struct file *file) { + struct user_event_file_info *info = file->private_data; + struct user_event_group *group; struct user_event_refs *refs; struct user_event *user; int i; + if (!info) + return -EINVAL; + + group = info->group; + /* * Ensure refs cannot change under any situation by taking the * register mutex during the final freeing of the references. */ - mutex_lock(®_mutex); + mutex_lock(&group->reg_mutex); - refs = file->private_data; + refs = info->refs; if (!refs) goto out; @@ -1436,37 +1704,56 @@ static int user_events_release(struct inode *node, struct file *file) user = refs->events[i]; if (user) - atomic_dec(&user->refcnt); + refcount_dec(&user->refcnt); } out: file->private_data = NULL; - mutex_unlock(®_mutex); + mutex_unlock(&group->reg_mutex); kfree(refs); + kfree(info); return 0; } static const struct file_operations user_data_fops = { + .open = user_events_open, .write = user_events_write, .write_iter = user_events_write_iter, .unlocked_ioctl = user_events_ioctl, .release = user_events_release, }; +static struct user_event_group *user_status_group(struct file *file) +{ + struct seq_file *m = file->private_data; + + if (!m) + return NULL; + + return m->private; +} + /* * Maps the shared page into the user process for checking if event is enabled. */ static int user_status_mmap(struct file *file, struct vm_area_struct *vma) { + char *pages; + struct user_event_group *group = user_status_group(file); unsigned long size = vma->vm_end - vma->vm_start; - if (size != MAX_EVENTS) + if (size != MAX_BYTES) + return -EINVAL; + + if (!group) return -EINVAL; + pages = group->register_page_data; + return remap_pfn_range(vma, vma->vm_start, - virt_to_phys(register_page_data) >> PAGE_SHIFT, + virt_to_phys(pages) >> PAGE_SHIFT, size, vm_get_page_prot(VM_READ)); } @@ -1490,14 +1777,18 @@ static void user_seq_stop(struct seq_file *m, void *p) static int user_seq_show(struct seq_file *m, void *p) { + struct user_event_group *group = m->private; struct user_event *user; char status; int i, active = 0, busy = 0, flags; - mutex_lock(®_mutex); + if (!group) + return -EINVAL; + + mutex_lock(&group->reg_mutex); - hash_for_each(register_table, i, user, node) { - status = register_page_data[user->index]; + hash_for_each(group->register_table, i, user, node) { + status = user->status; flags = user->flags; seq_printf(m, "%d:%s", user->index, EVENT_NAME(user)); @@ -1520,7 +1811,7 @@ static int user_seq_show(struct seq_file *m, void *p) active++; } - mutex_unlock(®_mutex); + mutex_unlock(&group->reg_mutex); seq_puts(m, "\n"); seq_printf(m, "Active: %d\n", active); @@ -1539,7 +1830,24 @@ static const struct seq_operations user_seq_ops = { static int user_status_open(struct inode *node, struct file *file) { - return seq_open(file, &user_seq_ops); + struct user_event_group *group; + int ret; + + group = current_user_event_group(); + + if (!group) + return -ENOENT; + + ret = seq_open(file, &user_seq_ops); + + if (!ret) { + /* Chain group to seq_file */ + struct seq_file *m = file->private_data; + + m->private = group; + } + + return ret; } static const struct file_operations user_status_fops = { @@ -1580,42 +1888,21 @@ err: return -ENODEV; } -static void set_page_reservations(bool set) -{ - int page; - - for (page = 0; page < MAX_PAGES; ++page) { - void *addr = register_page_data + (PAGE_SIZE * page); - - if (set) - SetPageReserved(virt_to_page(addr)); - else - ClearPageReserved(virt_to_page(addr)); - } -} - static int __init trace_events_user_init(void) { - struct page *pages; int ret; - /* Zero all bits beside 0 (which is reserved for failures) */ - bitmap_zero(page_bitmap, MAX_EVENTS); - set_bit(0, page_bitmap); + init_group = user_event_group_create(&init_user_ns); - pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, MAX_PAGE_ORDER); - if (!pages) + if (!init_group) return -ENOMEM; - register_page_data = page_address(pages); - - set_page_reservations(true); ret = create_user_tracefs(); if (ret) { pr_warn("user_events could not register with tracefs\n"); - set_page_reservations(false); - __free_pages(pages, MAX_PAGE_ORDER); + user_event_group_destroy(init_group); + init_group = NULL; return ret; } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 23f7f0ec4f4c..ee77c8203bd5 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -20,6 +20,7 @@ #include "trace_kprobe_selftest.h" #include "trace_probe.h" #include "trace_probe_tmpl.h" +#include "trace_probe_kernel.h" #define KPROBE_EVENT_SYSTEM "kprobes" #define KRETPROBE_MAXACTIVE_MAX 4096 @@ -1223,29 +1224,14 @@ static const struct file_operations kprobe_profile_ops = { static nokprobe_inline int fetch_store_strlen_user(unsigned long addr) { - const void __user *uaddr = (__force const void __user *)addr; - - return strnlen_user_nofault(uaddr, MAX_STRING_SIZE); + return kern_fetch_store_strlen_user(addr); } /* Return the length of string -- including null terminal byte */ static nokprobe_inline int fetch_store_strlen(unsigned long addr) { - int ret, len = 0; - u8 c; - -#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE - if (addr < TASK_SIZE) - return fetch_store_strlen_user(addr); -#endif - - do { - ret = copy_from_kernel_nofault(&c, (u8 *)addr + len, 1); - len++; - } while (c && ret == 0 && len < MAX_STRING_SIZE); - - return (ret < 0) ? ret : len; + return kern_fetch_store_strlen(addr); } /* @@ -1255,21 +1241,7 @@ fetch_store_strlen(unsigned long addr) static nokprobe_inline int fetch_store_string_user(unsigned long addr, void *dest, void *base) { - const void __user *uaddr = (__force const void __user *)addr; - int maxlen = get_loc_len(*(u32 *)dest); - void *__dest; - long ret; - - if (unlikely(!maxlen)) - return -ENOMEM; - - __dest = get_loc_data(dest, base); - - ret = strncpy_from_user_nofault(__dest, uaddr, maxlen); - if (ret >= 0) - *(u32 *)dest = make_data_loc(ret, __dest - base); - - return ret; + return kern_fetch_store_string_user(addr, dest, base); } /* @@ -1279,29 +1251,7 @@ fetch_store_string_user(unsigned long addr, void *dest, void *base) static nokprobe_inline int fetch_store_string(unsigned long addr, void *dest, void *base) { - int maxlen = get_loc_len(*(u32 *)dest); - void *__dest; - long ret; - -#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE - if ((unsigned long)addr < TASK_SIZE) - return fetch_store_string_user(addr, dest, base); -#endif - - if (unlikely(!maxlen)) - return -ENOMEM; - - __dest = get_loc_data(dest, base); - - /* - * Try to get string again, since the string can be changed while - * probing. - */ - ret = strncpy_from_kernel_nofault(__dest, (void *)addr, maxlen); - if (ret >= 0) - *(u32 *)dest = make_data_loc(ret, __dest - base); - - return ret; + return kern_fetch_store_string(addr, dest, base); } static nokprobe_inline int @@ -1394,7 +1344,6 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs, return; fbuffer.regs = regs; - entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event); entry->ip = (unsigned long)tk->rp.kp.addr; store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize); @@ -1435,7 +1384,6 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, return; fbuffer.regs = regs; - entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event); entry->func = (unsigned long)tk->rp.kp.addr; entry->ret_ip = get_kretprobe_retaddr(ri); store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize); diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 313439920a8c..94c1b5eb1dc0 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -49,6 +49,28 @@ #define DEFAULT_TIMERLAT_PRIO 95 /* FIFO 95 */ /* + * osnoise/options entries. + */ +enum osnoise_options_index { + OSN_DEFAULTS = 0, + OSN_WORKLOAD, + OSN_PANIC_ON_STOP, + OSN_PREEMPT_DISABLE, + OSN_IRQ_DISABLE, + OSN_MAX +}; + +static const char * const osnoise_options_str[OSN_MAX] = { + "DEFAULTS", + "OSNOISE_WORKLOAD", + "PANIC_ON_STOP", + "OSNOISE_PREEMPT_DISABLE", + "OSNOISE_IRQ_DISABLE" }; + +#define OSN_DEFAULT_OPTIONS 0x2 +static unsigned long osnoise_options = OSN_DEFAULT_OPTIONS; + +/* * trace_array of the enabled osnoise/timerlat instances. */ struct osnoise_instance { @@ -917,7 +939,7 @@ void osnoise_trace_irq_entry(int id) void osnoise_trace_irq_exit(int id, const char *desc) { struct osnoise_variables *osn_var = this_cpu_osn_var(); - int duration; + s64 duration; if (!osn_var->sampling) return; @@ -1048,7 +1070,7 @@ static void trace_softirq_entry_callback(void *data, unsigned int vec_nr) static void trace_softirq_exit_callback(void *data, unsigned int vec_nr) { struct osnoise_variables *osn_var = this_cpu_osn_var(); - int duration; + s64 duration; if (!osn_var->sampling) return; @@ -1144,7 +1166,7 @@ thread_entry(struct osnoise_variables *osn_var, struct task_struct *t) static void thread_exit(struct osnoise_variables *osn_var, struct task_struct *t) { - int duration; + s64 duration; if (!osn_var->sampling) return; @@ -1173,11 +1195,12 @@ trace_sched_switch_callback(void *data, bool preempt, unsigned int prev_state) { struct osnoise_variables *osn_var = this_cpu_osn_var(); + int workload = test_bit(OSN_WORKLOAD, &osnoise_options); - if (p->pid != osn_var->pid) + if ((p->pid != osn_var->pid) || !workload) thread_exit(osn_var, p); - if (n->pid != osn_var->pid) + if ((n->pid != osn_var->pid) || !workload) thread_entry(osn_var, n); } @@ -1255,6 +1278,9 @@ static __always_inline void osnoise_stop_tracing(void) trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_, "stop tracing hit on cpu %d\n", smp_processor_id()); + if (test_bit(OSN_PANIC_ON_STOP, &osnoise_options)) + panic("tracer hit stop condition on CPU %d\n", smp_processor_id()); + tracer_tracing_off(tr); } rcu_read_unlock(); @@ -1289,12 +1315,14 @@ static void notify_new_max_latency(u64 latency) */ static int run_osnoise(void) { + bool disable_irq = test_bit(OSN_IRQ_DISABLE, &osnoise_options); struct osnoise_variables *osn_var = this_cpu_osn_var(); u64 start, sample, last_sample; u64 last_int_count, int_count; s64 noise = 0, max_noise = 0; s64 total, last_total = 0; struct osnoise_sample s; + bool disable_preemption; unsigned int threshold; u64 runtime, stop_in; u64 sum_noise = 0; @@ -1302,6 +1330,12 @@ static int run_osnoise(void) int ret = -1; /* + * Disabling preemption is only required if IRQs are enabled, + * and the options is set on. + */ + disable_preemption = !disable_irq && test_bit(OSN_PREEMPT_DISABLE, &osnoise_options); + + /* * Considers the current thread as the workload. */ osn_var->pid = current->pid; @@ -1317,6 +1351,15 @@ static int run_osnoise(void) threshold = tracing_thresh ? : 5000; /* + * Apply PREEMPT and IRQ disabled options. + */ + if (disable_irq) + local_irq_disable(); + + if (disable_preemption) + preempt_disable(); + + /* * Make sure NMIs see sampling first */ osn_var->sampling = true; @@ -1403,16 +1446,21 @@ static int run_osnoise(void) * cond_resched() */ if (IS_ENABLED(CONFIG_PREEMPT_RCU)) { - local_irq_disable(); + if (!disable_irq) + local_irq_disable(); + rcu_momentary_dyntick_idle(); - local_irq_enable(); + + if (!disable_irq) + local_irq_enable(); } /* * For the non-preemptive kernel config: let threads runs, if - * they so wish. + * they so wish, unless set not do to so. */ - cond_resched(); + if (!disable_irq && !disable_preemption) + cond_resched(); last_sample = sample; last_int_count = int_count; @@ -1432,6 +1480,15 @@ static int run_osnoise(void) barrier(); /* + * Return to the preemptive state. + */ + if (disable_preemption) + preempt_enable(); + + if (disable_irq) + local_irq_enable(); + + /* * Save noise info. */ s.noise = time_to_us(sum_noise); @@ -1710,9 +1767,16 @@ static void stop_kthread(unsigned int cpu) struct task_struct *kthread; kthread = per_cpu(per_cpu_osnoise_var, cpu).kthread; - if (kthread) + if (kthread) { kthread_stop(kthread); - per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL; + per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL; + } else { + if (!test_bit(OSN_WORKLOAD, &osnoise_options)) { + per_cpu(per_cpu_osnoise_var, cpu).sampling = false; + barrier(); + return; + } + } } /* @@ -1746,6 +1810,13 @@ static int start_kthread(unsigned int cpu) snprintf(comm, 24, "timerlat/%d", cpu); main = timerlat_main; } else { + /* if no workload, just return */ + if (!test_bit(OSN_WORKLOAD, &osnoise_options)) { + per_cpu(per_cpu_osnoise_var, cpu).sampling = true; + barrier(); + return 0; + } + snprintf(comm, 24, "osnoise/%d", cpu); } @@ -1786,8 +1857,9 @@ static int start_per_cpu_kthreads(void) for_each_cpu(cpu, current_mask) { retval = start_kthread(cpu); if (retval) { + cpus_read_unlock(); stop_per_cpu_kthreads(); - break; + return retval; } } @@ -1860,6 +1932,150 @@ static void osnoise_init_hotplug_support(void) #endif /* CONFIG_HOTPLUG_CPU */ /* + * seq file functions for the osnoise/options file. + */ +static void *s_options_start(struct seq_file *s, loff_t *pos) +{ + int option = *pos; + + mutex_lock(&interface_lock); + + if (option >= OSN_MAX) + return NULL; + + return pos; +} + +static void *s_options_next(struct seq_file *s, void *v, loff_t *pos) +{ + int option = ++(*pos); + + if (option >= OSN_MAX) + return NULL; + + return pos; +} + +static int s_options_show(struct seq_file *s, void *v) +{ + loff_t *pos = v; + int option = *pos; + + if (option == OSN_DEFAULTS) { + if (osnoise_options == OSN_DEFAULT_OPTIONS) + seq_printf(s, "%s", osnoise_options_str[option]); + else + seq_printf(s, "NO_%s", osnoise_options_str[option]); + goto out; + } + + if (test_bit(option, &osnoise_options)) + seq_printf(s, "%s", osnoise_options_str[option]); + else + seq_printf(s, "NO_%s", osnoise_options_str[option]); + +out: + if (option != OSN_MAX) + seq_puts(s, " "); + + return 0; +} + +static void s_options_stop(struct seq_file *s, void *v) +{ + seq_puts(s, "\n"); + mutex_unlock(&interface_lock); +} + +static const struct seq_operations osnoise_options_seq_ops = { + .start = s_options_start, + .next = s_options_next, + .show = s_options_show, + .stop = s_options_stop +}; + +static int osnoise_options_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &osnoise_options_seq_ops); +}; + +/** + * osnoise_options_write - Write function for "options" entry + * @filp: The active open file structure + * @ubuf: The user buffer that contains the value to write + * @cnt: The maximum number of bytes to write to "file" + * @ppos: The current position in @file + * + * Writing the option name sets the option, writing the "NO_" + * prefix in front of the option name disables it. + * + * Writing "DEFAULTS" resets the option values to the default ones. + */ +static ssize_t osnoise_options_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + int running, option, enable, retval; + char buf[256], *option_str; + + if (cnt >= 256) + return -EINVAL; + + if (copy_from_user(buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + if (strncmp(buf, "NO_", 3)) { + option_str = strstrip(buf); + enable = true; + } else { + option_str = strstrip(&buf[3]); + enable = false; + } + + option = match_string(osnoise_options_str, OSN_MAX, option_str); + if (option < 0) + return -EINVAL; + + /* + * trace_types_lock is taken to avoid concurrency on start/stop. + */ + mutex_lock(&trace_types_lock); + running = osnoise_has_registered_instances(); + if (running) + stop_per_cpu_kthreads(); + + mutex_lock(&interface_lock); + /* + * avoid CPU hotplug operations that might read options. + */ + cpus_read_lock(); + + retval = cnt; + + if (enable) { + if (option == OSN_DEFAULTS) + osnoise_options = OSN_DEFAULT_OPTIONS; + else + set_bit(option, &osnoise_options); + } else { + if (option == OSN_DEFAULTS) + retval = -EINVAL; + else + clear_bit(option, &osnoise_options); + } + + cpus_read_unlock(); + mutex_unlock(&interface_lock); + + if (running) + start_per_cpu_kthreads(); + mutex_unlock(&trace_types_lock); + + return retval; +} + +/* * osnoise_cpus_read - Read function for reading the "cpus" file * @filp: The active open file structure * @ubuf: The userspace provided buffer to read value into @@ -2041,6 +2257,14 @@ static const struct file_operations cpus_fops = { .llseek = generic_file_llseek, }; +static const struct file_operations osnoise_options_fops = { + .open = osnoise_options_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .write = osnoise_options_write +}; + #ifdef CONFIG_TIMERLAT_TRACER #ifdef CONFIG_STACKTRACE static int init_timerlat_stack_tracefs(struct dentry *top_dir) @@ -2127,6 +2351,11 @@ static int init_tracefs(void) if (!tmp) goto err; + tmp = trace_create_file("options", TRACE_MODE_WRITE, top_dir, NULL, + &osnoise_options_fops); + if (!tmp) + goto err; + ret = init_timerlat_tracefs(top_dir); if (ret) goto err; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 67f47ea27921..57a13b61f186 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -11,6 +11,7 @@ #include <linux/kprobes.h> #include <linux/sched/clock.h> #include <linux/sched/mm.h> +#include <linux/idr.h> #include "trace_output.h" @@ -21,8 +22,6 @@ DECLARE_RWSEM(trace_event_sem); static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; -static int next_event_type = __TRACE_LAST_TYPE; - enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; @@ -323,8 +322,9 @@ void trace_event_printf(struct trace_iterator *iter, const char *fmt, ...) } EXPORT_SYMBOL(trace_event_printf); -static int trace_output_raw(struct trace_iterator *iter, char *name, - char *fmt, va_list ap) +static __printf(3, 0) +int trace_output_raw(struct trace_iterator *iter, char *name, + char *fmt, va_list ap) { struct trace_seq *s = &iter->seq; @@ -688,38 +688,23 @@ struct trace_event *ftrace_find_event(int type) return NULL; } -static LIST_HEAD(ftrace_event_list); +static DEFINE_IDA(trace_event_ida); -static int trace_search_list(struct list_head **list) +static void free_trace_event_type(int type) { - struct trace_event *e = NULL, *iter; - int next = __TRACE_LAST_TYPE; - - if (list_empty(&ftrace_event_list)) { - *list = &ftrace_event_list; - return next; - } + if (type >= __TRACE_LAST_TYPE) + ida_free(&trace_event_ida, type); +} - /* - * We used up all possible max events, - * lets see if somebody freed one. - */ - list_for_each_entry(iter, &ftrace_event_list, list) { - if (iter->type != next) { - e = iter; - break; - } - next++; - } +static int alloc_trace_event_type(void) +{ + int next; - /* Did we used up all 65 thousand events??? */ - if (next > TRACE_EVENT_TYPE_MAX) + /* Skip static defined type numbers */ + next = ida_alloc_range(&trace_event_ida, __TRACE_LAST_TYPE, + TRACE_EVENT_TYPE_MAX, GFP_KERNEL); + if (next < 0) return 0; - - if (e) - *list = &e->list; - else - *list = &ftrace_event_list; return next; } @@ -761,28 +746,10 @@ int register_trace_event(struct trace_event *event) if (WARN_ON(!event->funcs)) goto out; - INIT_LIST_HEAD(&event->list); - if (!event->type) { - struct list_head *list = NULL; - - if (next_event_type > TRACE_EVENT_TYPE_MAX) { - - event->type = trace_search_list(&list); - if (!event->type) - goto out; - - } else { - - event->type = next_event_type++; - list = &ftrace_event_list; - } - - if (WARN_ON(ftrace_find_event(event->type))) + event->type = alloc_trace_event_type(); + if (!event->type) goto out; - - list_add_tail(&event->list, list); - } else if (WARN(event->type > __TRACE_LAST_TYPE, "Need to add type to trace.h")) { goto out; @@ -819,7 +786,7 @@ EXPORT_SYMBOL_GPL(register_trace_event); int __unregister_trace_event(struct trace_event *event) { hlist_del(&event->node); - list_del(&event->list); + free_trace_event_type(event->type); return 0; } diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 36dff277de46..01ebabbbe8c9 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -76,9 +76,11 @@ const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\""; /* Fetch type information table */ static const struct fetch_type probe_fetch_types[] = { /* Special types */ - __ASSIGN_FETCH_TYPE("string", string, string, sizeof(u32), 1, + __ASSIGN_FETCH_TYPE("string", string, string, sizeof(u32), 1, 1, "__data_loc char[]"), - __ASSIGN_FETCH_TYPE("ustring", string, string, sizeof(u32), 1, + __ASSIGN_FETCH_TYPE("ustring", string, string, sizeof(u32), 1, 1, + "__data_loc char[]"), + __ASSIGN_FETCH_TYPE("symstr", string, string, sizeof(u32), 1, 1, "__data_loc char[]"), /* Basic types */ ASSIGN_FETCH_TYPE(u8, u8, 0), @@ -98,10 +100,15 @@ static const struct fetch_type probe_fetch_types[] = { ASSIGN_FETCH_TYPE_END }; -static const struct fetch_type *find_fetch_type(const char *type) +static const struct fetch_type *find_fetch_type(const char *type, unsigned long flags) { int i; + /* Reject the symbol/symstr for uprobes */ + if (type && (flags & TPARG_FL_USER) && + (!strcmp(type, "symbol") || !strcmp(type, "symstr"))) + return NULL; + if (!type) type = DEFAULT_FETCH_TYPE_STR; @@ -119,13 +126,13 @@ static const struct fetch_type *find_fetch_type(const char *type) switch (bs) { case 8: - return find_fetch_type("u8"); + return find_fetch_type("u8", flags); case 16: - return find_fetch_type("u16"); + return find_fetch_type("u16", flags); case 32: - return find_fetch_type("u32"); + return find_fetch_type("u32", flags); case 64: - return find_fetch_type("u64"); + return find_fetch_type("u64", flags); default: goto fail; } @@ -246,7 +253,7 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, return -EINVAL; } strlcpy(buf, event, slash - event + 1); - if (!is_good_name(buf)) { + if (!is_good_system_name(buf)) { trace_probe_log_err(offset, BAD_GROUP_NAME); return -EINVAL; } @@ -478,7 +485,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type, DEREF_OPEN_BRACE); return -EINVAL; } else { - const struct fetch_type *t2 = find_fetch_type(NULL); + const struct fetch_type *t2 = find_fetch_type(NULL, flags); *tmp = '\0'; ret = parse_probe_arg(arg, t2, &code, end, flags, offs); @@ -630,9 +637,9 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, /* The type of $comm must be "string", and not an array. */ if (parg->count || (t && strcmp(t, "string"))) goto out; - parg->type = find_fetch_type("string"); + parg->type = find_fetch_type("string", flags); } else - parg->type = find_fetch_type(t); + parg->type = find_fetch_type(t, flags); if (!parg->type) { trace_probe_log_err(offset + (t ? (t - arg) : 0), BAD_TYPE); goto out; @@ -662,16 +669,26 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, ret = -EINVAL; /* Store operation */ - if (!strcmp(parg->type->name, "string") || - !strcmp(parg->type->name, "ustring")) { - if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_UDEREF && - code->op != FETCH_OP_IMM && code->op != FETCH_OP_COMM && - code->op != FETCH_OP_DATA && code->op != FETCH_OP_TP_ARG) { - trace_probe_log_err(offset + (t ? (t - arg) : 0), - BAD_STRING); - goto fail; + if (parg->type->is_string) { + if (!strcmp(parg->type->name, "symstr")) { + if (code->op != FETCH_OP_REG && code->op != FETCH_OP_STACK && + code->op != FETCH_OP_RETVAL && code->op != FETCH_OP_ARG && + code->op != FETCH_OP_DEREF && code->op != FETCH_OP_TP_ARG) { + trace_probe_log_err(offset + (t ? (t - arg) : 0), + BAD_SYMSTRING); + goto fail; + } + } else { + if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_UDEREF && + code->op != FETCH_OP_IMM && code->op != FETCH_OP_COMM && + code->op != FETCH_OP_DATA && code->op != FETCH_OP_TP_ARG) { + trace_probe_log_err(offset + (t ? (t - arg) : 0), + BAD_STRING); + goto fail; + } } - if ((code->op == FETCH_OP_IMM || code->op == FETCH_OP_COMM || + if (!strcmp(parg->type->name, "symstr") || + (code->op == FETCH_OP_IMM || code->op == FETCH_OP_COMM || code->op == FETCH_OP_DATA) || code->op == FETCH_OP_TP_ARG || parg->count) { /* @@ -679,6 +696,8 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, * must be kept, and if parg->count != 0, this is an * array of string pointers instead of string address * itself. + * For the symstr, it doesn't need to dereference, thus + * it just get the value. */ code++; if (code->op != FETCH_OP_NOP) { @@ -690,6 +709,8 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, if (!strcmp(parg->type->name, "ustring") || code->op == FETCH_OP_UDEREF) code->op = FETCH_OP_ST_USTRING; + else if (!strcmp(parg->type->name, "symstr")) + code->op = FETCH_OP_ST_SYMSTR; else code->op = FETCH_OP_ST_STRING; code->size = parg->type->size; @@ -919,8 +940,7 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len, for (i = 0; i < tp->nr_args; i++) { parg = tp->args + i; if (parg->count) { - if ((strcmp(parg->type->name, "string") == 0) || - (strcmp(parg->type->name, "ustring") == 0)) + if (parg->type->is_string) fmt = ", __get_str(%s[%d])"; else fmt = ", REC->%s[%d]"; @@ -928,8 +948,7 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len, pos += snprintf(buf + pos, LEN_OR_ZERO, fmt, parg->name, j); } else { - if ((strcmp(parg->type->name, "string") == 0) || - (strcmp(parg->type->name, "ustring") == 0)) + if (parg->type->is_string) fmt = ", __get_str(%s)"; else fmt = ", REC->%s"; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 3b3869ae8cfd..23acfd1c3812 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -98,6 +98,7 @@ enum fetch_op { FETCH_OP_ST_UMEM, /* Mem: .offset, .size */ FETCH_OP_ST_STRING, /* String: .offset, .size */ FETCH_OP_ST_USTRING, /* User String: .offset, .size */ + FETCH_OP_ST_SYMSTR, /* Kernel Symbol String: .offset, .size */ // Stage 4 (modify) op FETCH_OP_MOD_BF, /* Bitfield: .basesize, .lshift, .rshift */ // Stage 5 (loop) op @@ -133,7 +134,8 @@ struct fetch_insn { struct fetch_type { const char *name; /* Name of type */ size_t size; /* Byte size of type */ - int is_signed; /* Signed flag */ + bool is_signed; /* Signed flag */ + bool is_string; /* String flag */ print_type_func_t print; /* Print functions */ const char *fmt; /* Format string */ const char *fmttype; /* Name in format file */ @@ -177,16 +179,19 @@ DECLARE_BASIC_PRINT_TYPE_FUNC(symbol); #define _ADDR_FETCH_TYPE(t) __ADDR_FETCH_TYPE(t) #define ADDR_FETCH_TYPE _ADDR_FETCH_TYPE(BITS_PER_LONG) -#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \ - {.name = _name, \ +#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, str, _fmttype) \ + {.name = _name, \ .size = _size, \ - .is_signed = sign, \ + .is_signed = (bool)sign, \ + .is_string = (bool)str, \ .print = PRINT_TYPE_FUNC_NAME(ptype), \ .fmt = PRINT_TYPE_FMT_NAME(ptype), \ .fmttype = _fmttype, \ } + +/* Non string types can use these macros */ #define _ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \ - __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, #_fmttype) + __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, 0, #_fmttype) #define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ _ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, ptype) @@ -353,7 +358,8 @@ int trace_probe_create(const char *raw_command, int (*createfn)(int, const char #define TPARG_FL_KERNEL BIT(1) #define TPARG_FL_FENTRY BIT(2) #define TPARG_FL_TPOINT BIT(3) -#define TPARG_FL_MASK GENMASK(3, 0) +#define TPARG_FL_USER BIT(4) +#define TPARG_FL_MASK GENMASK(4, 0) extern int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, const char *argv, unsigned int flags); @@ -431,6 +437,7 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(ARRAY_TOO_BIG, "Array number is too big"), \ C(BAD_TYPE, "Unknown type is specified"), \ C(BAD_STRING, "String accepts only memory argument"), \ + C(BAD_SYMSTRING, "Symbol String doesn't accept data/userdata"), \ C(BAD_BITFIELD, "Invalid bitfield"), \ C(ARG_NAME_TOO_LONG, "Argument name is too long"), \ C(NO_ARG_NAME, "Argument name is not specified"), \ @@ -445,7 +452,8 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(SAME_PROBE, "There is already the exact same probe event"),\ C(NO_EVENT_INFO, "This requires both group and event name to attach"),\ C(BAD_ATTACH_EVENT, "Attached event does not exist"),\ - C(BAD_ATTACH_ARG, "Attached event does not have this field"), + C(BAD_ATTACH_ARG, "Attached event does not have this field"),\ + C(NO_EP_FILTER, "No filter rule after 'if'"), #undef C #define C(a, b) TP_ERR_##a diff --git a/kernel/trace/trace_probe_kernel.h b/kernel/trace/trace_probe_kernel.h new file mode 100644 index 000000000000..77dbd9ff9782 --- /dev/null +++ b/kernel/trace/trace_probe_kernel.h @@ -0,0 +1,115 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __TRACE_PROBE_KERNEL_H_ +#define __TRACE_PROBE_KERNEL_H_ + +#define FAULT_STRING "(fault)" + +/* + * This depends on trace_probe.h, but can not include it due to + * the way trace_probe_tmpl.h is used by trace_kprobe.c and trace_eprobe.c. + * Which means that any other user must include trace_probe.h before including + * this file. + */ +/* Return the length of string -- including null terminal byte */ +static nokprobe_inline int +kern_fetch_store_strlen_user(unsigned long addr) +{ + const void __user *uaddr = (__force const void __user *)addr; + int ret; + + ret = strnlen_user_nofault(uaddr, MAX_STRING_SIZE); + /* + * strnlen_user_nofault returns zero on fault, insert the + * FAULT_STRING when that occurs. + */ + if (ret <= 0) + return strlen(FAULT_STRING) + 1; + return ret; +} + +/* Return the length of string -- including null terminal byte */ +static nokprobe_inline int +kern_fetch_store_strlen(unsigned long addr) +{ + int ret, len = 0; + u8 c; + +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + if (addr < TASK_SIZE) + return kern_fetch_store_strlen_user(addr); +#endif + + do { + ret = copy_from_kernel_nofault(&c, (u8 *)addr + len, 1); + len++; + } while (c && ret == 0 && len < MAX_STRING_SIZE); + + /* For faults, return enough to hold the FAULT_STRING */ + return (ret < 0) ? strlen(FAULT_STRING) + 1 : len; +} + +static nokprobe_inline void set_data_loc(int ret, void *dest, void *__dest, void *base, int len) +{ + if (ret >= 0) { + *(u32 *)dest = make_data_loc(ret, __dest - base); + } else { + strscpy(__dest, FAULT_STRING, len); + ret = strlen(__dest) + 1; + } +} + +/* + * Fetch a null-terminated string from user. Caller MUST set *(u32 *)buf + * with max length and relative data location. + */ +static nokprobe_inline int +kern_fetch_store_string_user(unsigned long addr, void *dest, void *base) +{ + const void __user *uaddr = (__force const void __user *)addr; + int maxlen = get_loc_len(*(u32 *)dest); + void *__dest; + long ret; + + if (unlikely(!maxlen)) + return -ENOMEM; + + __dest = get_loc_data(dest, base); + + ret = strncpy_from_user_nofault(__dest, uaddr, maxlen); + set_data_loc(ret, dest, __dest, base, maxlen); + + return ret; +} + +/* + * Fetch a null-terminated string. Caller MUST set *(u32 *)buf with max + * length and relative data location. + */ +static nokprobe_inline int +kern_fetch_store_string(unsigned long addr, void *dest, void *base) +{ + int maxlen = get_loc_len(*(u32 *)dest); + void *__dest; + long ret; + +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + if ((unsigned long)addr < TASK_SIZE) + return kern_fetch_store_string_user(addr, dest, base); +#endif + + if (unlikely(!maxlen)) + return -ENOMEM; + + __dest = get_loc_data(dest, base); + + /* + * Try to get string again, since the string can be changed while + * probing. + */ + ret = strncpy_from_kernel_nofault(__dest, (void *)addr, maxlen); + set_data_loc(ret, dest, __dest, base, maxlen); + + return ret; +} + +#endif /* __TRACE_PROBE_KERNEL_H_ */ diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h index b3bdb8ddb862..5cea672243f6 100644 --- a/kernel/trace/trace_probe_tmpl.h +++ b/kernel/trace/trace_probe_tmpl.h @@ -67,6 +67,37 @@ probe_mem_read(void *dest, void *src, size_t size); static nokprobe_inline int probe_mem_read_user(void *dest, void *src, size_t size); +static nokprobe_inline int +fetch_store_symstrlen(unsigned long addr) +{ + char namebuf[KSYM_SYMBOL_LEN]; + int ret; + + ret = sprint_symbol(namebuf, addr); + if (ret < 0) + return 0; + + return ret + 1; +} + +/* + * Fetch a null-terminated symbol string + offset. Caller MUST set *(u32 *)buf + * with max length and relative data location. + */ +static nokprobe_inline int +fetch_store_symstring(unsigned long addr, void *dest, void *base) +{ + int maxlen = get_loc_len(*(u32 *)dest); + void *__dest; + + if (unlikely(!maxlen)) + return -ENOMEM; + + __dest = get_loc_data(dest, base); + + return sprint_symbol(__dest, addr); +} + /* From the 2nd stage, routine is same */ static nokprobe_inline int process_fetch_insn_bottom(struct fetch_insn *code, unsigned long val, @@ -99,16 +130,22 @@ stage2: stage3: /* 3rd stage: store value to buffer */ if (unlikely(!dest)) { - if (code->op == FETCH_OP_ST_STRING) { + switch (code->op) { + case FETCH_OP_ST_STRING: ret = fetch_store_strlen(val + code->offset); code++; goto array; - } else if (code->op == FETCH_OP_ST_USTRING) { + case FETCH_OP_ST_USTRING: ret += fetch_store_strlen_user(val + code->offset); code++; goto array; - } else + case FETCH_OP_ST_SYMSTR: + ret += fetch_store_symstrlen(val + code->offset); + code++; + goto array; + default: return -EILSEQ; + } } switch (code->op) { @@ -129,6 +166,10 @@ stage3: loc = *(u32 *)dest; ret = fetch_store_string_user(val + code->offset, dest, base); break; + case FETCH_OP_ST_SYMSTR: + loc = *(u32 *)dest; + ret = fetch_store_symstring(val + code->offset, dest, base); + break; default: return -EILSEQ; } diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index a2d301f58ced..ff0536cea968 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -785,7 +785,14 @@ static struct fgraph_ops fgraph_ops __initdata = { }; #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS -noinline __noclone static void trace_direct_tramp(void) { } +#ifndef CALL_DEPTH_ACCOUNT +#define CALL_DEPTH_ACCOUNT "" +#endif + +noinline __noclone static void trace_direct_tramp(void) +{ + asm(CALL_DEPTH_ACCOUNT); +} #endif /* diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index b69e207012c9..942ddbdace4a 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -201,8 +201,6 @@ print_syscall_exit(struct trace_iterator *iter, int flags, return trace_handle_return(s); } -extern char *__bad_type_size(void); - #define SYSCALL_FIELD(_type, _name) { \ .type = #_type, .name = #_name, \ .size = sizeof(_type), .align = __alignof__(_type), \ diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index fb58e86dd117..8d64b6553aed 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -691,7 +691,8 @@ static int __trace_uprobe_create(int argc, const char **argv) for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { trace_probe_log_set_index(i + 2); ret = traceprobe_parse_probe_arg(&tu->tp, i, argv[i], - is_return ? TPARG_FL_RETURN : 0); + (is_return ? TPARG_FL_RETURN : 0) | + TPARG_FL_USER); if (ret) goto error; } diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index 9901708ce6b8..c774e560f2f9 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -961,7 +961,7 @@ create_sort_entry(void *key, struct tracing_map_elt *elt) static void detect_dups(struct tracing_map_sort_entry **sort_entries, int n_entries, unsigned int key_size) { - unsigned int dups = 0, total_dups = 0; + unsigned int total_dups = 0; int i; void *key; @@ -974,11 +974,10 @@ static void detect_dups(struct tracing_map_sort_entry **sort_entries, key = sort_entries[0]->key; for (i = 1; i < n_entries; i++) { if (!memcmp(sort_entries[i]->key, key, key_size)) { - dups++; total_dups++; + total_dups++; continue; } key = sort_entries[i]->key; - dups = 0; } WARN_ONCE(total_dups > 0, diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index ef42c1a11920..f23144af5743 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -640,7 +640,6 @@ static void tp_module_going_check_quiescent(struct tracepoint *tp, void *priv) static int tracepoint_module_coming(struct module *mod) { struct tp_module *tp_mod; - int ret = 0; if (!mod->num_tracepoints) return 0; @@ -652,19 +651,18 @@ static int tracepoint_module_coming(struct module *mod) */ if (trace_module_has_bad_taint(mod)) return 0; - mutex_lock(&tracepoint_module_list_mutex); + tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL); - if (!tp_mod) { - ret = -ENOMEM; - goto end; - } + if (!tp_mod) + return -ENOMEM; tp_mod->mod = mod; + + mutex_lock(&tracepoint_module_list_mutex); list_add_tail(&tp_mod->list, &tracepoint_module_list); blocking_notifier_call_chain(&tracepoint_notify_list, MODULE_STATE_COMING, tp_mod); -end: mutex_unlock(&tracepoint_module_list_mutex); - return ret; + return 0; } static void tracepoint_module_going(struct module *mod) diff --git a/kernel/ucount.c b/kernel/ucount.c index 06ea04d44685..ee8e57fd6f90 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -87,10 +87,6 @@ static struct ctl_table user_table[] = { UCOUNT_ENTRY("max_fanotify_groups"), UCOUNT_ENTRY("max_fanotify_marks"), #endif - { }, - { }, - { }, - { }, { } }; #endif /* CONFIG_SYSCTL */ @@ -263,29 +259,29 @@ void dec_ucount(struct ucounts *ucounts, enum ucount_type type) put_ucounts(ucounts); } -long inc_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v) +long inc_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v) { struct ucounts *iter; long max = LONG_MAX; long ret = 0; for (iter = ucounts; iter; iter = iter->ns->ucounts) { - long new = atomic_long_add_return(v, &iter->ucount[type]); + long new = atomic_long_add_return(v, &iter->rlimit[type]); if (new < 0 || new > max) ret = LONG_MAX; else if (iter == ucounts) ret = new; - max = READ_ONCE(iter->ns->ucount_max[type]); + max = get_userns_rlimit_max(iter->ns, type); } return ret; } -bool dec_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v) +bool dec_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v) { struct ucounts *iter; long new = -1; /* Silence compiler warning */ for (iter = ucounts; iter; iter = iter->ns->ucounts) { - long dec = atomic_long_sub_return(v, &iter->ucount[type]); + long dec = atomic_long_sub_return(v, &iter->rlimit[type]); WARN_ON_ONCE(dec < 0); if (iter == ucounts) new = dec; @@ -294,11 +290,11 @@ bool dec_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v) } static void do_dec_rlimit_put_ucounts(struct ucounts *ucounts, - struct ucounts *last, enum ucount_type type) + struct ucounts *last, enum rlimit_type type) { struct ucounts *iter, *next; for (iter = ucounts; iter != last; iter = next) { - long dec = atomic_long_sub_return(1, &iter->ucount[type]); + long dec = atomic_long_sub_return(1, &iter->rlimit[type]); WARN_ON_ONCE(dec < 0); next = iter->ns->ucounts; if (dec == 0) @@ -306,12 +302,12 @@ static void do_dec_rlimit_put_ucounts(struct ucounts *ucounts, } } -void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum ucount_type type) +void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum rlimit_type type) { do_dec_rlimit_put_ucounts(ucounts, NULL, type); } -long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum ucount_type type) +long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type) { /* Caller must hold a reference to ucounts */ struct ucounts *iter; @@ -319,12 +315,12 @@ long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum ucount_type type) long dec, ret = 0; for (iter = ucounts; iter; iter = iter->ns->ucounts) { - long new = atomic_long_add_return(1, &iter->ucount[type]); + long new = atomic_long_add_return(1, &iter->rlimit[type]); if (new < 0 || new > max) goto unwind; if (iter == ucounts) ret = new; - max = READ_ONCE(iter->ns->ucount_max[type]); + max = get_userns_rlimit_max(iter->ns, type); /* * Grab an extra ucount reference for the caller when * the rlimit count was previously 0. @@ -336,24 +332,24 @@ long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum ucount_type type) } return ret; dec_unwind: - dec = atomic_long_sub_return(1, &iter->ucount[type]); + dec = atomic_long_sub_return(1, &iter->rlimit[type]); WARN_ON_ONCE(dec < 0); unwind: do_dec_rlimit_put_ucounts(ucounts, iter, type); return 0; } -bool is_ucounts_overlimit(struct ucounts *ucounts, enum ucount_type type, unsigned long rlimit) +bool is_rlimit_overlimit(struct ucounts *ucounts, enum rlimit_type type, unsigned long rlimit) { struct ucounts *iter; long max = rlimit; if (rlimit > LONG_MAX) max = LONG_MAX; for (iter = ucounts; iter; iter = iter->ns->ucounts) { - long val = get_ucounts_value(iter, type); + long val = get_rlimit_value(iter, type); if (val < 0 || val > max) return true; - max = READ_ONCE(iter->ns->ucount_max[type]); + max = get_userns_rlimit_max(iter->ns, type); } return false; } diff --git a/kernel/umh.c b/kernel/umh.c index b989736e8707..850631518665 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -28,6 +28,7 @@ #include <linux/async.h> #include <linux/uaccess.h> #include <linux/initrd.h> +#include <linux/freezer.h> #include <trace/events/module.h> @@ -403,6 +404,7 @@ EXPORT_SYMBOL(call_usermodehelper_setup); */ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) { + unsigned int state = TASK_UNINTERRUPTIBLE; DECLARE_COMPLETION_ONSTACK(done); int retval = 0; @@ -436,18 +438,22 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) if (wait == UMH_NO_WAIT) /* task has freed sub_info */ goto unlock; - if (wait & UMH_KILLABLE) { - retval = wait_for_completion_killable(&done); - if (!retval) - goto wait_done; + if (wait & UMH_KILLABLE) + state |= TASK_KILLABLE; + + if (wait & UMH_FREEZABLE) + state |= TASK_FREEZABLE; + retval = wait_for_completion_state(&done, state); + if (!retval) + goto wait_done; + + if (wait & UMH_KILLABLE) { /* umh_complete() will see NULL and free sub_info */ if (xchg(&sub_info->complete, NULL)) goto unlock; - /* fallthrough, umh_complete() was already called */ } - wait_for_completion(&done); wait_done: retval = sub_info->retval; out: diff --git a/kernel/user.c b/kernel/user.c index e2cf8c22b539..d667debeafd6 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -185,6 +185,7 @@ void free_uid(struct user_struct *up) if (refcount_dec_and_lock_irqsave(&up->__count, &uidhash_lock, &flags)) free_user(up, flags); } +EXPORT_SYMBOL_GPL(free_uid); struct user_struct *alloc_uid(kuid_t uid) { diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 5481ba44a8d6..54211dbd516c 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -9,6 +9,7 @@ #include <linux/highuid.h> #include <linux/cred.h> #include <linux/securebits.h> +#include <linux/security.h> #include <linux/keyctl.h> #include <linux/key-type.h> #include <keys/user-type.h> @@ -113,6 +114,10 @@ int create_user_ns(struct cred *new) !kgid_has_mapping(parent_ns, group)) goto fail_dec; + ret = security_create_user_ns(new); + if (ret < 0) + goto fail_dec; + ret = -ENOMEM; ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL); if (!ns) @@ -131,13 +136,13 @@ int create_user_ns(struct cred *new) ns->owner = owner; ns->group = group; INIT_WORK(&ns->work, free_user_ns); - for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++) { + for (i = 0; i < UCOUNT_COUNTS; i++) { ns->ucount_max[i] = INT_MAX; } - set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_NPROC, enforced_nproc_rlimit()); - set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_MSGQUEUE, rlimit(RLIMIT_MSGQUEUE)); - set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_SIGPENDING, rlimit(RLIMIT_SIGPENDING)); - set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_MEMLOCK, rlimit(RLIMIT_MEMLOCK)); + set_userns_rlimit_max(ns, UCOUNT_RLIMIT_NPROC, enforced_nproc_rlimit()); + set_userns_rlimit_max(ns, UCOUNT_RLIMIT_MSGQUEUE, rlimit(RLIMIT_MSGQUEUE)); + set_userns_rlimit_max(ns, UCOUNT_RLIMIT_SIGPENDING, rlimit(RLIMIT_SIGPENDING)); + set_userns_rlimit_max(ns, UCOUNT_RLIMIT_MEMLOCK, rlimit(RLIMIT_MEMLOCK)); ns->ucounts = ucounts; /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 4ca61d49885b..f50398cb790d 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -8,6 +8,7 @@ #include <linux/export.h> #include <linux/uts.h> #include <linux/utsname.h> +#include <linux/random.h> #include <linux/sysctl.h> #include <linux/wait.h> #include <linux/rwsem.h> @@ -57,6 +58,7 @@ static int proc_do_uts_string(struct ctl_table *table, int write, * theoretically be incorrect if there are two parallel writes * at non-zero offsets to the same sysctl. */ + add_device_randomness(tmp_data, sizeof(tmp_data)); down_write(&uts_sem); memcpy(get_uts(table), tmp_data, sizeof(tmp_data)); up_write(&uts_sem); @@ -72,8 +74,16 @@ static int proc_do_uts_string(struct ctl_table *table, int write, static DEFINE_CTL_TABLE_POLL(hostname_poll); static DEFINE_CTL_TABLE_POLL(domainname_poll); +// Note: update 'enum uts_proc' to match any changes to this table static struct ctl_table uts_kern_table[] = { { + .procname = "arch", + .data = init_uts_ns.name.machine, + .maxlen = sizeof(init_uts_ns.name.machine), + .mode = 0444, + .proc_handler = proc_do_uts_string, + }, + { .procname = "ostype", .data = init_uts_ns.name.sysname, .maxlen = sizeof(init_uts_ns.name.sysname), diff --git a/kernel/workqueue.c b/kernel/workqueue.c index aeea9731ef80..07895deca271 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1651,7 +1651,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, struct work_struct *work = &dwork->work; WARN_ON_ONCE(!wq); - WARN_ON_FUNCTION_MISMATCH(timer->function, delayed_work_timer_fn); + WARN_ON_ONCE(timer->function != delayed_work_timer_fn); WARN_ON_ONCE(timer_pending(timer)); WARN_ON_ONCE(!list_empty(&work->entry)); @@ -1771,7 +1771,7 @@ bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork) if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { rwork->wq = wq; - call_rcu(&rwork->rcu, rcu_work_rcufn); + call_rcu_hurry(&rwork->rcu, rcu_work_rcufn); return true; } @@ -3066,10 +3066,8 @@ static bool __flush_work(struct work_struct *work, bool from_cancel) if (WARN_ON(!work->func)) return false; - if (!from_cancel) { - lock_map_acquire(&work->lockdep_map); - lock_map_release(&work->lockdep_map); - } + lock_map_acquire(&work->lockdep_map); + lock_map_release(&work->lockdep_map); if (start_flush_work(work, &barr, from_cancel)) { wait_for_completion(&barr.done); |