diff options
Diffstat (limited to 'kernel')
104 files changed, 4787 insertions, 3601 deletions
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index be8c680121e4..d6ef4f4f9cba 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -529,7 +529,8 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, entry->rule.buflen += f_val; f->lsm_str = str; err = security_audit_rule_init(f->type, f->op, str, - (void **)&f->lsm_rule); + (void **)&f->lsm_rule, + GFP_KERNEL); /* Keep currently invalid fields around in case they * become valid after a policy reload. */ if (err == -EINVAL) { @@ -799,7 +800,7 @@ static inline int audit_dupe_lsm_field(struct audit_field *df, /* our own (refreshed) copy of lsm_rule */ ret = security_audit_rule_init(df->type, df->op, df->lsm_str, - (void **)&df->lsm_rule); + (void **)&df->lsm_rule, GFP_KERNEL); /* Keep currently invalid fields around in case they * become valid after a policy reload. */ if (ret == -EINVAL) { diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 7eb9ad3a3ae6..0291eef9ce92 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -50,5 +50,11 @@ endif obj-$(CONFIG_BPF_PRELOAD) += preload/ obj-$(CONFIG_BPF_SYSCALL) += relo_core.o -$(obj)/relo_core.o: $(srctree)/tools/lib/bpf/relo_core.c FORCE +obj-$(CONFIG_BPF_SYSCALL) += btf_iter.o +obj-$(CONFIG_BPF_SYSCALL) += btf_relocate.o + +# Some source files are common to libbpf. +vpath %.c $(srctree)/kernel/bpf:$(srctree)/tools/lib/bpf + +$(obj)/%.o: %.c FORCE $(call if_changed_rule,cc_o_c) diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 583ee4fe48ef..e52b3ad231b9 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -212,6 +212,7 @@ static u64 arena_map_mem_usage(const struct bpf_map *map) struct vma_list { struct vm_area_struct *vma; struct list_head head; + atomic_t mmap_count; }; static int remember_vma(struct bpf_arena *arena, struct vm_area_struct *vma) @@ -221,20 +222,30 @@ static int remember_vma(struct bpf_arena *arena, struct vm_area_struct *vma) vml = kmalloc(sizeof(*vml), GFP_KERNEL); if (!vml) return -ENOMEM; + atomic_set(&vml->mmap_count, 1); vma->vm_private_data = vml; vml->vma = vma; list_add(&vml->head, &arena->vma_list); return 0; } +static void arena_vm_open(struct vm_area_struct *vma) +{ + struct vma_list *vml = vma->vm_private_data; + + atomic_inc(&vml->mmap_count); +} + static void arena_vm_close(struct vm_area_struct *vma) { struct bpf_map *map = vma->vm_file->private_data; struct bpf_arena *arena = container_of(map, struct bpf_arena, map); - struct vma_list *vml; + struct vma_list *vml = vma->vm_private_data; + if (!atomic_dec_and_test(&vml->mmap_count)) + return; guard(mutex)(&arena->lock); - vml = vma->vm_private_data; + /* update link list under lock */ list_del(&vml->head); vma->vm_private_data = NULL; kfree(vml); @@ -287,6 +298,7 @@ out: } static const struct vm_operations_struct arena_vm_ops = { + .open = arena_vm_open, .close = arena_vm_close, .fault = arena_vm_fault, }; diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 976cb258a0ed..c938dea5ddbf 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -782,8 +782,8 @@ bpf_local_storage_map_alloc(union bpf_attr *attr, nbuckets = max_t(u32, 2, nbuckets); smap->bucket_log = ilog2(nbuckets); - smap->buckets = bpf_map_kvcalloc(&smap->map, sizeof(*smap->buckets), - nbuckets, GFP_USER | __GFP_NOWARN); + smap->buckets = bpf_map_kvcalloc(&smap->map, nbuckets, + sizeof(*smap->buckets), GFP_USER | __GFP_NOWARN); if (!smap->buckets) { err = -ENOMEM; goto free_smap; diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 68240c3c6e7d..08a338e1f231 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -280,6 +280,7 @@ BTF_ID(func, bpf_lsm_cred_prepare) BTF_ID(func, bpf_lsm_file_ioctl) BTF_ID(func, bpf_lsm_file_lock) BTF_ID(func, bpf_lsm_file_open) +BTF_ID(func, bpf_lsm_file_post_open) BTF_ID(func, bpf_lsm_file_receive) BTF_ID(func, bpf_lsm_inode_create) diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 86c7884abaf8..0d515ec57aa5 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -12,6 +12,7 @@ #include <linux/mutex.h> #include <linux/btf_ids.h> #include <linux/rcupdate_wait.h> +#include <linux/poll.h> struct bpf_struct_ops_value { struct bpf_struct_ops_common_value common; @@ -56,6 +57,7 @@ struct bpf_struct_ops_map { struct bpf_struct_ops_link { struct bpf_link link; struct bpf_map __rcu *map; + wait_queue_head_t wait_hup; }; static DEFINE_MUTEX(update_mutex); @@ -571,7 +573,7 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, } size = arch_prepare_bpf_trampoline(NULL, image + image_off, - image + PAGE_SIZE, + image + image_off + size, model, flags, tlinks, stub_func); if (size <= 0) { if (image != *_image) @@ -757,7 +759,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, goto unlock; } - err = st_ops->reg(kdata); + err = st_ops->reg(kdata, NULL); if (likely(!err)) { /* This refcnt increment on the map here after * 'st_ops->reg()' is secure since the state of the @@ -805,7 +807,7 @@ static long bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key) BPF_STRUCT_OPS_STATE_TOBEFREE); switch (prev_state) { case BPF_STRUCT_OPS_STATE_INUSE: - st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data); + st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data, NULL); bpf_map_put(map); return 0; case BPF_STRUCT_OPS_STATE_TOBEFREE: @@ -1057,10 +1059,7 @@ static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link) st_map = (struct bpf_struct_ops_map *) rcu_dereference_protected(st_link->map, true); if (st_map) { - /* st_link->map can be NULL if - * bpf_struct_ops_link_create() fails to register. - */ - st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data); + st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data, link); bpf_map_put(&st_map->map); } kfree(st_link); @@ -1075,7 +1074,8 @@ static void bpf_struct_ops_map_link_show_fdinfo(const struct bpf_link *link, st_link = container_of(link, struct bpf_struct_ops_link, link); rcu_read_lock(); map = rcu_dereference(st_link->map); - seq_printf(seq, "map_id:\t%d\n", map->id); + if (map) + seq_printf(seq, "map_id:\t%d\n", map->id); rcu_read_unlock(); } @@ -1088,7 +1088,8 @@ static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link *link, st_link = container_of(link, struct bpf_struct_ops_link, link); rcu_read_lock(); map = rcu_dereference(st_link->map); - info->struct_ops.map_id = map->id; + if (map) + info->struct_ops.map_id = map->id; rcu_read_unlock(); return 0; } @@ -1113,6 +1114,10 @@ static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map mutex_lock(&update_mutex); old_map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex)); + if (!old_map) { + err = -ENOLINK; + goto err_out; + } if (expected_old_map && old_map != expected_old_map) { err = -EPERM; goto err_out; @@ -1125,7 +1130,7 @@ static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map goto err_out; } - err = st_map->st_ops_desc->st_ops->update(st_map->kvalue.data, old_st_map->kvalue.data); + err = st_map->st_ops_desc->st_ops->update(st_map->kvalue.data, old_st_map->kvalue.data, link); if (err) goto err_out; @@ -1139,11 +1144,53 @@ err_out: return err; } +static int bpf_struct_ops_map_link_detach(struct bpf_link *link) +{ + struct bpf_struct_ops_link *st_link = container_of(link, struct bpf_struct_ops_link, link); + struct bpf_struct_ops_map *st_map; + struct bpf_map *map; + + mutex_lock(&update_mutex); + + map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex)); + if (!map) { + mutex_unlock(&update_mutex); + return 0; + } + st_map = container_of(map, struct bpf_struct_ops_map, map); + + st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data, link); + + RCU_INIT_POINTER(st_link->map, NULL); + /* Pair with bpf_map_get() in bpf_struct_ops_link_create() or + * bpf_map_inc() in bpf_struct_ops_map_link_update(). + */ + bpf_map_put(&st_map->map); + + mutex_unlock(&update_mutex); + + wake_up_interruptible_poll(&st_link->wait_hup, EPOLLHUP); + + return 0; +} + +static __poll_t bpf_struct_ops_map_link_poll(struct file *file, + struct poll_table_struct *pts) +{ + struct bpf_struct_ops_link *st_link = file->private_data; + + poll_wait(file, &st_link->wait_hup, pts); + + return rcu_access_pointer(st_link->map) ? 0 : EPOLLHUP; +} + static const struct bpf_link_ops bpf_struct_ops_map_lops = { .dealloc = bpf_struct_ops_map_link_dealloc, + .detach = bpf_struct_ops_map_link_detach, .show_fdinfo = bpf_struct_ops_map_link_show_fdinfo, .fill_link_info = bpf_struct_ops_map_link_fill_link_info, .update_map = bpf_struct_ops_map_link_update, + .poll = bpf_struct_ops_map_link_poll, }; int bpf_struct_ops_link_create(union bpf_attr *attr) @@ -1176,13 +1223,21 @@ int bpf_struct_ops_link_create(union bpf_attr *attr) if (err) goto err_out; - err = st_map->st_ops_desc->st_ops->reg(st_map->kvalue.data); + init_waitqueue_head(&link->wait_hup); + + /* Hold the update_mutex such that the subsystem cannot + * do link->ops->detach() before the link is fully initialized. + */ + mutex_lock(&update_mutex); + err = st_map->st_ops_desc->st_ops->reg(st_map->kvalue.data, &link->link); if (err) { + mutex_unlock(&update_mutex); bpf_link_cleanup(&link_primer); link = NULL; goto err_out; } RCU_INIT_POINTER(link->map, map); + mutex_unlock(&update_mutex); return bpf_link_settle(&link_primer); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 821063660d9f..520f49f422fe 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -274,6 +274,7 @@ struct btf { u32 start_str_off; /* first string offset (0 for base BTF) */ char name[MODULE_NAME_LEN]; bool kernel_btf; + __u32 *base_id_map; /* map from distilled base BTF -> vmlinux BTF ids */ }; enum verifier_phase { @@ -414,7 +415,7 @@ const char *btf_type_str(const struct btf_type *t) struct btf_show { u64 flags; void *target; /* target of show operation (seq file, buffer) */ - void (*showfn)(struct btf_show *show, const char *fmt, va_list args); + __printf(2, 0) void (*showfn)(struct btf_show *show, const char *fmt, va_list args); const struct btf *btf; /* below are used during iteration */ struct { @@ -530,6 +531,11 @@ static bool btf_type_is_decl_tag_target(const struct btf_type *t) btf_type_is_var(t) || btf_type_is_typedef(t); } +bool btf_is_vmlinux(const struct btf *btf) +{ + return btf->kernel_btf && !btf->base_btf; +} + u32 btf_nr_types(const struct btf *btf) { u32 total = 0; @@ -772,7 +778,7 @@ static bool __btf_name_char_ok(char c, bool first) return true; } -static const char *btf_str_by_offset(const struct btf *btf, u32 offset) +const char *btf_str_by_offset(const struct btf *btf, u32 offset) { while (offset < btf->start_str_off) btf = btf->base_btf; @@ -1670,14 +1676,8 @@ static void btf_free_kfunc_set_tab(struct btf *btf) if (!tab) return; - /* For module BTF, we directly assign the sets being registered, so - * there is nothing to free except kfunc_set_tab. - */ - if (btf_is_module(btf)) - goto free_tab; for (hook = 0; hook < ARRAY_SIZE(tab->sets); hook++) kfree(tab->sets[hook]); -free_tab: kfree(tab); btf->kfunc_set_tab = NULL; } @@ -1735,7 +1735,12 @@ static void btf_free(struct btf *btf) kvfree(btf->types); kvfree(btf->resolved_sizes); kvfree(btf->resolved_ids); - kvfree(btf->data); + /* vmlinux does not allocate btf->data, it simply points it at + * __start_BTF. + */ + if (!btf_is_vmlinux(btf)) + kvfree(btf->data); + kvfree(btf->base_id_map); kfree(btf); } @@ -1764,6 +1769,23 @@ void btf_put(struct btf *btf) } } +struct btf *btf_base_btf(const struct btf *btf) +{ + return btf->base_btf; +} + +const struct btf_header *btf_header(const struct btf *btf) +{ + return &btf->hdr; +} + +void btf_set_base_btf(struct btf *btf, const struct btf *base_btf) +{ + btf->base_btf = (struct btf *)base_btf; + btf->start_id = btf_nr_types(base_btf); + btf->start_str_off = base_btf->hdr.str_len; +} + static int env_resolve_init(struct btf_verifier_env *env) { struct btf *btf = env->btf; @@ -3442,10 +3464,12 @@ btf_find_graph_root(const struct btf *btf, const struct btf_type *pt, goto end; \ } -static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask, +static int btf_get_field_type(const struct btf *btf, const struct btf_type *var_type, + u32 field_mask, u32 *seen_mask, int *align, int *sz) { int type = 0; + const char *name = __btf_name_by_offset(btf, var_type->name_off); if (field_mask & BPF_SPIN_LOCK) { if (!strcmp(name, "bpf_spin_lock")) { @@ -3481,7 +3505,7 @@ static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask, field_mask_test_name(BPF_REFCOUNT, "bpf_refcount"); /* Only return BPF_KPTR when all other types with matchable names fail */ - if (field_mask & BPF_KPTR) { + if (field_mask & BPF_KPTR && !__btf_type_is_struct(var_type)) { type = BPF_KPTR_REF; goto end; } @@ -3494,140 +3518,232 @@ end: #undef field_mask_test_name +/* Repeat a number of fields for a specified number of times. + * + * Copy the fields starting from the first field and repeat them for + * repeat_cnt times. The fields are repeated by adding the offset of each + * field with + * (i + 1) * elem_size + * where i is the repeat index and elem_size is the size of an element. + */ +static int btf_repeat_fields(struct btf_field_info *info, + u32 field_cnt, u32 repeat_cnt, u32 elem_size) +{ + u32 i, j; + u32 cur; + + /* Ensure not repeating fields that should not be repeated. */ + for (i = 0; i < field_cnt; i++) { + switch (info[i].type) { + case BPF_KPTR_UNREF: + case BPF_KPTR_REF: + case BPF_KPTR_PERCPU: + case BPF_LIST_HEAD: + case BPF_RB_ROOT: + break; + default: + return -EINVAL; + } + } + + cur = field_cnt; + for (i = 0; i < repeat_cnt; i++) { + memcpy(&info[cur], &info[0], field_cnt * sizeof(info[0])); + for (j = 0; j < field_cnt; j++) + info[cur++].off += (i + 1) * elem_size; + } + + return 0; +} + static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t, u32 field_mask, - struct btf_field_info *info, int info_cnt) + struct btf_field_info *info, int info_cnt, + u32 level); + +/* Find special fields in the struct type of a field. + * + * This function is used to find fields of special types that is not a + * global variable or a direct field of a struct type. It also handles the + * repetition if it is the element type of an array. + */ +static int btf_find_nested_struct(const struct btf *btf, const struct btf_type *t, + u32 off, u32 nelems, + u32 field_mask, struct btf_field_info *info, + int info_cnt, u32 level) { - int ret, idx = 0, align, sz, field_type; - const struct btf_member *member; + int ret, err, i; + + level++; + if (level >= MAX_RESOLVE_DEPTH) + return -E2BIG; + + ret = btf_find_struct_field(btf, t, field_mask, info, info_cnt, level); + + if (ret <= 0) + return ret; + + /* Shift the offsets of the nested struct fields to the offsets + * related to the container. + */ + for (i = 0; i < ret; i++) + info[i].off += off; + + if (nelems > 1) { + err = btf_repeat_fields(info, ret, nelems - 1, t->size); + if (err == 0) + ret *= nelems; + else + ret = err; + } + + return ret; +} + +static int btf_find_field_one(const struct btf *btf, + const struct btf_type *var, + const struct btf_type *var_type, + int var_idx, + u32 off, u32 expected_size, + u32 field_mask, u32 *seen_mask, + struct btf_field_info *info, int info_cnt, + u32 level) +{ + int ret, align, sz, field_type; struct btf_field_info tmp; + const struct btf_array *array; + u32 i, nelems = 1; + + /* Walk into array types to find the element type and the number of + * elements in the (flattened) array. + */ + for (i = 0; i < MAX_RESOLVE_DEPTH && btf_type_is_array(var_type); i++) { + array = btf_array(var_type); + nelems *= array->nelems; + var_type = btf_type_by_id(btf, array->type); + } + if (i == MAX_RESOLVE_DEPTH) + return -E2BIG; + if (nelems == 0) + return 0; + + field_type = btf_get_field_type(btf, var_type, + field_mask, seen_mask, &align, &sz); + /* Look into variables of struct types */ + if (!field_type && __btf_type_is_struct(var_type)) { + sz = var_type->size; + if (expected_size && expected_size != sz * nelems) + return 0; + ret = btf_find_nested_struct(btf, var_type, off, nelems, field_mask, + &info[0], info_cnt, level); + return ret; + } + + if (field_type == 0) + return 0; + if (field_type < 0) + return field_type; + + if (expected_size && expected_size != sz * nelems) + return 0; + if (off % align) + return 0; + + switch (field_type) { + case BPF_SPIN_LOCK: + case BPF_TIMER: + case BPF_WORKQUEUE: + case BPF_LIST_NODE: + case BPF_RB_NODE: + case BPF_REFCOUNT: + ret = btf_find_struct(btf, var_type, off, sz, field_type, + info_cnt ? &info[0] : &tmp); + if (ret < 0) + return ret; + break; + case BPF_KPTR_UNREF: + case BPF_KPTR_REF: + case BPF_KPTR_PERCPU: + ret = btf_find_kptr(btf, var_type, off, sz, + info_cnt ? &info[0] : &tmp); + if (ret < 0) + return ret; + break; + case BPF_LIST_HEAD: + case BPF_RB_ROOT: + ret = btf_find_graph_root(btf, var, var_type, + var_idx, off, sz, + info_cnt ? &info[0] : &tmp, + field_type); + if (ret < 0) + return ret; + break; + default: + return -EFAULT; + } + + if (ret == BTF_FIELD_IGNORE) + return 0; + if (nelems > info_cnt) + return -E2BIG; + if (nelems > 1) { + ret = btf_repeat_fields(info, 1, nelems - 1, sz); + if (ret < 0) + return ret; + } + return nelems; +} + +static int btf_find_struct_field(const struct btf *btf, + const struct btf_type *t, u32 field_mask, + struct btf_field_info *info, int info_cnt, + u32 level) +{ + int ret, idx = 0; + const struct btf_member *member; u32 i, off, seen_mask = 0; for_each_member(i, t, member) { const struct btf_type *member_type = btf_type_by_id(btf, member->type); - field_type = btf_get_field_type(__btf_name_by_offset(btf, member_type->name_off), - field_mask, &seen_mask, &align, &sz); - if (field_type == 0) - continue; - if (field_type < 0) - return field_type; - off = __btf_member_bit_offset(t, member); if (off % 8) /* valid C code cannot generate such BTF */ return -EINVAL; off /= 8; - if (off % align) - continue; - switch (field_type) { - case BPF_SPIN_LOCK: - case BPF_TIMER: - case BPF_WORKQUEUE: - case BPF_LIST_NODE: - case BPF_RB_NODE: - case BPF_REFCOUNT: - ret = btf_find_struct(btf, member_type, off, sz, field_type, - idx < info_cnt ? &info[idx] : &tmp); - if (ret < 0) - return ret; - break; - case BPF_KPTR_UNREF: - case BPF_KPTR_REF: - case BPF_KPTR_PERCPU: - ret = btf_find_kptr(btf, member_type, off, sz, - idx < info_cnt ? &info[idx] : &tmp); - if (ret < 0) - return ret; - break; - case BPF_LIST_HEAD: - case BPF_RB_ROOT: - ret = btf_find_graph_root(btf, t, member_type, - i, off, sz, - idx < info_cnt ? &info[idx] : &tmp, - field_type); - if (ret < 0) - return ret; - break; - default: - return -EFAULT; - } - - if (ret == BTF_FIELD_IGNORE) - continue; - if (idx >= info_cnt) - return -E2BIG; - ++idx; + ret = btf_find_field_one(btf, t, member_type, i, + off, 0, + field_mask, &seen_mask, + &info[idx], info_cnt - idx, level); + if (ret < 0) + return ret; + idx += ret; } return idx; } static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t, u32 field_mask, struct btf_field_info *info, - int info_cnt) + int info_cnt, u32 level) { - int ret, idx = 0, align, sz, field_type; + int ret, idx = 0; const struct btf_var_secinfo *vsi; - struct btf_field_info tmp; u32 i, off, seen_mask = 0; for_each_vsi(i, t, vsi) { const struct btf_type *var = btf_type_by_id(btf, vsi->type); const struct btf_type *var_type = btf_type_by_id(btf, var->type); - field_type = btf_get_field_type(__btf_name_by_offset(btf, var_type->name_off), - field_mask, &seen_mask, &align, &sz); - if (field_type == 0) - continue; - if (field_type < 0) - return field_type; - off = vsi->offset; - if (vsi->size != sz) - continue; - if (off % align) - continue; - - switch (field_type) { - case BPF_SPIN_LOCK: - case BPF_TIMER: - case BPF_WORKQUEUE: - case BPF_LIST_NODE: - case BPF_RB_NODE: - case BPF_REFCOUNT: - ret = btf_find_struct(btf, var_type, off, sz, field_type, - idx < info_cnt ? &info[idx] : &tmp); - if (ret < 0) - return ret; - break; - case BPF_KPTR_UNREF: - case BPF_KPTR_REF: - case BPF_KPTR_PERCPU: - ret = btf_find_kptr(btf, var_type, off, sz, - idx < info_cnt ? &info[idx] : &tmp); - if (ret < 0) - return ret; - break; - case BPF_LIST_HEAD: - case BPF_RB_ROOT: - ret = btf_find_graph_root(btf, var, var_type, - -1, off, sz, - idx < info_cnt ? &info[idx] : &tmp, - field_type); - if (ret < 0) - return ret; - break; - default: - return -EFAULT; - } - - if (ret == BTF_FIELD_IGNORE) - continue; - if (idx >= info_cnt) - return -E2BIG; - ++idx; + ret = btf_find_field_one(btf, var, var_type, -1, off, vsi->size, + field_mask, &seen_mask, + &info[idx], info_cnt - idx, + level); + if (ret < 0) + return ret; + idx += ret; } return idx; } @@ -3637,9 +3753,9 @@ static int btf_find_field(const struct btf *btf, const struct btf_type *t, int info_cnt) { if (__btf_type_is_struct(t)) - return btf_find_struct_field(btf, t, field_mask, info, info_cnt); + return btf_find_struct_field(btf, t, field_mask, info, info_cnt, 0); else if (btf_type_is_datasec(t)) - return btf_find_datasec_var(btf, t, field_mask, info, info_cnt); + return btf_find_datasec_var(btf, t, field_mask, info, info_cnt, 0); return -EINVAL; } @@ -5726,6 +5842,15 @@ static int find_kern_ctx_type_id(enum bpf_prog_type prog_type) return ctx_type->type; } +bool btf_is_projection_of(const char *pname, const char *tname) +{ + if (strcmp(pname, "__sk_buff") == 0 && strcmp(tname, "sk_buff") == 0) + return true; + if (strcmp(pname, "xdp_md") == 0 && strcmp(tname, "xdp_buff") == 0) + return true; + return false; +} + bool btf_is_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, const struct btf_type *t, enum bpf_prog_type prog_type, int arg) @@ -5788,9 +5913,7 @@ again: * int socket_filter_bpf_prog(struct __sk_buff *skb) * { // no fields of skb are ever used } */ - if (strcmp(ctx_tname, "__sk_buff") == 0 && strcmp(tname, "sk_buff") == 0) - return true; - if (strcmp(ctx_tname, "xdp_md") == 0 && strcmp(tname, "xdp_buff") == 0) + if (btf_is_projection_of(ctx_tname, tname)) return true; if (strcmp(ctx_tname, tname)) { /* bpf_user_pt_regs_t is a typedef, so resolve it to @@ -5982,23 +6105,15 @@ int get_kern_ctx_btf_id(struct bpf_verifier_log *log, enum bpf_prog_type prog_ty BTF_ID_LIST(bpf_ctx_convert_btf_id) BTF_ID(struct, bpf_ctx_convert) -struct btf *btf_parse_vmlinux(void) +static struct btf *btf_parse_base(struct btf_verifier_env *env, const char *name, + void *data, unsigned int data_size) { - struct btf_verifier_env *env = NULL; - struct bpf_verifier_log *log; struct btf *btf = NULL; int err; if (!IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) return ERR_PTR(-ENOENT); - env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN); - if (!env) - return ERR_PTR(-ENOMEM); - - log = &env->log; - log->level = BPF_LOG_KERNEL; - btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN); if (!btf) { err = -ENOMEM; @@ -6006,10 +6121,10 @@ struct btf *btf_parse_vmlinux(void) } env->btf = btf; - btf->data = __start_BTF; - btf->data_size = __stop_BTF - __start_BTF; + btf->data = data; + btf->data_size = data_size; btf->kernel_btf = true; - snprintf(btf->name, sizeof(btf->name), "vmlinux"); + snprintf(btf->name, sizeof(btf->name), "%s", name); err = btf_parse_hdr(env); if (err) @@ -6029,20 +6144,11 @@ struct btf *btf_parse_vmlinux(void) if (err) goto errout; - /* btf_parse_vmlinux() runs under bpf_verifier_lock */ - bpf_ctx_convert.t = btf_type_by_id(btf, bpf_ctx_convert_btf_id[0]); - refcount_set(&btf->refcnt, 1); - err = btf_alloc_id(btf); - if (err) - goto errout; - - btf_verifier_env_free(env); return btf; errout: - btf_verifier_env_free(env); if (btf) { kvfree(btf->types); kfree(btf); @@ -6050,19 +6156,61 @@ errout: return ERR_PTR(err); } +struct btf *btf_parse_vmlinux(void) +{ + struct btf_verifier_env *env = NULL; + struct bpf_verifier_log *log; + struct btf *btf; + int err; + + env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN); + if (!env) + return ERR_PTR(-ENOMEM); + + log = &env->log; + log->level = BPF_LOG_KERNEL; + btf = btf_parse_base(env, "vmlinux", __start_BTF, __stop_BTF - __start_BTF); + if (IS_ERR(btf)) + goto err_out; + + /* btf_parse_vmlinux() runs under bpf_verifier_lock */ + bpf_ctx_convert.t = btf_type_by_id(btf, bpf_ctx_convert_btf_id[0]); + err = btf_alloc_id(btf); + if (err) { + btf_free(btf); + btf = ERR_PTR(err); + } +err_out: + btf_verifier_env_free(env); + return btf; +} + +/* If .BTF_ids section was created with distilled base BTF, both base and + * split BTF ids will need to be mapped to actual base/split ids for + * BTF now that it has been relocated. + */ +static __u32 btf_relocate_id(const struct btf *btf, __u32 id) +{ + if (!btf->base_btf || !btf->base_id_map) + return id; + return btf->base_id_map[id]; +} + #ifdef CONFIG_DEBUG_INFO_BTF_MODULES -static struct btf *btf_parse_module(const char *module_name, const void *data, unsigned int data_size) +static struct btf *btf_parse_module(const char *module_name, const void *data, + unsigned int data_size, void *base_data, + unsigned int base_data_size) { + struct btf *btf = NULL, *vmlinux_btf, *base_btf = NULL; struct btf_verifier_env *env = NULL; struct bpf_verifier_log *log; - struct btf *btf = NULL, *base_btf; - int err; + int err = 0; - base_btf = bpf_get_btf_vmlinux(); - if (IS_ERR(base_btf)) - return base_btf; - if (!base_btf) + vmlinux_btf = bpf_get_btf_vmlinux(); + if (IS_ERR(vmlinux_btf)) + return vmlinux_btf; + if (!vmlinux_btf) return ERR_PTR(-EINVAL); env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN); @@ -6072,6 +6220,16 @@ static struct btf *btf_parse_module(const char *module_name, const void *data, u log = &env->log; log->level = BPF_LOG_KERNEL; + if (base_data) { + base_btf = btf_parse_base(env, ".BTF.base", base_data, base_data_size); + if (IS_ERR(base_btf)) { + err = PTR_ERR(base_btf); + goto errout; + } + } else { + base_btf = vmlinux_btf; + } + btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN); if (!btf) { err = -ENOMEM; @@ -6111,12 +6269,22 @@ static struct btf *btf_parse_module(const char *module_name, const void *data, u if (err) goto errout; + if (base_btf != vmlinux_btf) { + err = btf_relocate(btf, vmlinux_btf, &btf->base_id_map); + if (err) + goto errout; + btf_free(base_btf); + base_btf = vmlinux_btf; + } + btf_verifier_env_free(env); refcount_set(&btf->refcnt, 1); return btf; errout: btf_verifier_env_free(env); + if (base_btf != vmlinux_btf) + btf_free(base_btf); if (btf) { kvfree(btf->data); kvfree(btf->types); @@ -6693,7 +6861,7 @@ int btf_struct_access(struct bpf_verifier_log *log, for (i = 0; i < rec->cnt; i++) { struct btf_field *field = &rec->fields[i]; u32 offset = field->offset; - if (off < offset + btf_field_type_size(field->type) && offset < off + size) { + if (off < offset + field->size && offset < off + size) { bpf_log(log, "direct access to %s is disallowed\n", btf_field_type_name(field->type)); @@ -7370,8 +7538,8 @@ static void btf_type_show(const struct btf *btf, u32 type_id, void *obj, btf_type_ops(t)->show(btf, t, type_id, obj, 0, show); } -static void btf_seq_show(struct btf_show *show, const char *fmt, - va_list args) +__printf(2, 0) static void btf_seq_show(struct btf_show *show, const char *fmt, + va_list args) { seq_vprintf((struct seq_file *)show->target, fmt, args); } @@ -7404,8 +7572,8 @@ struct btf_show_snprintf { int len; /* length we would have written */ }; -static void btf_snprintf_show(struct btf_show *show, const char *fmt, - va_list args) +__printf(2, 0) static void btf_snprintf_show(struct btf_show *show, const char *fmt, + va_list args) { struct btf_show_snprintf *ssnprintf = (struct btf_show_snprintf *)show; int len; @@ -7669,7 +7837,8 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op, err = -ENOMEM; goto out; } - btf = btf_parse_module(mod->name, mod->btf_data, mod->btf_data_size); + btf = btf_parse_module(mod->name, mod->btf_data, mod->btf_data_size, + mod->btf_base_data, mod->btf_base_data_size); if (IS_ERR(btf)) { kfree(btf_mod); if (!IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH)) { @@ -7993,7 +8162,7 @@ static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook, bool add_filter = !!kset->filter; struct btf_kfunc_set_tab *tab; struct btf_id_set8 *set; - u32 set_cnt; + u32 set_cnt, i; int ret; if (hook >= BTF_KFUNC_HOOK_MAX) { @@ -8039,21 +8208,15 @@ static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook, goto end; } - /* We don't need to allocate, concatenate, and sort module sets, because - * only one is allowed per hook. Hence, we can directly assign the - * pointer and return. - */ - if (!vmlinux_set) { - tab->sets[hook] = add_set; - goto do_add_filter; - } - /* In case of vmlinux sets, there may be more than one set being * registered per hook. To create a unified set, we allocate a new set * and concatenate all individual sets being registered. While each set * is individually sorted, they may become unsorted when concatenated, * hence re-sorting the final set again is required to make binary * searching the set using btf_id_set8_contains function work. + * + * For module sets, we need to allocate as we may need to relocate + * BTF ids. */ set_cnt = set ? set->cnt : 0; @@ -8083,11 +8246,14 @@ static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook, /* Concatenate the two sets */ memcpy(set->pairs + set->cnt, add_set->pairs, add_set->cnt * sizeof(set->pairs[0])); + /* Now that the set is copied, update with relocated BTF ids */ + for (i = set->cnt; i < set->cnt + add_set->cnt; i++) + set->pairs[i].id = btf_relocate_id(btf, set->pairs[i].id); + set->cnt += add_set->cnt; sort(set->pairs, set->cnt, sizeof(set->pairs[0]), btf_id_cmp_func, NULL); -do_add_filter: if (add_filter) { hook_filter = &tab->hook_filters[hook]; hook_filter->filters[hook_filter->nr_filters++] = kset->filter; @@ -8207,7 +8373,7 @@ static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook, return PTR_ERR(btf); for (i = 0; i < kset->set->cnt; i++) { - ret = btf_check_kfunc_protos(btf, kset->set->pairs[i].id, + ret = btf_check_kfunc_protos(btf, btf_relocate_id(btf, kset->set->pairs[i].id), kset->set->pairs[i].flags); if (ret) goto err_out; @@ -8271,7 +8437,7 @@ static int btf_check_dtor_kfuncs(struct btf *btf, const struct btf_id_dtor_kfunc u32 nr_args, i; for (i = 0; i < cnt; i++) { - dtor_btf_id = dtors[i].kfunc_btf_id; + dtor_btf_id = btf_relocate_id(btf, dtors[i].kfunc_btf_id); dtor_func = btf_type_by_id(btf, dtor_btf_id); if (!dtor_func || !btf_type_is_func(dtor_func)) @@ -8306,7 +8472,7 @@ int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_c { struct btf_id_dtor_kfunc_tab *tab; struct btf *btf; - u32 tab_cnt; + u32 tab_cnt, i; int ret; btf = btf_get_module_btf(owner); @@ -8357,6 +8523,13 @@ int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_c btf->dtor_kfunc_tab = tab; memcpy(tab->dtors + tab->cnt, dtors, add_cnt * sizeof(tab->dtors[0])); + + /* remap BTF ids based on BTF relocation (if any) */ + for (i = tab_cnt; i < tab_cnt + add_cnt; i++) { + tab->dtors[i].btf_id = btf_relocate_id(btf, tab->dtors[i].btf_id); + tab->dtors[i].kfunc_btf_id = btf_relocate_id(btf, tab->dtors[i].kfunc_btf_id); + } + tab->cnt += add_cnt; sort(tab->dtors, tab->cnt, sizeof(tab->dtors[0]), btf_id_cmp_func, NULL); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 1a6c3faa6e4a..7ee62e38faf0 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -736,11 +736,11 @@ static struct bpf_ksym *bpf_ksym_find(unsigned long addr) return n ? container_of(n, struct bpf_ksym, tnode) : NULL; } -const char *__bpf_address_lookup(unsigned long addr, unsigned long *size, +int __bpf_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char *sym) { struct bpf_ksym *ksym; - char *ret = NULL; + int ret = 0; rcu_read_lock(); ksym = bpf_ksym_find(addr); @@ -748,9 +748,8 @@ const char *__bpf_address_lookup(unsigned long addr, unsigned long *size, unsigned long symbol_start = ksym->start; unsigned long symbol_end = ksym->end; - strscpy(sym, ksym->name, KSYM_NAME_LEN); + ret = strscpy(sym, ksym->name, KSYM_NAME_LEN); - ret = sym; if (size) *size = symbol_end - symbol_start; if (off) @@ -1174,8 +1173,7 @@ bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr, } /* Copy JITed text from rw_header to its final location, the ro_header. */ -int bpf_jit_binary_pack_finalize(struct bpf_prog *prog, - struct bpf_binary_header *ro_header, +int bpf_jit_binary_pack_finalize(struct bpf_binary_header *ro_header, struct bpf_binary_header *rw_header) { void *ptr; @@ -2743,8 +2741,7 @@ static void bpf_free_used_maps(struct bpf_prog_aux *aux) kfree(aux->used_maps); } -void __bpf_free_used_btfs(struct bpf_prog_aux *aux, - struct btf_mod_pair *used_btfs, u32 len) +void __bpf_free_used_btfs(struct btf_mod_pair *used_btfs, u32 len) { #ifdef CONFIG_BPF_SYSCALL struct btf_mod_pair *btf_mod; @@ -2761,7 +2758,7 @@ void __bpf_free_used_btfs(struct bpf_prog_aux *aux, static void bpf_free_used_btfs(struct bpf_prog_aux *aux) { - __bpf_free_used_btfs(aux, aux->used_btfs, aux->used_btf_cnt); + __bpf_free_used_btfs(aux->used_btfs, aux->used_btf_cnt); kfree(aux->used_btfs); } diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index a8e34416e960..fbdf5a1aabfe 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -79,8 +79,6 @@ struct bpf_cpu_map { struct bpf_cpu_map_entry __rcu **cpu_map; }; -static DEFINE_PER_CPU(struct list_head, cpu_map_flush_list); - static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) { u32 value_size = attr->value_size; @@ -240,12 +238,14 @@ static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames, int xdp_n, struct xdp_cpumap_stats *stats, struct list_head *list) { + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; int nframes; if (!rcpu->prog) return xdp_n; rcu_read_lock_bh(); + bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); nframes = cpu_map_bpf_prog_run_xdp(rcpu, frames, xdp_n, stats); @@ -255,6 +255,7 @@ static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames, if (unlikely(!list_empty(list))) cpu_map_bpf_prog_run_skb(rcpu, list, stats); + bpf_net_ctx_clear(bpf_net_ctx); rcu_read_unlock_bh(); /* resched point, may call do_softirq() */ return nframes; @@ -706,7 +707,6 @@ static void bq_flush_to_queue(struct xdp_bulk_queue *bq) */ static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) { - struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list); struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq); if (unlikely(bq->count == CPU_MAP_BULK_SIZE)) @@ -723,8 +723,11 @@ static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) */ bq->q[bq->count++] = xdpf; - if (!bq->flush_node.prev) + if (!bq->flush_node.prev) { + struct list_head *flush_list = bpf_net_ctx_get_cpu_map_flush_list(); + list_add(&bq->flush_node, flush_list); + } } int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf, @@ -756,9 +759,8 @@ trace: return ret; } -void __cpu_map_flush(void) +void __cpu_map_flush(struct list_head *flush_list) { - struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list); struct xdp_bulk_queue *bq, *tmp; list_for_each_entry_safe(bq, tmp, flush_list, flush_node) { @@ -768,24 +770,3 @@ void __cpu_map_flush(void) wake_up_process(bq->obj->kthread); } } - -#ifdef CONFIG_DEBUG_NET -bool cpu_map_check_flush(void) -{ - if (list_empty(this_cpu_ptr(&cpu_map_flush_list))) - return false; - __cpu_map_flush(); - return true; -} -#endif - -static int __init cpu_map_init(void) -{ - int cpu; - - for_each_possible_cpu(cpu) - INIT_LIST_HEAD(&per_cpu(cpu_map_flush_list, cpu)); - return 0; -} - -subsys_initcall(cpu_map_init); diff --git a/kernel/bpf/crypto.c b/kernel/bpf/crypto.c index 2bee4af91e38..94854cd9c4cc 100644 --- a/kernel/bpf/crypto.c +++ b/kernel/bpf/crypto.c @@ -275,7 +275,7 @@ static int bpf_crypto_crypt(const struct bpf_crypto_ctx *ctx, if (__bpf_dynptr_is_rdonly(dst)) return -EINVAL; - siv_len = __bpf_dynptr_size(siv); + siv_len = siv ? __bpf_dynptr_size(siv) : 0; src_len = __bpf_dynptr_size(src); dst_len = __bpf_dynptr_size(dst); if (!src_len || !dst_len) @@ -303,36 +303,44 @@ static int bpf_crypto_crypt(const struct bpf_crypto_ctx *ctx, /** * bpf_crypto_decrypt() - Decrypt buffer using configured context and IV provided. - * @ctx: The crypto context being used. The ctx must be a trusted pointer. - * @src: bpf_dynptr to the encrypted data. Must be a trusted pointer. - * @dst: bpf_dynptr to the buffer where to store the result. Must be a trusted pointer. - * @siv: bpf_dynptr to IV data and state data to be used by decryptor. + * @ctx: The crypto context being used. The ctx must be a trusted pointer. + * @src: bpf_dynptr to the encrypted data. Must be a trusted pointer. + * @dst: bpf_dynptr to the buffer where to store the result. Must be a trusted pointer. + * @siv__nullable: bpf_dynptr to IV data and state data to be used by decryptor. May be NULL. * * Decrypts provided buffer using IV data and the crypto context. Crypto context must be configured. */ __bpf_kfunc int bpf_crypto_decrypt(struct bpf_crypto_ctx *ctx, - const struct bpf_dynptr_kern *src, - const struct bpf_dynptr_kern *dst, - const struct bpf_dynptr_kern *siv) + const struct bpf_dynptr *src, + const struct bpf_dynptr *dst, + const struct bpf_dynptr *siv__nullable) { - return bpf_crypto_crypt(ctx, src, dst, siv, true); + const struct bpf_dynptr_kern *src_kern = (struct bpf_dynptr_kern *)src; + const struct bpf_dynptr_kern *dst_kern = (struct bpf_dynptr_kern *)dst; + const struct bpf_dynptr_kern *siv_kern = (struct bpf_dynptr_kern *)siv__nullable; + + return bpf_crypto_crypt(ctx, src_kern, dst_kern, siv_kern, true); } /** * bpf_crypto_encrypt() - Encrypt buffer using configured context and IV provided. - * @ctx: The crypto context being used. The ctx must be a trusted pointer. - * @src: bpf_dynptr to the plain data. Must be a trusted pointer. - * @dst: bpf_dynptr to buffer where to store the result. Must be a trusted pointer. - * @siv: bpf_dynptr to IV data and state data to be used by decryptor. + * @ctx: The crypto context being used. The ctx must be a trusted pointer. + * @src: bpf_dynptr to the plain data. Must be a trusted pointer. + * @dst: bpf_dynptr to the buffer where to store the result. Must be a trusted pointer. + * @siv__nullable: bpf_dynptr to IV data and state data to be used by decryptor. May be NULL. * * Encrypts provided buffer using IV data and the crypto context. Crypto context must be configured. */ __bpf_kfunc int bpf_crypto_encrypt(struct bpf_crypto_ctx *ctx, - const struct bpf_dynptr_kern *src, - const struct bpf_dynptr_kern *dst, - const struct bpf_dynptr_kern *siv) + const struct bpf_dynptr *src, + const struct bpf_dynptr *dst, + const struct bpf_dynptr *siv__nullable) { - return bpf_crypto_crypt(ctx, src, dst, siv, false); + const struct bpf_dynptr_kern *src_kern = (struct bpf_dynptr_kern *)src; + const struct bpf_dynptr_kern *dst_kern = (struct bpf_dynptr_kern *)dst; + const struct bpf_dynptr_kern *siv_kern = (struct bpf_dynptr_kern *)siv__nullable; + + return bpf_crypto_crypt(ctx, src_kern, dst_kern, siv_kern, false); } __bpf_kfunc_end_defs(); diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 4e2cdbb5629f..9e0e3b0a18e4 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -83,7 +83,6 @@ struct bpf_dtab { u32 n_buckets; }; -static DEFINE_PER_CPU(struct list_head, dev_flush_list); static DEFINE_SPINLOCK(dev_map_lock); static LIST_HEAD(dev_map_list); @@ -107,7 +106,7 @@ static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab, return &dtab->dev_index_head[idx & (dtab->n_buckets - 1)]; } -static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) +static int dev_map_alloc_check(union bpf_attr *attr) { u32 valsize = attr->value_size; @@ -121,23 +120,28 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) attr->map_flags & ~DEV_CREATE_FLAG_MASK) return -EINVAL; + if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { + /* Hash table size must be power of 2; roundup_pow_of_two() + * can overflow into UB on 32-bit arches + */ + if (attr->max_entries > 1UL << 31) + return -EINVAL; + } + + return 0; +} + +static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) +{ /* Lookup returns a pointer straight to dev->ifindex, so make sure the * verifier prevents writes from the BPF side */ attr->map_flags |= BPF_F_RDONLY_PROG; - - bpf_map_init_from_attr(&dtab->map, attr); if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { - /* hash table size must be power of 2; roundup_pow_of_two() can - * overflow into UB on 32-bit arches, so check that first - */ - if (dtab->map.max_entries > 1UL << 31) - return -EINVAL; - + /* Hash table size must be power of 2 */ dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries); - dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets, dtab->map.numa_node); if (!dtab->dev_index_head) @@ -196,7 +200,14 @@ static void dev_map_free(struct bpf_map *map) list_del_rcu(&dtab->list); spin_unlock(&dev_map_lock); - bpf_clear_redirect_map(map); + /* bpf_redirect_info->map is assigned in __bpf_xdp_redirect_map() + * during NAPI callback and cleared after the XDP redirect. There is no + * explicit RCU read section which protects bpf_redirect_info->map but + * local_bh_disable() also marks the beginning an RCU section. This + * makes the complete softirq callback RCU protected. Thus after + * following synchronize_rcu() there no bpf_redirect_info->map == map + * assignment. + */ synchronize_rcu(); /* Make sure prior __dev_map_entry_free() have completed. */ @@ -406,9 +417,8 @@ out: * driver before returning from its napi->poll() routine. See the comment above * xdp_do_flush() in filter.c. */ -void __dev_flush(void) +void __dev_flush(struct list_head *flush_list) { - struct list_head *flush_list = this_cpu_ptr(&dev_flush_list); struct xdp_dev_bulk_queue *bq, *tmp; list_for_each_entry_safe(bq, tmp, flush_list, flush_node) { @@ -419,16 +429,6 @@ void __dev_flush(void) } } -#ifdef CONFIG_DEBUG_NET -bool dev_check_flush(void) -{ - if (list_empty(this_cpu_ptr(&dev_flush_list))) - return false; - __dev_flush(); - return true; -} -#endif - /* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or * by local_bh_disable() (from XDP calls inside NAPI). The * rcu_read_lock_bh_held() below makes lockdep accept both. @@ -453,7 +453,6 @@ static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key) static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx, struct bpf_prog *xdp_prog) { - struct list_head *flush_list = this_cpu_ptr(&dev_flush_list); struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq); if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) @@ -467,6 +466,8 @@ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, * are only ever modified together. */ if (!bq->dev_rx) { + struct list_head *flush_list = bpf_net_ctx_get_dev_flush_list(); + bq->dev_rx = dev_rx; bq->xdp_prog = xdp_prog; list_add(&bq->flush_node, flush_list); @@ -760,9 +761,6 @@ int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, for (i = 0; i < dtab->n_buckets; i++) { head = dev_map_index_hash(dtab, i); hlist_for_each_entry_safe(dst, next, head, index_hlist) { - if (!dst) - continue; - if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex)) continue; @@ -1043,6 +1041,7 @@ static u64 dev_map_mem_usage(const struct bpf_map *map) BTF_ID_LIST_SINGLE(dev_map_btf_ids, struct, bpf_dtab) const struct bpf_map_ops dev_map_ops = { .map_meta_equal = bpf_map_meta_equal, + .map_alloc_check = dev_map_alloc_check, .map_alloc = dev_map_alloc, .map_free = dev_map_free, .map_get_next_key = dev_map_get_next_key, @@ -1057,6 +1056,7 @@ const struct bpf_map_ops dev_map_ops = { const struct bpf_map_ops dev_map_hash_ops = { .map_meta_equal = bpf_map_meta_equal, + .map_alloc_check = dev_map_alloc_check, .map_alloc = dev_map_alloc, .map_free = dev_map_free, .map_get_next_key = dev_map_hash_get_next_key, @@ -1156,15 +1156,11 @@ static struct notifier_block dev_map_notifier = { static int __init dev_map_init(void) { - int cpu; - /* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */ BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) != offsetof(struct _bpf_dtab_netdev, dev)); register_netdevice_notifier(&dev_map_notifier); - for_each_possible_cpu(cpu) - INIT_LIST_HEAD(&per_cpu(dev_flush_list, cpu)); return 0; } diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 2a69a9a36c0f..b5f0adae8293 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1084,7 +1084,10 @@ struct bpf_async_cb { struct bpf_prog *prog; void __rcu *callback_fn; void *value; - struct rcu_head rcu; + union { + struct rcu_head rcu; + struct work_struct delete_work; + }; u64 flags; }; @@ -1107,6 +1110,7 @@ struct bpf_async_cb { struct bpf_hrtimer { struct bpf_async_cb cb; struct hrtimer timer; + atomic_t cancelling; }; struct bpf_work { @@ -1219,6 +1223,21 @@ static void bpf_wq_delete_work(struct work_struct *work) kfree_rcu(w, cb.rcu); } +static void bpf_timer_delete_work(struct work_struct *work) +{ + struct bpf_hrtimer *t = container_of(work, struct bpf_hrtimer, cb.delete_work); + + /* Cancel the timer and wait for callback to complete if it was running. + * If hrtimer_cancel() can be safely called it's safe to call + * kfree_rcu(t) right after for both preallocated and non-preallocated + * maps. The async->cb = NULL was already done and no code path can see + * address 't' anymore. Timer if armed for existing bpf_hrtimer before + * bpf_timer_cancel_and_free will have been cancelled. + */ + hrtimer_cancel(&t->timer); + kfree_rcu(t, cb.rcu); +} + static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags, enum bpf_async_type type) { @@ -1262,6 +1281,8 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u clockid = flags & (MAX_CLOCKS - 1); t = (struct bpf_hrtimer *)cb; + atomic_set(&t->cancelling, 0); + INIT_WORK(&t->cb.delete_work, bpf_timer_delete_work); hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT); t->timer.function = bpf_timer_cb; cb->value = (void *)async - map->record->timer_off; @@ -1440,7 +1461,8 @@ static void drop_prog_refcnt(struct bpf_async_cb *async) BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer) { - struct bpf_hrtimer *t; + struct bpf_hrtimer *t, *cur_t; + bool inc = false; int ret = 0; if (in_nmi()) @@ -1452,14 +1474,41 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer) ret = -EINVAL; goto out; } - if (this_cpu_read(hrtimer_running) == t) { + + cur_t = this_cpu_read(hrtimer_running); + if (cur_t == t) { /* If bpf callback_fn is trying to bpf_timer_cancel() * its own timer the hrtimer_cancel() will deadlock - * since it waits for callback_fn to finish + * since it waits for callback_fn to finish. + */ + ret = -EDEADLK; + goto out; + } + + /* Only account in-flight cancellations when invoked from a timer + * callback, since we want to avoid waiting only if other _callbacks_ + * are waiting on us, to avoid introducing lockups. Non-callback paths + * are ok, since nobody would synchronously wait for their completion. + */ + if (!cur_t) + goto drop; + atomic_inc(&t->cancelling); + /* Need full barrier after relaxed atomic_inc */ + smp_mb__after_atomic(); + inc = true; + if (atomic_read(&cur_t->cancelling)) { + /* We're cancelling timer t, while some other timer callback is + * attempting to cancel us. In such a case, it might be possible + * that timer t belongs to the other callback, or some other + * callback waiting upon it (creating transitive dependencies + * upon us), and we will enter a deadlock if we continue + * cancelling and waiting for it synchronously, since it might + * do the same. Bail! */ ret = -EDEADLK; goto out; } +drop: drop_prog_refcnt(&t->cb); out: __bpf_spin_unlock_irqrestore(&timer->lock); @@ -1467,6 +1516,8 @@ out: * if it was running. */ ret = ret ?: hrtimer_cancel(&t->timer); + if (inc) + atomic_dec(&t->cancelling); rcu_read_unlock(); return ret; } @@ -1512,25 +1563,39 @@ void bpf_timer_cancel_and_free(void *val) if (!t) return; - /* Cancel the timer and wait for callback to complete if it was running. - * If hrtimer_cancel() can be safely called it's safe to call kfree(t) - * right after for both preallocated and non-preallocated maps. - * The async->cb = NULL was already done and no code path can - * see address 't' anymore. - * - * Check that bpf_map_delete/update_elem() wasn't called from timer - * callback_fn. In such case don't call hrtimer_cancel() (since it will - * deadlock) and don't call hrtimer_try_to_cancel() (since it will just - * return -1). Though callback_fn is still running on this cpu it's + /* We check that bpf_map_delete/update_elem() was called from timer + * callback_fn. In such case we don't call hrtimer_cancel() (since it + * will deadlock) and don't call hrtimer_try_to_cancel() (since it will + * just return -1). Though callback_fn is still running on this cpu it's * safe to do kfree(t) because bpf_timer_cb() read everything it needed * from 't'. The bpf subprog callback_fn won't be able to access 't', * since async->cb = NULL was already done. The timer will be * effectively cancelled because bpf_timer_cb() will return * HRTIMER_NORESTART. + * + * However, it is possible the timer callback_fn calling us armed the + * timer _before_ calling us, such that failing to cancel it here will + * cause it to possibly use struct hrtimer after freeing bpf_hrtimer. + * Therefore, we _need_ to cancel any outstanding timers before we do + * kfree_rcu, even though no more timers can be armed. + * + * Moreover, we need to schedule work even if timer does not belong to + * the calling callback_fn, as on two different CPUs, we can end up in a + * situation where both sides run in parallel, try to cancel one + * another, and we end up waiting on both sides in hrtimer_cancel + * without making forward progress, since timer1 depends on time2 + * callback to finish, and vice versa. + * + * CPU 1 (timer1_cb) CPU 2 (timer2_cb) + * bpf_timer_cancel_and_free(timer2) bpf_timer_cancel_and_free(timer1) + * + * To avoid these issues, punt to workqueue context when we are in a + * timer callback. */ - if (this_cpu_read(hrtimer_running) != t) - hrtimer_cancel(&t->timer); - kfree_rcu(t, cb.rcu); + if (this_cpu_read(hrtimer_running)) + queue_work(system_unbound_wq, &t->cb.delete_work); + else + bpf_timer_delete_work(&t->cb.delete_work); } /* This function is called by map_delete/update_elem for individual element and @@ -2433,7 +2498,7 @@ __bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid) /** * bpf_dynptr_slice() - Obtain a read-only pointer to the dynptr data. - * @ptr: The dynptr whose data slice to retrieve + * @p: The dynptr whose data slice to retrieve * @offset: Offset into the dynptr * @buffer__opt: User-provided buffer to copy contents into. May be NULL * @buffer__szk: Size (in bytes) of the buffer if present. This is the @@ -2459,9 +2524,10 @@ __bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid) * provided buffer, with its contents containing the data, if unable to obtain * direct pointer) */ -__bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset, +__bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u32 offset, void *buffer__opt, u32 buffer__szk) { + const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; enum bpf_dynptr_type type; u32 len = buffer__szk; int err; @@ -2503,7 +2569,7 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset /** * bpf_dynptr_slice_rdwr() - Obtain a writable pointer to the dynptr data. - * @ptr: The dynptr whose data slice to retrieve + * @p: The dynptr whose data slice to retrieve * @offset: Offset into the dynptr * @buffer__opt: User-provided buffer to copy contents into. May be NULL * @buffer__szk: Size (in bytes) of the buffer if present. This is the @@ -2543,9 +2609,11 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset * provided buffer, with its contents containing the data, if unable to obtain * direct pointer) */ -__bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr_kern *ptr, u32 offset, +__bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u32 offset, void *buffer__opt, u32 buffer__szk) { + const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; + if (!ptr->data || __bpf_dynptr_is_rdonly(ptr)) return NULL; @@ -2571,11 +2639,12 @@ __bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr_kern *ptr, u32 o * will be copied out into the buffer and the user will need to call * bpf_dynptr_write() to commit changes. */ - return bpf_dynptr_slice(ptr, offset, buffer__opt, buffer__szk); + return bpf_dynptr_slice(p, offset, buffer__opt, buffer__szk); } -__bpf_kfunc int bpf_dynptr_adjust(struct bpf_dynptr_kern *ptr, u32 start, u32 end) +__bpf_kfunc int bpf_dynptr_adjust(const struct bpf_dynptr *p, u32 start, u32 end) { + struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; u32 size; if (!ptr->data || start > end) @@ -2592,36 +2661,45 @@ __bpf_kfunc int bpf_dynptr_adjust(struct bpf_dynptr_kern *ptr, u32 start, u32 en return 0; } -__bpf_kfunc bool bpf_dynptr_is_null(struct bpf_dynptr_kern *ptr) +__bpf_kfunc bool bpf_dynptr_is_null(const struct bpf_dynptr *p) { + struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; + return !ptr->data; } -__bpf_kfunc bool bpf_dynptr_is_rdonly(struct bpf_dynptr_kern *ptr) +__bpf_kfunc bool bpf_dynptr_is_rdonly(const struct bpf_dynptr *p) { + struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; + if (!ptr->data) return false; return __bpf_dynptr_is_rdonly(ptr); } -__bpf_kfunc __u32 bpf_dynptr_size(const struct bpf_dynptr_kern *ptr) +__bpf_kfunc __u32 bpf_dynptr_size(const struct bpf_dynptr *p) { + struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; + if (!ptr->data) return -EINVAL; return __bpf_dynptr_size(ptr); } -__bpf_kfunc int bpf_dynptr_clone(struct bpf_dynptr_kern *ptr, - struct bpf_dynptr_kern *clone__uninit) +__bpf_kfunc int bpf_dynptr_clone(const struct bpf_dynptr *p, + struct bpf_dynptr *clone__uninit) { + struct bpf_dynptr_kern *clone = (struct bpf_dynptr_kern *)clone__uninit; + struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; + if (!ptr->data) { - bpf_dynptr_set_null(clone__uninit); + bpf_dynptr_set_null(clone); return -EINVAL; } - *clone__uninit = *ptr; + *clone = *ptr; return 0; } @@ -2721,7 +2799,7 @@ __bpf_kfunc int bpf_wq_start(struct bpf_wq *wq, unsigned int flags) } __bpf_kfunc int bpf_wq_set_callback_impl(struct bpf_wq *wq, - int (callback_fn)(void *map, int *key, struct bpf_wq *wq), + int (callback_fn)(void *map, int *key, void *value), unsigned int flags, void *aux__ign) { @@ -2744,6 +2822,122 @@ __bpf_kfunc void bpf_preempt_enable(void) preempt_enable(); } +struct bpf_iter_bits { + __u64 __opaque[2]; +} __aligned(8); + +struct bpf_iter_bits_kern { + union { + unsigned long *bits; + unsigned long bits_copy; + }; + u32 nr_bits; + int bit; +} __aligned(8); + +/** + * bpf_iter_bits_new() - Initialize a new bits iterator for a given memory area + * @it: The new bpf_iter_bits to be created + * @unsafe_ptr__ign: A pointer pointing to a memory area to be iterated over + * @nr_words: The size of the specified memory area, measured in 8-byte units. + * Due to the limitation of memalloc, it can't be greater than 512. + * + * This function initializes a new bpf_iter_bits structure for iterating over + * a memory area which is specified by the @unsafe_ptr__ign and @nr_words. It + * copies the data of the memory area to the newly created bpf_iter_bits @it for + * subsequent iteration operations. + * + * On success, 0 is returned. On failure, ERR is returned. + */ +__bpf_kfunc int +bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_words) +{ + struct bpf_iter_bits_kern *kit = (void *)it; + u32 nr_bytes = nr_words * sizeof(u64); + u32 nr_bits = BYTES_TO_BITS(nr_bytes); + int err; + + BUILD_BUG_ON(sizeof(struct bpf_iter_bits_kern) != sizeof(struct bpf_iter_bits)); + BUILD_BUG_ON(__alignof__(struct bpf_iter_bits_kern) != + __alignof__(struct bpf_iter_bits)); + + kit->nr_bits = 0; + kit->bits_copy = 0; + kit->bit = -1; + + if (!unsafe_ptr__ign || !nr_words) + return -EINVAL; + + /* Optimization for u64 mask */ + if (nr_bits == 64) { + err = bpf_probe_read_kernel_common(&kit->bits_copy, nr_bytes, unsafe_ptr__ign); + if (err) + return -EFAULT; + + kit->nr_bits = nr_bits; + return 0; + } + + /* Fallback to memalloc */ + kit->bits = bpf_mem_alloc(&bpf_global_ma, nr_bytes); + if (!kit->bits) + return -ENOMEM; + + err = bpf_probe_read_kernel_common(kit->bits, nr_bytes, unsafe_ptr__ign); + if (err) { + bpf_mem_free(&bpf_global_ma, kit->bits); + return err; + } + + kit->nr_bits = nr_bits; + return 0; +} + +/** + * bpf_iter_bits_next() - Get the next bit in a bpf_iter_bits + * @it: The bpf_iter_bits to be checked + * + * This function returns a pointer to a number representing the value of the + * next bit in the bits. + * + * If there are no further bits available, it returns NULL. + */ +__bpf_kfunc int *bpf_iter_bits_next(struct bpf_iter_bits *it) +{ + struct bpf_iter_bits_kern *kit = (void *)it; + u32 nr_bits = kit->nr_bits; + const unsigned long *bits; + int bit; + + if (nr_bits == 0) + return NULL; + + bits = nr_bits == 64 ? &kit->bits_copy : kit->bits; + bit = find_next_bit(bits, nr_bits, kit->bit + 1); + if (bit >= nr_bits) { + kit->nr_bits = 0; + return NULL; + } + + kit->bit = bit; + return &kit->bit; +} + +/** + * bpf_iter_bits_destroy() - Destroy a bpf_iter_bits + * @it: The bpf_iter_bits to be destroyed + * + * Destroy the resource associated with the bpf_iter_bits. + */ +__bpf_kfunc void bpf_iter_bits_destroy(struct bpf_iter_bits *it) +{ + struct bpf_iter_bits_kern *kit = (void *)it; + + if (kit->nr_bits <= 64) + return; + bpf_mem_free(&bpf_global_ma, kit->bits); +} + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(generic_btf_ids) @@ -2826,6 +3020,9 @@ BTF_ID_FLAGS(func, bpf_wq_set_callback_impl) BTF_ID_FLAGS(func, bpf_wq_start) BTF_ID_FLAGS(func, bpf_preempt_disable) BTF_ID_FLAGS(func, bpf_preempt_enable) +BTF_ID_FLAGS(func, bpf_iter_bits_new, KF_ITER_NEW) +BTF_ID_FLAGS(func, bpf_iter_bits_next, KF_ITER_NEXT | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_iter_bits_destroy, KF_ITER_DESTROY) BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { @@ -2867,7 +3064,9 @@ late_initcall(kfunc_init); */ const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len) { - return bpf_dynptr_slice(ptr, 0, NULL, len); + const struct bpf_dynptr *p = (struct bpf_dynptr *)ptr; + + return bpf_dynptr_slice(p, 0, NULL, len); } /* Get a pointer to dynptr data up to len bytes for read write access. If diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 4bd8f17a9f24..5aebfc3051e3 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -91,7 +91,7 @@ void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, goto fail; } else { u64 new_end, new_start; - u32 buf_start, buf_end, new_n; + u32 buf_start, buf_end; new_end = log->end_pos + n; if (new_end - log->start_pos >= log->len_total) @@ -708,7 +708,9 @@ static void print_reg_state(struct bpf_verifier_env *env, verbose(env, "%s", btf_type_name(reg->btf, reg->btf_id)); verbose(env, "("); if (reg->id) - verbose_a("id=%d", reg->id); + verbose_a("id=%d", reg->id & ~BPF_ADD_CONST); + if (reg->id & BPF_ADD_CONST) + verbose(env, "%+d", reg->off); if (reg->ref_obj_id) verbose_a("ref_obj_id=%d", reg->ref_obj_id); if (type_is_non_owning_ref(reg->type)) diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 0ee653a936ea..e20b90c36131 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -51,7 +51,8 @@ struct bpf_ringbuf { * This prevents a user-space application from modifying the * position and ruining in-kernel tracking. The permissions of the * pages depend on who is producing samples: user-space or the - * kernel. + * kernel. Note that the pending counter is placed in the same + * page as the producer, so that it shares the same cache line. * * Kernel-producer * --------------- @@ -70,6 +71,7 @@ struct bpf_ringbuf { */ unsigned long consumer_pos __aligned(PAGE_SIZE); unsigned long producer_pos __aligned(PAGE_SIZE); + unsigned long pending_pos; char data[] __aligned(PAGE_SIZE); }; @@ -179,6 +181,7 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) rb->mask = data_sz - 1; rb->consumer_pos = 0; rb->producer_pos = 0; + rb->pending_pos = 0; return rb; } @@ -404,9 +407,9 @@ bpf_ringbuf_restore_from_rec(struct bpf_ringbuf_hdr *hdr) static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) { - unsigned long cons_pos, prod_pos, new_prod_pos, flags; - u32 len, pg_off; + unsigned long cons_pos, prod_pos, new_prod_pos, pend_pos, flags; struct bpf_ringbuf_hdr *hdr; + u32 len, pg_off, tmp_size, hdr_len; if (unlikely(size > RINGBUF_MAX_RECORD_SZ)) return NULL; @@ -424,13 +427,29 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) spin_lock_irqsave(&rb->spinlock, flags); } + pend_pos = rb->pending_pos; prod_pos = rb->producer_pos; new_prod_pos = prod_pos + len; - /* check for out of ringbuf space by ensuring producer position - * doesn't advance more than (ringbuf_size - 1) ahead + while (pend_pos < prod_pos) { + hdr = (void *)rb->data + (pend_pos & rb->mask); + hdr_len = READ_ONCE(hdr->len); + if (hdr_len & BPF_RINGBUF_BUSY_BIT) + break; + tmp_size = hdr_len & ~BPF_RINGBUF_DISCARD_BIT; + tmp_size = round_up(tmp_size + BPF_RINGBUF_HDR_SZ, 8); + pend_pos += tmp_size; + } + rb->pending_pos = pend_pos; + + /* check for out of ringbuf space: + * - by ensuring producer position doesn't advance more than + * (ringbuf_size - 1) ahead + * - by ensuring oldest not yet committed record until newest + * record does not span more than (ringbuf_size - 1) */ - if (new_prod_pos - cons_pos > rb->mask) { + if (new_prod_pos - cons_pos > rb->mask || + new_prod_pos - pend_pos > rb->mask) { spin_unlock_irqrestore(&rb->spinlock, flags); return NULL; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2222c3ff88e7..869265852d51 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2998,6 +2998,7 @@ static int bpf_obj_get(const union bpf_attr *attr) void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, const struct bpf_link_ops *ops, struct bpf_prog *prog) { + WARN_ON(ops->dealloc && ops->dealloc_deferred); atomic64_set(&link->refcnt, 1); link->type = type; link->id = 0; @@ -3056,16 +3057,17 @@ static void bpf_link_defer_dealloc_mult_rcu_gp(struct rcu_head *rcu) /* bpf_link_free is guaranteed to be called from process context */ static void bpf_link_free(struct bpf_link *link) { + const struct bpf_link_ops *ops = link->ops; bool sleepable = false; bpf_link_free_id(link->id); if (link->prog) { sleepable = link->prog->sleepable; /* detach BPF program, clean up used resources */ - link->ops->release(link); + ops->release(link); bpf_prog_put(link->prog); } - if (link->ops->dealloc_deferred) { + if (ops->dealloc_deferred) { /* schedule BPF link deallocation; if underlying BPF program * is sleepable, we need to first wait for RCU tasks trace * sync, then go through "classic" RCU grace period @@ -3074,9 +3076,8 @@ static void bpf_link_free(struct bpf_link *link) call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_mult_rcu_gp); else call_rcu(&link->rcu, bpf_link_defer_dealloc_rcu_gp); - } - if (link->ops->dealloc) - link->ops->dealloc(link); + } else if (ops->dealloc) + ops->dealloc(link); } static void bpf_link_put_deferred(struct work_struct *work) @@ -3150,6 +3151,13 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) } #endif +static __poll_t bpf_link_poll(struct file *file, struct poll_table_struct *pts) +{ + struct bpf_link *link = file->private_data; + + return link->ops->poll(file, pts); +} + static const struct file_operations bpf_link_fops = { #ifdef CONFIG_PROC_FS .show_fdinfo = bpf_link_show_fdinfo, @@ -3159,6 +3167,16 @@ static const struct file_operations bpf_link_fops = { .write = bpf_dummy_write, }; +static const struct file_operations bpf_link_fops_poll = { +#ifdef CONFIG_PROC_FS + .show_fdinfo = bpf_link_show_fdinfo, +#endif + .release = bpf_link_release, + .read = bpf_dummy_read, + .write = bpf_dummy_write, + .poll = bpf_link_poll, +}; + static int bpf_link_alloc_id(struct bpf_link *link) { int id; @@ -3201,7 +3219,9 @@ int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer) return id; } - file = anon_inode_getfile("bpf_link", &bpf_link_fops, link, O_CLOEXEC); + file = anon_inode_getfile("bpf_link", + link->ops->poll ? &bpf_link_fops_poll : &bpf_link_fops, + link, O_CLOEXEC); if (IS_ERR(file)) { bpf_link_free_id(id); put_unused_fd(fd); @@ -3229,7 +3249,9 @@ int bpf_link_settle(struct bpf_link_primer *primer) int bpf_link_new_fd(struct bpf_link *link) { - return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC); + return anon_inode_getfd("bpf-link", + link->ops->poll ? &bpf_link_fops_poll : &bpf_link_fops, + link, O_CLOEXEC); } struct bpf_link *bpf_link_get_from_fd(u32 ufd) @@ -3239,7 +3261,7 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd) if (!f.file) return ERR_PTR(-EBADF); - if (f.file->f_op != &bpf_link_fops) { + if (f.file->f_op != &bpf_link_fops && f.file->f_op != &bpf_link_fops_poll) { fdput(f); return ERR_PTR(-EINVAL); } @@ -4971,7 +4993,7 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, uattr); else if (f.file->f_op == &btf_fops) err = bpf_btf_get_info_by_fd(f.file, f.file->private_data, attr, uattr); - else if (f.file->f_op == &bpf_link_fops) + else if (f.file->f_op == &bpf_link_fops || f.file->f_op == &bpf_link_fops_poll) err = bpf_link_get_info_by_fd(f.file, f.file->private_data, attr, uattr); else @@ -5106,7 +5128,7 @@ static int bpf_task_fd_query(const union bpf_attr *attr, if (!file) return -EBADF; - if (file->f_op == &bpf_link_fops) { + if (file->f_op == &bpf_link_fops || file->f_op == &bpf_link_fops_poll) { struct bpf_link *link = file->private_data; if (link->ops == &bpf_raw_tp_link_lops) { @@ -5416,10 +5438,11 @@ static int link_detach(union bpf_attr *attr) return ret; } -static struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link) +struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link) { return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? link : ERR_PTR(-ENOENT); } +EXPORT_SYMBOL(bpf_link_inc_not_zero); struct bpf_link *bpf_link_by_id(u32 id) { diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index ec4e97c61eef..02aa9db8d796 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -261,6 +261,7 @@ task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info) u32 saved_tid = info->tid; struct task_struct *curr_task; unsigned int curr_fd = info->fd; + struct file *f; /* If this function returns a non-NULL file object, * it held a reference to the task/file. @@ -286,12 +287,8 @@ again: } rcu_read_lock(); - for (;; curr_fd++) { - struct file *f; - f = task_lookup_next_fdget_rcu(curr_task, &curr_fd); - if (!f) - break; - + f = task_lookup_next_fdget_rcu(curr_task, &curr_fd); + if (f) { /* set info->fd */ info->fd = curr_fd; rcu_read_unlock(); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 48f3a9acdef3..8da132a1ef28 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2982,8 +2982,10 @@ static int check_subprogs(struct bpf_verifier_env *env) if (code == (BPF_JMP | BPF_CALL) && insn[i].src_reg == 0 && - insn[i].imm == BPF_FUNC_tail_call) + insn[i].imm == BPF_FUNC_tail_call) { subprog[cur_subprog].has_tail_call = true; + subprog[cur_subprog].tail_call_reachable = true; + } if (BPF_CLASS(code) == BPF_LD && (BPF_MODE(code) == BPF_ABS || BPF_MODE(code) == BPF_IND)) subprog[cur_subprog].has_ld_abs = true; @@ -3215,7 +3217,8 @@ static int insn_def_regno(const struct bpf_insn *insn) case BPF_ST: return -1; case BPF_STX: - if (BPF_MODE(insn->code) == BPF_ATOMIC && + if ((BPF_MODE(insn->code) == BPF_ATOMIC || + BPF_MODE(insn->code) == BPF_PROBE_ATOMIC) && (insn->imm & BPF_FETCH)) { if (insn->imm == BPF_CMPXCHG) return BPF_REG_0; @@ -3991,7 +3994,7 @@ static bool idset_contains(struct bpf_idset *s, u32 id) u32 i; for (i = 0; i < s->count; ++i) - if (s->ids[i] == id) + if (s->ids[i] == (id & ~BPF_ADD_CONST)) return true; return false; @@ -4001,7 +4004,7 @@ static int idset_push(struct bpf_idset *s, u32 id) { if (WARN_ON_ONCE(s->count >= ARRAY_SIZE(s->ids))) return -EFAULT; - s->ids[s->count++] = id; + s->ids[s->count++] = id & ~BPF_ADD_CONST; return 0; } @@ -4438,8 +4441,20 @@ static bool __is_pointer_value(bool allow_ptr_leaks, static void assign_scalar_id_before_mov(struct bpf_verifier_env *env, struct bpf_reg_state *src_reg) { - if (src_reg->type == SCALAR_VALUE && !src_reg->id && - !tnum_is_const(src_reg->var_off)) + if (src_reg->type != SCALAR_VALUE) + return; + + if (src_reg->id & BPF_ADD_CONST) { + /* + * The verifier is processing rX = rY insn and + * rY->id has special linked register already. + * Cleared it, since multiple rX += const are not supported. + */ + src_reg->id = 0; + src_reg->off = 0; + } + + if (!src_reg->id && !tnum_is_const(src_reg->var_off)) /* Ensure that src_reg has a valid ID that will be copied to * dst_reg and then will be used by find_equal_scalars() to * propagate min/max range. @@ -4549,11 +4564,12 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, state->stack[spi].spilled_ptr.id = 0; } else if (!reg && !(off % BPF_REG_SIZE) && is_bpf_st_mem(insn) && env->bpf_capable) { - struct bpf_reg_state fake_reg = {}; + struct bpf_reg_state *tmp_reg = &env->fake_reg[0]; - __mark_reg_known(&fake_reg, insn->imm); - fake_reg.type = SCALAR_VALUE; - save_register_state(env, state, spi, &fake_reg, size); + memset(tmp_reg, 0, sizeof(*tmp_reg)); + __mark_reg_known(tmp_reg, insn->imm); + tmp_reg->type = SCALAR_VALUE; + save_register_state(env, state, spi, tmp_reg, size); } else if (reg && is_spillable_regtype(reg->type)) { /* register containing pointer is being spilled into stack */ if (size != BPF_REG_SIZE) { @@ -5448,7 +5464,7 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, * this program. To check that [x1, x2) overlaps with [y1, y2), * it is sufficient to check x1 < y2 && y1 < x2. */ - if (reg->smin_value + off < p + btf_field_type_size(field->type) && + if (reg->smin_value + off < p + field->size && p < reg->umax_value + off + size) { switch (field->type) { case BPF_KPTR_UNREF: @@ -6235,6 +6251,7 @@ static void set_sext32_default_val(struct bpf_reg_state *reg, int size) } reg->u32_min_value = 0; reg->u32_max_value = U32_MAX; + reg->var_off = tnum_subreg(tnum_unknown); } static void coerce_subreg_to_size_sx(struct bpf_reg_state *reg, int size) @@ -6279,6 +6296,7 @@ static void coerce_subreg_to_size_sx(struct bpf_reg_state *reg, int size) reg->s32_max_value = s32_max; reg->u32_min_value = (u32)s32_min; reg->u32_max_value = (u32)s32_max; + reg->var_off = tnum_subreg(tnum_range(s32_min, s32_max)); return; } @@ -7712,6 +7730,13 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; int err; + if (reg->type != PTR_TO_STACK && reg->type != CONST_PTR_TO_DYNPTR) { + verbose(env, + "arg#%d expected pointer to stack or const struct bpf_dynptr\n", + regno); + return -EINVAL; + } + /* MEM_UNINIT and MEM_RDONLY are exclusive, when applied to an * ARG_PTR_TO_DYNPTR (or ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_*): */ @@ -9461,6 +9486,10 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, return -EINVAL; } } else if (arg->arg_type == (ARG_PTR_TO_DYNPTR | MEM_RDONLY)) { + ret = check_func_arg_reg_off(env, reg, regno, ARG_PTR_TO_DYNPTR); + if (ret) + return ret; + ret = process_dynptr_func(env, regno, -1, arg->arg_type, 0); if (ret) return ret; @@ -10914,7 +10943,7 @@ enum { }; BTF_ID_LIST(kf_arg_btf_ids) -BTF_ID(struct, bpf_dynptr_kern) +BTF_ID(struct, bpf_dynptr) BTF_ID(struct, bpf_list_head) BTF_ID(struct, bpf_list_node) BTF_ID(struct, bpf_rb_root) @@ -11128,7 +11157,11 @@ BTF_ID(func, bpf_iter_css_task_new) #else BTF_ID_UNUSED #endif +#ifdef CONFIG_BPF_EVENTS BTF_ID(func, bpf_session_cookie) +#else +BTF_ID_UNUSED +#endif static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) { @@ -11183,6 +11216,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (btf_is_prog_ctx_type(&env->log, meta->btf, t, resolve_prog_type(env->prog), argno)) return KF_ARG_PTR_TO_CTX; + if (is_kfunc_arg_nullable(meta->btf, &args[argno]) && register_is_null(reg)) + return KF_ARG_PTR_TO_NULL; + if (is_kfunc_arg_alloc_obj(meta->btf, &args[argno])) return KF_ARG_PTR_TO_ALLOC_BTF_ID; @@ -11228,9 +11264,6 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (is_kfunc_arg_callback(env, meta->btf, &args[argno])) return KF_ARG_PTR_TO_CALLBACK; - if (is_kfunc_arg_nullable(meta->btf, &args[argno]) && register_is_null(reg)) - return KF_ARG_PTR_TO_NULL; - if (argno + 1 < nargs && (is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], ®s[regno + 1]) || is_kfunc_arg_const_mem_size(meta->btf, &args[argno + 1], ®s[regno + 1]))) @@ -11261,6 +11294,8 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env, bool strict_type_match = false; const struct btf *reg_btf; const char *reg_ref_tname; + bool taking_projection; + bool struct_same; u32 reg_ref_id; if (base_type(reg->type) == PTR_TO_BTF_ID) { @@ -11300,11 +11335,19 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env, btf_type_ids_nocast_alias(&env->log, reg_btf, reg_ref_id, meta->btf, ref_id)) strict_type_match = true; - WARN_ON_ONCE(is_kfunc_trusted_args(meta) && reg->off); + WARN_ON_ONCE(is_kfunc_release(meta) && + (reg->off || !tnum_is_const(reg->var_off) || + reg->var_off.value)); reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id, ®_ref_id); reg_ref_tname = btf_name_by_offset(reg_btf, reg_ref_t->name_off); - if (!btf_struct_ids_match(&env->log, reg_btf, reg_ref_id, reg->off, meta->btf, ref_id, strict_type_match)) { + struct_same = btf_struct_ids_match(&env->log, reg_btf, reg_ref_id, reg->off, meta->btf, ref_id, strict_type_match); + /* If kfunc is accepting a projection type (ie. __sk_buff), it cannot + * actually use it -- it must cast to the underlying type. So we allow + * caller to pass in the underlying type. + */ + taking_projection = btf_is_projection_of(ref_tname, reg_ref_tname); + if (!taking_projection && !struct_same) { verbose(env, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n", meta->func_name, argno, btf_type_str(ref_t), ref_tname, argno + 1, btf_type_str(reg_ref_t), reg_ref_tname); @@ -11644,7 +11687,7 @@ __process_kf_arg_ptr_to_graph_node(struct bpf_verifier_env *env, node_off = reg->off + reg->var_off.value; field = reg_find_field_offset(reg, node_off, node_field_type); - if (!field || field->offset != node_off) { + if (!field) { verbose(env, "%s not found at offset=%u\n", node_type_name, node_off); return -EINVAL; } @@ -11876,12 +11919,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return -EINVAL; } } - fallthrough; case KF_ARG_PTR_TO_CTX: - /* Trusted arguments have the same offset checks as release arguments */ - arg_type |= OBJ_RELEASE; - break; case KF_ARG_PTR_TO_DYNPTR: case KF_ARG_PTR_TO_ITER: case KF_ARG_PTR_TO_LIST_HEAD: @@ -11894,7 +11933,6 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_REFCOUNTED_KPTR: case KF_ARG_PTR_TO_CONST_STR: case KF_ARG_PTR_TO_WORKQUEUE: - /* Trusted by default */ break; default: WARN_ON_ONCE(1); @@ -11950,12 +11988,6 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ enum bpf_arg_type dynptr_arg_type = ARG_PTR_TO_DYNPTR; int clone_ref_obj_id = 0; - if (reg->type != PTR_TO_STACK && - reg->type != CONST_PTR_TO_DYNPTR) { - verbose(env, "arg#%d expected pointer to stack or dynptr_ptr\n", i); - return -EINVAL; - } - if (reg->type == CONST_PTR_TO_DYNPTR) dynptr_arg_type |= MEM_RDONLY; @@ -12694,46 +12726,6 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return 0; } -static bool signed_add_overflows(s64 a, s64 b) -{ - /* Do the add in u64, where overflow is well-defined */ - s64 res = (s64)((u64)a + (u64)b); - - if (b < 0) - return res > a; - return res < a; -} - -static bool signed_add32_overflows(s32 a, s32 b) -{ - /* Do the add in u32, where overflow is well-defined */ - s32 res = (s32)((u32)a + (u32)b); - - if (b < 0) - return res > a; - return res < a; -} - -static bool signed_sub_overflows(s64 a, s64 b) -{ - /* Do the sub in u64, where overflow is well-defined */ - s64 res = (s64)((u64)a - (u64)b); - - if (b < 0) - return res < a; - return res > a; -} - -static bool signed_sub32_overflows(s32 a, s32 b) -{ - /* Do the sub in u32, where overflow is well-defined */ - s32 res = (s32)((u32)a - (u32)b); - - if (b < 0) - return res < a; - return res > a; -} - static bool check_reg_sane_offset(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, enum bpf_reg_type type) @@ -13215,21 +13207,15 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, * added into the variable offset, and we copy the fixed offset * from ptr_reg. */ - if (signed_add_overflows(smin_ptr, smin_val) || - signed_add_overflows(smax_ptr, smax_val)) { + if (check_add_overflow(smin_ptr, smin_val, &dst_reg->smin_value) || + check_add_overflow(smax_ptr, smax_val, &dst_reg->smax_value)) { dst_reg->smin_value = S64_MIN; dst_reg->smax_value = S64_MAX; - } else { - dst_reg->smin_value = smin_ptr + smin_val; - dst_reg->smax_value = smax_ptr + smax_val; } - if (umin_ptr + umin_val < umin_ptr || - umax_ptr + umax_val < umax_ptr) { + if (check_add_overflow(umin_ptr, umin_val, &dst_reg->umin_value) || + check_add_overflow(umax_ptr, umax_val, &dst_reg->umax_value)) { dst_reg->umin_value = 0; dst_reg->umax_value = U64_MAX; - } else { - dst_reg->umin_value = umin_ptr + umin_val; - dst_reg->umax_value = umax_ptr + umax_val; } dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off); dst_reg->off = ptr_reg->off; @@ -13272,14 +13258,11 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, /* A new variable offset is created. If the subtrahend is known * nonnegative, then any reg->range we had before is still good. */ - if (signed_sub_overflows(smin_ptr, smax_val) || - signed_sub_overflows(smax_ptr, smin_val)) { + if (check_sub_overflow(smin_ptr, smax_val, &dst_reg->smin_value) || + check_sub_overflow(smax_ptr, smin_val, &dst_reg->smax_value)) { /* Overflow possible, we know nothing */ dst_reg->smin_value = S64_MIN; dst_reg->smax_value = S64_MAX; - } else { - dst_reg->smin_value = smin_ptr - smax_val; - dst_reg->smax_value = smax_ptr - smin_val; } if (umin_ptr < umax_val) { /* Overflow possible, we know nothing */ @@ -13332,71 +13315,56 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, static void scalar32_min_max_add(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - s32 smin_val = src_reg->s32_min_value; - s32 smax_val = src_reg->s32_max_value; - u32 umin_val = src_reg->u32_min_value; - u32 umax_val = src_reg->u32_max_value; + s32 *dst_smin = &dst_reg->s32_min_value; + s32 *dst_smax = &dst_reg->s32_max_value; + u32 *dst_umin = &dst_reg->u32_min_value; + u32 *dst_umax = &dst_reg->u32_max_value; - if (signed_add32_overflows(dst_reg->s32_min_value, smin_val) || - signed_add32_overflows(dst_reg->s32_max_value, smax_val)) { - dst_reg->s32_min_value = S32_MIN; - dst_reg->s32_max_value = S32_MAX; - } else { - dst_reg->s32_min_value += smin_val; - dst_reg->s32_max_value += smax_val; + if (check_add_overflow(*dst_smin, src_reg->s32_min_value, dst_smin) || + check_add_overflow(*dst_smax, src_reg->s32_max_value, dst_smax)) { + *dst_smin = S32_MIN; + *dst_smax = S32_MAX; } - if (dst_reg->u32_min_value + umin_val < umin_val || - dst_reg->u32_max_value + umax_val < umax_val) { - dst_reg->u32_min_value = 0; - dst_reg->u32_max_value = U32_MAX; - } else { - dst_reg->u32_min_value += umin_val; - dst_reg->u32_max_value += umax_val; + if (check_add_overflow(*dst_umin, src_reg->u32_min_value, dst_umin) || + check_add_overflow(*dst_umax, src_reg->u32_max_value, dst_umax)) { + *dst_umin = 0; + *dst_umax = U32_MAX; } } static void scalar_min_max_add(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - s64 smin_val = src_reg->smin_value; - s64 smax_val = src_reg->smax_value; - u64 umin_val = src_reg->umin_value; - u64 umax_val = src_reg->umax_value; + s64 *dst_smin = &dst_reg->smin_value; + s64 *dst_smax = &dst_reg->smax_value; + u64 *dst_umin = &dst_reg->umin_value; + u64 *dst_umax = &dst_reg->umax_value; - if (signed_add_overflows(dst_reg->smin_value, smin_val) || - signed_add_overflows(dst_reg->smax_value, smax_val)) { - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; - } else { - dst_reg->smin_value += smin_val; - dst_reg->smax_value += smax_val; + if (check_add_overflow(*dst_smin, src_reg->smin_value, dst_smin) || + check_add_overflow(*dst_smax, src_reg->smax_value, dst_smax)) { + *dst_smin = S64_MIN; + *dst_smax = S64_MAX; } - if (dst_reg->umin_value + umin_val < umin_val || - dst_reg->umax_value + umax_val < umax_val) { - dst_reg->umin_value = 0; - dst_reg->umax_value = U64_MAX; - } else { - dst_reg->umin_value += umin_val; - dst_reg->umax_value += umax_val; + if (check_add_overflow(*dst_umin, src_reg->umin_value, dst_umin) || + check_add_overflow(*dst_umax, src_reg->umax_value, dst_umax)) { + *dst_umin = 0; + *dst_umax = U64_MAX; } } static void scalar32_min_max_sub(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - s32 smin_val = src_reg->s32_min_value; - s32 smax_val = src_reg->s32_max_value; + s32 *dst_smin = &dst_reg->s32_min_value; + s32 *dst_smax = &dst_reg->s32_max_value; u32 umin_val = src_reg->u32_min_value; u32 umax_val = src_reg->u32_max_value; - if (signed_sub32_overflows(dst_reg->s32_min_value, smax_val) || - signed_sub32_overflows(dst_reg->s32_max_value, smin_val)) { + if (check_sub_overflow(*dst_smin, src_reg->s32_max_value, dst_smin) || + check_sub_overflow(*dst_smax, src_reg->s32_min_value, dst_smax)) { /* Overflow possible, we know nothing */ - dst_reg->s32_min_value = S32_MIN; - dst_reg->s32_max_value = S32_MAX; - } else { - dst_reg->s32_min_value -= smax_val; - dst_reg->s32_max_value -= smin_val; + *dst_smin = S32_MIN; + *dst_smax = S32_MAX; } if (dst_reg->u32_min_value < umax_val) { /* Overflow possible, we know nothing */ @@ -13412,19 +13380,16 @@ static void scalar32_min_max_sub(struct bpf_reg_state *dst_reg, static void scalar_min_max_sub(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - s64 smin_val = src_reg->smin_value; - s64 smax_val = src_reg->smax_value; + s64 *dst_smin = &dst_reg->smin_value; + s64 *dst_smax = &dst_reg->smax_value; u64 umin_val = src_reg->umin_value; u64 umax_val = src_reg->umax_value; - if (signed_sub_overflows(dst_reg->smin_value, smax_val) || - signed_sub_overflows(dst_reg->smax_value, smin_val)) { + if (check_sub_overflow(*dst_smin, src_reg->smax_value, dst_smin) || + check_sub_overflow(*dst_smax, src_reg->smin_value, dst_smax)) { /* Overflow possible, we know nothing */ - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; - } else { - dst_reg->smin_value -= smax_val; - dst_reg->smax_value -= smin_val; + *dst_smin = S64_MIN; + *dst_smax = S64_MAX; } if (dst_reg->umin_value < umax_val) { /* Overflow possible, we know nothing */ @@ -14030,6 +13995,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *regs = state->regs, *dst_reg, *src_reg; struct bpf_reg_state *ptr_reg = NULL, off_reg = {0}; + bool alu32 = (BPF_CLASS(insn->code) != BPF_ALU64); u8 opcode = BPF_OP(insn->code); int err; @@ -14052,11 +14018,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, if (dst_reg->type != SCALAR_VALUE) ptr_reg = dst_reg; - else - /* Make sure ID is cleared otherwise dst_reg min/max could be - * incorrectly propagated into other registers by find_equal_scalars() - */ - dst_reg->id = 0; + if (BPF_SRC(insn->code) == BPF_X) { src_reg = ®s[insn->src_reg]; if (src_reg->type != SCALAR_VALUE) { @@ -14120,7 +14082,43 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, verbose(env, "verifier internal error: no src_reg\n"); return -EINVAL; } - return adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg); + err = adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg); + if (err) + return err; + /* + * Compilers can generate the code + * r1 = r2 + * r1 += 0x1 + * if r2 < 1000 goto ... + * use r1 in memory access + * So remember constant delta between r2 and r1 and update r1 after + * 'if' condition. + */ + if (env->bpf_capable && BPF_OP(insn->code) == BPF_ADD && + dst_reg->id && is_reg_const(src_reg, alu32)) { + u64 val = reg_const_value(src_reg, alu32); + + if ((dst_reg->id & BPF_ADD_CONST) || + /* prevent overflow in find_equal_scalars() later */ + val > (u32)S32_MAX) { + /* + * If the register already went through rX += val + * we cannot accumulate another val into rx->off. + */ + dst_reg->off = 0; + dst_reg->id = 0; + } else { + dst_reg->id |= BPF_ADD_CONST; + dst_reg->off = val; + } + } else { + /* + * Make sure ID is cleared otherwise dst_reg min/max could be + * incorrectly propagated into other registers by find_equal_scalars() + */ + dst_reg->id = 0; + } + return 0; } /* check validity of 32-bit and 64-bit arithmetic operations */ @@ -15092,12 +15090,36 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn, static void find_equal_scalars(struct bpf_verifier_state *vstate, struct bpf_reg_state *known_reg) { + struct bpf_reg_state fake_reg; struct bpf_func_state *state; struct bpf_reg_state *reg; bpf_for_each_reg_in_vstate(vstate, state, reg, ({ - if (reg->type == SCALAR_VALUE && reg->id == known_reg->id) + if (reg->type != SCALAR_VALUE || reg == known_reg) + continue; + if ((reg->id & ~BPF_ADD_CONST) != (known_reg->id & ~BPF_ADD_CONST)) + continue; + if ((!(reg->id & BPF_ADD_CONST) && !(known_reg->id & BPF_ADD_CONST)) || + reg->off == known_reg->off) { copy_register_state(reg, known_reg); + } else { + s32 saved_off = reg->off; + + fake_reg.type = SCALAR_VALUE; + __mark_reg_known(&fake_reg, (s32)reg->off - (s32)known_reg->off); + + /* reg = known_reg; reg += delta */ + copy_register_state(reg, known_reg); + /* + * Must preserve off, id and add_const flag, + * otherwise another find_equal_scalars() will be incorrect. + */ + reg->off = saved_off; + + scalar32_min_max_add(reg, &fake_reg); + scalar_min_max_add(reg, &fake_reg); + reg->var_off = tnum_add(reg->var_off, fake_reg.var_off); + } })); } @@ -15109,7 +15131,6 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs; struct bpf_reg_state *dst_reg, *other_branch_regs, *src_reg = NULL; struct bpf_reg_state *eq_branch_regs; - struct bpf_reg_state fake_reg = {}; u8 opcode = BPF_OP(insn->code); bool is_jmp32; int pred = -1; @@ -15175,7 +15196,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, verbose(env, "BPF_JMP/JMP32 uses reserved fields\n"); return -EINVAL; } - src_reg = &fake_reg; + src_reg = &env->fake_reg[0]; + memset(src_reg, 0, sizeof(*src_reg)); src_reg->type = SCALAR_VALUE; __mark_reg_known(src_reg, insn->imm); } @@ -15235,10 +15257,16 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, &other_branch_regs[insn->src_reg], dst_reg, src_reg, opcode, is_jmp32); } else /* BPF_SRC(insn->code) == BPF_K */ { + /* reg_set_min_max() can mangle the fake_reg. Make a copy + * so that these are two different memory locations. The + * src_reg is not used beyond here in context of K. + */ + memcpy(&env->fake_reg[1], &env->fake_reg[0], + sizeof(env->fake_reg[0])); err = reg_set_min_max(env, &other_branch_regs[insn->dst_reg], - src_reg /* fake one */, - dst_reg, src_reg /* same fake one */, + &env->fake_reg[0], + dst_reg, &env->fake_reg[1], opcode, is_jmp32); } if (err) @@ -16726,6 +16754,10 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, } if (!rold->precise && exact == NOT_EXACT) return true; + if ((rold->id & BPF_ADD_CONST) != (rcur->id & BPF_ADD_CONST)) + return false; + if ((rold->id & BPF_ADD_CONST) && (rold->off != rcur->off)) + return false; /* Why check_ids() for scalar registers? * * Consider the following BPF code: @@ -17437,11 +17469,11 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) goto skip_inf_loop_check; } if (is_may_goto_insn_at(env, insn_idx)) { - if (states_equal(env, &sl->state, cur, RANGE_WITHIN)) { + if (sl->state.may_goto_depth != cur->may_goto_depth && + states_equal(env, &sl->state, cur, RANGE_WITHIN)) { update_loop_entry(cur, &sl->state); goto hit; } - goto skip_inf_loop_check; } if (calls_callback(env, insn_idx)) { if (states_equal(env, &sl->state, cur, RANGE_WITHIN)) @@ -18607,8 +18639,7 @@ static void release_maps(struct bpf_verifier_env *env) /* drop refcnt of maps used by the rejected program */ static void release_btfs(struct bpf_verifier_env *env) { - __bpf_free_used_btfs(env->prog->aux, env->used_btfs, - env->used_btf_cnt); + __bpf_free_used_btfs(env->used_btfs, env->used_btf_cnt); } /* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */ @@ -18719,6 +18750,41 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of return new_prog; } +/* + * For all jmp insns in a given 'prog' that point to 'tgt_idx' insn adjust the + * jump offset by 'delta'. + */ +static int adjust_jmp_off(struct bpf_prog *prog, u32 tgt_idx, u32 delta) +{ + struct bpf_insn *insn = prog->insnsi; + u32 insn_cnt = prog->len, i; + s32 imm; + s16 off; + + for (i = 0; i < insn_cnt; i++, insn++) { + u8 code = insn->code; + + if ((BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32) || + BPF_OP(code) == BPF_CALL || BPF_OP(code) == BPF_EXIT) + continue; + + if (insn->code == (BPF_JMP32 | BPF_JA)) { + if (i + 1 + insn->imm != tgt_idx) + continue; + if (check_add_overflow(insn->imm, delta, &imm)) + return -ERANGE; + insn->imm = imm; + } else { + if (i + 1 + insn->off != tgt_idx) + continue; + if (check_add_overflow(insn->off, delta, &off)) + return -ERANGE; + insn->off = off; + } + } + return 0; +} + static int adjust_subprog_starts_after_remove(struct bpf_verifier_env *env, u32 off, u32 cnt) { @@ -19993,7 +20059,10 @@ static int do_misc_fixups(struct bpf_verifier_env *env) stack_depth_extra = 8; insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_AX, BPF_REG_10, stack_off); - insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off + 2); + if (insn->off >= 0) + insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off + 2); + else + insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off - 1); insn_buf[2] = BPF_ALU64_IMM(BPF_SUB, BPF_REG_AX, 1); insn_buf[3] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_AX, stack_off); cnt = 4; @@ -20309,7 +20378,7 @@ patch_map_ops_generic: goto next_insn; } -#ifdef CONFIG_X86_64 +#if defined(CONFIG_X86_64) && !defined(CONFIG_UML) /* Implement bpf_get_smp_processor_id() inline. */ if (insn->imm == BPF_FUNC_get_smp_processor_id && prog->jit_requested && bpf_jit_supports_percpu_insn()) { @@ -20535,6 +20604,13 @@ next_insn: if (!new_prog) return -ENOMEM; env->prog = prog = new_prog; + /* + * If may_goto is a first insn of a prog there could be a jmp + * insn that points to it, hence adjust all such jmps to point + * to insn after BPF_ST that inits may_goto count. + * Adjustment will succeed because bpf_patch_insn_data() didn't fail. + */ + WARN_ON(adjust_jmp_off(env->prog, subprog_start, 1)); } /* Since poke tab is now finalized, publish aux to tracker. */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index e32b6972c478..c8e4b62b436a 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1744,8 +1744,11 @@ static int css_populate_dir(struct cgroup_subsys_state *css) if (cgroup_psi_enabled()) { ret = cgroup_addrm_files(css, cgrp, cgroup_psi_files, true); - if (ret < 0) + if (ret < 0) { + cgroup_addrm_files(css, cgrp, + cgroup_base_files, false); return ret; + } } } else { ret = cgroup_addrm_files(css, cgrp, @@ -1839,9 +1842,9 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) RCU_INIT_POINTER(scgrp->subsys[ssid], NULL); rcu_assign_pointer(dcgrp->subsys[ssid], css); ss->root = dst_root; - css->cgroup = dcgrp; spin_lock_irq(&css_set_lock); + css->cgroup = dcgrp; WARN_ON(!list_empty(&dcgrp->e_csets[ss->id])); list_for_each_entry_safe(cset, cset_pos, &scgrp->e_csets[ss->id], e_cset_node[ss->id]) { @@ -1922,6 +1925,7 @@ enum cgroup2_param { Opt_memory_localevents, Opt_memory_recursiveprot, Opt_memory_hugetlb_accounting, + Opt_pids_localevents, nr__cgroup2_params }; @@ -1931,6 +1935,7 @@ static const struct fs_parameter_spec cgroup2_fs_parameters[] = { fsparam_flag("memory_localevents", Opt_memory_localevents), fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot), fsparam_flag("memory_hugetlb_accounting", Opt_memory_hugetlb_accounting), + fsparam_flag("pids_localevents", Opt_pids_localevents), {} }; @@ -1960,6 +1965,9 @@ static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param case Opt_memory_hugetlb_accounting: ctx->flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING; return 0; + case Opt_pids_localevents: + ctx->flags |= CGRP_ROOT_PIDS_LOCAL_EVENTS; + return 0; } return -EINVAL; } @@ -1989,6 +1997,11 @@ static void apply_cgroup_root_flags(unsigned int root_flags) cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING; else cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING; + + if (root_flags & CGRP_ROOT_PIDS_LOCAL_EVENTS) + cgrp_dfl_root.flags |= CGRP_ROOT_PIDS_LOCAL_EVENTS; + else + cgrp_dfl_root.flags &= ~CGRP_ROOT_PIDS_LOCAL_EVENTS; } } @@ -2004,6 +2017,8 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root seq_puts(seq, ",memory_recursiveprot"); if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING) seq_puts(seq, ",memory_hugetlb_accounting"); + if (cgrp_dfl_root.flags & CGRP_ROOT_PIDS_LOCAL_EVENTS) + seq_puts(seq, ",pids_localevents"); return 0; } @@ -6686,8 +6701,10 @@ void cgroup_exit(struct task_struct *tsk) WARN_ON_ONCE(list_empty(&tsk->cg_list)); cset = task_css_set(tsk); css_set_move_task(tsk, cset, NULL, false); - list_add_tail(&tsk->cg_list, &cset->dying_tasks); cset->nr_tasks--; + /* matches the signal->live check in css_task_iter_advance() */ + if (thread_group_leader(tsk) && atomic_read(&tsk->signal->live)) + list_add_tail(&tsk->cg_list, &cset->dying_tasks); if (dl_task(tsk)) dec_dl_tasks_cs(tsk); @@ -6714,10 +6731,12 @@ void cgroup_release(struct task_struct *task) ss->release(task); } while_each_subsys_mask(); - spin_lock_irq(&css_set_lock); - css_set_skip_task_iters(task_css_set(task), task); - list_del_init(&task->cg_list); - spin_unlock_irq(&css_set_lock); + if (!list_empty(&task->cg_list)) { + spin_lock_irq(&css_set_lock); + css_set_skip_task_iters(task_css_set(task), task); + list_del_init(&task->cg_list); + spin_unlock_irq(&css_set_lock); + } } void cgroup_free(struct task_struct *task) @@ -7062,7 +7081,8 @@ static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr, "favordynmods\n" "memory_localevents\n" "memory_recursiveprot\n" - "memory_hugetlb_accounting\n"); + "memory_hugetlb_accounting\n" + "pids_localevents\n"); } static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features); diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index c12b9fdb22a4..40ec4abaf440 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -21,6 +21,7 @@ * License. See the file COPYING in the main directory of the Linux * distribution for more details. */ +#include "cgroup-internal.h" #include <linux/cpu.h> #include <linux/cpumask.h> @@ -87,7 +88,7 @@ static const char * const perr_strings[] = { [PERR_NOTEXCL] = "Cpu list in cpuset.cpus not exclusive", [PERR_NOCPUS] = "Parent unable to distribute cpu downstream", [PERR_HOTPLUG] = "No cpu available due to hotplug", - [PERR_CPUSEMPTY] = "cpuset.cpus is empty", + [PERR_CPUSEMPTY] = "cpuset.cpus and cpuset.cpus.exclusive are empty", [PERR_HKEEPING] = "partition config conflicts with housekeeping setup", }; @@ -127,19 +128,28 @@ struct cpuset { /* * Exclusive CPUs dedicated to current cgroup (default hierarchy only) * - * This exclusive CPUs must be a subset of cpus_allowed. A parent - * cgroup can only grant exclusive CPUs to one of its children. + * The effective_cpus of a valid partition root comes solely from its + * effective_xcpus and some of the effective_xcpus may be distributed + * to sub-partitions below & hence excluded from its effective_cpus. + * For a valid partition root, its effective_cpus have no relationship + * with cpus_allowed unless its exclusive_cpus isn't set. * - * When the cgroup becomes a valid partition root, effective_xcpus - * defaults to cpus_allowed if not set. The effective_cpus of a valid - * partition root comes solely from its effective_xcpus and some of the - * effective_xcpus may be distributed to sub-partitions below & hence - * excluded from its effective_cpus. + * This value will only be set if either exclusive_cpus is set or + * when this cpuset becomes a local partition root. */ cpumask_var_t effective_xcpus; /* * Exclusive CPUs as requested by the user (default hierarchy only) + * + * Its value is independent of cpus_allowed and designates the set of + * CPUs that can be granted to the current cpuset or its children when + * it becomes a valid partition root. The effective set of exclusive + * CPUs granted (effective_xcpus) depends on whether those exclusive + * CPUs are passed down by its ancestors and not yet taken up by + * another sibling partition root along the way. + * + * If its value isn't set, it defaults to cpus_allowed. */ cpumask_var_t exclusive_cpus; @@ -169,7 +179,7 @@ struct cpuset { /* for custom sched domain */ int relax_domain_level; - /* number of valid sub-partitions */ + /* number of valid local child partitions */ int nr_subparts; /* partition root state */ @@ -230,6 +240,17 @@ static struct list_head remote_children; * 2 - partition root without load balancing (isolated) * -1 - invalid partition root * -2 - invalid isolated partition root + * + * There are 2 types of partitions - local or remote. Local partitions are + * those whose parents are partition root themselves. Setting of + * cpuset.cpus.exclusive are optional in setting up local partitions. + * Remote partitions are those whose parents are not partition roots. Passing + * down exclusive CPUs by setting cpuset.cpus.exclusive along its ancestor + * nodes are mandatory in creating a remote partition. + * + * For simplicity, a local partition can be created under a local or remote + * partition but a remote partition cannot have any partition root in its + * ancestor chain except the cgroup root. */ #define PRS_MEMBER 0 #define PRS_ROOT 1 @@ -434,7 +455,7 @@ static struct cpuset top_cpuset = { * by other task, we use alloc_lock in the task_struct fields to protect * them. * - * The cpuset_common_file_read() handlers only hold callback_lock across + * The cpuset_common_seq_show() handlers only hold callback_lock across * small pieces of code, such as when reading out possibly multi-word * cpumasks and nodemasks. * @@ -709,6 +730,19 @@ static inline void free_cpuset(struct cpuset *cs) kfree(cs); } +/* Return user specified exclusive CPUs */ +static inline struct cpumask *user_xcpus(struct cpuset *cs) +{ + return cpumask_empty(cs->exclusive_cpus) ? cs->cpus_allowed + : cs->exclusive_cpus; +} + +static inline bool xcpus_empty(struct cpuset *cs) +{ + return cpumask_empty(cs->cpus_allowed) && + cpumask_empty(cs->exclusive_cpus); +} + static inline struct cpumask *fetch_xcpus(struct cpuset *cs) { return !cpumask_empty(cs->exclusive_cpus) ? cs->exclusive_cpus : @@ -825,17 +859,41 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) /* * If either I or some sibling (!= me) is exclusive, we can't - * overlap + * overlap. exclusive_cpus cannot overlap with each other if set. */ ret = -EINVAL; cpuset_for_each_child(c, css, par) { - if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && - c != cur) { + bool txset, cxset; /* Are exclusive_cpus set? */ + + if (c == cur) + continue; + + txset = !cpumask_empty(trial->exclusive_cpus); + cxset = !cpumask_empty(c->exclusive_cpus); + if (is_cpu_exclusive(trial) || is_cpu_exclusive(c) || + (txset && cxset)) { if (!cpusets_are_exclusive(trial, c)) goto out; + } else if (txset || cxset) { + struct cpumask *xcpus, *acpus; + + /* + * When just one of the exclusive_cpus's is set, + * cpus_allowed of the other cpuset, if set, cannot be + * a subset of it or none of those CPUs will be + * available if these exclusive CPUs are activated. + */ + if (txset) { + xcpus = trial->exclusive_cpus; + acpus = c->cpus_allowed; + } else { + xcpus = c->exclusive_cpus; + acpus = trial->cpus_allowed; + } + if (!cpumask_empty(acpus) && cpumask_subset(acpus, xcpus)) + goto out; } if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && - c != cur && nodes_intersects(trial->mems_allowed, c->mems_allowed)) goto out; } @@ -957,13 +1015,15 @@ static int generate_sched_domains(cpumask_var_t **domains, int nslot; /* next empty doms[] struct cpumask slot */ struct cgroup_subsys_state *pos_css; bool root_load_balance = is_sched_load_balance(&top_cpuset); + bool cgrpv2 = cgroup_subsys_on_dfl(cpuset_cgrp_subsys); doms = NULL; dattr = NULL; csa = NULL; /* Special case for the 99% of systems with one, full, sched domain */ - if (root_load_balance && !top_cpuset.nr_subparts) { + if (root_load_balance && cpumask_empty(subpartitions_cpus)) { +single_root_domain: ndoms = 1; doms = alloc_sched_domains(ndoms); if (!doms) @@ -991,16 +1051,18 @@ static int generate_sched_domains(cpumask_var_t **domains, cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) { if (cp == &top_cpuset) continue; + + if (cgrpv2) + goto v2; + /* + * v1: * Continue traversing beyond @cp iff @cp has some CPUs and * isn't load balancing. The former is obvious. The * latter: All child cpusets contain a subset of the * parent's cpus, so just skip them, and then we call * update_domain_attr_tree() to calc relax_domain_level of * the corresponding sched domain. - * - * If root is load-balancing, we can skip @cp if it - * is a subset of the root's effective_cpus. */ if (!cpumask_empty(cp->cpus_allowed) && !(is_sched_load_balance(cp) && @@ -1008,20 +1070,39 @@ static int generate_sched_domains(cpumask_var_t **domains, housekeeping_cpumask(HK_TYPE_DOMAIN)))) continue; - if (root_load_balance && - cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus)) - continue; - if (is_sched_load_balance(cp) && !cpumask_empty(cp->effective_cpus)) csa[csn++] = cp; - /* skip @cp's subtree if not a partition root */ - if (!is_partition_valid(cp)) + /* skip @cp's subtree */ + pos_css = css_rightmost_descendant(pos_css); + continue; + +v2: + /* + * Only valid partition roots that are not isolated and with + * non-empty effective_cpus will be saved into csn[]. + */ + if ((cp->partition_root_state == PRS_ROOT) && + !cpumask_empty(cp->effective_cpus)) + csa[csn++] = cp; + + /* + * Skip @cp's subtree if not a partition root and has no + * exclusive CPUs to be granted to child cpusets. + */ + if (!is_partition_valid(cp) && cpumask_empty(cp->exclusive_cpus)) pos_css = css_rightmost_descendant(pos_css); } rcu_read_unlock(); + /* + * If there are only isolated partitions underneath the cgroup root, + * we can optimize out unneeded sched domains scanning. + */ + if (root_load_balance && (csn == 1)) + goto single_root_domain; + for (i = 0; i < csn; i++) csa[i]->pn = i; ndoms = csn; @@ -1064,6 +1145,20 @@ restart: dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr), GFP_KERNEL); + /* + * Cgroup v2 doesn't support domain attributes, just set all of them + * to SD_ATTR_INIT. Also non-isolating partition root CPUs are a + * subset of HK_TYPE_DOMAIN housekeeping CPUs. + */ + if (cgrpv2) { + for (i = 0; i < ndoms; i++) { + cpumask_copy(doms[i], csa[i]->effective_cpus); + if (dattr) + dattr[i] = SD_ATTR_INIT; + } + goto done; + } + for (nslot = 0, i = 0; i < csn; i++) { struct cpuset *a = csa[i]; struct cpumask *dp; @@ -1223,7 +1318,7 @@ static void rebuild_sched_domains_locked(void) * root should be only a subset of the active CPUs. Since a CPU in any * partition root could be offlined, all must be checked. */ - if (top_cpuset.nr_subparts) { + if (!cpumask_empty(subpartitions_cpus)) { rcu_read_lock(); cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { if (!is_partition_valid(cs)) { @@ -1338,7 +1433,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, */ static int update_partition_exclusive(struct cpuset *cs, int new_prs) { - bool exclusive = (new_prs > 0); + bool exclusive = (new_prs > PRS_MEMBER); if (exclusive && !is_cpu_exclusive(cs)) { if (update_flag(CS_CPU_EXCLUSIVE, cs, 1)) @@ -1532,7 +1627,7 @@ EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated); * Return: true if xcpus is not empty, false otherwise. * * Starting with exclusive_cpus (cpus_allowed if exclusive_cpus is not set), - * it must be a subset of cpus_allowed and parent's effective_xcpus. + * it must be a subset of parent's effective_xcpus. */ static bool compute_effective_exclusive_cpumask(struct cpuset *cs, struct cpumask *xcpus) @@ -1542,12 +1637,7 @@ static bool compute_effective_exclusive_cpumask(struct cpuset *cs, if (!xcpus) xcpus = cs->effective_xcpus; - if (!cpumask_empty(cs->exclusive_cpus)) - cpumask_and(xcpus, cs->exclusive_cpus, cs->cpus_allowed); - else - cpumask_copy(xcpus, cs->cpus_allowed); - - return cpumask_and(xcpus, xcpus, parent->effective_xcpus); + return cpumask_and(xcpus, user_xcpus(cs), parent->effective_xcpus); } static inline bool is_remote_partition(struct cpuset *cs) @@ -1826,8 +1916,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, */ adding = deleting = false; old_prs = new_prs = cs->partition_root_state; - xcpus = !cpumask_empty(cs->exclusive_cpus) - ? cs->effective_xcpus : cs->cpus_allowed; + xcpus = user_xcpus(cs); if (cmd == partcmd_invalidate) { if (is_prs_invalid(old_prs)) @@ -1855,7 +1944,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, return is_partition_invalid(parent) ? PERR_INVPARENT : PERR_NOTPART; } - if (!newmask && cpumask_empty(cs->cpus_allowed)) + if (!newmask && xcpus_empty(cs)) return PERR_CPUSEMPTY; nocpu = tasks_nocpu_error(parent, cs, xcpus); @@ -2583,8 +2672,6 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, retval = cpulist_parse(buf, trialcs->exclusive_cpus); if (retval < 0) return retval; - if (!is_cpu_exclusive(cs)) - set_bit(CS_CPU_EXCLUSIVE, &trialcs->flags); } /* Nothing to do if the CPUs didn't change */ @@ -3071,9 +3158,9 @@ static int update_prstate(struct cpuset *cs, int new_prs) ? partcmd_enable : partcmd_enablei; /* - * cpus_allowed cannot be empty. + * cpus_allowed and exclusive_cpus cannot be both empty. */ - if (cpumask_empty(cs->cpus_allowed)) { + if (xcpus_empty(cs)) { err = PERR_CPUSEMPTY; goto out; } @@ -4009,8 +4096,6 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css) } __set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); - nodes_clear(cs->mems_allowed); - nodes_clear(cs->effective_mems); fmeter_init(&cs->fmeter); cs->relax_domain_level = -1; INIT_LIST_HEAD(&cs->remote_sibling); @@ -4040,6 +4125,12 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) set_bit(CS_SPREAD_PAGE, &cs->flags); if (is_spread_slab(parent)) set_bit(CS_SPREAD_SLAB, &cs->flags); + /* + * For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated + */ + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && + !is_sched_load_balance(parent)) + clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); cpuset_inc(); @@ -4050,14 +4141,6 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cs->use_parent_ecpus = true; parent->child_ecpus_count++; } - - /* - * For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated - */ - if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && - !is_sched_load_balance(parent)) - clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); - spin_unlock_irq(&callback_lock); if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) @@ -4571,7 +4654,7 @@ static void cpuset_handle_hotplug(void) * In the rare case that hotplug removes all the cpus in * subpartitions_cpus, we assumed that cpus are updated. */ - if (!cpus_updated && top_cpuset.nr_subparts) + if (!cpus_updated && !cpumask_empty(subpartitions_cpus)) cpus_updated = true; /* For v1, synchronize cpus_allowed to cpu_active_mask */ @@ -5051,10 +5134,14 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, if (!buf) goto out; - css = task_get_css(tsk, cpuset_cgrp_id); - retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX, - current->nsproxy->cgroup_ns); - css_put(css); + rcu_read_lock(); + spin_lock_irq(&css_set_lock); + css = task_css(tsk, cpuset_cgrp_id); + retval = cgroup_path_ns_locked(css->cgroup, buf, PATH_MAX, + current->nsproxy->cgroup_ns); + spin_unlock_irq(&css_set_lock); + rcu_read_unlock(); + if (retval == -E2BIG) retval = -ENAMETOOLONG; if (retval < 0) diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c index 79a3717a5803..0e26068995a6 100644 --- a/kernel/cgroup/misc.c +++ b/kernel/cgroup/misc.c @@ -121,6 +121,30 @@ static void misc_cg_cancel_charge(enum misc_res_type type, struct misc_cg *cg, misc_res_name[type]); } +static void misc_cg_update_watermark(struct misc_res *res, u64 new_usage) +{ + u64 old; + + while (true) { + old = atomic64_read(&res->watermark); + if (new_usage <= old) + break; + if (atomic64_cmpxchg(&res->watermark, old, new_usage) == old) + break; + } +} + +static void misc_cg_event(enum misc_res_type type, struct misc_cg *cg) +{ + atomic64_inc(&cg->res[type].events_local); + cgroup_file_notify(&cg->events_local_file); + + for (; parent_misc(cg); cg = parent_misc(cg)) { + atomic64_inc(&cg->res[type].events); + cgroup_file_notify(&cg->events_file); + } +} + /** * misc_cg_try_charge() - Try charging the misc cgroup. * @type: Misc res type to charge. @@ -159,14 +183,12 @@ int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, u64 amount) ret = -EBUSY; goto err_charge; } + misc_cg_update_watermark(res, new_usage); } return 0; err_charge: - for (j = i; j; j = parent_misc(j)) { - atomic64_inc(&j->res[type].events); - cgroup_file_notify(&j->events_file); - } + misc_cg_event(type, i); for (j = cg; j != i; j = parent_misc(j)) misc_cg_cancel_charge(type, j, amount); @@ -308,6 +330,29 @@ static int misc_cg_current_show(struct seq_file *sf, void *v) } /** + * misc_cg_peak_show() - Show the peak usage of the misc cgroup. + * @sf: Interface file + * @v: Arguments passed + * + * Context: Any context. + * Return: 0 to denote successful print. + */ +static int misc_cg_peak_show(struct seq_file *sf, void *v) +{ + int i; + u64 watermark; + struct misc_cg *cg = css_misc(seq_css(sf)); + + for (i = 0; i < MISC_CG_RES_TYPES; i++) { + watermark = atomic64_read(&cg->res[i].watermark); + if (READ_ONCE(misc_res_capacity[i]) || watermark) + seq_printf(sf, "%s %llu\n", misc_res_name[i], watermark); + } + + return 0; +} + +/** * misc_cg_capacity_show() - Show the total capacity of misc res on the host. * @sf: Interface file * @v: Arguments passed @@ -331,20 +376,33 @@ static int misc_cg_capacity_show(struct seq_file *sf, void *v) return 0; } -static int misc_events_show(struct seq_file *sf, void *v) +static int __misc_events_show(struct seq_file *sf, bool local) { struct misc_cg *cg = css_misc(seq_css(sf)); u64 events; int i; for (i = 0; i < MISC_CG_RES_TYPES; i++) { - events = atomic64_read(&cg->res[i].events); + if (local) + events = atomic64_read(&cg->res[i].events_local); + else + events = atomic64_read(&cg->res[i].events); if (READ_ONCE(misc_res_capacity[i]) || events) seq_printf(sf, "%s.max %llu\n", misc_res_name[i], events); } return 0; } +static int misc_events_show(struct seq_file *sf, void *v) +{ + return __misc_events_show(sf, false); +} + +static int misc_events_local_show(struct seq_file *sf, void *v) +{ + return __misc_events_show(sf, true); +} + /* Misc cgroup interface files */ static struct cftype misc_cg_files[] = { { @@ -358,6 +416,10 @@ static struct cftype misc_cg_files[] = { .seq_show = misc_cg_current_show, }, { + .name = "peak", + .seq_show = misc_cg_peak_show, + }, + { .name = "capacity", .seq_show = misc_cg_capacity_show, .flags = CFTYPE_ONLY_ON_ROOT, @@ -368,6 +430,12 @@ static struct cftype misc_cg_files[] = { .file_offset = offsetof(struct misc_cg, events_file), .seq_show = misc_events_show, }, + { + .name = "events.local", + .flags = CFTYPE_NOT_ON_ROOT, + .file_offset = offsetof(struct misc_cg, events_local_file), + .seq_show = misc_events_local_show, + }, {} }; diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c index 0e5ec7d59b4d..f5cb0ec45b9d 100644 --- a/kernel/cgroup/pids.c +++ b/kernel/cgroup/pids.c @@ -38,6 +38,14 @@ #define PIDS_MAX (PID_MAX_LIMIT + 1ULL) #define PIDS_MAX_STR "max" +enum pidcg_event { + /* Fork failed in subtree because this pids_cgroup limit was hit. */ + PIDCG_MAX, + /* Fork failed in this pids_cgroup because ancestor limit was hit. */ + PIDCG_FORKFAIL, + NR_PIDCG_EVENTS, +}; + struct pids_cgroup { struct cgroup_subsys_state css; @@ -49,11 +57,12 @@ struct pids_cgroup { atomic64_t limit; int64_t watermark; - /* Handle for "pids.events" */ + /* Handles for pids.events[.local] */ struct cgroup_file events_file; + struct cgroup_file events_local_file; - /* Number of times fork failed because limit was hit. */ - atomic64_t events_limit; + atomic64_t events[NR_PIDCG_EVENTS]; + atomic64_t events_local[NR_PIDCG_EVENTS]; }; static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css) @@ -148,12 +157,13 @@ static void pids_charge(struct pids_cgroup *pids, int num) * pids_try_charge - hierarchically try to charge the pid count * @pids: the pid cgroup state * @num: the number of pids to charge + * @fail: storage of pid cgroup causing the fail * * This function follows the set limit. It will fail if the charge would cause * the new value to exceed the hierarchical limit. Returns 0 if the charge * succeeded, otherwise -EAGAIN. */ -static int pids_try_charge(struct pids_cgroup *pids, int num) +static int pids_try_charge(struct pids_cgroup *pids, int num, struct pids_cgroup **fail) { struct pids_cgroup *p, *q; @@ -166,9 +176,10 @@ static int pids_try_charge(struct pids_cgroup *pids, int num) * p->limit is %PIDS_MAX then we know that this test will never * fail. */ - if (new > limit) + if (new > limit) { + *fail = p; goto revert; - + } /* * Not technically accurate if we go over limit somewhere up * the hierarchy, but that's tolerable for the watermark. @@ -229,6 +240,36 @@ static void pids_cancel_attach(struct cgroup_taskset *tset) } } +static void pids_event(struct pids_cgroup *pids_forking, + struct pids_cgroup *pids_over_limit) +{ + struct pids_cgroup *p = pids_forking; + bool limit = false; + + /* Only log the first time limit is hit. */ + if (atomic64_inc_return(&p->events_local[PIDCG_FORKFAIL]) == 1) { + pr_info("cgroup: fork rejected by pids controller in "); + pr_cont_cgroup_path(p->css.cgroup); + pr_cont("\n"); + } + cgroup_file_notify(&p->events_local_file); + if (!cgroup_subsys_on_dfl(pids_cgrp_subsys) || + cgrp_dfl_root.flags & CGRP_ROOT_PIDS_LOCAL_EVENTS) + return; + + for (; parent_pids(p); p = parent_pids(p)) { + if (p == pids_over_limit) { + limit = true; + atomic64_inc(&p->events_local[PIDCG_MAX]); + cgroup_file_notify(&p->events_local_file); + } + if (limit) + atomic64_inc(&p->events[PIDCG_MAX]); + + cgroup_file_notify(&p->events_file); + } +} + /* * task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies * on cgroup_threadgroup_change_begin() held by the copy_process(). @@ -236,7 +277,7 @@ static void pids_cancel_attach(struct cgroup_taskset *tset) static int pids_can_fork(struct task_struct *task, struct css_set *cset) { struct cgroup_subsys_state *css; - struct pids_cgroup *pids; + struct pids_cgroup *pids, *pids_over_limit; int err; if (cset) @@ -244,16 +285,10 @@ static int pids_can_fork(struct task_struct *task, struct css_set *cset) else css = task_css_check(current, pids_cgrp_id, true); pids = css_pids(css); - err = pids_try_charge(pids, 1); - if (err) { - /* Only log the first time events_limit is incremented. */ - if (atomic64_inc_return(&pids->events_limit) == 1) { - pr_info("cgroup: fork rejected by pids controller in "); - pr_cont_cgroup_path(css->cgroup); - pr_cont("\n"); - } - cgroup_file_notify(&pids->events_file); - } + err = pids_try_charge(pids, 1, &pids_over_limit); + if (err) + pids_event(pids, pids_over_limit); + return err; } @@ -337,11 +372,32 @@ static s64 pids_peak_read(struct cgroup_subsys_state *css, return READ_ONCE(pids->watermark); } -static int pids_events_show(struct seq_file *sf, void *v) +static int __pids_events_show(struct seq_file *sf, bool local) { struct pids_cgroup *pids = css_pids(seq_css(sf)); + enum pidcg_event pe = PIDCG_MAX; + atomic64_t *events; + + if (!cgroup_subsys_on_dfl(pids_cgrp_subsys) || + cgrp_dfl_root.flags & CGRP_ROOT_PIDS_LOCAL_EVENTS) { + pe = PIDCG_FORKFAIL; + local = true; + } + events = local ? pids->events_local : pids->events; + + seq_printf(sf, "max %lld\n", (s64)atomic64_read(&events[pe])); + return 0; +} - seq_printf(sf, "max %lld\n", (s64)atomic64_read(&pids->events_limit)); +static int pids_events_show(struct seq_file *sf, void *v) +{ + __pids_events_show(sf, false); + return 0; +} + +static int pids_events_local_show(struct seq_file *sf, void *v) +{ + __pids_events_show(sf, true); return 0; } @@ -368,9 +424,42 @@ static struct cftype pids_files[] = { .file_offset = offsetof(struct pids_cgroup, events_file), .flags = CFTYPE_NOT_ON_ROOT, }, + { + .name = "events.local", + .seq_show = pids_events_local_show, + .file_offset = offsetof(struct pids_cgroup, events_local_file), + .flags = CFTYPE_NOT_ON_ROOT, + }, + { } /* terminate */ +}; + +static struct cftype pids_files_legacy[] = { + { + .name = "max", + .write = pids_max_write, + .seq_show = pids_max_show, + .flags = CFTYPE_NOT_ON_ROOT, + }, + { + .name = "current", + .read_s64 = pids_current_read, + .flags = CFTYPE_NOT_ON_ROOT, + }, + { + .name = "peak", + .flags = CFTYPE_NOT_ON_ROOT, + .read_s64 = pids_peak_read, + }, + { + .name = "events", + .seq_show = pids_events_show, + .file_offset = offsetof(struct pids_cgroup, events_file), + .flags = CFTYPE_NOT_ON_ROOT, + }, { } /* terminate */ }; + struct cgroup_subsys pids_cgrp_subsys = { .css_alloc = pids_css_alloc, .css_free = pids_css_free, @@ -379,7 +468,7 @@ struct cgroup_subsys pids_cgrp_subsys = { .can_fork = pids_can_fork, .cancel_fork = pids_cancel_fork, .release = pids_release, - .legacy_cftypes = pids_files, + .legacy_cftypes = pids_files_legacy, .dfl_cftypes = pids_files, .threaded = true, }; diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index fb8b49437573..a06b45272411 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -594,49 +594,46 @@ static void root_cgroup_cputime(struct cgroup_base_stat *bstat) } } + +static void cgroup_force_idle_show(struct seq_file *seq, struct cgroup_base_stat *bstat) +{ +#ifdef CONFIG_SCHED_CORE + u64 forceidle_time = bstat->forceidle_sum; + + do_div(forceidle_time, NSEC_PER_USEC); + seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time); +#endif +} + void cgroup_base_stat_cputime_show(struct seq_file *seq) { struct cgroup *cgrp = seq_css(seq)->cgroup; u64 usage, utime, stime; - struct cgroup_base_stat bstat; -#ifdef CONFIG_SCHED_CORE - u64 forceidle_time; -#endif if (cgroup_parent(cgrp)) { cgroup_rstat_flush_hold(cgrp); usage = cgrp->bstat.cputime.sum_exec_runtime; cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, &utime, &stime); -#ifdef CONFIG_SCHED_CORE - forceidle_time = cgrp->bstat.forceidle_sum; -#endif cgroup_rstat_flush_release(cgrp); } else { - root_cgroup_cputime(&bstat); - usage = bstat.cputime.sum_exec_runtime; - utime = bstat.cputime.utime; - stime = bstat.cputime.stime; -#ifdef CONFIG_SCHED_CORE - forceidle_time = bstat.forceidle_sum; -#endif + /* cgrp->bstat of root is not actually used, reuse it */ + root_cgroup_cputime(&cgrp->bstat); + usage = cgrp->bstat.cputime.sum_exec_runtime; + utime = cgrp->bstat.cputime.utime; + stime = cgrp->bstat.cputime.stime; } do_div(usage, NSEC_PER_USEC); do_div(utime, NSEC_PER_USEC); do_div(stime, NSEC_PER_USEC); -#ifdef CONFIG_SCHED_CORE - do_div(forceidle_time, NSEC_PER_USEC); -#endif seq_printf(seq, "usage_usec %llu\n" "user_usec %llu\n" "system_usec %llu\n", usage, utime, stime); -#ifdef CONFIG_SCHED_CORE - seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time); -#endif + cgroup_force_idle_show(seq, &cgrp->bstat); } /* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */ diff --git a/kernel/cpu.c b/kernel/cpu.c index 563877d6c28b..1209ddaec026 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -483,6 +483,8 @@ static int cpu_hotplug_disabled; DEFINE_STATIC_PERCPU_RWSEM(cpu_hotplug_lock); +static bool cpu_hotplug_offline_disabled __ro_after_init; + void cpus_read_lock(void) { percpu_down_read(&cpu_hotplug_lock); @@ -542,6 +544,14 @@ static void lockdep_release_cpus_lock(void) rwsem_release(&cpu_hotplug_lock.dep_map, _THIS_IP_); } +/* Declare CPU offlining not supported */ +void cpu_hotplug_disable_offlining(void) +{ + cpu_maps_update_begin(); + cpu_hotplug_offline_disabled = true; + cpu_maps_update_done(); +} + /* * Wait for currently running CPU hotplug operations to complete (if any) and * disable future CPU hotplug (from sysfs). The 'cpu_add_remove_lock' protects @@ -1471,7 +1481,7 @@ static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target) * If the platform does not support hotplug, report it explicitly to * differentiate it from a transient offlining failure. */ - if (cc_platform_has(CC_ATTR_HOTPLUG_DISABLED)) + if (cpu_hotplug_offline_disabled) return -EOPNOTSUPP; if (cpu_hotplug_disabled) return -EBUSY; @@ -1859,6 +1869,9 @@ static inline bool cpuhp_bringup_cpus_parallel(unsigned int ncpus) { return fals void __init bringup_nonboot_cpus(unsigned int max_cpus) { + if (!max_cpus) + return; + /* Try parallel bringup optimization if enabled */ if (cpuhp_bringup_cpus_parallel(max_cpus)) return; @@ -1891,8 +1904,8 @@ int freeze_secondary_cpus(int primary) cpumask_clear(frozen_cpus); pr_info("Disabling non-boot CPUs ...\n"); - for_each_online_cpu(cpu) { - if (cpu == primary) + for (cpu = nr_cpu_ids - 1; cpu >= 0; cpu--) { + if (!cpu_online(cpu) || cpu == primary) continue; if (pm_wakeup_pending()) { @@ -2446,7 +2459,7 @@ EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance); * The caller needs to hold cpus read locked while calling this function. * Return: * On success: - * Positive state number if @state is CPUHP_AP_ONLINE_DYN; + * Positive state number if @state is CPUHP_AP_ONLINE_DYN or CPUHP_BP_PREPARE_DYN; * 0 for all other states * On failure: proper (negative) error code */ @@ -2469,7 +2482,7 @@ int __cpuhp_setup_state_cpuslocked(enum cpuhp_state state, ret = cpuhp_store_callbacks(state, name, startup, teardown, multi_instance); - dynstate = state == CPUHP_AP_ONLINE_DYN; + dynstate = state == CPUHP_AP_ONLINE_DYN || state == CPUHP_BP_PREPARE_DYN; if (ret > 0 && dynstate) { state = ret; ret = 0; @@ -2500,8 +2513,8 @@ int __cpuhp_setup_state_cpuslocked(enum cpuhp_state state, out: mutex_unlock(&cpuhp_state_mutex); /* - * If the requested state is CPUHP_AP_ONLINE_DYN, return the - * dynamically allocated state in case of success. + * If the requested state is CPUHP_AP_ONLINE_DYN or CPUHP_BP_PREPARE_DYN, + * return the dynamically allocated state in case of success. */ if (!ret && dynstate) return state; @@ -3069,6 +3082,9 @@ EXPORT_SYMBOL(__cpu_possible_mask); struct cpumask __cpu_online_mask __read_mostly; EXPORT_SYMBOL(__cpu_online_mask); +struct cpumask __cpu_enabled_mask __read_mostly; +EXPORT_SYMBOL(__cpu_enabled_mask); + struct cpumask __cpu_present_mask __read_mostly; EXPORT_SYMBOL(__cpu_present_mask); diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 1273be84392c..8d57255e5b29 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -11,6 +11,7 @@ #include <linux/perf_event.h> #include <linux/slab.h> #include <linux/sched/task_stack.h> +#include <linux/uprobes.h> #include "internal.h" @@ -29,7 +30,7 @@ static inline size_t perf_callchain_entry__sizeof(void) sysctl_perf_event_max_contexts_per_stack)); } -static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]); +static DEFINE_PER_CPU(u8, callchain_recursion[PERF_NR_CONTEXTS]); static atomic_t nr_callchain_events; static DEFINE_MUTEX(callchain_mutex); static struct callchain_cpus_entries *callchain_cpus_entries; @@ -176,13 +177,51 @@ put_callchain_entry(int rctx) put_recursion_context(this_cpu_ptr(callchain_recursion), rctx); } +static void fixup_uretprobe_trampoline_entries(struct perf_callchain_entry *entry, + int start_entry_idx) +{ +#ifdef CONFIG_UPROBES + struct uprobe_task *utask = current->utask; + struct return_instance *ri; + __u64 *cur_ip, *last_ip, tramp_addr; + + if (likely(!utask || !utask->return_instances)) + return; + + cur_ip = &entry->ip[start_entry_idx]; + last_ip = &entry->ip[entry->nr - 1]; + ri = utask->return_instances; + tramp_addr = uprobe_get_trampoline_vaddr(); + + /* + * If there are pending uretprobes for the current thread, they are + * recorded in a list inside utask->return_instances; each such + * pending uretprobe replaces traced user function's return address on + * the stack, so when stack trace is captured, instead of seeing + * actual function's return address, we'll have one or many uretprobe + * trampoline addresses in the stack trace, which are not helpful and + * misleading to users. + * So here we go over the pending list of uretprobes, and each + * encountered trampoline address is replaced with actual return + * address. + */ + while (ri && cur_ip <= last_ip) { + if (*cur_ip == tramp_addr) { + *cur_ip = ri->orig_ret_vaddr; + ri = ri->next; + } + cur_ip++; + } +#endif +} + struct perf_callchain_entry * get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, u32 max_stack, bool crosstask, bool add_mark) { struct perf_callchain_entry *entry; struct perf_callchain_entry_ctx ctx; - int rctx; + int rctx, start_entry_idx; entry = get_callchain_entry(&rctx); if (!entry) @@ -215,7 +254,9 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, if (add_mark) perf_callchain_store_context(&ctx, PERF_CONTEXT_USER); + start_entry_idx = entry->nr; perf_callchain_user(&ctx, regs); + fixup_uretprobe_trampoline_entries(entry, start_entry_idx); } } diff --git a/kernel/events/core.c b/kernel/events/core.c index f0128c5ff278..ab6c4c942f79 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2283,21 +2283,6 @@ event_sched_out(struct perf_event *event, struct perf_event_context *ctx) state = PERF_EVENT_STATE_OFF; } - if (event->pending_sigtrap) { - bool dec = true; - - event->pending_sigtrap = 0; - if (state != PERF_EVENT_STATE_OFF && - !event->pending_work) { - event->pending_work = 1; - dec = false; - WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount)); - task_work_add(current, &event->pending_task, TWA_RESUME); - } - if (dec) - local_dec(&event->ctx->nr_pending); - } - perf_event_set_state(event, state); if (!is_software_event(event)) @@ -2466,7 +2451,7 @@ static void __perf_event_disable(struct perf_event *event, * hold the top-level event's child_mutex, so any descendant that * goes to exit will block in perf_event_exit_event(). * - * When called from perf_pending_irq it's OK because event->ctx + * When called from perf_pending_disable it's OK because event->ctx * is the current context on this CPU and preemption is disabled, * hence we can't get into perf_event_task_sched_out for this context. */ @@ -2506,7 +2491,7 @@ EXPORT_SYMBOL_GPL(perf_event_disable); void perf_event_disable_inatomic(struct perf_event *event) { event->pending_disable = 1; - irq_work_queue(&event->pending_irq); + irq_work_queue(&event->pending_disable_irq); } #define MAX_INTERRUPTS (~0ULL) @@ -5206,9 +5191,35 @@ static bool exclusive_event_installable(struct perf_event *event, static void perf_addr_filters_splice(struct perf_event *event, struct list_head *head); +static void perf_pending_task_sync(struct perf_event *event) +{ + struct callback_head *head = &event->pending_task; + + if (!event->pending_work) + return; + /* + * If the task is queued to the current task's queue, we + * obviously can't wait for it to complete. Simply cancel it. + */ + if (task_work_cancel(current, head)) { + event->pending_work = 0; + local_dec(&event->ctx->nr_pending); + return; + } + + /* + * All accesses related to the event are within the same RCU section in + * perf_pending_task(). The RCU grace period before the event is freed + * will make sure all those accesses are complete by then. + */ + rcuwait_wait_event(&event->pending_work_wait, !event->pending_work, TASK_UNINTERRUPTIBLE); +} + static void _free_event(struct perf_event *event) { irq_work_sync(&event->pending_irq); + irq_work_sync(&event->pending_disable_irq); + perf_pending_task_sync(event); unaccount_event(event); @@ -5384,6 +5395,7 @@ int perf_event_release_kernel(struct perf_event *event) again: mutex_lock(&event->child_mutex); list_for_each_entry(child, &event->child_list, child_list) { + void *var = NULL; /* * Cannot change, child events are not migrated, see the @@ -5424,11 +5436,23 @@ again: * this can't be the last reference. */ put_event(event); + } else { + var = &ctx->refcount; } mutex_unlock(&event->child_mutex); mutex_unlock(&ctx->mutex); put_ctx(ctx); + + if (var) { + /* + * If perf_event_free_task() has deleted all events from the + * ctx while the child_mutex got released above, make sure to + * notify about the preceding put_ctx(). + */ + smp_mb(); /* pairs with wait_var_event() */ + wake_up_var(var); + } goto again; } mutex_unlock(&event->child_mutex); @@ -6496,6 +6520,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) return -EINVAL; nr_pages = vma_size / PAGE_SIZE; + if (nr_pages > INT_MAX) + return -ENOMEM; mutex_lock(&event->mmap_mutex); ret = -EINVAL; @@ -6737,7 +6763,7 @@ static void perf_sigtrap(struct perf_event *event) /* * Deliver the pending work in-event-context or follow the context. */ -static void __perf_pending_irq(struct perf_event *event) +static void __perf_pending_disable(struct perf_event *event) { int cpu = READ_ONCE(event->oncpu); @@ -6752,11 +6778,6 @@ static void __perf_pending_irq(struct perf_event *event) * Yay, we hit home and are in the context of the event. */ if (cpu == smp_processor_id()) { - if (event->pending_sigtrap) { - event->pending_sigtrap = 0; - perf_sigtrap(event); - local_dec(&event->ctx->nr_pending); - } if (event->pending_disable) { event->pending_disable = 0; perf_event_disable_local(event); @@ -6780,11 +6801,26 @@ static void __perf_pending_irq(struct perf_event *event) * irq_work_queue(); // FAILS * * irq_work_run() - * perf_pending_irq() + * perf_pending_disable() * * But the event runs on CPU-B and wants disabling there. */ - irq_work_queue_on(&event->pending_irq, cpu); + irq_work_queue_on(&event->pending_disable_irq, cpu); +} + +static void perf_pending_disable(struct irq_work *entry) +{ + struct perf_event *event = container_of(entry, struct perf_event, pending_disable_irq); + int rctx; + + /* + * If we 'fail' here, that's OK, it means recursion is already disabled + * and we won't recurse 'further'. + */ + rctx = perf_swevent_get_recursion_context(); + __perf_pending_disable(event); + if (rctx >= 0) + perf_swevent_put_recursion_context(rctx); } static void perf_pending_irq(struct irq_work *entry) @@ -6807,8 +6843,6 @@ static void perf_pending_irq(struct irq_work *entry) perf_event_wakeup(event); } - __perf_pending_irq(event); - if (rctx >= 0) perf_swevent_put_recursion_context(rctx); } @@ -6819,23 +6853,27 @@ static void perf_pending_task(struct callback_head *head) int rctx; /* + * All accesses to the event must belong to the same implicit RCU read-side + * critical section as the ->pending_work reset. See comment in + * perf_pending_task_sync(). + */ + rcu_read_lock(); + /* * If we 'fail' here, that's OK, it means recursion is already disabled * and we won't recurse 'further'. */ - preempt_disable_notrace(); rctx = perf_swevent_get_recursion_context(); if (event->pending_work) { event->pending_work = 0; perf_sigtrap(event); local_dec(&event->ctx->nr_pending); + rcuwait_wake_up(&event->pending_work_wait); } + rcu_read_unlock(); if (rctx >= 0) perf_swevent_put_recursion_context(rctx); - preempt_enable_notrace(); - - put_event(event); } #ifdef CONFIG_GUEST_PERF_EVENTS @@ -9693,16 +9731,26 @@ static int __perf_event_overflow(struct perf_event *event, */ bool valid_sample = sample_is_allowed(event, regs); unsigned int pending_id = 1; + enum task_work_notify_mode notify_mode; if (regs) pending_id = hash32_ptr((void *)instruction_pointer(regs)) ?: 1; - if (!event->pending_sigtrap) { - event->pending_sigtrap = pending_id; + + notify_mode = in_nmi() ? TWA_NMI_CURRENT : TWA_RESUME; + + if (!event->pending_work && + !task_work_add(current, &event->pending_task, notify_mode)) { + event->pending_work = pending_id; local_inc(&event->ctx->nr_pending); + + event->pending_addr = 0; + if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR)) + event->pending_addr = data->addr; + } else if (event->attr.exclude_kernel && valid_sample) { /* * Should not be able to return to user space without - * consuming pending_sigtrap; with exceptions: + * consuming pending_work; with exceptions: * * 1. Where !exclude_kernel, events can overflow again * in the kernel without returning to user space. @@ -9712,13 +9760,8 @@ static int __perf_event_overflow(struct perf_event *event, * To approximate progress (with false negatives), * check 32-bit hash of the current IP. */ - WARN_ON_ONCE(event->pending_sigtrap != pending_id); + WARN_ON_ONCE(event->pending_work != pending_id); } - - event->pending_addr = 0; - if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR)) - event->pending_addr = data->addr; - irq_work_queue(&event->pending_irq); } READ_ONCE(event->overflow_handler)(event, data, regs); @@ -9746,11 +9789,7 @@ struct swevent_htable { struct swevent_hlist *swevent_hlist; struct mutex hlist_mutex; int hlist_refcount; - - /* Recursion avoidance in each contexts */ - int recursion[PERF_NR_CONTEXTS]; }; - static DEFINE_PER_CPU(struct swevent_htable, swevent_htable); /* @@ -9948,17 +9987,13 @@ DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]); int perf_swevent_get_recursion_context(void) { - struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); - - return get_recursion_context(swhash->recursion); + return get_recursion_context(current->perf_recursion); } EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); void perf_swevent_put_recursion_context(int rctx) { - struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); - - put_recursion_context(swhash->recursion, rctx); + put_recursion_context(current->perf_recursion, rctx); } void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) @@ -11948,7 +11983,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, init_waitqueue_head(&event->waitq); init_irq_work(&event->pending_irq, perf_pending_irq); + event->pending_disable_irq = IRQ_WORK_INIT_HARD(perf_pending_disable); init_task_work(&event->pending_task, perf_pending_task); + rcuwait_init(&event->pending_work_wait); mutex_init(&event->mmap_mutex); raw_spin_lock_init(&event->addr_filters.lock); @@ -13624,6 +13661,7 @@ int perf_event_init_task(struct task_struct *child, u64 clone_flags) { int ret; + memset(child->perf_recursion, 0, sizeof(child->perf_recursion)); child->perf_event_ctxp = NULL; mutex_init(&child->perf_event_mutex); INIT_LIST_HEAD(&child->perf_event_list); diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 5150d5f84c03..451514442a1b 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -128,7 +128,7 @@ static inline unsigned long perf_data_size(struct perf_buffer *rb) static inline unsigned long perf_aux_size(struct perf_buffer *rb) { - return rb->aux_nr_pages << PAGE_SHIFT; + return (unsigned long)rb->aux_nr_pages << PAGE_SHIFT; } #define __DEFINE_OUTPUT_COPY_BODY(advance_buf, memcpy_func, ...) \ @@ -208,7 +208,7 @@ arch_perf_out_copy_user(void *dst, const void *src, unsigned long n) DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user) -static inline int get_recursion_context(int *recursion) +static inline int get_recursion_context(u8 *recursion) { unsigned char rctx = interrupt_context_level(); @@ -221,7 +221,7 @@ static inline int get_recursion_context(int *recursion) return rctx; } -static inline void put_recursion_context(int *recursion, int rctx) +static inline void put_recursion_context(u8 *recursion, unsigned char rctx) { barrier(); recursion[rctx]--; diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 4013408ce012..8cadf97bc290 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -682,13 +682,18 @@ int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event, if (!has_aux(event)) return -EOPNOTSUPP; + if (nr_pages <= 0) + return -EINVAL; + if (!overwrite) { /* * Watermark defaults to half the buffer, and so does the * max_order, to aid PMU drivers in double buffering. */ if (!watermark) - watermark = nr_pages << (PAGE_SHIFT - 1); + watermark = min_t(unsigned long, + U32_MAX, + (unsigned long)nr_pages << (PAGE_SHIFT - 1)); /* * Use aux_watermark as the basis for chunking to diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 2c83ba776fc7..99be2adedbc0 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1474,11 +1474,20 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) return ret; } +void * __weak arch_uprobe_trampoline(unsigned long *psize) +{ + static uprobe_opcode_t insn = UPROBE_SWBP_INSN; + + *psize = UPROBE_SWBP_INSN_SIZE; + return &insn; +} + static struct xol_area *__create_xol_area(unsigned long vaddr) { struct mm_struct *mm = current->mm; - uprobe_opcode_t insn = UPROBE_SWBP_INSN; + unsigned long insns_size; struct xol_area *area; + void *insns; area = kmalloc(sizeof(*area), GFP_KERNEL); if (unlikely(!area)) @@ -1502,7 +1511,8 @@ static struct xol_area *__create_xol_area(unsigned long vaddr) /* Reserve the 1st slot for get_trampoline_vaddr() */ set_bit(0, area->bitmap); atomic_set(&area->slot_count, 1); - arch_uprobe_copy_ixol(area->pages[0], 0, &insn, UPROBE_SWBP_INSN_SIZE); + insns = arch_uprobe_trampoline(&insns_size); + arch_uprobe_copy_ixol(area->pages[0], 0, insns, insns_size); if (!xol_add_vma(mm, area)) return area; @@ -1827,7 +1837,7 @@ void uprobe_copy_process(struct task_struct *t, unsigned long flags) * * Returns -1 in case the xol_area is not allocated. */ -static unsigned long get_trampoline_vaddr(void) +unsigned long uprobe_get_trampoline_vaddr(void) { struct xol_area *area; unsigned long trampoline_vaddr = -1; @@ -1878,7 +1888,7 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) if (!ri) return; - trampoline_vaddr = get_trampoline_vaddr(); + trampoline_vaddr = uprobe_get_trampoline_vaddr(); orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs); if (orig_ret_vaddr == -1) goto fail; @@ -2123,7 +2133,7 @@ static struct return_instance *find_next_ret_chain(struct return_instance *ri) return ri; } -static void handle_trampoline(struct pt_regs *regs) +void uprobe_handle_trampoline(struct pt_regs *regs) { struct uprobe_task *utask; struct return_instance *ri, *next; @@ -2149,6 +2159,15 @@ static void handle_trampoline(struct pt_regs *regs) instruction_pointer_set(regs, ri->orig_ret_vaddr); do { + /* pop current instance from the stack of pending return instances, + * as it's not pending anymore: we just fixed up original + * instruction pointer in regs and are about to call handlers; + * this allows fixup_uretprobe_trampoline_entries() to properly fix up + * captured stack traces from uretprobe handlers, in which pending + * trampoline addresses on the stack are replaced with correct + * original return addresses + */ + utask->return_instances = ri->next; if (valid) handle_uretprobe_chain(ri, regs); ri = free_ret_instance(ri); @@ -2187,8 +2206,8 @@ static void handle_swbp(struct pt_regs *regs) int is_swbp; bp_vaddr = uprobe_get_swbp_addr(regs); - if (bp_vaddr == get_trampoline_vaddr()) - return handle_trampoline(regs); + if (bp_vaddr == uprobe_get_trampoline_vaddr()) + return uprobe_handle_trampoline(regs); uprobe = find_active_uprobe(bp_vaddr, &is_swbp); if (!uprobe) { diff --git a/kernel/exit.c b/kernel/exit.c index f95a2c1338a8..be81342caf1b 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -277,7 +277,6 @@ repeat: } write_unlock_irq(&tasklist_lock); - seccomp_filter_release(p); proc_flush_pid(thread_pid); put_pid(thread_pid); release_thread(p); @@ -484,6 +483,8 @@ retry: * Search through everything else, we should not get here often. */ for_each_process(g) { + if (atomic_read(&mm->mm_users) <= 1) + break; if (g->flags & PF_KTHREAD) continue; for_each_thread(g, c) { @@ -832,6 +833,8 @@ void __noreturn do_exit(long code) io_uring_files_cancel(); exit_signals(tsk); /* sets PF_EXITING */ + seccomp_filter_release(tsk); + acct_update_integrals(tsk); group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { diff --git a/kernel/fork.c b/kernel/fork.c index 99076dbe27d8..942e3d8617bf 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -115,6 +115,8 @@ #define CREATE_TRACE_POINTS #include <trace/events/task.h> +#include <kunit/visibility.h> + /* * Minimum number of threads to boot the kernel */ @@ -616,12 +618,6 @@ static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm) exe_file = get_mm_exe_file(oldmm); RCU_INIT_POINTER(mm->exe_file, exe_file); - /* - * We depend on the oldmm having properly denied write access to the - * exe_file already. - */ - if (exe_file && deny_write_access(exe_file)) - pr_warn_once("deny_write_access() failed in %s\n", __func__); } #ifdef CONFIG_MMU @@ -1334,6 +1330,7 @@ struct mm_struct *mm_alloc(void) memset(mm, 0, sizeof(*mm)); return mm_init(mm, current, current_user_ns()); } +EXPORT_SYMBOL_IF_KUNIT(mm_alloc); static inline void __mmput(struct mm_struct *mm) { @@ -1412,20 +1409,11 @@ int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) */ old_exe_file = rcu_dereference_raw(mm->exe_file); - if (new_exe_file) { - /* - * We expect the caller (i.e., sys_execve) to already denied - * write access, so this is unlikely to fail. - */ - if (unlikely(deny_write_access(new_exe_file))) - return -EACCES; + if (new_exe_file) get_file(new_exe_file); - } rcu_assign_pointer(mm->exe_file, new_exe_file); - if (old_exe_file) { - allow_write_access(old_exe_file); + if (old_exe_file) fput(old_exe_file); - } return 0; } @@ -1464,9 +1452,6 @@ int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) return ret; } - ret = deny_write_access(new_exe_file); - if (ret) - return -EACCES; get_file(new_exe_file); /* set the new file */ @@ -1475,10 +1460,8 @@ int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) rcu_assign_pointer(mm->exe_file, new_exe_file); mmap_write_unlock(mm); - if (old_exe_file) { - allow_write_access(old_exe_file); + if (old_exe_file) fput(old_exe_file); - } return 0; } @@ -2941,8 +2924,6 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, } #endif -#ifdef __ARCH_WANT_SYS_CLONE3 - noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, struct clone_args __user *uargs, size_t usize) @@ -3086,6 +3067,11 @@ SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size) struct kernel_clone_args kargs; pid_t set_tid[MAX_PID_NS_LEVEL]; +#ifdef __ARCH_BROKEN_SYS_CLONE3 +#warning clone3() entry point is missing, please fix + return -ENOSYS; +#endif + kargs.set_tid = set_tid; err = copy_clone_args_from_user(&kargs, uargs, size); @@ -3097,7 +3083,6 @@ SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size) return kernel_clone(&kargs); } -#endif void walk_process_tree(struct task_struct *top, proc_visitor visitor, void *data) { diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index 74a4ef1da9ad..fd75b4a484d7 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c @@ -18,7 +18,9 @@ #include <linux/mm.h> #include "gcov.h" -#if (__GNUC__ >= 10) +#if (__GNUC__ >= 14) +#define GCOV_COUNTERS 9 +#elif (__GNUC__ >= 10) #define GCOV_COUNTERS 8 #elif (__GNUC__ >= 7) #define GCOV_COUNTERS 9 diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c index 38d6ae651ac7..3d4036db15ac 100644 --- a/kernel/irq/irq_sim.c +++ b/kernel/irq/irq_sim.c @@ -17,6 +17,8 @@ struct irq_sim_work_ctx { unsigned int irq_count; unsigned long *pending; struct irq_domain *domain; + struct irq_sim_ops ops; + void *user_data; }; struct irq_sim_irq_ctx { @@ -88,6 +90,31 @@ static int irq_sim_set_irqchip_state(struct irq_data *data, return 0; } +static int irq_sim_request_resources(struct irq_data *data) +{ + struct irq_sim_irq_ctx *irq_ctx = irq_data_get_irq_chip_data(data); + struct irq_sim_work_ctx *work_ctx = irq_ctx->work_ctx; + irq_hw_number_t hwirq = irqd_to_hwirq(data); + + if (work_ctx->ops.irq_sim_irq_requested) + return work_ctx->ops.irq_sim_irq_requested(work_ctx->domain, + hwirq, + work_ctx->user_data); + + return 0; +} + +static void irq_sim_release_resources(struct irq_data *data) +{ + struct irq_sim_irq_ctx *irq_ctx = irq_data_get_irq_chip_data(data); + struct irq_sim_work_ctx *work_ctx = irq_ctx->work_ctx; + irq_hw_number_t hwirq = irqd_to_hwirq(data); + + if (work_ctx->ops.irq_sim_irq_released) + work_ctx->ops.irq_sim_irq_released(work_ctx->domain, hwirq, + work_ctx->user_data); +} + static struct irq_chip irq_sim_irqchip = { .name = "irq_sim", .irq_mask = irq_sim_irqmask, @@ -95,6 +122,8 @@ static struct irq_chip irq_sim_irqchip = { .irq_set_type = irq_sim_set_type, .irq_get_irqchip_state = irq_sim_get_irqchip_state, .irq_set_irqchip_state = irq_sim_set_irqchip_state, + .irq_request_resources = irq_sim_request_resources, + .irq_release_resources = irq_sim_release_resources, }; static void irq_sim_handle_irq(struct irq_work *work) @@ -164,6 +193,15 @@ static const struct irq_domain_ops irq_sim_domain_ops = { struct irq_domain *irq_domain_create_sim(struct fwnode_handle *fwnode, unsigned int num_irqs) { + return irq_domain_create_sim_full(fwnode, num_irqs, NULL, NULL); +} +EXPORT_SYMBOL_GPL(irq_domain_create_sim); + +struct irq_domain *irq_domain_create_sim_full(struct fwnode_handle *fwnode, + unsigned int num_irqs, + const struct irq_sim_ops *ops, + void *data) +{ struct irq_sim_work_ctx *work_ctx __free(kfree) = kmalloc(sizeof(*work_ctx), GFP_KERNEL); @@ -183,10 +221,14 @@ struct irq_domain *irq_domain_create_sim(struct fwnode_handle *fwnode, work_ctx->irq_count = num_irqs; work_ctx->work = IRQ_WORK_INIT_HARD(irq_sim_handle_irq); work_ctx->pending = no_free_ptr(pending); + work_ctx->user_data = data; + + if (ops) + memcpy(&work_ctx->ops, ops, sizeof(*ops)); return no_free_ptr(work_ctx)->domain; } -EXPORT_SYMBOL_GPL(irq_domain_create_sim); +EXPORT_SYMBOL_GPL(irq_domain_create_sim_full); /** * irq_domain_remove_sim - Deinitialize the interrupt simulator domain: free @@ -228,10 +270,22 @@ struct irq_domain *devm_irq_domain_create_sim(struct device *dev, struct fwnode_handle *fwnode, unsigned int num_irqs) { + return devm_irq_domain_create_sim_full(dev, fwnode, num_irqs, + NULL, NULL); +} +EXPORT_SYMBOL_GPL(devm_irq_domain_create_sim); + +struct irq_domain * +devm_irq_domain_create_sim_full(struct device *dev, + struct fwnode_handle *fwnode, + unsigned int num_irqs, + const struct irq_sim_ops *ops, + void *data) +{ struct irq_domain *domain; int ret; - domain = irq_domain_create_sim(fwnode, num_irqs); + domain = irq_domain_create_sim_full(fwnode, num_irqs, ops, data); if (IS_ERR(domain)) return domain; @@ -241,4 +295,4 @@ struct irq_domain *devm_irq_domain_create_sim(struct device *dev, return domain; } -EXPORT_SYMBOL_GPL(devm_irq_domain_create_sim); +EXPORT_SYMBOL_GPL(devm_irq_domain_create_sim_full); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 71b0fc2d0aea..dd53298ef1a5 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1337,7 +1337,7 @@ static int irq_thread(void *data) * synchronize_hardirq(). So neither IRQTF_RUNTHREAD nor the * oneshot mask bit can be set. */ - task_work_cancel(current, irq_thread_dtor); + task_work_cancel_func(current, irq_thread_dtor); return 0; } diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 3218fa5688b9..4ad5ed8adf96 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -131,13 +131,16 @@ bool static_key_fast_inc_not_disabled(struct static_key *key) STATIC_KEY_CHECK_USE(key); /* * Negative key->enabled has a special meaning: it sends - * static_key_slow_inc() down the slow path, and it is non-zero - * so it counts as "enabled" in jump_label_update(). Note that - * atomic_inc_unless_negative() checks >= 0, so roll our own. + * static_key_slow_inc/dec() down the slow path, and it is non-zero + * so it counts as "enabled" in jump_label_update(). + * + * The INT_MAX overflow condition is either used by the networking + * code to reset or detected in the slow path of + * static_key_slow_inc_cpuslocked(). */ v = atomic_read(&key->enabled); do { - if (v <= 0 || (v + 1) < 0) + if (v <= 0 || v == INT_MAX) return false; } while (!likely(atomic_try_cmpxchg(&key->enabled, &v, v + 1))); @@ -150,7 +153,7 @@ bool static_key_slow_inc_cpuslocked(struct static_key *key) lockdep_assert_cpus_held(); /* - * Careful if we get concurrent static_key_slow_inc() calls; + * Careful if we get concurrent static_key_slow_inc/dec() calls; * later calls must wait for the first one to _finish_ the * jump_label_update() process. At the same time, however, * the jump_label_update() call below wants to see @@ -159,22 +162,24 @@ bool static_key_slow_inc_cpuslocked(struct static_key *key) if (static_key_fast_inc_not_disabled(key)) return true; - jump_label_lock(); - if (atomic_read(&key->enabled) == 0) { - atomic_set(&key->enabled, -1); + guard(mutex)(&jump_label_mutex); + /* Try to mark it as 'enabling in progress. */ + if (!atomic_cmpxchg(&key->enabled, 0, -1)) { jump_label_update(key); /* - * Ensure that if the above cmpxchg loop observes our positive - * value, it must also observe all the text changes. + * Ensure that when static_key_fast_inc_not_disabled() or + * static_key_slow_try_dec() observe the positive value, + * they must also observe all the text changes. */ atomic_set_release(&key->enabled, 1); } else { - if (WARN_ON_ONCE(!static_key_fast_inc_not_disabled(key))) { - jump_label_unlock(); + /* + * While holding the mutex this should never observe + * anything else than a value >= 1 and succeed + */ + if (WARN_ON_ONCE(!static_key_fast_inc_not_disabled(key))) return false; - } } - jump_label_unlock(); return true; } @@ -247,20 +252,32 @@ EXPORT_SYMBOL_GPL(static_key_disable); static bool static_key_slow_try_dec(struct static_key *key) { - int val; - - val = atomic_fetch_add_unless(&key->enabled, -1, 1); - if (val == 1) - return false; + int v; /* - * The negative count check is valid even when a negative - * key->enabled is in use by static_key_slow_inc(); a - * __static_key_slow_dec() before the first static_key_slow_inc() - * returns is unbalanced, because all other static_key_slow_inc() - * instances block while the update is in progress. + * Go into the slow path if key::enabled is less than or equal than + * one. One is valid to shut down the key, anything less than one + * is an imbalance, which is handled at the call site. + * + * That includes the special case of '-1' which is set in + * static_key_slow_inc_cpuslocked(), but that's harmless as it is + * fully serialized in the slow path below. By the time this task + * acquires the jump label lock the value is back to one and the + * retry under the lock must succeed. */ - WARN(val < 0, "jump label: negative count!\n"); + v = atomic_read(&key->enabled); + do { + /* + * Warn about the '-1' case though; since that means a + * decrement is concurrent with a first (0->1) increment. IOW + * people are trying to disable something that wasn't yet fully + * enabled. This suggests an ordering problem on the user side. + */ + WARN_ON_ONCE(v < 0); + if (v <= 1) + return false; + } while (!likely(atomic_try_cmpxchg(&key->enabled, &v, v - 1))); + return true; } @@ -271,10 +288,11 @@ static void __static_key_slow_dec_cpuslocked(struct static_key *key) if (static_key_slow_try_dec(key)) return; - jump_label_lock(); - if (atomic_dec_and_test(&key->enabled)) + guard(mutex)(&jump_label_mutex); + if (atomic_cmpxchg(&key->enabled, 1, 0)) jump_label_update(key); - jump_label_unlock(); + else + WARN_ON_ONCE(!static_key_slow_try_dec(key)); } static void __static_key_slow_dec(struct static_key *key) diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 22ea19a36e6e..98b9622d372e 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -388,12 +388,12 @@ int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize, !!__bpf_address_lookup(addr, symbolsize, offset, namebuf); } -static const char *kallsyms_lookup_buildid(unsigned long addr, +static int kallsyms_lookup_buildid(unsigned long addr, unsigned long *symbolsize, unsigned long *offset, char **modname, const unsigned char **modbuildid, char *namebuf) { - const char *ret; + int ret; namebuf[KSYM_NAME_LEN - 1] = 0; namebuf[0] = 0; @@ -410,7 +410,7 @@ static const char *kallsyms_lookup_buildid(unsigned long addr, if (modbuildid) *modbuildid = NULL; - ret = namebuf; + ret = strlen(namebuf); goto found; } @@ -442,8 +442,13 @@ const char *kallsyms_lookup(unsigned long addr, unsigned long *offset, char **modname, char *namebuf) { - return kallsyms_lookup_buildid(addr, symbolsize, offset, modname, - NULL, namebuf); + int ret = kallsyms_lookup_buildid(addr, symbolsize, offset, modname, + NULL, namebuf); + + if (!ret) + return NULL; + + return namebuf; } int lookup_symbol_name(unsigned long addr, char *symname) @@ -478,19 +483,15 @@ static int __sprint_symbol(char *buffer, unsigned long address, { char *modname; const unsigned char *buildid; - const char *name; unsigned long offset, size; int len; address += symbol_offset; - name = kallsyms_lookup_buildid(address, &size, &offset, &modname, &buildid, + len = kallsyms_lookup_buildid(address, &size, &offset, &modname, &buildid, buffer); - if (!name) + if (!len) return sprintf(buffer, "0x%lx", address - symbol_offset); - if (name != buffer) - strcpy(buffer, name); - len = strlen(buffer); offset -= symbol_offset; if (add_offset) diff --git a/kernel/kcov.c b/kernel/kcov.c index c3124f6d5536..f0a69d402066 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -632,6 +632,7 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, return -EINVAL; kcov->mode = mode; t->kcov = kcov; + t->kcov_mode = KCOV_MODE_REMOTE; kcov->t = t; kcov->remote = true; kcov->remote_size = remote_arg->area_size; diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c index 0c17b4c83e1c..117d9d4d3c3b 100644 --- a/kernel/kcsan/kcsan_test.c +++ b/kernel/kcsan/kcsan_test.c @@ -1620,5 +1620,6 @@ static struct kunit_suite kcsan_test_suite = { kunit_test_suites(&kcsan_test_suite); +MODULE_DESCRIPTION("KCSAN test suite"); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Marco Elver <elver@google.com>"); diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 151bd3de5936..726b22ce7d0b 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -97,7 +97,6 @@ static struct ctl_table kern_lockdep_table[] = { .proc_handler = proc_dointvec, }, #endif /* CONFIG_LOCK_STAT */ - { } }; static __init int kernel_lockdep_sysctls_init(void) diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 415d81e6ce70..de95ec07e477 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -30,6 +30,7 @@ #include <linux/torture.h> #include <linux/reboot.h> +MODULE_DESCRIPTION("torture test facility for locking"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>"); diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 1df5fef8a656..7d96bed718e4 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -583,7 +583,7 @@ EXPORT_SYMBOL(queued_spin_lock_slowpath); #include "qspinlock_paravirt.h" #include "qspinlock.c" -bool nopvspin __initdata; +bool nopvspin; static __init int parse_nopvspin(char *arg) { nopvspin = true; diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index c6d17aee4209..33cac79e3994 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1297,7 +1297,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) /* * lock for writing */ -static inline int __down_write_common(struct rw_semaphore *sem, int state) +static __always_inline int __down_write_common(struct rw_semaphore *sem, int state) { int ret = 0; @@ -1310,12 +1310,12 @@ static inline int __down_write_common(struct rw_semaphore *sem, int state) return ret; } -static inline void __down_write(struct rw_semaphore *sem) +static __always_inline void __down_write(struct rw_semaphore *sem) { __down_write_common(sem, TASK_UNINTERRUPTIBLE); } -static inline int __down_write_killable(struct rw_semaphore *sem) +static __always_inline int __down_write_killable(struct rw_semaphore *sem) { return __down_write_common(sem, TASK_KILLABLE); } diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c index 8475a0794f8c..438c6086d540 100644 --- a/kernel/locking/spinlock.c +++ b/kernel/locking/spinlock.c @@ -413,3 +413,11 @@ notrace int in_lock_functions(unsigned long addr) && addr < (unsigned long)__lock_text_end; } EXPORT_SYMBOL(in_lock_functions); + +#if defined(CONFIG_PROVE_LOCKING) && defined(CONFIG_PREEMPT_RT) +void notrace lockdep_assert_in_softirq_func(void) +{ + lockdep_assert_in_softirq(); +} +EXPORT_SYMBOL(lockdep_assert_in_softirq_func); +#endif diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c index 62fb57bb9f16..bf65e0c3c86f 100644 --- a/kernel/module/kallsyms.c +++ b/kernel/module/kallsyms.c @@ -321,14 +321,15 @@ void * __weak dereference_module_function_descriptor(struct module *mod, * For kallsyms to ask for address resolution. NULL means not found. Careful * not to lock to avoid deadlock on oopses, simply disable preemption. */ -const char *module_address_lookup(unsigned long addr, - unsigned long *size, - unsigned long *offset, - char **modname, - const unsigned char **modbuildid, - char *namebuf) +int module_address_lookup(unsigned long addr, + unsigned long *size, + unsigned long *offset, + char **modname, + const unsigned char **modbuildid, + char *namebuf) { - const char *ret = NULL; + const char *sym; + int ret = 0; struct module *mod; preempt_disable(); @@ -344,12 +345,10 @@ const char *module_address_lookup(unsigned long addr, #endif } - ret = find_kallsyms_symbol(mod, addr, size, offset); - } - /* Make a copy in here where it's safe */ - if (ret) { - strscpy(namebuf, ret, KSYM_NAME_LEN); - ret = namebuf; + sym = find_kallsyms_symbol(mod, addr, size, offset); + + if (sym) + ret = strscpy(namebuf, sym, KSYM_NAME_LEN); } preempt_enable(); diff --git a/kernel/module/main.c b/kernel/module/main.c index d18a94b973e1..d9592195c5bb 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -2166,6 +2166,8 @@ static int find_module_sections(struct module *mod, struct load_info *info) #endif #ifdef CONFIG_DEBUG_INFO_BTF_MODULES mod->btf_data = any_section_objs(info, ".BTF", 1, &mod->btf_data_size); + mod->btf_base_data = any_section_objs(info, ".BTF.base", 1, + &mod->btf_base_data_size); #endif #ifdef CONFIG_JUMP_LABEL mod->jump_entries = section_objs(info, "__jump_table", @@ -2590,8 +2592,9 @@ static noinline int do_init_module(struct module *mod) } #ifdef CONFIG_DEBUG_INFO_BTF_MODULES - /* .BTF is not SHF_ALLOC and will get removed, so sanitize pointer */ + /* .BTF is not SHF_ALLOC and will get removed, so sanitize pointers */ mod->btf_data = NULL; + mod->btf_base_data = NULL; #endif /* * We want to free module_init, but be aware that kallsyms may be diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index dc48fecfa1dc..bdf0087d6442 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -218,6 +218,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) */ do { clear_thread_flag(TIF_SIGPENDING); + clear_thread_flag(TIF_NOTIFY_SIGNAL); rc = kernel_wait4(-1, NULL, __WALL, NULL); } while (rc != -ECHILD); @@ -248,24 +249,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) set_current_state(TASK_INTERRUPTIBLE); if (pid_ns->pid_allocated == init_pids) break; - /* - * Release tasks_rcu_exit_srcu to avoid following deadlock: - * - * 1) TASK A unshare(CLONE_NEWPID) - * 2) TASK A fork() twice -> TASK B (child reaper for new ns) - * and TASK C - * 3) TASK B exits, kills TASK C, waits for TASK A to reap it - * 4) TASK A calls synchronize_rcu_tasks() - * -> synchronize_srcu(tasks_rcu_exit_srcu) - * 5) *DEADLOCK* - * - * It is considered safe to release tasks_rcu_exit_srcu here - * because we assume the current task can not be concurrently - * reaped at this point. - */ - exit_tasks_rcu_stop(); schedule(); - exit_tasks_rcu_start(); } __set_current_state(TASK_RUNNING); diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile index 040fe7d1eda2..39a2b61c7232 100644 --- a/kernel/printk/Makefile +++ b/kernel/printk/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-y = printk.o conopt.o +obj-y = printk.o obj-$(CONFIG_PRINTK) += printk_safe.o nbcon.o obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o obj-$(CONFIG_PRINTK_INDEX) += index.o diff --git a/kernel/printk/conopt.c b/kernel/printk/conopt.c deleted file mode 100644 index 9d507bac3657..000000000000 --- a/kernel/printk/conopt.c +++ /dev/null @@ -1,146 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Kernel command line console options for hardware based addressing - * - * Copyright (C) 2023 Texas Instruments Incorporated - https://www.ti.com/ - * Author: Tony Lindgren <tony@atomide.com> - */ - -#include <linux/console.h> -#include <linux/init.h> -#include <linux/string.h> -#include <linux/types.h> - -#include <asm/errno.h> - -#include "console_cmdline.h" - -/* - * Allow longer DEVNAME:0.0 style console naming such as abcd0000.serial:0.0 - * in addition to the legacy ttyS0 style naming. - */ -#define CONSOLE_NAME_MAX 32 - -#define CONSOLE_OPT_MAX 16 -#define CONSOLE_BRL_OPT_MAX 16 - -struct console_option { - char name[CONSOLE_NAME_MAX]; - char opt[CONSOLE_OPT_MAX]; - char brl_opt[CONSOLE_BRL_OPT_MAX]; - u8 has_brl_opt:1; -}; - -/* Updated only at console_setup() time, no locking needed */ -static struct console_option conopt[MAX_CMDLINECONSOLES]; - -/** - * console_opt_save - Saves kernel command line console option for driver use - * @str: Kernel command line console name and option - * @brl_opt: Braille console options - * - * Saves a kernel command line console option for driver subsystems to use for - * adding a preferred console during init. Called from console_setup() only. - * - * Return: 0 on success, negative error code on failure. - */ -int __init console_opt_save(const char *str, const char *brl_opt) -{ - struct console_option *con; - size_t namelen, optlen; - const char *opt; - int i; - - namelen = strcspn(str, ","); - if (namelen == 0 || namelen >= CONSOLE_NAME_MAX) - return -EINVAL; - - opt = str + namelen; - if (*opt == ',') - opt++; - - optlen = strlen(opt); - if (optlen >= CONSOLE_OPT_MAX) - return -EINVAL; - - for (i = 0; i < MAX_CMDLINECONSOLES; i++) { - con = &conopt[i]; - - if (con->name[0]) { - if (!strncmp(str, con->name, namelen)) - return 0; - continue; - } - - /* - * The name isn't terminated, only opt is. Empty opt is fine, - * but brl_opt can be either empty or NULL. For more info, see - * _braille_console_setup(). - */ - strscpy(con->name, str, namelen + 1); - strscpy(con->opt, opt, CONSOLE_OPT_MAX); - if (brl_opt) { - strscpy(con->brl_opt, brl_opt, CONSOLE_BRL_OPT_MAX); - con->has_brl_opt = 1; - } - - return 0; - } - - return -ENOMEM; -} - -static struct console_option *console_opt_find(const char *name) -{ - struct console_option *con; - int i; - - for (i = 0; i < MAX_CMDLINECONSOLES; i++) { - con = &conopt[i]; - if (!strcmp(name, con->name)) - return con; - } - - return NULL; -} - -/** - * add_preferred_console_match - Adds a preferred console if a match is found - * @match: Expected console on kernel command line, such as console=DEVNAME:0.0 - * @name: Name of the console character device to add such as ttyS - * @idx: Index for the console - * - * Allows driver subsystems to add a console after translating the command - * line name to the character device name used for the console. Options are - * added automatically based on the kernel command line. Duplicate preferred - * consoles are ignored by __add_preferred_console(). - * - * Return: 0 on success, negative error code on failure. - */ -int add_preferred_console_match(const char *match, const char *name, - const short idx) -{ - struct console_option *con; - char *brl_opt = NULL; - - if (!match || !strlen(match) || !name || !strlen(name) || - idx < 0) - return -EINVAL; - - con = console_opt_find(match); - if (!con) - return -ENOENT; - - /* - * See __add_preferred_console(). It checks for NULL brl_options to set - * the preferred_console flag. Empty brl_opt instead of NULL leads into - * the preferred_console flag not set, and CON_CONSDEV not being set, - * and the boot console won't get disabled at the end of console_setup(). - */ - if (con->has_brl_opt) - brl_opt = con->brl_opt; - - console_opt_add_preferred_console(name, idx, con->opt, brl_opt); - - return 0; -} diff --git a/kernel/printk/console_cmdline.h b/kernel/printk/console_cmdline.h index a125e0235589..3ca74ad391d6 100644 --- a/kernel/printk/console_cmdline.h +++ b/kernel/printk/console_cmdline.h @@ -2,12 +2,6 @@ #ifndef _CONSOLE_CMDLINE_H #define _CONSOLE_CMDLINE_H -#define MAX_CMDLINECONSOLES 8 - -int console_opt_save(const char *str, const char *brl_opt); -int console_opt_add_preferred_console(const char *name, const short idx, - char *options, char *brl_options); - struct console_cmdline { char name[16]; /* Name of the driver */ diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 420fd310129d..dddb15f48d59 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -383,6 +383,9 @@ static int console_locked; /* * Array of consoles built from command line options (console=) */ + +#define MAX_CMDLINECONSOLES 8 + static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES]; static int preferred_console = -1; @@ -2500,17 +2503,6 @@ static int __init console_setup(char *str) if (_braille_console_setup(&str, &brl_options)) return 1; - /* Save the console for driver subsystem use */ - if (console_opt_save(str, brl_options)) - return 1; - - /* Flag register_console() to not call try_enable_default_console() */ - console_set_on_cmdline = 1; - - /* Don't attempt to parse a DEVNAME:0.0 style console */ - if (strchr(str, ':')) - return 1; - /* * Decode str into name, index, options. */ @@ -2541,13 +2533,6 @@ static int __init console_setup(char *str) } __setup("console=", console_setup); -/* Only called from add_preferred_console_match() */ -int console_opt_add_preferred_console(const char *name, const short idx, - char *options, char *brl_options) -{ - return __add_preferred_console(name, idx, options, brl_options, true); -} - /** * add_preferred_console - add a device to the list of preferred consoles. * @name: device name @@ -3522,7 +3507,7 @@ void register_console(struct console *newcon) * Note that a console with tty binding will have CON_CONSDEV * flag set and will be first in the list. */ - if (preferred_console < 0 && !console_set_on_cmdline) { + if (preferred_console < 0) { if (hlist_empty(&console_list) || !console_first()->device || console_first()->flags & CON_BOOT) { try_enable_default_console(newcon); diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index 8db4fedaaa1e..b53a9e8f5904 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -42,6 +42,7 @@ #include "rcu.h" +MODULE_DESCRIPTION("Read-Copy Update module-based scalability-test facility"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>"); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 807fbf6123a7..08bf7c669dd3 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -51,6 +51,7 @@ #include "rcu.h" +MODULE_DESCRIPTION("Read-Copy Update module-based torture test facility"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com> and Josh Triplett <josh@joshtriplett.org>"); @@ -390,6 +391,7 @@ struct rcu_torture_ops { int extendables; int slow_gps; int no_pi_lock; + int debug_objects; const char *name; }; @@ -577,6 +579,7 @@ static struct rcu_torture_ops rcu_ops = { .irq_capable = 1, .can_boost = IS_ENABLED(CONFIG_RCU_BOOST), .extendables = RCUTORTURE_MAX_EXTEND, + .debug_objects = 1, .name = "rcu" }; @@ -747,6 +750,7 @@ static struct rcu_torture_ops srcu_ops = { .cbflood_max = 50000, .irq_capable = 1, .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU), + .debug_objects = 1, .name = "srcu" }; @@ -786,6 +790,7 @@ static struct rcu_torture_ops srcud_ops = { .cbflood_max = 50000, .irq_capable = 1, .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU), + .debug_objects = 1, .name = "srcud" }; @@ -2626,7 +2631,7 @@ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp) spin_lock_irqsave(&rfp->rcu_fwd_lock, flags); rfcpp = rfp->rcu_fwd_cb_tail; rfp->rcu_fwd_cb_tail = &rfcp->rfc_next; - WRITE_ONCE(*rfcpp, rfcp); + smp_store_release(rfcpp, rfcp); WRITE_ONCE(rfp->n_launders_cb, rfp->n_launders_cb + 1); i = ((jiffies - rfp->rcu_fwd_startat) / (HZ / FWD_CBS_HIST_DIV)); if (i >= ARRAY_SIZE(rfp->n_launders_hist)) @@ -3455,7 +3460,6 @@ rcu_torture_cleanup(void) cur_ops->gp_slow_unregister(NULL); } -#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD static void rcu_torture_leak_cb(struct rcu_head *rhp) { } @@ -3473,7 +3477,6 @@ static void rcu_torture_err_cb(struct rcu_head *rhp) */ pr_alert("%s: duplicated callback was invoked.\n", KBUILD_MODNAME); } -#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ /* * Verify that double-free causes debug-objects to complain, but only @@ -3482,39 +3485,43 @@ static void rcu_torture_err_cb(struct rcu_head *rhp) */ static void rcu_test_debug_objects(void) { -#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD struct rcu_head rh1; struct rcu_head rh2; + int idx; + + if (!IS_ENABLED(CONFIG_DEBUG_OBJECTS_RCU_HEAD)) { + pr_alert("%s: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_%s()\n", + KBUILD_MODNAME, cur_ops->name); + return; + } + + if (WARN_ON_ONCE(cur_ops->debug_objects && + (!cur_ops->call || !cur_ops->cb_barrier))) + return; + struct rcu_head *rhp = kmalloc(sizeof(*rhp), GFP_KERNEL); init_rcu_head_on_stack(&rh1); init_rcu_head_on_stack(&rh2); - pr_alert("%s: WARN: Duplicate call_rcu() test starting.\n", KBUILD_MODNAME); + pr_alert("%s: WARN: Duplicate call_%s() test starting.\n", KBUILD_MODNAME, cur_ops->name); /* Try to queue the rh2 pair of callbacks for the same grace period. */ - preempt_disable(); /* Prevent preemption from interrupting test. */ - rcu_read_lock(); /* Make it impossible to finish a grace period. */ - call_rcu_hurry(&rh1, rcu_torture_leak_cb); /* Start grace period. */ - local_irq_disable(); /* Make it harder to start a new grace period. */ - call_rcu_hurry(&rh2, rcu_torture_leak_cb); - call_rcu_hurry(&rh2, rcu_torture_err_cb); /* Duplicate callback. */ + idx = cur_ops->readlock(); /* Make it impossible to finish a grace period. */ + cur_ops->call(&rh1, rcu_torture_leak_cb); /* Start grace period. */ + cur_ops->call(&rh2, rcu_torture_leak_cb); + cur_ops->call(&rh2, rcu_torture_err_cb); /* Duplicate callback. */ if (rhp) { - call_rcu_hurry(rhp, rcu_torture_leak_cb); - call_rcu_hurry(rhp, rcu_torture_err_cb); /* Another duplicate callback. */ + cur_ops->call(rhp, rcu_torture_leak_cb); + cur_ops->call(rhp, rcu_torture_err_cb); /* Another duplicate callback. */ } - local_irq_enable(); - rcu_read_unlock(); - preempt_enable(); + cur_ops->readunlock(idx); /* Wait for them all to get done so we can safely return. */ - rcu_barrier(); - pr_alert("%s: WARN: Duplicate call_rcu() test complete.\n", KBUILD_MODNAME); + cur_ops->cb_barrier(); + pr_alert("%s: WARN: Duplicate call_%s() test complete.\n", KBUILD_MODNAME, cur_ops->name); destroy_rcu_head_on_stack(&rh1); destroy_rcu_head_on_stack(&rh2); kfree(rhp); -#else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ - pr_alert("%s: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n", KBUILD_MODNAME); -#endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ } static void rcutorture_sync(void) diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index 2c2648a3ad30..f4ea5b1ec068 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -63,6 +63,7 @@ do { \ #define SCALEOUT_ERRSTRING(s, x...) pr_alert("%s" SCALE_FLAG "!!! " s "\n", scale_type, ## x) +MODULE_DESCRIPTION("Scalability test for object reference mechanisms"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Joel Fernandes (Google) <joel@joelfernandes.org>"); diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 5afd5cf494db..549c03336ee9 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -277,7 +277,8 @@ bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie) unsigned long cur_s = READ_ONCE(ssp->srcu_idx); barrier(); - return ULONG_CMP_GE(cur_s, cookie) || ULONG_CMP_LT(cur_s, cookie - 3); + return cookie == SRCU_GET_STATE_COMPLETED || + ULONG_CMP_GE(cur_s, cookie) || ULONG_CMP_LT(cur_s, cookie - 3); } EXPORT_SYMBOL_GPL(poll_state_synchronize_srcu); diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index bc4b58b0204e..b24db425f16d 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -667,7 +667,10 @@ void cleanup_srcu_struct(struct srcu_struct *ssp) pr_info("%s: Active srcu_struct %p read state: %d gp state: %lu/%lu\n", __func__, ssp, rcu_seq_state(READ_ONCE(sup->srcu_gp_seq)), rcu_seq_current(&sup->srcu_gp_seq), sup->srcu_gp_seq_needed); - return; /* Caller forgot to stop doing call_srcu()? */ + return; // Caller forgot to stop doing call_srcu()? + // Or caller invoked start_poll_synchronize_srcu() + // and then cleanup_srcu_struct() before that grace + // period ended? } kfree(sup->node); sup->node = NULL; @@ -845,7 +848,6 @@ static void srcu_gp_end(struct srcu_struct *ssp) bool cbs; bool last_lvl; int cpu; - unsigned long flags; unsigned long gpseq; int idx; unsigned long mask; @@ -907,12 +909,12 @@ static void srcu_gp_end(struct srcu_struct *ssp) if (!(gpseq & counter_wrap_check)) for_each_possible_cpu(cpu) { sdp = per_cpu_ptr(ssp->sda, cpu); - spin_lock_irqsave_rcu_node(sdp, flags); + spin_lock_irq_rcu_node(sdp); if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed + 100)) sdp->srcu_gp_seq_needed = gpseq; if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed_exp + 100)) sdp->srcu_gp_seq_needed_exp = gpseq; - spin_unlock_irqrestore_rcu_node(sdp, flags); + spin_unlock_irq_rcu_node(sdp); } /* Callback initiation done, allow grace periods after next. */ @@ -1540,7 +1542,8 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu); */ bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie) { - if (!rcu_seq_done(&ssp->srcu_sup->srcu_gp_seq, cookie)) + if (cookie != SRCU_GET_STATE_COMPLETED && + !rcu_seq_done(&ssp->srcu_sup->srcu_gp_seq, cookie)) return false; // Ensure that the end of the SRCU grace period happens before // any subsequent code that the caller might execute. diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index 6c2bd9001adc..da60a9947c00 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c @@ -122,7 +122,7 @@ void rcu_sync_enter(struct rcu_sync *rsp) * we are called at early boot time but this shouldn't happen. */ } - WRITE_ONCE(rsp->gp_count, rsp->gp_count + 1); + rsp->gp_count++; spin_unlock_irq(&rsp->rss_lock); if (gp_state == GP_IDLE) { @@ -151,15 +151,11 @@ void rcu_sync_enter(struct rcu_sync *rsp) */ void rcu_sync_exit(struct rcu_sync *rsp) { - int gpc; - WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_IDLE); - WARN_ON_ONCE(READ_ONCE(rsp->gp_count) == 0); spin_lock_irq(&rsp->rss_lock); - gpc = rsp->gp_count - 1; - WRITE_ONCE(rsp->gp_count, gpc); - if (!gpc) { + WARN_ON_ONCE(rsp->gp_count == 0); + if (!--rsp->gp_count) { if (rsp->gp_state == GP_PASSED) { WRITE_ONCE(rsp->gp_state, GP_EXIT); rcu_sync_call(rsp); @@ -178,10 +174,10 @@ void rcu_sync_dtor(struct rcu_sync *rsp) { int gp_state; - WARN_ON_ONCE(READ_ONCE(rsp->gp_count)); WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_PASSED); spin_lock_irq(&rsp->rss_lock); + WARN_ON_ONCE(rsp->gp_count); if (rsp->gp_state == GP_REPLAY) WRITE_ONCE(rsp->gp_state, GP_EXIT); gp_state = rsp->gp_state; diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index e1bf33018e6d..ba3440a45b6d 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -858,7 +858,7 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) // not know to synchronize with this RCU Tasks grace period) have // completed exiting. The synchronize_rcu() in rcu_tasks_postgp() // will take care of any tasks stuck in the non-preemptible region -// of do_exit() following its call to exit_tasks_rcu_stop(). +// of do_exit() following its call to exit_tasks_rcu_finish(). // check_all_holdout_tasks(), repeatedly until holdout list is empty: // Scans the holdout list, attempting to identify a quiescent state // for each task on the list. If there is a quiescent state, the @@ -1220,7 +1220,7 @@ void exit_tasks_rcu_start(void) * Remove the task from the "yet another list" because do_exit() is now * non-preemptible, allowing synchronize_rcu() to wait beyond this point. */ -void exit_tasks_rcu_stop(void) +void exit_tasks_rcu_finish(void) { unsigned long flags; struct rcu_tasks_percpu *rtpcp; @@ -1231,22 +1231,12 @@ void exit_tasks_rcu_stop(void) raw_spin_lock_irqsave_rcu_node(rtpcp, flags); list_del_init(&t->rcu_tasks_exit_list); raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); -} -/* - * Contribute to protect against tasklist scan blind spot while the - * task is exiting and may be removed from the tasklist. See - * corresponding synchronize_srcu() for further details. - */ -void exit_tasks_rcu_finish(void) -{ - exit_tasks_rcu_stop(); - exit_tasks_rcu_finish_trace(current); + exit_tasks_rcu_finish_trace(t); } #else /* #ifdef CONFIG_TASKS_RCU */ void exit_tasks_rcu_start(void) { } -void exit_tasks_rcu_stop(void) { } void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); } #endif /* #else #ifdef CONFIG_TASKS_RCU */ @@ -1757,6 +1747,16 @@ static void rcu_tasks_trace_pregp_step(struct list_head *hop) // allow safe access to the hop list. for_each_online_cpu(cpu) { rcu_read_lock(); + // Note that cpu_curr_snapshot() picks up the target + // CPU's current task while its runqueue is locked with + // an smp_mb__after_spinlock(). This ensures that either + // the grace-period kthread will see that task's read-side + // critical section or the task will see the updater's pre-GP + // accesses. The trailing smp_mb() in cpu_curr_snapshot() + // does not currently play a role other than simplify + // that function's ordering semantics. If these simplified + // ordering semantics continue to be redundant, that smp_mb() + // might be removed. t = cpu_curr_snapshot(cpu); if (rcu_tasks_trace_pertask_prep(t, true)) trc_add_holdout(t, hop); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 28c7031711a3..e641cc681901 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -96,6 +96,7 @@ static struct rcu_state rcu_state = { .ofl_lock = __ARCH_SPIN_LOCK_UNLOCKED, .srs_cleanup_work = __WORK_INITIALIZER(rcu_state.srs_cleanup_work, rcu_sr_normal_gp_cleanup_work), + .srs_cleanups_pending = ATOMIC_INIT(0), }; /* Dump rcu_node combining tree at boot to verify correct setup. */ @@ -175,6 +176,9 @@ static int gp_init_delay; module_param(gp_init_delay, int, 0444); static int gp_cleanup_delay; module_param(gp_cleanup_delay, int, 0444); +static int nohz_full_patience_delay; +module_param(nohz_full_patience_delay, int, 0444); +static int nohz_full_patience_delay_jiffies; // Add delay to rcu_read_unlock() for strict grace periods. static int rcu_unlock_delay; @@ -296,16 +300,6 @@ static void rcu_dynticks_eqs_online(void) } /* - * Snapshot the ->dynticks counter with full ordering so as to allow - * stable comparison of this counter with past and future snapshots. - */ -static int rcu_dynticks_snap(int cpu) -{ - smp_mb(); // Fundamental RCU ordering guarantee. - return ct_dynticks_cpu_acquire(cpu); -} - -/* * Return true if the snapshot returned from rcu_dynticks_snap() * indicates that RCU is in an extended quiescent state. */ @@ -321,7 +315,15 @@ static bool rcu_dynticks_in_eqs(int snap) */ static bool rcu_dynticks_in_eqs_since(struct rcu_data *rdp, int snap) { - return snap != rcu_dynticks_snap(rdp->cpu); + /* + * The first failing snapshot is already ordered against the accesses + * performed by the remote CPU after it exits idle. + * + * The second snapshot therefore only needs to order against accesses + * performed by the remote CPU prior to entering idle and therefore can + * rely solely on acquire semantics. + */ + return snap != ct_dynticks_cpu_acquire(rdp->cpu); } /* @@ -769,7 +771,18 @@ static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp) */ static int dyntick_save_progress_counter(struct rcu_data *rdp) { - rdp->dynticks_snap = rcu_dynticks_snap(rdp->cpu); + /* + * Full ordering between remote CPU's post idle accesses and updater's + * accesses prior to current GP (and also the started GP sequence number) + * is enforced by rcu_seq_start() implicit barrier and even further by + * smp_mb__after_unlock_lock() barriers chained all the way throughout the + * rnp locking tree since rcu_gp_init() and up to the current leaf rnp + * locking. + * + * Ordering between remote CPU's pre idle accesses and post grace period + * updater's accesses is enforced by the below acquire semantic. + */ + rdp->dynticks_snap = ct_dynticks_cpu_acquire(rdp->cpu); if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) { trace_rcu_fqs(rcu_state.name, rdp->gp_seq, rdp->cpu, TPS("dti")); rcu_gpnum_ovf(rdp->mynode, rdp); @@ -1660,6 +1673,9 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work) rcu_sr_put_wait_head(rcu); } + + /* Order list manipulations with atomic access. */ + atomic_dec_return_release(&rcu_state.srs_cleanups_pending); } /* @@ -1667,7 +1683,7 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work) */ static void rcu_sr_normal_gp_cleanup(void) { - struct llist_node *wait_tail, *next, *rcu; + struct llist_node *wait_tail, *next = NULL, *rcu = NULL; int done = 0; wait_tail = rcu_state.srs_wait_tail; @@ -1693,16 +1709,34 @@ static void rcu_sr_normal_gp_cleanup(void) break; } - // concurrent sr_normal_gp_cleanup work might observe this update. - smp_store_release(&rcu_state.srs_done_tail, wait_tail); + /* + * Fast path, no more users to process except putting the second last + * wait head if no inflight-workers. If there are in-flight workers, + * they will remove the last wait head. + * + * Note that the ACQUIRE orders atomic access with list manipulation. + */ + if (wait_tail->next && wait_tail->next->next == NULL && + rcu_sr_is_wait_head(wait_tail->next) && + !atomic_read_acquire(&rcu_state.srs_cleanups_pending)) { + rcu_sr_put_wait_head(wait_tail->next); + wait_tail->next = NULL; + } + + /* Concurrent sr_normal_gp_cleanup work might observe this update. */ ASSERT_EXCLUSIVE_WRITER(rcu_state.srs_done_tail); + smp_store_release(&rcu_state.srs_done_tail, wait_tail); /* * We schedule a work in order to perform a final processing * of outstanding users(if still left) and releasing wait-heads * added by rcu_sr_normal_gp_init() call. */ - queue_work(sync_wq, &rcu_state.srs_cleanup_work); + if (wait_tail->next) { + atomic_inc(&rcu_state.srs_cleanups_pending); + if (!queue_work(sync_wq, &rcu_state.srs_cleanup_work)) + atomic_dec(&rcu_state.srs_cleanups_pending); + } } /* @@ -1810,7 +1844,7 @@ static noinline_for_stack bool rcu_gp_init(void) WRITE_ONCE(rcu_state.gp_state, RCU_GP_ONOFF); /* Exclude CPU hotplug operations. */ rcu_for_each_leaf_node(rnp) { - local_irq_save(flags); + local_irq_disable(); arch_spin_lock(&rcu_state.ofl_lock); raw_spin_lock_rcu_node(rnp); if (rnp->qsmaskinit == rnp->qsmaskinitnext && @@ -1818,7 +1852,7 @@ static noinline_for_stack bool rcu_gp_init(void) /* Nothing to do on this leaf rcu_node structure. */ raw_spin_unlock_rcu_node(rnp); arch_spin_unlock(&rcu_state.ofl_lock); - local_irq_restore(flags); + local_irq_enable(); continue; } @@ -1855,7 +1889,7 @@ static noinline_for_stack bool rcu_gp_init(void) raw_spin_unlock_rcu_node(rnp); arch_spin_unlock(&rcu_state.ofl_lock); - local_irq_restore(flags); + local_irq_enable(); } rcu_gp_slow(gp_preinit_delay); /* Races with CPU hotplug. */ @@ -4313,11 +4347,15 @@ static int rcu_pending(int user) return 1; /* Is this a nohz_full CPU in userspace or idle? (Ignore RCU if so.) */ - if ((user || rcu_is_cpu_rrupt_from_idle()) && rcu_nohz_full_cpu()) + gp_in_progress = rcu_gp_in_progress(); + if ((user || rcu_is_cpu_rrupt_from_idle() || + (gp_in_progress && + time_before(jiffies, READ_ONCE(rcu_state.gp_start) + + nohz_full_patience_delay_jiffies))) && + rcu_nohz_full_cpu()) return 0; /* Is the RCU core waiting for a quiescent state from this CPU? */ - gp_in_progress = rcu_gp_in_progress(); if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm && gp_in_progress) return 1; @@ -4767,7 +4805,7 @@ rcu_boot_init_percpu_data(int cpu) rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu); INIT_WORK(&rdp->strict_work, strict_work_handler); WARN_ON_ONCE(ct->dynticks_nesting != 1); - WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(cpu))); + WARN_ON_ONCE(rcu_dynticks_in_eqs(ct_dynticks_cpu(cpu))); rdp->barrier_seq_snap = rcu_state.barrier_sequence; rdp->rcu_ofl_gp_seq = rcu_state.gp_seq; rdp->rcu_ofl_gp_state = RCU_GP_CLEANED; @@ -5110,11 +5148,15 @@ void rcutree_migrate_callbacks(int cpu) struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); bool needwake; - if (rcu_rdp_is_offloaded(rdp) || - rcu_segcblist_empty(&rdp->cblist)) - return; /* No callbacks to migrate. */ + if (rcu_rdp_is_offloaded(rdp)) + return; raw_spin_lock_irqsave(&rcu_state.barrier_lock, flags); + if (rcu_segcblist_empty(&rdp->cblist)) { + raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags); + return; /* No callbacks to migrate. */ + } + WARN_ON_ONCE(rcu_rdp_cpu_online(rdp)); rcu_barrier_entrain(rdp); my_rdp = this_cpu_ptr(&rcu_data); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index bae7925c497f..fcf2b4aa3441 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -223,7 +223,6 @@ struct rcu_data { struct swait_queue_head nocb_state_wq; /* For offloading state changes */ struct task_struct *nocb_gp_kthread; raw_spinlock_t nocb_lock; /* Guard following pair of fields. */ - atomic_t nocb_lock_contended; /* Contention experienced. */ int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ struct timer_list nocb_timer; /* Enforce finite deferral. */ unsigned long nocb_gp_adv_time; /* Last call_rcu() CB adv (jiffies). */ @@ -420,6 +419,7 @@ struct rcu_state { struct llist_node *srs_done_tail; /* ready for GP users. */ struct sr_wait_node srs_wait_nodes[SR_NORMAL_GP_WAIT_HEAD_MAX]; struct work_struct srs_cleanup_work; + atomic_t srs_cleanups_pending; /* srs inflight worker cleanups. */ }; /* Values for rcu_state structure's gp_flags field. */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 8a1d9c8bd9f7..4acd29d16fdb 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -265,7 +265,12 @@ static bool sync_exp_work_done(unsigned long s) { if (rcu_exp_gp_seq_done(s)) { trace_rcu_exp_grace_period(rcu_state.name, s, TPS("done")); - smp_mb(); /* Ensure test happens before caller kfree(). */ + /* + * Order GP completion with preceding accesses. Order also GP + * completion with post GP update side accesses. Pairs with + * rcu_seq_end(). + */ + smp_mb(); return true; } return false; @@ -357,7 +362,21 @@ static void __sync_rcu_exp_select_node_cpus(struct rcu_exp_work *rewp) !(rnp->qsmaskinitnext & mask)) { mask_ofl_test |= mask; } else { - snap = rcu_dynticks_snap(cpu); + /* + * Full ordering between remote CPU's post idle accesses + * and updater's accesses prior to current GP (and also + * the started GP sequence number) is enforced by + * rcu_seq_start() implicit barrier, relayed by kworkers + * locking and even further by smp_mb__after_unlock_lock() + * barriers chained all the way throughout the rnp locking + * tree since sync_exp_reset_tree() and up to the current + * leaf rnp locking. + * + * Ordering between remote CPU's pre idle accesses and + * post grace period updater's accesses is enforced by the + * below acquire semantic. + */ + snap = ct_dynticks_cpu_acquire(cpu); if (rcu_dynticks_in_eqs(snap)) mask_ofl_test |= mask; else @@ -953,7 +972,6 @@ void synchronize_rcu_expedited(void) rnp = rcu_get_root(); wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], sync_exp_work_done(s)); - smp_mb(); /* Work actions happen before return. */ /* Let the next expedited grace period start. */ mutex_unlock(&rcu_state.exp_mutex); diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 3f85577bddd4..3ce30841119a 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -91,8 +91,7 @@ module_param(nocb_nobypass_lim_per_jiffy, int, 0); /* * Acquire the specified rcu_data structure's ->nocb_bypass_lock. If the - * lock isn't immediately available, increment ->nocb_lock_contended to - * flag the contention. + * lock isn't immediately available, perform minimal sanity check. */ static void rcu_nocb_bypass_lock(struct rcu_data *rdp) __acquires(&rdp->nocb_bypass_lock) @@ -100,29 +99,12 @@ static void rcu_nocb_bypass_lock(struct rcu_data *rdp) lockdep_assert_irqs_disabled(); if (raw_spin_trylock(&rdp->nocb_bypass_lock)) return; - atomic_inc(&rdp->nocb_lock_contended); + /* + * Contention expected only when local enqueue collide with + * remote flush from kthreads. + */ WARN_ON_ONCE(smp_processor_id() != rdp->cpu); - smp_mb__after_atomic(); /* atomic_inc() before lock. */ raw_spin_lock(&rdp->nocb_bypass_lock); - smp_mb__before_atomic(); /* atomic_dec() after lock. */ - atomic_dec(&rdp->nocb_lock_contended); -} - -/* - * Spinwait until the specified rcu_data structure's ->nocb_lock is - * not contended. Please note that this is extremely special-purpose, - * relying on the fact that at most two kthreads and one CPU contend for - * this lock, and also that the two kthreads are guaranteed to have frequent - * grace-period-duration time intervals between successive acquisitions - * of the lock. This allows us to use an extremely simple throttling - * mechanism, and further to apply it only to the CPU doing floods of - * call_rcu() invocations. Don't try this at home! - */ -static void rcu_nocb_wait_contended(struct rcu_data *rdp) -{ - WARN_ON_ONCE(smp_processor_id() != rdp->cpu); - while (WARN_ON_ONCE(atomic_read(&rdp->nocb_lock_contended))) - cpu_relax(); } /* @@ -510,7 +492,6 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, } // We need to use the bypass. - rcu_nocb_wait_contended(rdp); rcu_nocb_bypass_lock(rdp); ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */ @@ -635,8 +616,7 @@ static void call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *head, } } -static int nocb_gp_toggle_rdp(struct rcu_data *rdp, - bool *wake_state) +static int nocb_gp_toggle_rdp(struct rcu_data *rdp) { struct rcu_segcblist *cblist = &rdp->cblist; unsigned long flags; @@ -650,8 +630,6 @@ static int nocb_gp_toggle_rdp(struct rcu_data *rdp, * We will handle this rdp until it ever gets de-offloaded. */ rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP); - if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) - *wake_state = true; ret = 1; } else if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED) && rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) { @@ -660,8 +638,6 @@ static int nocb_gp_toggle_rdp(struct rcu_data *rdp, * We will ignore this rdp until it ever gets re-offloaded. */ rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP); - if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) - *wake_state = true; ret = 0; } else { WARN_ON_ONCE(1); @@ -877,16 +853,15 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) } if (rdp_toggling) { - bool wake_state = false; int ret; - ret = nocb_gp_toggle_rdp(rdp_toggling, &wake_state); + ret = nocb_gp_toggle_rdp(rdp_toggling); if (ret == 1) list_add_tail(&rdp_toggling->nocb_entry_rdp, &my_rdp->nocb_head_rdp); else if (ret == 0) list_del(&rdp_toggling->nocb_entry_rdp); - if (wake_state) - swake_up_one(&rdp_toggling->nocb_state_wq); + + swake_up_one(&rdp_toggling->nocb_state_wq); } my_rdp->nocb_gp_seq = -1; @@ -913,16 +888,9 @@ static int rcu_nocb_gp_kthread(void *arg) return 0; } -static inline bool nocb_cb_can_run(struct rcu_data *rdp) -{ - u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_CB; - - return rcu_segcblist_test_flags(&rdp->cblist, flags); -} - static inline bool nocb_cb_wait_cond(struct rcu_data *rdp) { - return nocb_cb_can_run(rdp) && !READ_ONCE(rdp->nocb_cb_sleep); + return !READ_ONCE(rdp->nocb_cb_sleep) || kthread_should_park(); } /* @@ -934,21 +902,19 @@ static void nocb_cb_wait(struct rcu_data *rdp) struct rcu_segcblist *cblist = &rdp->cblist; unsigned long cur_gp_seq; unsigned long flags; - bool needwake_state = false; bool needwake_gp = false; - bool can_sleep = true; struct rcu_node *rnp = rdp->mynode; - do { - swait_event_interruptible_exclusive(rdp->nocb_cb_wq, - nocb_cb_wait_cond(rdp)); - - if (READ_ONCE(rdp->nocb_cb_sleep)) { - WARN_ON(signal_pending(current)); - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty")); - } - } while (!nocb_cb_can_run(rdp)); + swait_event_interruptible_exclusive(rdp->nocb_cb_wq, + nocb_cb_wait_cond(rdp)); + if (kthread_should_park()) { + kthread_parkme(); + } else if (READ_ONCE(rdp->nocb_cb_sleep)) { + WARN_ON(signal_pending(current)); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty")); + } + WARN_ON_ONCE(!rcu_rdp_is_offloaded(rdp)); local_irq_save(flags); rcu_momentary_dyntick_idle(); @@ -971,37 +937,16 @@ static void nocb_cb_wait(struct rcu_data *rdp) raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ } - if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) { - if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) { - rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_CB); - if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) - needwake_state = true; - } - if (rcu_segcblist_ready_cbs(cblist)) - can_sleep = false; + if (!rcu_segcblist_ready_cbs(cblist)) { + WRITE_ONCE(rdp->nocb_cb_sleep, true); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep")); } else { - /* - * De-offloading. Clear our flag and notify the de-offload worker. - * We won't touch the callbacks and keep sleeping until we ever - * get re-offloaded. - */ - WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)); - rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_CB); - if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) - needwake_state = true; + WRITE_ONCE(rdp->nocb_cb_sleep, false); } - WRITE_ONCE(rdp->nocb_cb_sleep, can_sleep); - - if (rdp->nocb_cb_sleep) - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep")); - rcu_nocb_unlock_irqrestore(rdp, flags); if (needwake_gp) rcu_gp_kthread_wake(); - - if (needwake_state) - swake_up_one(&rdp->nocb_state_wq); } /* @@ -1094,17 +1039,8 @@ static int rdp_offload_toggle(struct rcu_data *rdp, bool wake_gp = false; rcu_segcblist_offload(cblist, offload); - - if (rdp->nocb_cb_sleep) - rdp->nocb_cb_sleep = false; rcu_nocb_unlock_irqrestore(rdp, flags); - /* - * Ignore former value of nocb_cb_sleep and force wake up as it could - * have been spuriously set to false already. - */ - swake_up_one(&rdp->nocb_cb_wq); - raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); // Queue this rdp for add/del to/from the list to iterate on rcuog WRITE_ONCE(rdp_gp->nocb_toggling_rdp, rdp); @@ -1161,19 +1097,11 @@ static long rcu_nocb_rdp_deoffload(void *arg) if (wake_gp) wake_up_process(rdp_gp->nocb_gp_kthread); - /* - * If rcuo[p] kthread spawn failed, directly remove SEGCBLIST_KTHREAD_CB. - * Just wait SEGCBLIST_KTHREAD_GP to be cleared by rcuog. - */ - if (!rdp->nocb_cb_kthread) { - rcu_nocb_lock_irqsave(rdp, flags); - rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB); - rcu_nocb_unlock_irqrestore(rdp, flags); - } - swait_event_exclusive(rdp->nocb_state_wq, - !rcu_segcblist_test_flags(cblist, - SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP)); + !rcu_segcblist_test_flags(cblist, + SEGCBLIST_KTHREAD_GP)); + if (rdp->nocb_cb_kthread) + kthread_park(rdp->nocb_cb_kthread); } else { /* * No kthread to clear the flags for us or remove the rdp from the nocb list @@ -1181,8 +1109,7 @@ static long rcu_nocb_rdp_deoffload(void *arg) * but we stick to paranoia in this rare path. */ rcu_nocb_lock_irqsave(rdp, flags); - rcu_segcblist_clear_flags(&rdp->cblist, - SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP); + rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_KTHREAD_GP); rcu_nocb_unlock_irqrestore(rdp, flags); list_del(&rdp->nocb_entry_rdp); @@ -1282,8 +1209,10 @@ static long rcu_nocb_rdp_offload(void *arg) wake_gp = rdp_offload_toggle(rdp, true, flags); if (wake_gp) wake_up_process(rdp_gp->nocb_gp_kthread); + + kthread_unpark(rdp->nocb_cb_kthread); + swait_event_exclusive(rdp->nocb_state_wq, - rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB) && rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)); /* @@ -1468,7 +1397,7 @@ void __init rcu_init_nohz(void) if (rcu_segcblist_empty(&rdp->cblist)) rcu_segcblist_init(&rdp->cblist); rcu_segcblist_offload(&rdp->cblist, true); - rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP); + rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_GP); rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_RCU_CORE); } rcu_organize_nocb_kthreads(); @@ -1526,11 +1455,16 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex); /* Spawn the kthread for this CPU. */ - t = kthread_run(rcu_nocb_cb_kthread, rdp, - "rcuo%c/%d", rcu_state.abbr, cpu); + t = kthread_create(rcu_nocb_cb_kthread, rdp, + "rcuo%c/%d", rcu_state.abbr, cpu); if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__)) goto end; + if (rcu_rdp_is_offloaded(rdp)) + wake_up_process(t); + else + kthread_park(t); + if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_CB_BOOST) && kthread_prio) sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); @@ -1678,12 +1612,11 @@ static void show_rcu_nocb_state(struct rcu_data *rdp) sprintf(bufw, "%ld", rsclp->gp_seq[RCU_WAIT_TAIL]); sprintf(bufr, "%ld", rsclp->gp_seq[RCU_NEXT_READY_TAIL]); - pr_info(" CB %d^%d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n", + pr_info(" CB %d^%d->%d %c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n", rdp->cpu, rdp->nocb_gp_rdp->cpu, nocb_next_rdp ? nocb_next_rdp->cpu : -1, "kK"[!!rdp->nocb_cb_kthread], "bB"[raw_spin_is_locked(&rdp->nocb_bypass_lock)], - "cC"[!!atomic_read(&rdp->nocb_lock_contended)], "lL"[raw_spin_is_locked(&rdp->nocb_lock)], "sS"[!!rdp->nocb_cb_sleep], ".W"[swait_active(&rdp->nocb_cb_wq)], diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 340bbefe5f65..c569da65b421 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -28,8 +28,8 @@ static bool rcu_rdp_is_offloaded(struct rcu_data *rdp) !(lockdep_is_held(&rcu_state.barrier_mutex) || (IS_ENABLED(CONFIG_HOTPLUG_CPU) && lockdep_is_cpus_held()) || rcu_lockdep_is_held_nocb(rdp) || - (rdp == this_cpu_ptr(&rcu_data) && - !(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible())) || + (!(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible()) && + rdp == this_cpu_ptr(&rcu_data)) || rcu_current_is_nocb_kthread(rdp)), "Unsafe read of RCU_NOCB offloaded state" ); @@ -93,6 +93,16 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_init_delay); if (gp_cleanup_delay) pr_info("\tRCU debug GP cleanup slowdown %d jiffies.\n", gp_cleanup_delay); + if (nohz_full_patience_delay < 0) { + pr_info("\tRCU NOCB CPU patience negative (%d), resetting to zero.\n", nohz_full_patience_delay); + nohz_full_patience_delay = 0; + } else if (nohz_full_patience_delay > 5 * MSEC_PER_SEC) { + pr_info("\tRCU NOCB CPU patience too large (%d), resetting to %ld.\n", nohz_full_patience_delay, 5 * MSEC_PER_SEC); + nohz_full_patience_delay = 5 * MSEC_PER_SEC; + } else if (nohz_full_patience_delay) { + pr_info("\tRCU NOCB CPU patience set to %d milliseconds.\n", nohz_full_patience_delay); + } + nohz_full_patience_delay_jiffies = msecs_to_jiffies(nohz_full_patience_delay); if (!use_softirq) pr_info("\tRCU_SOFTIRQ processing moved to rcuc kthreads.\n"); if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG)) diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 460efecd077b..4b0e9d7c4c68 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -501,7 +501,7 @@ static void print_cpu_stall_info(int cpu) } delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq); falsepositive = rcu_is_gp_kthread_starving(NULL) && - rcu_dynticks_in_eqs(rcu_dynticks_snap(cpu)); + rcu_dynticks_in_eqs(ct_dynticks_cpu(cpu)); rcuc_starved = rcu_is_rcuc_kthread_starving(rdp, &j); if (rcuc_starved) // Print signed value, as negative values indicate a probable bug. @@ -515,7 +515,7 @@ static void print_cpu_stall_info(int cpu) rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' : "!."[!delta], ticks_value, ticks_title, - rcu_dynticks_snap(cpu) & 0xffff, + ct_dynticks_cpu(cpu) & 0xffff, ct_dynticks_nesting_cpu(cpu), ct_dynticks_nmi_nesting_cpu(cpu), rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), data_race(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart, diff --git a/kernel/scftorture.c b/kernel/scftorture.c index 59032aaccd18..44e83a646264 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -43,6 +43,7 @@ #define SCFTORTOUT_ERRSTRING(s, x...) pr_alert(SCFTORT_FLAG "!!! " s "\n", ## x) +MODULE_DESCRIPTION("Torture tests on the smp_call_function() family of primitives"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@kernel.org>"); @@ -67,7 +68,7 @@ torture_param(int, weight_many_wait, -1, "Testing weight for multi-CPU operation torture_param(int, weight_all, -1, "Testing weight for all-CPU no-wait operations."); torture_param(int, weight_all_wait, -1, "Testing weight for all-CPU operations."); -char *torture_type = ""; +static char *torture_type = ""; #ifdef MODULE # define SCFTORT_SHUTDOWN 0 diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c index d9dc9ab3773f..39c315182b35 100644 --- a/kernel/sched/build_policy.c +++ b/kernel/sched/build_policy.c @@ -52,3 +52,4 @@ #include "cputime.c" #include "deadline.c" +#include "syscalls.c" diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index 3c6193de9cde..a09655b48140 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -340,7 +340,7 @@ again: this_clock = sched_clock_local(my_scd); /* * We must enforce atomic readout on 32-bit, otherwise the - * update on the remote CPU can hit inbetween the readout of + * update on the remote CPU can hit in between the readout of * the low 32-bit and the high 32-bit portion. */ remote_clock = cmpxchg64(&scd->clock, 0, 0); @@ -444,7 +444,7 @@ notrace void sched_clock_tick_stable(void) } /* - * We are going deep-idle (irqs are disabled): + * We are going deep-idle (IRQs are disabled): */ notrace void sched_clock_idle_sleep_event(void) { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index bcf2c4cc0522..ae5ef3013a55 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2,9 +2,10 @@ /* * kernel/sched/core.c * - * Core kernel scheduler code and related syscalls + * Core kernel CPU scheduler code * * Copyright (C) 1991-2002 Linus Torvalds + * Copyright (C) 1998-2024 Ingo Molnar, Red Hat */ #include <linux/highmem.h> #include <linux/hrtimer_api.h> @@ -706,14 +707,14 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) /* * Since irq_time is only updated on {soft,}irq_exit, we might run into * this case when a previous update_rq_clock() happened inside a - * {soft,}irq region. + * {soft,}IRQ region. * * When this happens, we stop ->clock_task and only update the * prev_irq_time stamp to account for the part that fit, so that a next * update will consume the rest. This ensures ->clock_task is * monotonic. * - * It does however cause some slight miss-attribution of {soft,}irq + * It does however cause some slight miss-attribution of {soft,}IRQ * time, a more accurate solution would be to update the irq_time using * the current rq->clock timestamp, except that would require using * atomic ops. @@ -723,7 +724,6 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) rq->prev_irq_time += irq_delta; delta -= irq_delta; - psi_account_irqtime(rq->curr, irq_delta); delayacct_irq(rq->curr, irq_delta); #endif #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING @@ -826,7 +826,7 @@ static void __hrtick_start(void *arg) /* * Called to set the hrtick timer state. * - * called with rq->lock held and irqs disabled + * called with rq->lock held and IRQs disabled */ void hrtick_start(struct rq *rq, u64 delay) { @@ -850,7 +850,7 @@ void hrtick_start(struct rq *rq, u64 delay) /* * Called to set the hrtick timer state. * - * called with rq->lock held and irqs disabled + * called with rq->lock held and IRQs disabled */ void hrtick_start(struct rq *rq, u64 delay) { @@ -884,7 +884,7 @@ static inline void hrtick_rq_init(struct rq *rq) #endif /* CONFIG_SCHED_HRTICK */ /* - * cmpxchg based fetch_or, macro so it works for different integer types + * try_cmpxchg based fetch_or() macro so it works for different integer types: */ #define fetch_or(ptr, mask) \ ({ \ @@ -1081,7 +1081,7 @@ void resched_cpu(int cpu) * * We don't do similar optimization for completely idle system, as * selecting an idle CPU will add more delays to the timers than intended - * (as that CPU's timer base may not be uptodate wrt jiffies etc). + * (as that CPU's timer base may not be up to date wrt jiffies etc). */ int get_nohz_timer_target(void) { @@ -1141,7 +1141,7 @@ static void wake_up_idle_cpu(int cpu) * nohz functions that would need to follow TIF_NR_POLLING * clearing: * - * - On most archs, a simple fetch_or on ti::flags with a + * - On most architectures, a simple fetch_or on ti::flags with a * "0" value would be enough to know if an IPI needs to be sent. * * - x86 needs to perform a last need_resched() check between @@ -1324,30 +1324,27 @@ int tg_nop(struct task_group *tg, void *data) } #endif -static void set_load_weight(struct task_struct *p, bool update_load) +void set_load_weight(struct task_struct *p, bool update_load) { int prio = p->static_prio - MAX_RT_PRIO; - struct load_weight *load = &p->se.load; + struct load_weight lw; - /* - * SCHED_IDLE tasks get minimal weight: - */ if (task_has_idle_policy(p)) { - load->weight = scale_load(WEIGHT_IDLEPRIO); - load->inv_weight = WMULT_IDLEPRIO; - return; + lw.weight = scale_load(WEIGHT_IDLEPRIO); + lw.inv_weight = WMULT_IDLEPRIO; + } else { + lw.weight = scale_load(sched_prio_to_weight[prio]); + lw.inv_weight = sched_prio_to_wmult[prio]; } /* * SCHED_OTHER tasks have to update their load when changing their * weight */ - if (update_load && p->sched_class == &fair_sched_class) { - reweight_task(p, prio); - } else { - load->weight = scale_load(sched_prio_to_weight[prio]); - load->inv_weight = sched_prio_to_wmult[prio]; - } + if (update_load && p->sched_class == &fair_sched_class) + reweight_task(p, &lw); + else + p->se.load = lw; } #ifdef CONFIG_UCLAMP_TASK @@ -1384,7 +1381,7 @@ static unsigned int __maybe_unused sysctl_sched_uclamp_util_max = SCHED_CAPACITY * This knob will not override the system default sched_util_clamp_min defined * above. */ -static unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE; +unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE; /* All clamps are required to be less or equal than these values */ static struct uclamp_se uclamp_default[UCLAMP_CNT]; @@ -1409,32 +1406,6 @@ static struct uclamp_se uclamp_default[UCLAMP_CNT]; */ DEFINE_STATIC_KEY_FALSE(sched_uclamp_used); -/* Integer rounded range for each bucket */ -#define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS) - -#define for_each_clamp_id(clamp_id) \ - for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++) - -static inline unsigned int uclamp_bucket_id(unsigned int clamp_value) -{ - return min_t(unsigned int, clamp_value / UCLAMP_BUCKET_DELTA, UCLAMP_BUCKETS - 1); -} - -static inline unsigned int uclamp_none(enum uclamp_id clamp_id) -{ - if (clamp_id == UCLAMP_MIN) - return 0; - return SCHED_CAPACITY_SCALE; -} - -static inline void uclamp_se_set(struct uclamp_se *uc_se, - unsigned int value, bool user_defined) -{ - uc_se->value = value; - uc_se->bucket_id = uclamp_bucket_id(value); - uc_se->user_defined = user_defined; -} - static inline unsigned int uclamp_idle_value(struct rq *rq, enum uclamp_id clamp_id, unsigned int clamp_value) @@ -1676,7 +1647,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, rq_clamp = uclamp_rq_get(rq, clamp_id); /* * Defensive programming: this should never happen. If it happens, - * e.g. due to future modification, warn and fixup the expected value. + * e.g. due to future modification, warn and fix up the expected value. */ SCHED_WARN_ON(bucket->value > rq_clamp); if (bucket->value >= rq_clamp) { @@ -1898,107 +1869,6 @@ undo: } #endif -static int uclamp_validate(struct task_struct *p, - const struct sched_attr *attr) -{ - int util_min = p->uclamp_req[UCLAMP_MIN].value; - int util_max = p->uclamp_req[UCLAMP_MAX].value; - - if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) { - util_min = attr->sched_util_min; - - if (util_min + 1 > SCHED_CAPACITY_SCALE + 1) - return -EINVAL; - } - - if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) { - util_max = attr->sched_util_max; - - if (util_max + 1 > SCHED_CAPACITY_SCALE + 1) - return -EINVAL; - } - - if (util_min != -1 && util_max != -1 && util_min > util_max) - return -EINVAL; - - /* - * We have valid uclamp attributes; make sure uclamp is enabled. - * - * We need to do that here, because enabling static branches is a - * blocking operation which obviously cannot be done while holding - * scheduler locks. - */ - static_branch_enable(&sched_uclamp_used); - - return 0; -} - -static bool uclamp_reset(const struct sched_attr *attr, - enum uclamp_id clamp_id, - struct uclamp_se *uc_se) -{ - /* Reset on sched class change for a non user-defined clamp value. */ - if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)) && - !uc_se->user_defined) - return true; - - /* Reset on sched_util_{min,max} == -1. */ - if (clamp_id == UCLAMP_MIN && - attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN && - attr->sched_util_min == -1) { - return true; - } - - if (clamp_id == UCLAMP_MAX && - attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX && - attr->sched_util_max == -1) { - return true; - } - - return false; -} - -static void __setscheduler_uclamp(struct task_struct *p, - const struct sched_attr *attr) -{ - enum uclamp_id clamp_id; - - for_each_clamp_id(clamp_id) { - struct uclamp_se *uc_se = &p->uclamp_req[clamp_id]; - unsigned int value; - - if (!uclamp_reset(attr, clamp_id, uc_se)) - continue; - - /* - * RT by default have a 100% boost value that could be modified - * at runtime. - */ - if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN)) - value = sysctl_sched_uclamp_util_min_rt_default; - else - value = uclamp_none(clamp_id); - - uclamp_se_set(uc_se, value, false); - - } - - if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP))) - return; - - if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN && - attr->sched_util_min != -1) { - uclamp_se_set(&p->uclamp_req[UCLAMP_MIN], - attr->sched_util_min, true); - } - - if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX && - attr->sched_util_max != -1) { - uclamp_se_set(&p->uclamp_req[UCLAMP_MAX], - attr->sched_util_max, true); - } -} - static void uclamp_fork(struct task_struct *p) { enum uclamp_id clamp_id; @@ -2066,13 +1936,6 @@ static void __init init_uclamp(void) #else /* !CONFIG_UCLAMP_TASK */ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { } static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { } -static inline int uclamp_validate(struct task_struct *p, - const struct sched_attr *attr) -{ - return -EOPNOTSUPP; -} -static void __setscheduler_uclamp(struct task_struct *p, - const struct sched_attr *attr) { } static inline void uclamp_fork(struct task_struct *p) { } static inline void uclamp_post_fork(struct task_struct *p) { } static inline void init_uclamp(void) { } @@ -2102,7 +1965,7 @@ unsigned long get_wchan(struct task_struct *p) return ip; } -static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) +void enqueue_task(struct rq *rq, struct task_struct *p, int flags) { if (!(flags & ENQUEUE_NOCLOCK)) update_rq_clock(rq); @@ -2119,7 +1982,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) sched_core_enqueue(rq, p); } -static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) +void dequeue_task(struct rq *rq, struct task_struct *p, int flags) { if (sched_core_enabled(rq)) sched_core_dequeue(rq, p, flags); @@ -2157,52 +2020,6 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags) dequeue_task(rq, p, flags); } -static inline int __normal_prio(int policy, int rt_prio, int nice) -{ - int prio; - - if (dl_policy(policy)) - prio = MAX_DL_PRIO - 1; - else if (rt_policy(policy)) - prio = MAX_RT_PRIO - 1 - rt_prio; - else - prio = NICE_TO_PRIO(nice); - - return prio; -} - -/* - * Calculate the expected normal priority: i.e. priority - * without taking RT-inheritance into account. Might be - * boosted by interactivity modifiers. Changes upon fork, - * setprio syscalls, and whenever the interactivity - * estimator recalculates. - */ -static inline int normal_prio(struct task_struct *p) -{ - return __normal_prio(p->policy, p->rt_priority, PRIO_TO_NICE(p->static_prio)); -} - -/* - * Calculate the current priority, i.e. the priority - * taken into account by the scheduler. This value might - * be boosted by RT tasks, or might be boosted by - * interactivity modifiers. Will be RT if the task got - * RT-boosted. If not then it returns p->normal_prio. - */ -static int effective_prio(struct task_struct *p) -{ - p->normal_prio = normal_prio(p); - /* - * If we are RT tasks or we were boosted to RT priority, - * keep the priority unchanged. Otherwise, update priority - * to the normal priority: - */ - if (!rt_prio(p->prio)) - return p->normal_prio; - return p->prio; -} - /** * task_curr - is this task currently executing on a CPU? * @p: the task in question. @@ -2221,9 +2038,9 @@ inline int task_curr(const struct task_struct *p) * this means any call to check_class_changed() must be followed by a call to * balance_callback(). */ -static inline void check_class_changed(struct rq *rq, struct task_struct *p, - const struct sched_class *prev_class, - int oldprio) +void check_class_changed(struct rq *rq, struct task_struct *p, + const struct sched_class *prev_class, + int oldprio) { if (prev_class != p->sched_class) { if (prev_class->switched_from) @@ -2392,9 +2209,6 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state static void __do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx); -static int __set_cpus_allowed_ptr(struct task_struct *p, - struct affinity_context *ctx); - static void migrate_disable_switch(struct rq *rq, struct task_struct *p) { struct affinity_context ac = { @@ -2409,7 +2223,7 @@ static void migrate_disable_switch(struct rq *rq, struct task_struct *p) return; /* - * Violates locking rules! see comment in __do_set_cpus_allowed(). + * Violates locking rules! See comment in __do_set_cpus_allowed(). */ __do_set_cpus_allowed(p, &ac); } @@ -2576,7 +2390,7 @@ static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf, } /* - * migration_cpu_stop - this will be executed by a highprio stopper thread + * migration_cpu_stop - this will be executed by a high-prio stopper thread * and performs thread migration by bumping thread off CPU then * 'pushing' onto another runqueue. */ @@ -2821,16 +2635,6 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) kfree_rcu((union cpumask_rcuhead *)ac.user_mask, rcu); } -static cpumask_t *alloc_user_cpus_ptr(int node) -{ - /* - * See do_set_cpus_allowed() above for the rcu_head usage. - */ - int size = max_t(int, cpumask_size(), sizeof(struct rcu_head)); - - return kmalloc_node(size, GFP_KERNEL, node); -} - int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node) { @@ -3199,8 +3003,7 @@ out: * task must not exit() & deallocate itself prematurely. The * call is not atomic; no spinlocks may be held. */ -static int __set_cpus_allowed_ptr(struct task_struct *p, - struct affinity_context *ctx) +int __set_cpus_allowed_ptr(struct task_struct *p, struct affinity_context *ctx) { struct rq_flags rf; struct rq *rq; @@ -3319,9 +3122,6 @@ out_free_mask: free_cpumask_var(new_mask); } -static int -__sched_setaffinity(struct task_struct *p, struct affinity_context *ctx); - /* * Restore the affinity of a task @p which was previously restricted by a * call to force_compatible_cpus_allowed_ptr(). @@ -3701,12 +3501,6 @@ void sched_set_stop_task(int cpu, struct task_struct *stop) #else /* CONFIG_SMP */ -static inline int __set_cpus_allowed_ptr(struct task_struct *p, - struct affinity_context *ctx) -{ - return set_cpus_allowed_ptr(p, ctx->new_mask); -} - static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { } static inline bool rq_has_pinned_tasks(struct rq *rq) @@ -3714,11 +3508,6 @@ static inline bool rq_has_pinned_tasks(struct rq *rq) return false; } -static inline cpumask_t *alloc_user_cpus_ptr(int node) -{ - return NULL; -} - #endif /* !CONFIG_SMP */ static void @@ -3901,8 +3690,8 @@ void sched_ttwu_pending(void *arg) * it is possible for select_idle_siblings() to stack a number * of tasks on this CPU during that window. * - * It is ok to clear ttwu_pending when another task pending. - * We will receive IPI after local irq enabled and then enqueue it. + * It is OK to clear ttwu_pending when another task pending. + * We will receive IPI after local IRQ enabled and then enqueue it. * Since now nr_running > 0, idle_cpu() will always get correct result. */ WRITE_ONCE(rq->ttwu_pending, 0); @@ -4467,12 +4256,7 @@ int task_call_func(struct task_struct *p, task_call_f func, void *arg) * @cpu: The CPU on which to snapshot the task. * * Returns the task_struct pointer of the task "currently" running on - * the specified CPU. If the same task is running on that CPU throughout, - * the return value will be a pointer to that task's task_struct structure. - * If the CPU did any context switches even vaguely concurrently with the - * execution of this function, the return value will be a pointer to the - * task_struct structure of a randomly chosen task that was running on - * that CPU somewhere around the time that this function was executing. + * the specified CPU. * * If the specified CPU was offline, the return value is whatever it * is, perhaps a pointer to the task_struct structure of that CPU's idle @@ -4486,11 +4270,16 @@ int task_call_func(struct task_struct *p, task_call_f func, void *arg) */ struct task_struct *cpu_curr_snapshot(int cpu) { + struct rq *rq = cpu_rq(cpu); struct task_struct *t; + struct rq_flags rf; - smp_mb(); /* Pairing determined by caller's synchronization design. */ + rq_lock_irqsave(rq, &rf); + smp_mb__after_spinlock(); /* Pairing determined by caller's synchronization design. */ t = rcu_dereference(cpu_curr(cpu)); + rq_unlock_irqrestore(rq, &rf); smp_mb(); /* Pairing determined by caller's synchronization design. */ + return t; } @@ -5095,7 +4884,7 @@ __splice_balance_callbacks(struct rq *rq, bool split) return head; } -static inline struct balance_callback *splice_balance_callbacks(struct rq *rq) +struct balance_callback *splice_balance_callbacks(struct rq *rq) { return __splice_balance_callbacks(rq, true); } @@ -5105,7 +4894,7 @@ static void __balance_callbacks(struct rq *rq) do_balance_callbacks(rq, __splice_balance_callbacks(rq, false)); } -static inline void balance_callbacks(struct rq *rq, struct balance_callback *head) +void balance_callbacks(struct rq *rq, struct balance_callback *head) { unsigned long flags; @@ -5122,15 +4911,6 @@ static inline void __balance_callbacks(struct rq *rq) { } -static inline struct balance_callback *splice_balance_callbacks(struct rq *rq) -{ - return NULL; -} - -static inline void balance_callbacks(struct rq *rq, struct balance_callback *head) -{ -} - #endif static inline void @@ -5233,7 +5013,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, * * The context switch have flipped the stack from under us and restored the * local variables which were saved when this task called schedule() in the - * past. prev == current is still correct but we need to recalculate this_rq + * past. 'prev == current' is still correct but we need to recalculate this_rq * because prev may have moved to another CPU. */ static struct rq *finish_task_switch(struct task_struct *prev) @@ -5556,9 +5336,9 @@ EXPORT_PER_CPU_SYMBOL(kernel_cpustat); static inline void prefetch_curr_exec_start(struct task_struct *p) { #ifdef CONFIG_FAIR_GROUP_SCHED - struct sched_entity *curr = (&p->se)->cfs_rq->curr; + struct sched_entity *curr = p->se.cfs_rq->curr; #else - struct sched_entity *curr = (&task_rq(p)->cfs)->curr; + struct sched_entity *curr = task_rq(p)->cfs.curr; #endif prefetch(curr); prefetch(&curr->exec_start); @@ -5579,7 +5359,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) /* * 64-bit doesn't need locks to atomically read a 64-bit value. * So we have a optimization chance when the task's delta_exec is 0. - * Reading ->on_cpu is racy, but this is ok. + * Reading ->on_cpu is racy, but this is OK. * * If we race with it leaving CPU, we'll take a lock. So we're correct. * If we race with it entering CPU, unaccounted time is 0. This is @@ -5665,7 +5445,7 @@ void sched_tick(void) { int cpu = smp_processor_id(); struct rq *rq = cpu_rq(cpu); - struct task_struct *curr = rq->curr; + struct task_struct *curr; struct rq_flags rf; unsigned long hw_pressure; u64 resched_latency; @@ -5677,6 +5457,9 @@ void sched_tick(void) rq_lock(rq, &rf); + curr = rq->curr; + psi_account_irqtime(rq, curr, NULL); + update_rq_clock(rq); hw_pressure = arch_scale_hw_pressure(cpu_of(rq)); update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure); @@ -6737,6 +6520,7 @@ static void __sched notrace __schedule(unsigned int sched_mode) ++*switch_count; migrate_disable_switch(rq, prev); + psi_account_irqtime(rq, prev, next); psi_sched_switch(prev, next, !task_on_rq_queued(prev)); trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state); @@ -6853,7 +6637,7 @@ void __sched schedule_idle(void) { /* * As this skips calling sched_submit_work(), which the idle task does - * regardless because that function is a nop when the task is in a + * regardless because that function is a NOP when the task is in a * TASK_RUNNING state, make sure this isn't used someplace that the * current task can be in any other state. Note, idle is always in the * TASK_RUNNING state. @@ -7048,9 +6832,9 @@ EXPORT_SYMBOL(dynamic_preempt_schedule_notrace); /* * This is the entry point to schedule() from kernel preemption - * off of irq context. - * Note, that this is called and return with irqs disabled. This will - * protect us against recursive calling from irq. + * off of IRQ context. + * Note, that this is called and return with IRQs disabled. This will + * protect us against recursive calling from IRQ contexts. */ asmlinkage __visible void __sched preempt_schedule_irq(void) { @@ -7080,7 +6864,7 @@ int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flag } EXPORT_SYMBOL(default_wake_function); -static void __setscheduler_prio(struct task_struct *p, int prio) +void __setscheduler_prio(struct task_struct *p, int prio) { if (dl_prio(prio)) p->sched_class = &dl_sched_class; @@ -7120,21 +6904,6 @@ void rt_mutex_post_schedule(void) lockdep_assert(fetch_and_set(current->sched_rt_mutex, 0)); } -static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) -{ - if (pi_task) - prio = min(prio, pi_task->prio); - - return prio; -} - -static inline int rt_effective_prio(struct task_struct *p, int prio) -{ - struct task_struct *pi_task = rt_mutex_get_top_task(p); - - return __rt_effective_prio(pi_task, prio); -} - /* * rt_mutex_setprio - set the current priority of a task * @p: task to boost @@ -7184,7 +6953,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) goto out_unlock; /* - * Idle task boosting is a nono in general. There is one + * Idle task boosting is a no-no in general. There is one * exception, when PREEMPT_RT and NOHZ is active: * * The idle task calls get_next_timer_interrupt() and holds @@ -7263,1325 +7032,8 @@ out_unlock: preempt_enable(); } -#else -static inline int rt_effective_prio(struct task_struct *p, int prio) -{ - return prio; -} #endif -void set_user_nice(struct task_struct *p, long nice) -{ - bool queued, running; - struct rq *rq; - int old_prio; - - if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) - return; - /* - * We have to be careful, if called from sys_setpriority(), - * the task might be in the middle of scheduling on another CPU. - */ - CLASS(task_rq_lock, rq_guard)(p); - rq = rq_guard.rq; - - update_rq_clock(rq); - - /* - * The RT priorities are set via sched_setscheduler(), but we still - * allow the 'normal' nice value to be set - but as expected - * it won't have any effect on scheduling until the task is - * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR: - */ - if (task_has_dl_policy(p) || task_has_rt_policy(p)) { - p->static_prio = NICE_TO_PRIO(nice); - return; - } - - queued = task_on_rq_queued(p); - running = task_current(rq, p); - if (queued) - dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); - if (running) - put_prev_task(rq, p); - - p->static_prio = NICE_TO_PRIO(nice); - set_load_weight(p, true); - old_prio = p->prio; - p->prio = effective_prio(p); - - if (queued) - enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); - if (running) - set_next_task(rq, p); - - /* - * If the task increased its priority or is running and - * lowered its priority, then reschedule its CPU: - */ - p->sched_class->prio_changed(rq, p, old_prio); -} -EXPORT_SYMBOL(set_user_nice); - -/* - * is_nice_reduction - check if nice value is an actual reduction - * - * Similar to can_nice() but does not perform a capability check. - * - * @p: task - * @nice: nice value - */ -static bool is_nice_reduction(const struct task_struct *p, const int nice) -{ - /* Convert nice value [19,-20] to rlimit style value [1,40]: */ - int nice_rlim = nice_to_rlimit(nice); - - return (nice_rlim <= task_rlimit(p, RLIMIT_NICE)); -} - -/* - * can_nice - check if a task can reduce its nice value - * @p: task - * @nice: nice value - */ -int can_nice(const struct task_struct *p, const int nice) -{ - return is_nice_reduction(p, nice) || capable(CAP_SYS_NICE); -} - -#ifdef __ARCH_WANT_SYS_NICE - -/* - * sys_nice - change the priority of the current process. - * @increment: priority increment - * - * sys_setpriority is a more generic, but much slower function that - * does similar things. - */ -SYSCALL_DEFINE1(nice, int, increment) -{ - long nice, retval; - - /* - * Setpriority might change our priority at the same moment. - * We don't have to worry. Conceptually one call occurs first - * and we have a single winner. - */ - increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); - nice = task_nice(current) + increment; - - nice = clamp_val(nice, MIN_NICE, MAX_NICE); - if (increment < 0 && !can_nice(current, nice)) - return -EPERM; - - retval = security_task_setnice(current, nice); - if (retval) - return retval; - - set_user_nice(current, nice); - return 0; -} - -#endif - -/** - * task_prio - return the priority value of a given task. - * @p: the task in question. - * - * Return: The priority value as seen by users in /proc. - * - * sched policy return value kernel prio user prio/nice - * - * normal, batch, idle [0 ... 39] [100 ... 139] 0/[-20 ... 19] - * fifo, rr [-2 ... -100] [98 ... 0] [1 ... 99] - * deadline -101 -1 0 - */ -int task_prio(const struct task_struct *p) -{ - return p->prio - MAX_RT_PRIO; -} - -/** - * idle_cpu - is a given CPU idle currently? - * @cpu: the processor in question. - * - * Return: 1 if the CPU is currently idle. 0 otherwise. - */ -int idle_cpu(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - - if (rq->curr != rq->idle) - return 0; - - if (rq->nr_running) - return 0; - -#ifdef CONFIG_SMP - if (rq->ttwu_pending) - return 0; -#endif - - return 1; -} - -/** - * available_idle_cpu - is a given CPU idle for enqueuing work. - * @cpu: the CPU in question. - * - * Return: 1 if the CPU is currently idle. 0 otherwise. - */ -int available_idle_cpu(int cpu) -{ - if (!idle_cpu(cpu)) - return 0; - - if (vcpu_is_preempted(cpu)) - return 0; - - return 1; -} - -/** - * idle_task - return the idle task for a given CPU. - * @cpu: the processor in question. - * - * Return: The idle task for the CPU @cpu. - */ -struct task_struct *idle_task(int cpu) -{ - return cpu_rq(cpu)->idle; -} - -#ifdef CONFIG_SCHED_CORE -int sched_core_idle_cpu(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - - if (sched_core_enabled(rq) && rq->curr == rq->idle) - return 1; - - return idle_cpu(cpu); -} - -#endif - -#ifdef CONFIG_SMP -/* - * This function computes an effective utilization for the given CPU, to be - * used for frequency selection given the linear relation: f = u * f_max. - * - * The scheduler tracks the following metrics: - * - * cpu_util_{cfs,rt,dl,irq}() - * cpu_bw_dl() - * - * Where the cfs,rt and dl util numbers are tracked with the same metric and - * synchronized windows and are thus directly comparable. - * - * The cfs,rt,dl utilization are the running times measured with rq->clock_task - * which excludes things like IRQ and steal-time. These latter are then accrued - * in the irq utilization. - * - * The DL bandwidth number otoh is not a measured metric but a value computed - * based on the task model parameters and gives the minimal utilization - * required to meet deadlines. - */ -unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, - unsigned long *min, - unsigned long *max) -{ - unsigned long util, irq, scale; - struct rq *rq = cpu_rq(cpu); - - scale = arch_scale_cpu_capacity(cpu); - - /* - * Early check to see if IRQ/steal time saturates the CPU, can be - * because of inaccuracies in how we track these -- see - * update_irq_load_avg(). - */ - irq = cpu_util_irq(rq); - if (unlikely(irq >= scale)) { - if (min) - *min = scale; - if (max) - *max = scale; - return scale; - } - - if (min) { - /* - * The minimum utilization returns the highest level between: - * - the computed DL bandwidth needed with the IRQ pressure which - * steals time to the deadline task. - * - The minimum performance requirement for CFS and/or RT. - */ - *min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN)); - - /* - * When an RT task is runnable and uclamp is not used, we must - * ensure that the task will run at maximum compute capacity. - */ - if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt)) - *min = max(*min, scale); - } - - /* - * Because the time spend on RT/DL tasks is visible as 'lost' time to - * CFS tasks and we use the same metric to track the effective - * utilization (PELT windows are synchronized) we can directly add them - * to obtain the CPU's actual utilization. - */ - util = util_cfs + cpu_util_rt(rq); - util += cpu_util_dl(rq); - - /* - * The maximum hint is a soft bandwidth requirement, which can be lower - * than the actual utilization because of uclamp_max requirements. - */ - if (max) - *max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX)); - - if (util >= scale) - return scale; - - /* - * There is still idle time; further improve the number by using the - * irq metric. Because IRQ/steal time is hidden from the task clock we - * need to scale the task numbers: - * - * max - irq - * U' = irq + --------- * U - * max - */ - util = scale_irq_capacity(util, irq, scale); - util += irq; - - return min(scale, util); -} - -unsigned long sched_cpu_util(int cpu) -{ - return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL); -} -#endif /* CONFIG_SMP */ - -/** - * find_process_by_pid - find a process with a matching PID value. - * @pid: the pid in question. - * - * The task of @pid, if found. %NULL otherwise. - */ -static struct task_struct *find_process_by_pid(pid_t pid) -{ - return pid ? find_task_by_vpid(pid) : current; -} - -static struct task_struct *find_get_task(pid_t pid) -{ - struct task_struct *p; - guard(rcu)(); - - p = find_process_by_pid(pid); - if (likely(p)) - get_task_struct(p); - - return p; -} - -DEFINE_CLASS(find_get_task, struct task_struct *, if (_T) put_task_struct(_T), - find_get_task(pid), pid_t pid) - -/* - * sched_setparam() passes in -1 for its policy, to let the functions - * it calls know not to change it. - */ -#define SETPARAM_POLICY -1 - -static void __setscheduler_params(struct task_struct *p, - const struct sched_attr *attr) -{ - int policy = attr->sched_policy; - - if (policy == SETPARAM_POLICY) - policy = p->policy; - - p->policy = policy; - - if (dl_policy(policy)) - __setparam_dl(p, attr); - else if (fair_policy(policy)) - p->static_prio = NICE_TO_PRIO(attr->sched_nice); - - /* - * __sched_setscheduler() ensures attr->sched_priority == 0 when - * !rt_policy. Always setting this ensures that things like - * getparam()/getattr() don't report silly values for !rt tasks. - */ - p->rt_priority = attr->sched_priority; - p->normal_prio = normal_prio(p); - set_load_weight(p, true); -} - -/* - * Check the target process has a UID that matches the current process's: - */ -static bool check_same_owner(struct task_struct *p) -{ - const struct cred *cred = current_cred(), *pcred; - guard(rcu)(); - - pcred = __task_cred(p); - return (uid_eq(cred->euid, pcred->euid) || - uid_eq(cred->euid, pcred->uid)); -} - -/* - * Allow unprivileged RT tasks to decrease priority. - * Only issue a capable test if needed and only once to avoid an audit - * event on permitted non-privileged operations: - */ -static int user_check_sched_setscheduler(struct task_struct *p, - const struct sched_attr *attr, - int policy, int reset_on_fork) -{ - if (fair_policy(policy)) { - if (attr->sched_nice < task_nice(p) && - !is_nice_reduction(p, attr->sched_nice)) - goto req_priv; - } - - if (rt_policy(policy)) { - unsigned long rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO); - - /* Can't set/change the rt policy: */ - if (policy != p->policy && !rlim_rtprio) - goto req_priv; - - /* Can't increase priority: */ - if (attr->sched_priority > p->rt_priority && - attr->sched_priority > rlim_rtprio) - goto req_priv; - } - - /* - * Can't set/change SCHED_DEADLINE policy at all for now - * (safest behavior); in the future we would like to allow - * unprivileged DL tasks to increase their relative deadline - * or reduce their runtime (both ways reducing utilization) - */ - if (dl_policy(policy)) - goto req_priv; - - /* - * Treat SCHED_IDLE as nice 20. Only allow a switch to - * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. - */ - if (task_has_idle_policy(p) && !idle_policy(policy)) { - if (!is_nice_reduction(p, task_nice(p))) - goto req_priv; - } - - /* Can't change other user's priorities: */ - if (!check_same_owner(p)) - goto req_priv; - - /* Normal users shall not reset the sched_reset_on_fork flag: */ - if (p->sched_reset_on_fork && !reset_on_fork) - goto req_priv; - - return 0; - -req_priv: - if (!capable(CAP_SYS_NICE)) - return -EPERM; - - return 0; -} - -static int __sched_setscheduler(struct task_struct *p, - const struct sched_attr *attr, - bool user, bool pi) -{ - int oldpolicy = -1, policy = attr->sched_policy; - int retval, oldprio, newprio, queued, running; - const struct sched_class *prev_class; - struct balance_callback *head; - struct rq_flags rf; - int reset_on_fork; - int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; - struct rq *rq; - bool cpuset_locked = false; - - /* The pi code expects interrupts enabled */ - BUG_ON(pi && in_interrupt()); -recheck: - /* Double check policy once rq lock held: */ - if (policy < 0) { - reset_on_fork = p->sched_reset_on_fork; - policy = oldpolicy = p->policy; - } else { - reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK); - - if (!valid_policy(policy)) - return -EINVAL; - } - - if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV)) - return -EINVAL; - - /* - * Valid priorities for SCHED_FIFO and SCHED_RR are - * 1..MAX_RT_PRIO-1, valid priority for SCHED_NORMAL, - * SCHED_BATCH and SCHED_IDLE is 0. - */ - if (attr->sched_priority > MAX_RT_PRIO-1) - return -EINVAL; - if ((dl_policy(policy) && !__checkparam_dl(attr)) || - (rt_policy(policy) != (attr->sched_priority != 0))) - return -EINVAL; - - if (user) { - retval = user_check_sched_setscheduler(p, attr, policy, reset_on_fork); - if (retval) - return retval; - - if (attr->sched_flags & SCHED_FLAG_SUGOV) - return -EINVAL; - - retval = security_task_setscheduler(p); - if (retval) - return retval; - } - - /* Update task specific "requested" clamps */ - if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) { - retval = uclamp_validate(p, attr); - if (retval) - return retval; - } - - /* - * SCHED_DEADLINE bandwidth accounting relies on stable cpusets - * information. - */ - if (dl_policy(policy) || dl_policy(p->policy)) { - cpuset_locked = true; - cpuset_lock(); - } - - /* - * Make sure no PI-waiters arrive (or leave) while we are - * changing the priority of the task: - * - * To be able to change p->policy safely, the appropriate - * runqueue lock must be held. - */ - rq = task_rq_lock(p, &rf); - update_rq_clock(rq); - - /* - * Changing the policy of the stop threads its a very bad idea: - */ - if (p == rq->stop) { - retval = -EINVAL; - goto unlock; - } - - /* - * If not changing anything there's no need to proceed further, - * but store a possible modification of reset_on_fork. - */ - if (unlikely(policy == p->policy)) { - if (fair_policy(policy) && attr->sched_nice != task_nice(p)) - goto change; - if (rt_policy(policy) && attr->sched_priority != p->rt_priority) - goto change; - if (dl_policy(policy) && dl_param_changed(p, attr)) - goto change; - if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) - goto change; - - p->sched_reset_on_fork = reset_on_fork; - retval = 0; - goto unlock; - } -change: - - if (user) { -#ifdef CONFIG_RT_GROUP_SCHED - /* - * Do not allow realtime tasks into groups that have no runtime - * assigned. - */ - if (rt_bandwidth_enabled() && rt_policy(policy) && - task_group(p)->rt_bandwidth.rt_runtime == 0 && - !task_group_is_autogroup(task_group(p))) { - retval = -EPERM; - goto unlock; - } -#endif -#ifdef CONFIG_SMP - if (dl_bandwidth_enabled() && dl_policy(policy) && - !(attr->sched_flags & SCHED_FLAG_SUGOV)) { - cpumask_t *span = rq->rd->span; - - /* - * Don't allow tasks with an affinity mask smaller than - * the entire root_domain to become SCHED_DEADLINE. We - * will also fail if there's no bandwidth available. - */ - if (!cpumask_subset(span, p->cpus_ptr) || - rq->rd->dl_bw.bw == 0) { - retval = -EPERM; - goto unlock; - } - } -#endif - } - - /* Re-check policy now with rq lock held: */ - if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { - policy = oldpolicy = -1; - task_rq_unlock(rq, p, &rf); - if (cpuset_locked) - cpuset_unlock(); - goto recheck; - } - - /* - * If setscheduling to SCHED_DEADLINE (or changing the parameters - * of a SCHED_DEADLINE task) we need to check if enough bandwidth - * is available. - */ - if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) { - retval = -EBUSY; - goto unlock; - } - - p->sched_reset_on_fork = reset_on_fork; - oldprio = p->prio; - - newprio = __normal_prio(policy, attr->sched_priority, attr->sched_nice); - if (pi) { - /* - * Take priority boosted tasks into account. If the new - * effective priority is unchanged, we just store the new - * normal parameters and do not touch the scheduler class and - * the runqueue. This will be done when the task deboost - * itself. - */ - newprio = rt_effective_prio(p, newprio); - if (newprio == oldprio) - queue_flags &= ~DEQUEUE_MOVE; - } - - queued = task_on_rq_queued(p); - running = task_current(rq, p); - if (queued) - dequeue_task(rq, p, queue_flags); - if (running) - put_prev_task(rq, p); - - prev_class = p->sched_class; - - if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) { - __setscheduler_params(p, attr); - __setscheduler_prio(p, newprio); - } - __setscheduler_uclamp(p, attr); - - if (queued) { - /* - * We enqueue to tail when the priority of a task is - * increased (user space view). - */ - if (oldprio < p->prio) - queue_flags |= ENQUEUE_HEAD; - - enqueue_task(rq, p, queue_flags); - } - if (running) - set_next_task(rq, p); - - check_class_changed(rq, p, prev_class, oldprio); - - /* Avoid rq from going away on us: */ - preempt_disable(); - head = splice_balance_callbacks(rq); - task_rq_unlock(rq, p, &rf); - - if (pi) { - if (cpuset_locked) - cpuset_unlock(); - rt_mutex_adjust_pi(p); - } - - /* Run balance callbacks after we've adjusted the PI chain: */ - balance_callbacks(rq, head); - preempt_enable(); - - return 0; - -unlock: - task_rq_unlock(rq, p, &rf); - if (cpuset_locked) - cpuset_unlock(); - return retval; -} - -static int _sched_setscheduler(struct task_struct *p, int policy, - const struct sched_param *param, bool check) -{ - struct sched_attr attr = { - .sched_policy = policy, - .sched_priority = param->sched_priority, - .sched_nice = PRIO_TO_NICE(p->static_prio), - }; - - /* Fixup the legacy SCHED_RESET_ON_FORK hack. */ - if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) { - attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; - policy &= ~SCHED_RESET_ON_FORK; - attr.sched_policy = policy; - } - - return __sched_setscheduler(p, &attr, check, true); -} -/** - * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. - * @p: the task in question. - * @policy: new policy. - * @param: structure containing the new RT priority. - * - * Use sched_set_fifo(), read its comment. - * - * Return: 0 on success. An error code otherwise. - * - * NOTE that the task may be already dead. - */ -int sched_setscheduler(struct task_struct *p, int policy, - const struct sched_param *param) -{ - return _sched_setscheduler(p, policy, param, true); -} - -int sched_setattr(struct task_struct *p, const struct sched_attr *attr) -{ - return __sched_setscheduler(p, attr, true, true); -} - -int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) -{ - return __sched_setscheduler(p, attr, false, true); -} -EXPORT_SYMBOL_GPL(sched_setattr_nocheck); - -/** - * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. - * @p: the task in question. - * @policy: new policy. - * @param: structure containing the new RT priority. - * - * Just like sched_setscheduler, only don't bother checking if the - * current context has permission. For example, this is needed in - * stop_machine(): we create temporary high priority worker threads, - * but our caller might not have that capability. - * - * Return: 0 on success. An error code otherwise. - */ -int sched_setscheduler_nocheck(struct task_struct *p, int policy, - const struct sched_param *param) -{ - return _sched_setscheduler(p, policy, param, false); -} - -/* - * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally - * incapable of resource management, which is the one thing an OS really should - * be doing. - * - * This is of course the reason it is limited to privileged users only. - * - * Worse still; it is fundamentally impossible to compose static priority - * workloads. You cannot take two correctly working static prio workloads - * and smash them together and still expect them to work. - * - * For this reason 'all' FIFO tasks the kernel creates are basically at: - * - * MAX_RT_PRIO / 2 - * - * The administrator _MUST_ configure the system, the kernel simply doesn't - * know enough information to make a sensible choice. - */ -void sched_set_fifo(struct task_struct *p) -{ - struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 }; - WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0); -} -EXPORT_SYMBOL_GPL(sched_set_fifo); - -/* - * For when you don't much care about FIFO, but want to be above SCHED_NORMAL. - */ -void sched_set_fifo_low(struct task_struct *p) -{ - struct sched_param sp = { .sched_priority = 1 }; - WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0); -} -EXPORT_SYMBOL_GPL(sched_set_fifo_low); - -void sched_set_normal(struct task_struct *p, int nice) -{ - struct sched_attr attr = { - .sched_policy = SCHED_NORMAL, - .sched_nice = nice, - }; - WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0); -} -EXPORT_SYMBOL_GPL(sched_set_normal); - -static int -do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) -{ - struct sched_param lparam; - - if (!param || pid < 0) - return -EINVAL; - if (copy_from_user(&lparam, param, sizeof(struct sched_param))) - return -EFAULT; - - CLASS(find_get_task, p)(pid); - if (!p) - return -ESRCH; - - return sched_setscheduler(p, policy, &lparam); -} - -/* - * Mimics kernel/events/core.c perf_copy_attr(). - */ -static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr) -{ - u32 size; - int ret; - - /* Zero the full structure, so that a short copy will be nice: */ - memset(attr, 0, sizeof(*attr)); - - ret = get_user(size, &uattr->size); - if (ret) - return ret; - - /* ABI compatibility quirk: */ - if (!size) - size = SCHED_ATTR_SIZE_VER0; - if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE) - goto err_size; - - ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); - if (ret) { - if (ret == -E2BIG) - goto err_size; - return ret; - } - - if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) && - size < SCHED_ATTR_SIZE_VER1) - return -EINVAL; - - /* - * XXX: Do we want to be lenient like existing syscalls; or do we want - * to be strict and return an error on out-of-bounds values? - */ - attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE); - - return 0; - -err_size: - put_user(sizeof(*attr), &uattr->size); - return -E2BIG; -} - -static void get_params(struct task_struct *p, struct sched_attr *attr) -{ - if (task_has_dl_policy(p)) - __getparam_dl(p, attr); - else if (task_has_rt_policy(p)) - attr->sched_priority = p->rt_priority; - else - attr->sched_nice = task_nice(p); -} - -/** - * sys_sched_setscheduler - set/change the scheduler policy and RT priority - * @pid: the pid in question. - * @policy: new policy. - * @param: structure containing the new RT priority. - * - * Return: 0 on success. An error code otherwise. - */ -SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param) -{ - if (policy < 0) - return -EINVAL; - - return do_sched_setscheduler(pid, policy, param); -} - -/** - * sys_sched_setparam - set/change the RT priority of a thread - * @pid: the pid in question. - * @param: structure containing the new RT priority. - * - * Return: 0 on success. An error code otherwise. - */ -SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) -{ - return do_sched_setscheduler(pid, SETPARAM_POLICY, param); -} - -/** - * sys_sched_setattr - same as above, but with extended sched_attr - * @pid: the pid in question. - * @uattr: structure containing the extended parameters. - * @flags: for future extension. - */ -SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, - unsigned int, flags) -{ - struct sched_attr attr; - int retval; - - if (!uattr || pid < 0 || flags) - return -EINVAL; - - retval = sched_copy_attr(uattr, &attr); - if (retval) - return retval; - - if ((int)attr.sched_policy < 0) - return -EINVAL; - if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY) - attr.sched_policy = SETPARAM_POLICY; - - CLASS(find_get_task, p)(pid); - if (!p) - return -ESRCH; - - if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS) - get_params(p, &attr); - - return sched_setattr(p, &attr); -} - -/** - * sys_sched_getscheduler - get the policy (scheduling class) of a thread - * @pid: the pid in question. - * - * Return: On success, the policy of the thread. Otherwise, a negative error - * code. - */ -SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) -{ - struct task_struct *p; - int retval; - - if (pid < 0) - return -EINVAL; - - guard(rcu)(); - p = find_process_by_pid(pid); - if (!p) - return -ESRCH; - - retval = security_task_getscheduler(p); - if (!retval) { - retval = p->policy; - if (p->sched_reset_on_fork) - retval |= SCHED_RESET_ON_FORK; - } - return retval; -} - -/** - * sys_sched_getparam - get the RT priority of a thread - * @pid: the pid in question. - * @param: structure containing the RT priority. - * - * Return: On success, 0 and the RT priority is in @param. Otherwise, an error - * code. - */ -SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) -{ - struct sched_param lp = { .sched_priority = 0 }; - struct task_struct *p; - int retval; - - if (!param || pid < 0) - return -EINVAL; - - scoped_guard (rcu) { - p = find_process_by_pid(pid); - if (!p) - return -ESRCH; - - retval = security_task_getscheduler(p); - if (retval) - return retval; - - if (task_has_rt_policy(p)) - lp.sched_priority = p->rt_priority; - } - - /* - * This one might sleep, we cannot do it with a spinlock held ... - */ - return copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; -} - -/* - * Copy the kernel size attribute structure (which might be larger - * than what user-space knows about) to user-space. - * - * Note that all cases are valid: user-space buffer can be larger or - * smaller than the kernel-space buffer. The usual case is that both - * have the same size. - */ -static int -sched_attr_copy_to_user(struct sched_attr __user *uattr, - struct sched_attr *kattr, - unsigned int usize) -{ - unsigned int ksize = sizeof(*kattr); - - if (!access_ok(uattr, usize)) - return -EFAULT; - - /* - * sched_getattr() ABI forwards and backwards compatibility: - * - * If usize == ksize then we just copy everything to user-space and all is good. - * - * If usize < ksize then we only copy as much as user-space has space for, - * this keeps ABI compatibility as well. We skip the rest. - * - * If usize > ksize then user-space is using a newer version of the ABI, - * which part the kernel doesn't know about. Just ignore it - tooling can - * detect the kernel's knowledge of attributes from the attr->size value - * which is set to ksize in this case. - */ - kattr->size = min(usize, ksize); - - if (copy_to_user(uattr, kattr, kattr->size)) - return -EFAULT; - - return 0; -} - -/** - * sys_sched_getattr - similar to sched_getparam, but with sched_attr - * @pid: the pid in question. - * @uattr: structure containing the extended parameters. - * @usize: sizeof(attr) for fwd/bwd comp. - * @flags: for future extension. - */ -SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, - unsigned int, usize, unsigned int, flags) -{ - struct sched_attr kattr = { }; - struct task_struct *p; - int retval; - - if (!uattr || pid < 0 || usize > PAGE_SIZE || - usize < SCHED_ATTR_SIZE_VER0 || flags) - return -EINVAL; - - scoped_guard (rcu) { - p = find_process_by_pid(pid); - if (!p) - return -ESRCH; - - retval = security_task_getscheduler(p); - if (retval) - return retval; - - kattr.sched_policy = p->policy; - if (p->sched_reset_on_fork) - kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; - get_params(p, &kattr); - kattr.sched_flags &= SCHED_FLAG_ALL; - -#ifdef CONFIG_UCLAMP_TASK - /* - * This could race with another potential updater, but this is fine - * because it'll correctly read the old or the new value. We don't need - * to guarantee who wins the race as long as it doesn't return garbage. - */ - kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; - kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; -#endif - } - - return sched_attr_copy_to_user(uattr, &kattr, usize); -} - -#ifdef CONFIG_SMP -int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask) -{ - /* - * If the task isn't a deadline task or admission control is - * disabled then we don't care about affinity changes. - */ - if (!task_has_dl_policy(p) || !dl_bandwidth_enabled()) - return 0; - - /* - * Since bandwidth control happens on root_domain basis, - * if admission test is enabled, we only admit -deadline - * tasks allowed to run on all the CPUs in the task's - * root_domain. - */ - guard(rcu)(); - if (!cpumask_subset(task_rq(p)->rd->span, mask)) - return -EBUSY; - - return 0; -} -#endif - -static int -__sched_setaffinity(struct task_struct *p, struct affinity_context *ctx) -{ - int retval; - cpumask_var_t cpus_allowed, new_mask; - - if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) - return -ENOMEM; - - if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { - retval = -ENOMEM; - goto out_free_cpus_allowed; - } - - cpuset_cpus_allowed(p, cpus_allowed); - cpumask_and(new_mask, ctx->new_mask, cpus_allowed); - - ctx->new_mask = new_mask; - ctx->flags |= SCA_CHECK; - - retval = dl_task_check_affinity(p, new_mask); - if (retval) - goto out_free_new_mask; - - retval = __set_cpus_allowed_ptr(p, ctx); - if (retval) - goto out_free_new_mask; - - cpuset_cpus_allowed(p, cpus_allowed); - if (!cpumask_subset(new_mask, cpus_allowed)) { - /* - * We must have raced with a concurrent cpuset update. - * Just reset the cpumask to the cpuset's cpus_allowed. - */ - cpumask_copy(new_mask, cpus_allowed); - - /* - * If SCA_USER is set, a 2nd call to __set_cpus_allowed_ptr() - * will restore the previous user_cpus_ptr value. - * - * In the unlikely event a previous user_cpus_ptr exists, - * we need to further restrict the mask to what is allowed - * by that old user_cpus_ptr. - */ - if (unlikely((ctx->flags & SCA_USER) && ctx->user_mask)) { - bool empty = !cpumask_and(new_mask, new_mask, - ctx->user_mask); - - if (WARN_ON_ONCE(empty)) - cpumask_copy(new_mask, cpus_allowed); - } - __set_cpus_allowed_ptr(p, ctx); - retval = -EINVAL; - } - -out_free_new_mask: - free_cpumask_var(new_mask); -out_free_cpus_allowed: - free_cpumask_var(cpus_allowed); - return retval; -} - -long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) -{ - struct affinity_context ac; - struct cpumask *user_mask; - int retval; - - CLASS(find_get_task, p)(pid); - if (!p) - return -ESRCH; - - if (p->flags & PF_NO_SETAFFINITY) - return -EINVAL; - - if (!check_same_owner(p)) { - guard(rcu)(); - if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) - return -EPERM; - } - - retval = security_task_setscheduler(p); - if (retval) - return retval; - - /* - * With non-SMP configs, user_cpus_ptr/user_mask isn't used and - * alloc_user_cpus_ptr() returns NULL. - */ - user_mask = alloc_user_cpus_ptr(NUMA_NO_NODE); - if (user_mask) { - cpumask_copy(user_mask, in_mask); - } else if (IS_ENABLED(CONFIG_SMP)) { - return -ENOMEM; - } - - ac = (struct affinity_context){ - .new_mask = in_mask, - .user_mask = user_mask, - .flags = SCA_USER, - }; - - retval = __sched_setaffinity(p, &ac); - kfree(ac.user_mask); - - return retval; -} - -static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, - struct cpumask *new_mask) -{ - if (len < cpumask_size()) - cpumask_clear(new_mask); - else if (len > cpumask_size()) - len = cpumask_size(); - - return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; -} - -/** - * sys_sched_setaffinity - set the CPU affinity of a process - * @pid: pid of the process - * @len: length in bytes of the bitmask pointed to by user_mask_ptr - * @user_mask_ptr: user-space pointer to the new CPU mask - * - * Return: 0 on success. An error code otherwise. - */ -SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, - unsigned long __user *, user_mask_ptr) -{ - cpumask_var_t new_mask; - int retval; - - if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) - return -ENOMEM; - - retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); - if (retval == 0) - retval = sched_setaffinity(pid, new_mask); - free_cpumask_var(new_mask); - return retval; -} - -long sched_getaffinity(pid_t pid, struct cpumask *mask) -{ - struct task_struct *p; - int retval; - - guard(rcu)(); - p = find_process_by_pid(pid); - if (!p) - return -ESRCH; - - retval = security_task_getscheduler(p); - if (retval) - return retval; - - guard(raw_spinlock_irqsave)(&p->pi_lock); - cpumask_and(mask, &p->cpus_mask, cpu_active_mask); - - return 0; -} - -/** - * sys_sched_getaffinity - get the CPU affinity of a process - * @pid: pid of the process - * @len: length in bytes of the bitmask pointed to by user_mask_ptr - * @user_mask_ptr: user-space pointer to hold the current CPU mask - * - * Return: size of CPU mask copied to user_mask_ptr on success. An - * error code otherwise. - */ -SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, - unsigned long __user *, user_mask_ptr) -{ - int ret; - cpumask_var_t mask; - - if ((len * BITS_PER_BYTE) < nr_cpu_ids) - return -EINVAL; - if (len & (sizeof(unsigned long)-1)) - return -EINVAL; - - if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) - return -ENOMEM; - - ret = sched_getaffinity(pid, mask); - if (ret == 0) { - unsigned int retlen = min(len, cpumask_size()); - - if (copy_to_user(user_mask_ptr, cpumask_bits(mask), retlen)) - ret = -EFAULT; - else - ret = retlen; - } - free_cpumask_var(mask); - - return ret; -} - -static void do_sched_yield(void) -{ - struct rq_flags rf; - struct rq *rq; - - rq = this_rq_lock_irq(&rf); - - schedstat_inc(rq->yld_count); - current->sched_class->yield_task(rq); - - preempt_disable(); - rq_unlock_irq(rq, &rf); - sched_preempt_enable_no_resched(); - - schedule(); -} - -/** - * sys_sched_yield - yield the current processor to other threads. - * - * This function yields the current CPU to other tasks. If there are no - * other threads running on this CPU then this function will return. - * - * Return: 0. - */ -SYSCALL_DEFINE0(sched_yield) -{ - do_sched_yield(); - return 0; -} - #if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) int __sched __cond_resched(void) { @@ -8904,105 +7356,11 @@ PREEMPT_MODEL_ACCESSOR(none); PREEMPT_MODEL_ACCESSOR(voluntary); PREEMPT_MODEL_ACCESSOR(full); -#else /* !CONFIG_PREEMPT_DYNAMIC */ +#else /* !CONFIG_PREEMPT_DYNAMIC: */ static inline void preempt_dynamic_init(void) { } -#endif /* #ifdef CONFIG_PREEMPT_DYNAMIC */ - -/** - * yield - yield the current processor to other threads. - * - * Do not ever use this function, there's a 99% chance you're doing it wrong. - * - * The scheduler is at all times free to pick the calling task as the most - * eligible task to run, if removing the yield() call from your code breaks - * it, it's already broken. - * - * Typical broken usage is: - * - * while (!event) - * yield(); - * - * where one assumes that yield() will let 'the other' process run that will - * make event true. If the current task is a SCHED_FIFO task that will never - * happen. Never use yield() as a progress guarantee!! - * - * If you want to use yield() to wait for something, use wait_event(). - * If you want to use yield() to be 'nice' for others, use cond_resched(). - * If you still want to use yield(), do not! - */ -void __sched yield(void) -{ - set_current_state(TASK_RUNNING); - do_sched_yield(); -} -EXPORT_SYMBOL(yield); - -/** - * yield_to - yield the current processor to another thread in - * your thread group, or accelerate that thread toward the - * processor it's on. - * @p: target task - * @preempt: whether task preemption is allowed or not - * - * It's the caller's job to ensure that the target task struct - * can't go away on us before we can do any checks. - * - * Return: - * true (>0) if we indeed boosted the target task. - * false (0) if we failed to boost the target. - * -ESRCH if there's no task to yield to. - */ -int __sched yield_to(struct task_struct *p, bool preempt) -{ - struct task_struct *curr = current; - struct rq *rq, *p_rq; - int yielded = 0; - - scoped_guard (irqsave) { - rq = this_rq(); - -again: - p_rq = task_rq(p); - /* - * If we're the only runnable task on the rq and target rq also - * has only one task, there's absolutely no point in yielding. - */ - if (rq->nr_running == 1 && p_rq->nr_running == 1) - return -ESRCH; - - guard(double_rq_lock)(rq, p_rq); - if (task_rq(p) != p_rq) - goto again; - - if (!curr->sched_class->yield_to_task) - return 0; - - if (curr->sched_class != p->sched_class) - return 0; - - if (task_on_cpu(p_rq, p) || !task_is_running(p)) - return 0; - - yielded = curr->sched_class->yield_to_task(rq, p); - if (yielded) { - schedstat_inc(rq->yld_count); - /* - * Make p's CPU reschedule; pick_next_entity - * takes care of fairness. - */ - if (preempt && rq != p_rq) - resched_curr(p_rq); - } - } - - if (yielded) - schedule(); - - return yielded; -} -EXPORT_SYMBOL_GPL(yield_to); +#endif /* CONFIG_PREEMPT_DYNAMIC */ int io_schedule_prepare(void) { @@ -9045,123 +7403,6 @@ void __sched io_schedule(void) } EXPORT_SYMBOL(io_schedule); -/** - * sys_sched_get_priority_max - return maximum RT priority. - * @policy: scheduling class. - * - * Return: On success, this syscall returns the maximum - * rt_priority that can be used by a given scheduling class. - * On failure, a negative error code is returned. - */ -SYSCALL_DEFINE1(sched_get_priority_max, int, policy) -{ - int ret = -EINVAL; - - switch (policy) { - case SCHED_FIFO: - case SCHED_RR: - ret = MAX_RT_PRIO-1; - break; - case SCHED_DEADLINE: - case SCHED_NORMAL: - case SCHED_BATCH: - case SCHED_IDLE: - ret = 0; - break; - } - return ret; -} - -/** - * sys_sched_get_priority_min - return minimum RT priority. - * @policy: scheduling class. - * - * Return: On success, this syscall returns the minimum - * rt_priority that can be used by a given scheduling class. - * On failure, a negative error code is returned. - */ -SYSCALL_DEFINE1(sched_get_priority_min, int, policy) -{ - int ret = -EINVAL; - - switch (policy) { - case SCHED_FIFO: - case SCHED_RR: - ret = 1; - break; - case SCHED_DEADLINE: - case SCHED_NORMAL: - case SCHED_BATCH: - case SCHED_IDLE: - ret = 0; - } - return ret; -} - -static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) -{ - unsigned int time_slice = 0; - int retval; - - if (pid < 0) - return -EINVAL; - - scoped_guard (rcu) { - struct task_struct *p = find_process_by_pid(pid); - if (!p) - return -ESRCH; - - retval = security_task_getscheduler(p); - if (retval) - return retval; - - scoped_guard (task_rq_lock, p) { - struct rq *rq = scope.rq; - if (p->sched_class->get_rr_interval) - time_slice = p->sched_class->get_rr_interval(rq, p); - } - } - - jiffies_to_timespec64(time_slice, t); - return 0; -} - -/** - * sys_sched_rr_get_interval - return the default timeslice of a process. - * @pid: pid of the process. - * @interval: userspace pointer to the timeslice value. - * - * this syscall writes the default timeslice value of a given process - * into the user-space timespec buffer. A value of '0' means infinity. - * - * Return: On success, 0 and the timeslice is in @interval. Otherwise, - * an error code. - */ -SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, - struct __kernel_timespec __user *, interval) -{ - struct timespec64 t; - int retval = sched_rr_get_interval(pid, &t); - - if (retval == 0) - retval = put_timespec64(&t, interval); - - return retval; -} - -#ifdef CONFIG_COMPAT_32BIT_TIME -SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid, - struct old_timespec32 __user *, interval) -{ - struct timespec64 t; - int retval = sched_rr_get_interval(pid, &t); - - if (retval == 0) - retval = put_old_timespec32(&t, interval); - return retval; -} -#endif - void sched_show_task(struct task_struct *p) { unsigned long free = 0; @@ -9729,7 +7970,7 @@ int sched_cpu_deactivate(unsigned int cpu) * Specifically, we rely on ttwu to no longer target this CPU, see * ttwu_queue_cond() and is_cpu_allowed(). * - * Do sync before park smpboot threads to take care the rcu boost case. + * Do sync before park smpboot threads to take care the RCU boost case. */ synchronize_rcu(); @@ -9804,7 +8045,7 @@ int sched_cpu_wait_empty(unsigned int cpu) * Since this CPU is going 'away' for a while, fold any nr_active delta we * might have. Called from the CPU stopper task after ensuring that the * stopper is the last running task on the CPU, so nr_active count is - * stable. We need to take the teardown thread which is calling this into + * stable. We need to take the tear-down thread which is calling this into * account, so we hand in adjust = 1 to the load calculation. * * Also see the comment "Global load-average calculations". @@ -9998,7 +8239,7 @@ void __init sched_init(void) /* * How much CPU bandwidth does root_task_group get? * - * In case of task-groups formed thr' the cgroup filesystem, it + * In case of task-groups formed through the cgroup filesystem, it * gets 100% of the CPU resources in the system. This overall * system CPU resource is divided among the tasks of * root_task_group and its child task-groups in a fair manner, @@ -10300,7 +8541,7 @@ void normalize_rt_tasks(void) #if defined(CONFIG_KGDB_KDB) /* - * These functions are only useful for kdb. + * These functions are only useful for KDB. * * They can only be called when the whole system has been * stopped - every CPU needs to be quiescent, and no scheduling @@ -10408,7 +8649,7 @@ void sched_online_group(struct task_group *tg, struct task_group *parent) online_fair_sched_group(tg); } -/* rcu callback to free various structures associated with a task group */ +/* RCU callback to free various structures associated with a task group */ static void sched_unregister_group_rcu(struct rcu_head *rhp) { /* Now it should be safe to free those cfs_rqs: */ @@ -11526,10 +9767,10 @@ const int sched_prio_to_weight[40] = { }; /* - * Inverse (2^32/x) values of the sched_prio_to_weight[] array, precalculated. + * Inverse (2^32/x) values of the sched_prio_to_weight[] array, pre-calculated. * * In cases where the weight does not change often, we can use the - * precalculated inverse to speed up arithmetics by turning divisions + * pre-calculated inverse to speed up arithmetics by turning divisions * into multiplications: */ const u32 sched_prio_to_wmult[40] = { @@ -11785,16 +10026,16 @@ void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) /* * Move the src cid if the dst cid is unset. This keeps id * allocation closest to 0 in cases where few threads migrate around - * many cpus. + * many CPUs. * * If destination cid is already set, we may have to just clear * the src cid to ensure compactness in frequent migrations * scenarios. * * It is not useful to clear the src cid when the number of threads is - * greater or equal to the number of allowed cpus, because user-space + * greater or equal to the number of allowed CPUs, because user-space * can expect that the number of allowed cids can reach the number of - * allowed cpus. + * allowed CPUs. */ dst_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(dst_rq)); dst_cid = READ_ONCE(dst_pcpu_cid->cid); diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c index a57fd8f27498..1ef98a93eb1d 100644 --- a/kernel/sched/core_sched.c +++ b/kernel/sched/core_sched.c @@ -279,7 +279,7 @@ void __sched_core_account_forceidle(struct rq *rq) continue; /* - * Note: this will account forceidle to the current cpu, even + * Note: this will account forceidle to the current CPU, even * if it comes from our SMT sibling. */ __account_forceidle_time(p, delta); diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index aa48b2ec879d..a5e00293ae43 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -14,11 +14,11 @@ * They are only modified in vtime_account, on corresponding CPU * with interrupts disabled. So, writes are safe. * They are read and saved off onto struct rq in update_rq_clock(). - * This may result in other CPU reading this CPU's irq time and can + * This may result in other CPU reading this CPU's IRQ time and can * race with irq/vtime_account on this CPU. We would either get old - * or new value with a side effect of accounting a slice of irq time to wrong - * task when irq is in progress while we read rq->clock. That is a worthy - * compromise in place of having locks on each irq in account_system_time. + * or new value with a side effect of accounting a slice of IRQ time to wrong + * task when IRQ is in progress while we read rq->clock. That is a worthy + * compromise in place of having locks on each IRQ in account_system_time. */ DEFINE_PER_CPU(struct irqtime, cpu_irqtime); @@ -269,7 +269,7 @@ static __always_inline u64 steal_account_process_time(u64 maxtime) } /* - * Account how much elapsed time was spent in steal, irq, or softirq time. + * Account how much elapsed time was spent in steal, IRQ, or softirq time. */ static inline u64 account_other_time(u64 max) { @@ -370,7 +370,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) * Check for hardirq is done both for system and user time as there is * no timer going off while we are on hardirq and hence we may never get an * opportunity to update it solely in system time. - * p->stime and friends are only updated on system time and not on irq + * p->stime and friends are only updated on system time and not on IRQ * softirq as those do not count in task exec_runtime any more. */ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, @@ -380,7 +380,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, /* * When returning from idle, many ticks can get accounted at - * once, including some ticks of steal, irq, and softirq time. + * once, including some ticks of steal, IRQ, and softirq time. * Subtract those ticks from the amount of time accounted to * idle, or potentially user or system time. Due to rounding, * other time can exceed ticks occasionally. diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index c75d1307d86d..f59e5c19d944 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -708,7 +708,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p } /* - * And we finally need to fixup root_domain(s) bandwidth accounting, + * And we finally need to fix up root_domain(s) bandwidth accounting, * since p is still hanging out in the old (now moved to default) root * domain. */ @@ -992,7 +992,7 @@ static inline bool dl_is_implicit(struct sched_dl_entity *dl_se) * is detected, the runtime and deadline need to be updated. * * If the task has an implicit deadline, i.e., deadline == period, the Original - * CBS is applied. the runtime is replenished and a new absolute deadline is + * CBS is applied. The runtime is replenished and a new absolute deadline is * set, as in the previous cases. * * However, the Original CBS does not work properly for tasks with @@ -1294,7 +1294,7 @@ int dl_runtime_exceeded(struct sched_dl_entity *dl_se) * Since rq->dl.running_bw and rq->dl.this_bw contain utilizations multiplied * by 2^BW_SHIFT, the result has to be shifted right by BW_SHIFT. * Since rq->dl.bw_ratio contains 1 / Umax multiplied by 2^RATIO_SHIFT, dl_bw - * is multiped by rq->dl.bw_ratio and shifted right by RATIO_SHIFT. + * is multiplied by rq->dl.bw_ratio and shifted right by RATIO_SHIFT. * Since delta is a 64 bit variable, to have an overflow its value should be * larger than 2^(64 - 20 - 8), which is more than 64 seconds. So, overflow is * not an issue here. @@ -1804,8 +1804,13 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) * The replenish timer needs to be canceled. No * problem if it fires concurrently: boosted threads * are ignored in dl_task_timer(). + * + * If the timer callback was running (hrtimer_try_to_cancel == -1), + * it will eventually call put_task_struct(). */ - hrtimer_try_to_cancel(&p->dl.dl_timer); + if (hrtimer_try_to_cancel(&p->dl.dl_timer) == 1 && + !dl_server(&p->dl)) + put_task_struct(p); p->dl.dl_throttled = 0; } } else if (!dl_prio(p->normal_prio)) { @@ -2488,7 +2493,7 @@ static void pull_dl_task(struct rq *this_rq) src_rq = cpu_rq(cpu); /* - * It looks racy, abd it is! However, as in sched_rt.c, + * It looks racy, and it is! However, as in sched_rt.c, * we are fine with this. */ if (this_rq->dl.dl_nr_running && diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8a5b1ae0aa55..9057584ec06d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -61,7 +61,7 @@ * Options are: * * SCHED_TUNABLESCALING_NONE - unscaled, always *1 - * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus) + * SCHED_TUNABLESCALING_LOG - scaled logarithmically, *1+ilog(ncpus) * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus * * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) @@ -3835,15 +3835,14 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, } } -void reweight_task(struct task_struct *p, int prio) +void reweight_task(struct task_struct *p, const struct load_weight *lw) { struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); struct load_weight *load = &se->load; - unsigned long weight = scale_load(sched_prio_to_weight[prio]); - reweight_entity(cfs_rq, se, weight); - load->inv_weight = sched_prio_to_wmult[prio]; + reweight_entity(cfs_rq, se, lw->weight); + load->inv_weight = lw->inv_weight; } static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); @@ -8719,7 +8718,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) * topology where each level pairs two lower groups (or better). This results * in O(log n) layers. Furthermore we reduce the number of CPUs going up the * tree to only the first of the previous level and we decrease the frequency - * of load-balance at each level inv. proportional to the number of CPUs in + * of load-balance at each level inversely proportional to the number of CPUs in * the groups. * * This yields: @@ -9149,12 +9148,8 @@ static int detach_tasks(struct lb_env *env) break; env->loop++; - /* - * We've more or less seen every task there is, call it quits - * unless we haven't found any movable task yet. - */ - if (env->loop > env->loop_max && - !(env->flags & LBF_ALL_PINNED)) + /* We've more or less seen every task there is, call it quits */ + if (env->loop > env->loop_max) break; /* take a breather every nr_migrate tasks */ @@ -11393,9 +11388,7 @@ more_balance: if (env.flags & LBF_NEED_BREAK) { env.flags &= ~LBF_NEED_BREAK; - /* Stop if we tried all running tasks */ - if (env.loop < busiest->nr_running) - goto more_balance; + goto more_balance; } /* @@ -11892,6 +11885,13 @@ static void kick_ilb(unsigned int flags) return; /* + * Don't bother if no new NOHZ balance work items for ilb_cpu, + * i.e. all bits in flags are already set in ilb_cpu. + */ + if ((atomic_read(nohz_flags(ilb_cpu)) & flags) == flags) + return; + + /* * Access to rq::nohz_csd is serialized by NOHZ_KICK_MASK; he who sets * the first flag owns it; cleared by nohz_csd_func(). */ diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 6135fbe83d68..6e78d071beb5 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -172,19 +172,13 @@ static void cpuidle_idle_call(void) /* * Check if the idle task must be rescheduled. If it is the - * case, exit the function after re-enabling the local irq. + * case, exit the function after re-enabling the local IRQ. */ if (need_resched()) { local_irq_enable(); return; } - /* - * The RCU framework needs to be told that we are entering an idle - * section, so no more rcu read side critical sections and one more - * step to the grace period - */ - if (cpuidle_not_available(drv, dev)) { tick_nohz_idle_stop_tick(); @@ -244,7 +238,7 @@ exit_idle: __current_set_polling(); /* - * It is up to the idle functions to reenable local interrupts + * It is up to the idle functions to re-enable local interrupts */ if (WARN_ON_ONCE(irqs_disabled())) local_irq_enable(); @@ -320,7 +314,7 @@ static void do_idle(void) rcu_nocb_flush_deferred_wakeup(); /* - * In poll mode we reenable interrupts and spin. Also if we + * In poll mode we re-enable interrupts and spin. Also if we * detected in the wakeup from idle path that the tick * broadcast device expired for us, we don't want to go deep * idle as we know that the IPI is going to arrive right away. diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index ca9da66cc894..c48900b856a2 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -45,7 +45,7 @@ * again, being late doesn't loose the delta, just wrecks the sample. * * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-CPU because - * this would add another cross-CPU cacheline miss and atomic operation + * this would add another cross-CPU cache-line miss and atomic operation * to the wakeup path. Instead we increment on whatever CPU the task ran * when it went into uninterruptible state and decrement on whatever CPU * did the wakeup. This means that only the sum of nr_uninterruptible over @@ -62,7 +62,7 @@ EXPORT_SYMBOL(avenrun); /* should be removed */ /** * get_avenrun - get the load average array - * @loads: pointer to dest load array + * @loads: pointer to destination load array * @offset: offset to add * @shift: shift count to shift the result left * diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index ef00382de595..fa52906a4478 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -417,7 +417,7 @@ int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity) #ifdef CONFIG_HAVE_SCHED_AVG_IRQ /* - * irq: + * IRQ: * * util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked * util_sum = cpu_scale * load_sum @@ -432,7 +432,7 @@ int update_irq_load_avg(struct rq *rq, u64 running) int ret = 0; /* - * We can't use clock_pelt because irq time is not accounted in + * We can't use clock_pelt because IRQ time is not accounted in * clock_task. Instead we directly scale the running time to * reflect the real amount of computation */ diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 7b4aa5809c0f..020d58967d4e 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -41,7 +41,7 @@ * What it means for a task to be productive is defined differently * for each resource. For IO, productive means a running task. For * memory, productive means a running task that isn't a reclaimer. For - * CPU, productive means an oncpu task. + * CPU, productive means an on-CPU task. * * Naturally, the FULL state doesn't exist for the CPU resource at the * system level, but exist at the cgroup level. At the cgroup level, @@ -49,7 +49,7 @@ * resource which is being used by others outside of the cgroup or * throttled by the cgroup cpu.max configuration. * - * The percentage of wallclock time spent in those compound stall + * The percentage of wall clock time spent in those compound stall * states gives pressure numbers between 0 and 100 for each resource, * where the SOME percentage indicates workload slowdowns and the FULL * percentage indicates reduced CPU utilization: @@ -218,28 +218,32 @@ void __init psi_init(void) group_init(&psi_system); } -static bool test_state(unsigned int *tasks, enum psi_states state, bool oncpu) +static u32 test_states(unsigned int *tasks, u32 state_mask) { - switch (state) { - case PSI_IO_SOME: - return unlikely(tasks[NR_IOWAIT]); - case PSI_IO_FULL: - return unlikely(tasks[NR_IOWAIT] && !tasks[NR_RUNNING]); - case PSI_MEM_SOME: - return unlikely(tasks[NR_MEMSTALL]); - case PSI_MEM_FULL: - return unlikely(tasks[NR_MEMSTALL] && - tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]); - case PSI_CPU_SOME: - return unlikely(tasks[NR_RUNNING] > oncpu); - case PSI_CPU_FULL: - return unlikely(tasks[NR_RUNNING] && !oncpu); - case PSI_NONIDLE: - return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] || - tasks[NR_RUNNING]; - default: - return false; + const bool oncpu = state_mask & PSI_ONCPU; + + if (tasks[NR_IOWAIT]) { + state_mask |= BIT(PSI_IO_SOME); + if (!tasks[NR_RUNNING]) + state_mask |= BIT(PSI_IO_FULL); + } + + if (tasks[NR_MEMSTALL]) { + state_mask |= BIT(PSI_MEM_SOME); + if (tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]) + state_mask |= BIT(PSI_MEM_FULL); } + + if (tasks[NR_RUNNING] > oncpu) + state_mask |= BIT(PSI_CPU_SOME); + + if (tasks[NR_RUNNING] && !oncpu) + state_mask |= BIT(PSI_CPU_FULL); + + if (tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] || tasks[NR_RUNNING]) + state_mask |= BIT(PSI_NONIDLE); + + return state_mask; } static void get_recent_times(struct psi_group *group, int cpu, @@ -345,7 +349,7 @@ static void collect_percpu_times(struct psi_group *group, /* * Collect the per-cpu time buckets and average them into a - * single time sample that is normalized to wallclock time. + * single time sample that is normalized to wall clock time. * * For averaging, each CPU is weighted by its non-idle time in * the sampling period. This eliminates artifacts from uneven @@ -770,9 +774,9 @@ static void psi_group_change(struct psi_group *group, int cpu, { struct psi_group_cpu *groupc; unsigned int t, m; - enum psi_states s; u32 state_mask; + lockdep_assert_rq_held(cpu_rq(cpu)); groupc = per_cpu_ptr(group->pcpu, cpu); /* @@ -841,10 +845,7 @@ static void psi_group_change(struct psi_group *group, int cpu, return; } - for (s = 0; s < NR_PSI_STATES; s++) { - if (test_state(groupc->tasks, s, state_mask & PSI_ONCPU)) - state_mask |= (1 << s); - } + state_mask = test_states(groupc->tasks, state_mask); /* * Since we care about lost potential, a memstall is FULL @@ -991,22 +992,32 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, } #ifdef CONFIG_IRQ_TIME_ACCOUNTING -void psi_account_irqtime(struct task_struct *task, u32 delta) +void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev) { - int cpu = task_cpu(task); + int cpu = task_cpu(curr); struct psi_group *group; struct psi_group_cpu *groupc; - u64 now; + u64 now, irq; + s64 delta; if (static_branch_likely(&psi_disabled)) return; - if (!task->pid) + if (!curr->pid) + return; + + lockdep_assert_rq_held(rq); + group = task_psi_group(curr); + if (prev && task_psi_group(prev) == group) return; now = cpu_clock(cpu); + irq = irq_time_read(cpu); + delta = (s64)(irq - rq->psi_irq_time); + if (delta < 0) + return; + rq->psi_irq_time = irq; - group = task_psi_group(task); do { if (!group->enabled) continue; @@ -1194,7 +1205,7 @@ void psi_cgroup_restart(struct psi_group *group) /* * After we disable psi_group->enabled, we don't actually * stop percpu tasks accounting in each psi_group_cpu, - * instead only stop test_state() loop, record_times() + * instead only stop test_states() loop, record_times() * and averaging worker, see psi_group_change() for details. * * When disable cgroup PSI, this function has nothing to sync @@ -1202,7 +1213,7 @@ void psi_cgroup_restart(struct psi_group *group) * would see !psi_group->enabled and only do task accounting. * * When re-enable cgroup PSI, this function use psi_group_change() - * to get correct state mask from test_state() loop on tasks[], + * to get correct state mask from test_states() loop on tasks[], * and restart groupc->state_start from now, use .clear = .set = 0 * here since no task status really changed. */ diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index aa4c1c874fa4..63e49c8ffc4d 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -140,7 +140,7 @@ void init_rt_rq(struct rt_rq *rt_rq) INIT_LIST_HEAD(array->queue + i); __clear_bit(i, array->bitmap); } - /* delimiter for bitsearch: */ + /* delimiter for bit-search: */ __set_bit(MAX_RT_PRIO, array->bitmap); #if defined CONFIG_SMP @@ -1135,7 +1135,7 @@ dec_rt_prio(struct rt_rq *rt_rq, int prio) /* * This may have been our highest task, and therefore - * we may have some recomputation to do + * we may have some re-computation to do */ if (prio == prev_prio) { struct rt_prio_array *array = &rt_rq->active; @@ -1571,7 +1571,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int flags) * * For equal prio tasks, we just let the scheduler sort it out. * - * Otherwise, just let it ride on the affined RQ and the + * Otherwise, just let it ride on the affine RQ and the * post-schedule router will push the preempted task away * * This test is optimistic, if we get it wrong the load-balancer @@ -2147,14 +2147,14 @@ static void push_rt_tasks(struct rq *rq) * if its the only CPU with multiple RT tasks queued, and a large number * of CPUs scheduling a lower priority task at the same time. * - * Each root domain has its own irq work function that can iterate over + * Each root domain has its own IRQ work function that can iterate over * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT * task must be checked if there's one or many CPUs that are lowering - * their priority, there's a single irq work iterator that will try to + * their priority, there's a single IRQ work iterator that will try to * push off RT tasks that are waiting to run. * * When a CPU schedules a lower priority task, it will kick off the - * irq work iterator that will jump to each CPU with overloaded RT tasks. + * IRQ work iterator that will jump to each CPU with overloaded RT tasks. * As it only takes the first CPU that schedules a lower priority task * to start the process, the rto_start variable is incremented and if * the atomic result is one, then that CPU will try to take the rto_lock. @@ -2162,7 +2162,7 @@ static void push_rt_tasks(struct rq *rq) * CPUs scheduling lower priority tasks. * * All CPUs that are scheduling a lower priority task will increment the - * rt_loop_next variable. This will make sure that the irq work iterator + * rt_loop_next variable. This will make sure that the IRQ work iterator * checks all RT overloaded CPUs whenever a CPU schedules a new lower * priority task, even if the iterator is in the middle of a scan. Incrementing * the rt_loop_next will cause the iterator to perform another scan. @@ -2242,7 +2242,7 @@ static void tell_cpu_to_push(struct rq *rq) * The rto_cpu is updated under the lock, if it has a valid CPU * then the IPI is still running and will continue due to the * update to loop_next, and nothing needs to be done here. - * Otherwise it is finishing up and an ipi needs to be sent. + * Otherwise it is finishing up and an IPI needs to be sent. */ if (rq->rd->rto_cpu < 0) cpu = rto_next_cpu(rq->rd); @@ -2594,7 +2594,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) watchdog(rq, p); /* - * RR tasks need a special form of timeslice management. + * RR tasks need a special form of time-slice management. * FIFO tasks have no timeslices. */ if (p->policy != SCHED_RR) @@ -2900,7 +2900,7 @@ static int sched_rt_global_constraints(void) int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) { - /* Don't accept realtime tasks when there is no way for them to run */ + /* Don't accept real-time tasks when there is no way for them to run */ if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) return 0; @@ -3001,7 +3001,7 @@ static int sched_rr_handler(struct ctl_table *table, int write, void *buffer, ret = proc_dointvec(table, write, buffer, lenp, ppos); /* * Make sure that internally we keep jiffies. - * Also, writing zero resets the timeslice to default: + * Also, writing zero resets the time-slice to default: */ if (!ret && write) { sched_rr_timeslice = diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index a831af102070..4c36cc680361 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -74,6 +74,12 @@ #include "../workqueue_internal.h" +struct rq; +struct cfs_rq; +struct rt_rq; +struct sched_group; +struct cpuidle_state; + #ifdef CONFIG_PARAVIRT # include <asm/paravirt.h> # include <asm/paravirt_api_clock.h> @@ -90,9 +96,6 @@ # define SCHED_WARN_ON(x) ({ (void)(x), 0; }) #endif -struct rq; -struct cpuidle_state; - /* task_struct::on_rq states: */ #define TASK_ON_RQ_QUEUED 1 #define TASK_ON_RQ_MIGRATING 2 @@ -128,12 +131,12 @@ extern struct list_head asym_cap_list; /* * Helpers for converting nanosecond timing to jiffy resolution */ -#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) +#define NS_TO_JIFFIES(time) ((unsigned long)(time) / (NSEC_PER_SEC/HZ)) /* * Increase resolution of nice-level calculations for 64-bit architectures. * The extra resolution improves shares distribution and load balancing of - * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup + * low-weight task groups (eg. nice +19 on an autogroup), deeper task-group * hierarchies, especially on larger systems. This is not a user-visible change * and does not change the user-interface for setting shares/weights. * @@ -147,12 +150,13 @@ extern struct list_head asym_cap_list; #ifdef CONFIG_64BIT # define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT) # define scale_load(w) ((w) << SCHED_FIXEDPOINT_SHIFT) -# define scale_load_down(w) \ -({ \ - unsigned long __w = (w); \ - if (__w) \ - __w = max(2UL, __w >> SCHED_FIXEDPOINT_SHIFT); \ - __w; \ +# define scale_load_down(w) \ +({ \ + unsigned long __w = (w); \ + \ + if (__w) \ + __w = max(2UL, __w >> SCHED_FIXEDPOINT_SHIFT); \ + __w; \ }) #else # define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT) @@ -187,6 +191,7 @@ static inline int idle_policy(int policy) { return policy == SCHED_IDLE; } + static inline int fair_policy(int policy) { return policy == SCHED_NORMAL || policy == SCHED_BATCH; @@ -201,6 +206,7 @@ static inline int dl_policy(int policy) { return policy == SCHED_DEADLINE; } + static inline bool valid_policy(int policy) { return idle_policy(policy) || fair_policy(policy) || @@ -222,11 +228,12 @@ static inline int task_has_dl_policy(struct task_struct *p) return dl_policy(p->policy); } -#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) +#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) static inline void update_avg(u64 *avg, u64 sample) { s64 diff = sample - *avg; + *avg += diff / 8; } @@ -251,7 +258,7 @@ static inline void update_avg(u64 *avg, u64 sample) */ #define SCHED_FLAG_SUGOV 0x10000000 -#define SCHED_DL_FLAGS (SCHED_FLAG_RECLAIM | SCHED_FLAG_DL_OVERRUN | SCHED_FLAG_SUGOV) +#define SCHED_DL_FLAGS (SCHED_FLAG_RECLAIM | SCHED_FLAG_DL_OVERRUN | SCHED_FLAG_SUGOV) static inline bool dl_entity_is_special(const struct sched_dl_entity *dl_se) { @@ -358,9 +365,6 @@ extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, #ifdef CONFIG_CGROUP_SCHED -struct cfs_rq; -struct rt_rq; - extern struct list_head task_groups; struct cfs_bandwidth { @@ -406,7 +410,7 @@ struct task_group { #ifdef CONFIG_SMP /* * load_avg can be heavily contended at clock tick time, so put - * it in its own cacheline separated from the fields above which + * it in its own cache-line separated from the fields above which * will also be accessed at each tick. */ atomic_long_t load_avg ____cacheline_aligned; @@ -536,6 +540,7 @@ static inline void set_task_rq_fair(struct sched_entity *se, #else /* CONFIG_CGROUP_SCHED */ struct cfs_bandwidth { }; + static inline bool cfs_task_bw_constrained(struct task_struct *p) { return false; } #endif /* CONFIG_CGROUP_SCHED */ @@ -551,8 +556,8 @@ extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent * applicable for 32-bits architectures. */ #ifdef CONFIG_64BIT -# define u64_u32_load_copy(var, copy) var -# define u64_u32_store_copy(var, copy, val) (var = val) +# define u64_u32_load_copy(var, copy) var +# define u64_u32_store_copy(var, copy, val) (var = val) #else # define u64_u32_load_copy(var, copy) \ ({ \ @@ -580,8 +585,8 @@ do { \ copy = __val; \ } while (0) #endif -# define u64_u32_load(var) u64_u32_load_copy(var, var##_copy) -# define u64_u32_store(var, val) u64_u32_store_copy(var, var##_copy, val) +# define u64_u32_load(var) u64_u32_load_copy(var, var##_copy) +# define u64_u32_store(var, val) u64_u32_store_copy(var, var##_copy, val) /* CFS-related fields in a runqueue */ struct cfs_rq { @@ -803,6 +808,7 @@ struct dl_rq { }; #ifdef CONFIG_FAIR_GROUP_SCHED + /* An entity is a task if it doesn't "own" a runqueue */ #define entity_is_task(se) (!se->my_q) @@ -820,16 +826,18 @@ static inline long se_runnable(struct sched_entity *se) return se->runnable_weight; } -#else +#else /* !CONFIG_FAIR_GROUP_SCHED: */ + #define entity_is_task(se) 1 -static inline void se_update_runnable(struct sched_entity *se) {} +static inline void se_update_runnable(struct sched_entity *se) { } static inline long se_runnable(struct sched_entity *se) { return !!se->on_rq; } -#endif + +#endif /* !CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_SMP /* @@ -874,7 +882,7 @@ struct root_domain { */ bool overloaded; - /* Indicate one or more cpus over-utilized (tipping point) */ + /* Indicate one or more CPUs over-utilized (tipping point) */ bool overutilized; /* @@ -988,7 +996,6 @@ struct uclamp_rq { DECLARE_STATIC_KEY_FALSE(sched_uclamp_used); #endif /* CONFIG_UCLAMP_TASK */ -struct rq; struct balance_callback { struct balance_callback *next; void (*func)(struct rq *rq); @@ -1126,6 +1133,7 @@ struct rq { #ifdef CONFIG_IRQ_TIME_ACCOUNTING u64 prev_irq_time; + u64 psi_irq_time; #endif #ifdef CONFIG_PARAVIRT u64 prev_steal_time; @@ -1143,7 +1151,7 @@ struct rq { call_single_data_t hrtick_csd; #endif struct hrtimer hrtick_timer; - ktime_t hrtick_time; + ktime_t hrtick_time; #endif #ifdef CONFIG_SCHEDSTATS @@ -1165,7 +1173,7 @@ struct rq { #endif #ifdef CONFIG_CPU_IDLE - /* Must be inspected within a rcu lock section */ + /* Must be inspected within a RCU lock section */ struct cpuidle_state *idle_state; #endif @@ -1227,7 +1235,7 @@ static inline int cpu_of(struct rq *rq) #endif } -#define MDF_PUSH 0x01 +#define MDF_PUSH 0x01 static inline bool is_migration_disabled(struct task_struct *p) { @@ -1246,7 +1254,6 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); #define cpu_curr(cpu) (cpu_rq(cpu)->curr) #define raw_rq() raw_cpu_ptr(&runqueues) -struct sched_group; #ifdef CONFIG_SCHED_CORE static inline struct cpumask *sched_group_span(struct sched_group *sg); @@ -1282,9 +1289,10 @@ static inline raw_spinlock_t *__rq_lockp(struct rq *rq) return &rq->__lock; } -bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b, - bool fi); -void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi); +extern bool +cfs_prio_less(const struct task_struct *a, const struct task_struct *b, bool fi); + +extern void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi); /* * Helpers to check if the CPU's core cookie matches with the task's cookie @@ -1352,7 +1360,7 @@ extern void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags); extern void sched_core_get(void); extern void sched_core_put(void); -#else /* !CONFIG_SCHED_CORE */ +#else /* !CONFIG_SCHED_CORE: */ static inline bool sched_core_enabled(struct rq *rq) { @@ -1390,7 +1398,8 @@ static inline bool sched_group_cookie_match(struct rq *rq, { return true; } -#endif /* CONFIG_SCHED_CORE */ + +#endif /* !CONFIG_SCHED_CORE */ static inline void lockdep_assert_rq_held(struct rq *rq) { @@ -1421,8 +1430,10 @@ static inline void raw_spin_rq_unlock_irq(struct rq *rq) static inline unsigned long _raw_spin_rq_lock_irqsave(struct rq *rq) { unsigned long flags; + local_irq_save(flags); raw_spin_rq_lock(rq); + return flags; } @@ -1451,6 +1462,7 @@ static inline void update_idle_core(struct rq *rq) { } #endif #ifdef CONFIG_FAIR_GROUP_SCHED + static inline struct task_struct *task_of(struct sched_entity *se) { SCHED_WARN_ON(!entity_is_task(se)); @@ -1474,9 +1486,9 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) return grp->my_q; } -#else +#else /* !CONFIG_FAIR_GROUP_SCHED: */ -#define task_of(_se) container_of(_se, struct task_struct, se) +#define task_of(_se) container_of(_se, struct task_struct, se) static inline struct cfs_rq *task_cfs_rq(const struct task_struct *p) { @@ -1496,7 +1508,8 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) { return NULL; } -#endif + +#endif /* !CONFIG_FAIR_GROUP_SCHED */ extern void update_rq_clock(struct rq *rq); @@ -1622,9 +1635,9 @@ static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf) #ifdef CONFIG_SCHED_DEBUG rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); rf->clock_update_flags = 0; -#ifdef CONFIG_SMP +# ifdef CONFIG_SMP SCHED_WARN_ON(rq->balance_callback && rq->balance_callback != &balance_push_callback); -#endif +# endif #endif } @@ -1650,9 +1663,11 @@ static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf) #endif } +extern struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) __acquires(rq->lock); +extern struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) __acquires(p->pi_lock) __acquires(rq->lock); @@ -1679,48 +1694,42 @@ DEFINE_LOCK_GUARD_1(task_rq_lock, struct task_struct, task_rq_unlock(_T->rq, _T->lock, &_T->rf), struct rq *rq; struct rq_flags rf) -static inline void -rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) +static inline void rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) __acquires(rq->lock) { raw_spin_rq_lock_irqsave(rq, rf->flags); rq_pin_lock(rq, rf); } -static inline void -rq_lock_irq(struct rq *rq, struct rq_flags *rf) +static inline void rq_lock_irq(struct rq *rq, struct rq_flags *rf) __acquires(rq->lock) { raw_spin_rq_lock_irq(rq); rq_pin_lock(rq, rf); } -static inline void -rq_lock(struct rq *rq, struct rq_flags *rf) +static inline void rq_lock(struct rq *rq, struct rq_flags *rf) __acquires(rq->lock) { raw_spin_rq_lock(rq); rq_pin_lock(rq, rf); } -static inline void -rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) +static inline void rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) __releases(rq->lock) { rq_unpin_lock(rq, rf); raw_spin_rq_unlock_irqrestore(rq, rf->flags); } -static inline void -rq_unlock_irq(struct rq *rq, struct rq_flags *rf) +static inline void rq_unlock_irq(struct rq *rq, struct rq_flags *rf) __releases(rq->lock) { rq_unpin_lock(rq, rf); raw_spin_rq_unlock_irq(rq); } -static inline void -rq_unlock(struct rq *rq, struct rq_flags *rf) +static inline void rq_unlock(struct rq *rq, struct rq_flags *rf) __releases(rq->lock) { rq_unpin_lock(rq, rf); @@ -1742,8 +1751,7 @@ DEFINE_LOCK_GUARD_1(rq_lock_irqsave, struct rq, rq_unlock_irqrestore(_T->lock, &_T->rf), struct rq_flags rf) -static inline struct rq * -this_rq_lock_irq(struct rq_flags *rf) +static inline struct rq *this_rq_lock_irq(struct rq_flags *rf) __acquires(rq->lock) { struct rq *rq; @@ -1751,15 +1759,18 @@ this_rq_lock_irq(struct rq_flags *rf) local_irq_disable(); rq = this_rq(); rq_lock(rq, rf); + return rq; } #ifdef CONFIG_NUMA + enum numa_topology_type { NUMA_DIRECT, NUMA_GLUELESS_MESH, NUMA_BACKPLANE, }; + extern enum numa_topology_type sched_numa_topology_type; extern int sched_max_numa_distance; extern bool find_numa_distance(int distance); @@ -1768,18 +1779,23 @@ extern void sched_update_numa(int cpu, bool online); extern void sched_domains_numa_masks_set(unsigned int cpu); extern void sched_domains_numa_masks_clear(unsigned int cpu); extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu); -#else + +#else /* !CONFIG_NUMA: */ + static inline void sched_init_numa(int offline_node) { } static inline void sched_update_numa(int cpu, bool online) { } static inline void sched_domains_numa_masks_set(unsigned int cpu) { } static inline void sched_domains_numa_masks_clear(unsigned int cpu) { } + static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu) { return nr_cpu_ids; } -#endif + +#endif /* !CONFIG_NUMA */ #ifdef CONFIG_NUMA_BALANCING + /* The regions in numa_faults array from task_struct */ enum numa_faults_stats { NUMA_MEM = 0, @@ -1787,17 +1803,21 @@ enum numa_faults_stats { NUMA_MEMBUF, NUMA_CPUBUF }; + extern void sched_setnuma(struct task_struct *p, int node); extern int migrate_task_to(struct task_struct *p, int cpu); extern int migrate_swap(struct task_struct *p, struct task_struct *t, int cpu, int scpu); extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p); -#else + +#else /* !CONFIG_NUMA_BALANCING: */ + static inline void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) { } -#endif /* CONFIG_NUMA_BALANCING */ + +#endif /* !CONFIG_NUMA_BALANCING */ #ifdef CONFIG_SMP @@ -1822,8 +1842,7 @@ queue_balance_callback(struct rq *rq, } #define rcu_dereference_check_sched_domain(p) \ - rcu_dereference_check((p), \ - lockdep_is_held(&sched_domains_mutex)) + rcu_dereference_check((p), lockdep_is_held(&sched_domains_mutex)) /* * The domain tree (rq->sd) is protected by RCU's quiescent state transition. @@ -1894,6 +1913,7 @@ DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); + extern struct static_key_false sched_asym_cpucapacity; extern struct static_key_false sched_cluster_active; @@ -1957,15 +1977,11 @@ static inline struct cpumask *group_balance_mask(struct sched_group *sg) extern int group_balance_cpu(struct sched_group *sg); #ifdef CONFIG_SCHED_DEBUG -void update_sched_domain_debugfs(void); -void dirty_sched_domain_sysctl(int cpu); +extern void update_sched_domain_debugfs(void); +extern void dirty_sched_domain_sysctl(int cpu); #else -static inline void update_sched_domain_debugfs(void) -{ -} -static inline void dirty_sched_domain_sysctl(int cpu) -{ -} +static inline void update_sched_domain_debugfs(void) { } +static inline void dirty_sched_domain_sysctl(int cpu) { } #endif extern int sched_update_scaling(void); @@ -1976,6 +1992,7 @@ static inline const struct cpumask *task_user_cpus(struct task_struct *p) return cpu_possible_mask; /* &init_task.cpus_mask */ return p->user_cpus_ptr; } + #endif /* CONFIG_SMP */ #include "stats.h" @@ -1998,13 +2015,13 @@ static inline void sched_core_tick(struct rq *rq) __sched_core_tick(rq); } -#else +#else /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS): */ -static inline void sched_core_account_forceidle(struct rq *rq) {} +static inline void sched_core_account_forceidle(struct rq *rq) { } -static inline void sched_core_tick(struct rq *rq) {} +static inline void sched_core_tick(struct rq *rq) { } -#endif /* CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS */ +#endif /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS) */ #ifdef CONFIG_CGROUP_SCHED @@ -2046,15 +2063,16 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) #endif } -#else /* CONFIG_CGROUP_SCHED */ +#else /* !CONFIG_CGROUP_SCHED: */ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } + static inline struct task_group *task_group(struct task_struct *p) { return NULL; } -#endif /* CONFIG_CGROUP_SCHED */ +#endif /* !CONFIG_CGROUP_SCHED */ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) { @@ -2099,6 +2117,7 @@ enum { extern const_debug unsigned int sysctl_sched_features; #ifdef CONFIG_JUMP_LABEL + #define SCHED_FEAT(name, enabled) \ static __always_inline bool static_branch_##name(struct static_key *key) \ { \ @@ -2111,13 +2130,13 @@ static __always_inline bool static_branch_##name(struct static_key *key) \ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; #define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x])) -#else /* !CONFIG_JUMP_LABEL */ +#else /* !CONFIG_JUMP_LABEL: */ #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) -#endif /* CONFIG_JUMP_LABEL */ +#endif /* !CONFIG_JUMP_LABEL */ -#else /* !SCHED_DEBUG */ +#else /* !SCHED_DEBUG: */ /* * Each translation unit has its own copy of sysctl_sched_features to allow @@ -2133,7 +2152,7 @@ static const_debug __maybe_unused unsigned int sysctl_sched_features = #define sched_feat(x) !!(sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) -#endif /* SCHED_DEBUG */ +#endif /* !SCHED_DEBUG */ extern struct static_key_false sched_numa_balancing; extern struct static_key_false sched_schedstats; @@ -2176,13 +2195,13 @@ static inline int task_on_rq_migrating(struct task_struct *p) } /* Wake flags. The first three directly map to some SD flag value */ -#define WF_EXEC 0x02 /* Wakeup after exec; maps to SD_BALANCE_EXEC */ -#define WF_FORK 0x04 /* Wakeup after fork; maps to SD_BALANCE_FORK */ -#define WF_TTWU 0x08 /* Wakeup; maps to SD_BALANCE_WAKE */ +#define WF_EXEC 0x02 /* Wakeup after exec; maps to SD_BALANCE_EXEC */ +#define WF_FORK 0x04 /* Wakeup after fork; maps to SD_BALANCE_FORK */ +#define WF_TTWU 0x08 /* Wakeup; maps to SD_BALANCE_WAKE */ -#define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */ -#define WF_MIGRATED 0x20 /* Internal use, task got migrated */ -#define WF_CURRENT_CPU 0x40 /* Prefer to move the wakee to the current CPU. */ +#define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */ +#define WF_MIGRATED 0x20 /* Internal use, task got migrated */ +#define WF_CURRENT_CPU 0x40 /* Prefer to move the wakee to the current CPU. */ #ifdef CONFIG_SMP static_assert(WF_EXEC == SD_BALANCE_EXEC); @@ -2252,9 +2271,9 @@ extern const u32 sched_prio_to_wmult[40]; #define RETRY_TASK ((void *)-1UL) struct affinity_context { - const struct cpumask *new_mask; - struct cpumask *user_mask; - unsigned int flags; + const struct cpumask *new_mask; + struct cpumask *user_mask; + unsigned int flags; }; extern s64 update_curr_common(struct rq *rq); @@ -2402,8 +2421,19 @@ extern void update_group_capacity(struct sched_domain *sd, int cpu); extern void sched_balance_trigger(struct rq *rq); +extern int __set_cpus_allowed_ptr(struct task_struct *p, struct affinity_context *ctx); extern void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx); +static inline cpumask_t *alloc_user_cpus_ptr(int node) +{ + /* + * See do_set_cpus_allowed() above for the rcu_head usage. + */ + int size = max_t(int, cpumask_size(), sizeof(struct rcu_head)); + + return kmalloc_node(size, GFP_KERNEL, node); +} + static inline struct task_struct *get_push_task(struct rq *rq) { struct task_struct *p = rq->curr; @@ -2425,9 +2455,23 @@ static inline struct task_struct *get_push_task(struct rq *rq) extern int push_cpu_stop(void *arg); -#endif +#else /* !CONFIG_SMP: */ + +static inline int __set_cpus_allowed_ptr(struct task_struct *p, + struct affinity_context *ctx) +{ + return set_cpus_allowed_ptr(p, ctx->new_mask); +} + +static inline cpumask_t *alloc_user_cpus_ptr(int node) +{ + return NULL; +} + +#endif /* !CONFIG_SMP */ #ifdef CONFIG_CPU_IDLE + static inline void idle_set_state(struct rq *rq, struct cpuidle_state *idle_state) { @@ -2440,7 +2484,9 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq) return rq->idle_state; } -#else + +#else /* !CONFIG_CPU_IDLE: */ + static inline void idle_set_state(struct rq *rq, struct cpuidle_state *idle_state) { @@ -2450,7 +2496,8 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq) { return NULL; } -#endif + +#endif /* !CONFIG_CPU_IDLE */ extern void schedule_idle(void); asmlinkage void schedule_user(void); @@ -2463,7 +2510,7 @@ extern void init_sched_dl_class(void); extern void init_sched_rt_class(void); extern void init_sched_fair_class(void); -extern void reweight_task(struct task_struct *p, int prio); +extern void reweight_task(struct task_struct *p, const struct load_weight *lw); extern void resched_curr(struct rq *rq); extern void resched_cpu(int cpu); @@ -2479,7 +2526,8 @@ extern void init_dl_entity(struct sched_dl_entity *dl_se); #define RATIO_SHIFT 8 #define MAX_BW_BITS (64 - BW_SHIFT) #define MAX_BW ((1ULL << MAX_BW_BITS) - 1) -unsigned long to_ratio(u64 period, u64 runtime); + +extern unsigned long to_ratio(u64 period, u64 runtime); extern void init_entity_runnable_average(struct sched_entity *se); extern void post_init_entity_util_avg(struct task_struct *p); @@ -2505,10 +2553,10 @@ static inline void sched_update_tick_dependency(struct rq *rq) else tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); } -#else +#else /* !CONFIG_NO_HZ_FULL: */ static inline int sched_tick_offload_init(void) { return 0; } static inline void sched_update_tick_dependency(struct rq *rq) { } -#endif +#endif /* !CONFIG_NO_HZ_FULL */ static inline void add_nr_running(struct rq *rq, unsigned count) { @@ -2544,9 +2592,9 @@ extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags); #ifdef CONFIG_PREEMPT_RT -#define SCHED_NR_MIGRATE_BREAK 8 +# define SCHED_NR_MIGRATE_BREAK 8 #else -#define SCHED_NR_MIGRATE_BREAK 32 +# define SCHED_NR_MIGRATE_BREAK 32 #endif extern const_debug unsigned int sysctl_sched_nr_migrate; @@ -2595,9 +2643,9 @@ static inline int hrtick_enabled_dl(struct rq *rq) return hrtick_enabled(rq); } -void hrtick_start(struct rq *rq, u64 delay); +extern void hrtick_start(struct rq *rq, u64 delay); -#else +#else /* !CONFIG_SCHED_HRTICK: */ static inline int hrtick_enabled_fair(struct rq *rq) { @@ -2614,13 +2662,10 @@ static inline int hrtick_enabled(struct rq *rq) return 0; } -#endif /* CONFIG_SCHED_HRTICK */ +#endif /* !CONFIG_SCHED_HRTICK */ #ifndef arch_scale_freq_tick -static __always_inline -void arch_scale_freq_tick(void) -{ -} +static __always_inline void arch_scale_freq_tick(void) { } #endif #ifndef arch_scale_freq_capacity @@ -2657,13 +2702,13 @@ static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) #endif } #else -static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) {} +static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) { } #endif -#define DEFINE_LOCK_GUARD_2(name, type, _lock, _unlock, ...) \ -__DEFINE_UNLOCK_GUARD(name, type, _unlock, type *lock2; __VA_ARGS__) \ -static inline class_##name##_t class_##name##_constructor(type *lock, type *lock2) \ -{ class_##name##_t _t = { .lock = lock, .lock2 = lock2 }, *_T = &_t; \ +#define DEFINE_LOCK_GUARD_2(name, type, _lock, _unlock, ...) \ +__DEFINE_UNLOCK_GUARD(name, type, _unlock, type *lock2; __VA_ARGS__) \ +static inline class_##name##_t class_##name##_constructor(type *lock, type *lock2) \ +{ class_##name##_t _t = { .lock = lock, .lock2 = lock2 }, *_T = &_t; \ _lock; return _t; } #ifdef CONFIG_SMP @@ -2717,7 +2762,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) return 1; } -#else +#else /* !CONFIG_PREEMPTION: */ /* * Unfair double_lock_balance: Optimizes throughput at the expense of * latency by eliminating extra atomic operations when the locks are @@ -2748,7 +2793,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) return 1; } -#endif /* CONFIG_PREEMPTION */ +#endif /* !CONFIG_PREEMPTION */ /* * double_lock_balance - lock the busiest runqueue, this_rq is locked already. @@ -2824,9 +2869,10 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) extern void set_rq_online (struct rq *rq); extern void set_rq_offline(struct rq *rq); + extern bool sched_smp_initialized; -#else /* CONFIG_SMP */ +#else /* !CONFIG_SMP: */ /* * double_rq_lock - safely lock two runqueues @@ -2860,7 +2906,7 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) __release(rq2->lock); } -#endif +#endif /* !CONFIG_SMP */ DEFINE_LOCK_GUARD_2(double_rq_lock, struct rq, double_rq_lock(_T->lock, _T->lock2), @@ -2881,16 +2927,15 @@ extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq); extern void resched_latency_warn(int cpu, u64 latency); -#ifdef CONFIG_NUMA_BALANCING -extern void -show_numa_stats(struct task_struct *p, struct seq_file *m); +# ifdef CONFIG_NUMA_BALANCING +extern void show_numa_stats(struct task_struct *p, struct seq_file *m); extern void print_numa_stats(struct seq_file *m, int node, unsigned long tsf, - unsigned long tpf, unsigned long gsf, unsigned long gpf); -#endif /* CONFIG_NUMA_BALANCING */ -#else -static inline void resched_latency_warn(int cpu, u64 latency) {} -#endif /* CONFIG_SCHED_DEBUG */ + unsigned long tpf, unsigned long gsf, unsigned long gpf); +# endif /* CONFIG_NUMA_BALANCING */ +#else /* !CONFIG_SCHED_DEBUG: */ +static inline void resched_latency_warn(int cpu, u64 latency) { } +#endif /* !CONFIG_SCHED_DEBUG */ extern void init_cfs_rq(struct cfs_rq *cfs_rq); extern void init_rt_rq(struct rt_rq *rt_rq); @@ -2900,6 +2945,7 @@ extern void cfs_bandwidth_usage_inc(void); extern void cfs_bandwidth_usage_dec(void); #ifdef CONFIG_NO_HZ_COMMON + #define NOHZ_BALANCE_KICK_BIT 0 #define NOHZ_STATS_KICK_BIT 1 #define NOHZ_NEWILB_KICK_BIT 2 @@ -2914,14 +2960,14 @@ extern void cfs_bandwidth_usage_dec(void); /* Update nohz.next_balance */ #define NOHZ_NEXT_KICK BIT(NOHZ_NEXT_KICK_BIT) -#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK | NOHZ_NEXT_KICK) +#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK | NOHZ_NEXT_KICK) -#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) +#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) extern void nohz_balance_exit_idle(struct rq *rq); -#else +#else /* !CONFIG_NO_HZ_COMMON: */ static inline void nohz_balance_exit_idle(struct rq *rq) { } -#endif +#endif /* !CONFIG_NO_HZ_COMMON */ #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) extern void nohz_run_idle_balance(int cpu); @@ -2930,6 +2976,7 @@ static inline void nohz_run_idle_balance(int cpu) { } #endif #ifdef CONFIG_IRQ_TIME_ACCOUNTING + struct irqtime { u64 total; u64 tick_delta; @@ -2957,9 +3004,11 @@ static inline u64 irq_time_read(int cpu) return total; } + #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ #ifdef CONFIG_CPU_FREQ + DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); /** @@ -2993,9 +3042,9 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) if (data) data->func(data, rq_clock(rq), flags); } -#else -static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} -#endif /* CONFIG_CPU_FREQ */ +#else /* !CONFIG_CPU_FREQ: */ +static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) { } +#endif /* !CONFIG_CPU_FREQ */ #ifdef arch_scale_freq_capacity # ifndef arch_scale_freq_invariant @@ -3006,6 +3055,7 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} #endif #ifdef CONFIG_SMP + unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, unsigned long *min, unsigned long *max); @@ -3048,9 +3098,11 @@ static inline unsigned long cpu_util_rt(struct rq *rq) { return READ_ONCE(rq->avg_rt.util_avg); } -#endif + +#endif /* CONFIG_SMP */ #ifdef CONFIG_UCLAMP_TASK + unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id); static inline unsigned long uclamp_rq_get(struct rq *rq, @@ -3097,9 +3149,40 @@ static inline bool uclamp_is_used(void) { return static_branch_likely(&sched_uclamp_used); } -#else /* CONFIG_UCLAMP_TASK */ -static inline unsigned long uclamp_eff_value(struct task_struct *p, - enum uclamp_id clamp_id) + +#define for_each_clamp_id(clamp_id) \ + for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++) + +extern unsigned int sysctl_sched_uclamp_util_min_rt_default; + + +static inline unsigned int uclamp_none(enum uclamp_id clamp_id) +{ + if (clamp_id == UCLAMP_MIN) + return 0; + return SCHED_CAPACITY_SCALE; +} + +/* Integer rounded range for each bucket */ +#define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS) + +static inline unsigned int uclamp_bucket_id(unsigned int clamp_value) +{ + return min_t(unsigned int, clamp_value / UCLAMP_BUCKET_DELTA, UCLAMP_BUCKETS - 1); +} + +static inline void +uclamp_se_set(struct uclamp_se *uc_se, unsigned int value, bool user_defined) +{ + uc_se->value = value; + uc_se->bucket_id = uclamp_bucket_id(value); + uc_se->user_defined = user_defined; +} + +#else /* !CONFIG_UCLAMP_TASK: */ + +static inline unsigned long +uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id) { if (clamp_id == UCLAMP_MIN) return 0; @@ -3114,8 +3197,8 @@ static inline bool uclamp_is_used(void) return false; } -static inline unsigned long uclamp_rq_get(struct rq *rq, - enum uclamp_id clamp_id) +static inline unsigned long +uclamp_rq_get(struct rq *rq, enum uclamp_id clamp_id) { if (clamp_id == UCLAMP_MIN) return 0; @@ -3123,8 +3206,8 @@ static inline unsigned long uclamp_rq_get(struct rq *rq, return SCHED_CAPACITY_SCALE; } -static inline void uclamp_rq_set(struct rq *rq, enum uclamp_id clamp_id, - unsigned int value) +static inline void +uclamp_rq_set(struct rq *rq, enum uclamp_id clamp_id, unsigned int value) { } @@ -3132,9 +3215,11 @@ static inline bool uclamp_rq_is_idle(struct rq *rq) { return false; } -#endif /* CONFIG_UCLAMP_TASK */ + +#endif /* !CONFIG_UCLAMP_TASK */ #ifdef CONFIG_HAVE_SCHED_AVG_IRQ + static inline unsigned long cpu_util_irq(struct rq *rq) { return READ_ONCE(rq->avg_irq.util_avg); @@ -3149,7 +3234,9 @@ unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned return util; } -#else + +#else /* !CONFIG_HAVE_SCHED_AVG_IRQ: */ + static inline unsigned long cpu_util_irq(struct rq *rq) { return 0; @@ -3160,7 +3247,8 @@ unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned { return util; } -#endif + +#endif /* !CONFIG_HAVE_SCHED_AVG_IRQ */ #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) @@ -3178,11 +3266,13 @@ extern struct cpufreq_governor schedutil_gov; #else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */ #define perf_domain_span(pd) NULL + static inline bool sched_energy_enabled(void) { return false; } #endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ #ifdef CONFIG_MEMBARRIER + /* * The scheduler provides memory barriers required by membarrier between: * - prior user-space memory accesses and store to rq->membarrier_state, @@ -3204,13 +3294,16 @@ static inline void membarrier_switch_mm(struct rq *rq, WRITE_ONCE(rq->membarrier_state, membarrier_state); } -#else + +#else /* !CONFIG_MEMBARRIER :*/ + static inline void membarrier_switch_mm(struct rq *rq, struct mm_struct *prev_mm, struct mm_struct *next_mm) { } -#endif + +#endif /* !CONFIG_MEMBARRIER */ #ifdef CONFIG_SMP static inline bool is_per_cpu_kthread(struct task_struct *p) @@ -3262,7 +3355,7 @@ static inline void __mm_cid_put(struct mm_struct *mm, int cid) * be held to transition to other states. * * State transitions synchronized with cmpxchg or try_cmpxchg need to be - * consistent across cpus, which prevents use of this_cpu_cmpxchg. + * consistent across CPUs, which prevents use of this_cpu_cmpxchg. */ static inline void mm_cid_put_lazy(struct task_struct *t) { @@ -3329,6 +3422,7 @@ static inline int __mm_cid_try_get(struct mm_struct *mm) } if (cpumask_test_and_set_cpu(cid, cpumask)) return -1; + return cid; } @@ -3393,6 +3487,7 @@ unlock: raw_spin_unlock(&cid_lock); end: mm_cid_snapshot_time(rq, mm); + return cid; } @@ -3415,6 +3510,7 @@ static inline int mm_cid_get(struct rq *rq, struct mm_struct *mm) } cid = __mm_cid_get(rq, mm); __this_cpu_write(pcpu_cid->cid, cid); + return cid; } @@ -3469,15 +3565,68 @@ static inline void switch_mm_cid(struct rq *rq, next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next->mm); } -#else +#else /* !CONFIG_SCHED_MM_CID: */ static inline void switch_mm_cid(struct rq *rq, struct task_struct *prev, struct task_struct *next) { } static inline void sched_mm_cid_migrate_from(struct task_struct *t) { } static inline void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) { } static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { } static inline void init_sched_mm_cid(struct task_struct *t) { } -#endif +#endif /* !CONFIG_SCHED_MM_CID */ extern u64 avg_vruntime(struct cfs_rq *cfs_rq); extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se); +#ifdef CONFIG_RT_MUTEXES + +static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) +{ + if (pi_task) + prio = min(prio, pi_task->prio); + + return prio; +} + +static inline int rt_effective_prio(struct task_struct *p, int prio) +{ + struct task_struct *pi_task = rt_mutex_get_top_task(p); + + return __rt_effective_prio(pi_task, prio); +} + +#else /* !CONFIG_RT_MUTEXES: */ + +static inline int rt_effective_prio(struct task_struct *p, int prio) +{ + return prio; +} + +#endif /* !CONFIG_RT_MUTEXES */ + +extern int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user, bool pi); +extern int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx); +extern void __setscheduler_prio(struct task_struct *p, int prio); +extern void set_load_weight(struct task_struct *p, bool update_load); +extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags); +extern void dequeue_task(struct rq *rq, struct task_struct *p, int flags); + +extern void check_class_changed(struct rq *rq, struct task_struct *p, + const struct sched_class *prev_class, + int oldprio); + +#ifdef CONFIG_SMP +extern struct balance_callback *splice_balance_callbacks(struct rq *rq); +extern void balance_callbacks(struct rq *rq, struct balance_callback *head); +#else + +static inline struct balance_callback *splice_balance_callbacks(struct rq *rq) +{ + return NULL; +} + +static inline void balance_callbacks(struct rq *rq, struct balance_callback *head) +{ +} + +#endif + #endif /* _KERNEL_SCHED_SCHED_H */ diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 38f3698f5e5b..237780aa3c53 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -110,8 +110,12 @@ __schedstats_from_se(struct sched_entity *se) void psi_task_change(struct task_struct *task, int clear, int set); void psi_task_switch(struct task_struct *prev, struct task_struct *next, bool sleep); -void psi_account_irqtime(struct task_struct *task, u32 delta); - +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev); +#else +static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr, + struct task_struct *prev) {} +#endif /*CONFIG_IRQ_TIME_ACCOUNTING */ /* * PSI tracks state that persists across sleeps, such as iowaits and * memory stalls. As a result, it has to distinguish between sleeps, @@ -192,7 +196,8 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) {} static inline void psi_sched_switch(struct task_struct *prev, struct task_struct *next, bool sleep) {} -static inline void psi_account_irqtime(struct task_struct *task, u32 delta) {} +static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr, + struct task_struct *prev) {} #endif /* CONFIG_PSI */ #ifdef CONFIG_SCHED_INFO @@ -219,7 +224,7 @@ static inline void sched_info_dequeue(struct rq *rq, struct task_struct *t) /* * Called when a task finally hits the CPU. We can now calculate how * long it was waiting to run. We also note when it began so that we - * can keep stats on how long its timeslice is. + * can keep stats on how long its time-slice is. */ static void sched_info_arrive(struct rq *rq, struct task_struct *t) { diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c new file mode 100644 index 000000000000..ae1b42775ef9 --- /dev/null +++ b/kernel/sched/syscalls.c @@ -0,0 +1,1699 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * kernel/sched/syscalls.c + * + * Core kernel scheduler syscalls related code + * + * Copyright (C) 1991-2002 Linus Torvalds + * Copyright (C) 1998-2024 Ingo Molnar, Red Hat + */ +#include <linux/sched.h> +#include <linux/cpuset.h> +#include <linux/sched/debug.h> + +#include <uapi/linux/sched/types.h> + +#include "sched.h" +#include "autogroup.h" + +static inline int __normal_prio(int policy, int rt_prio, int nice) +{ + int prio; + + if (dl_policy(policy)) + prio = MAX_DL_PRIO - 1; + else if (rt_policy(policy)) + prio = MAX_RT_PRIO - 1 - rt_prio; + else + prio = NICE_TO_PRIO(nice); + + return prio; +} + +/* + * Calculate the expected normal priority: i.e. priority + * without taking RT-inheritance into account. Might be + * boosted by interactivity modifiers. Changes upon fork, + * setprio syscalls, and whenever the interactivity + * estimator recalculates. + */ +static inline int normal_prio(struct task_struct *p) +{ + return __normal_prio(p->policy, p->rt_priority, PRIO_TO_NICE(p->static_prio)); +} + +/* + * Calculate the current priority, i.e. the priority + * taken into account by the scheduler. This value might + * be boosted by RT tasks, or might be boosted by + * interactivity modifiers. Will be RT if the task got + * RT-boosted. If not then it returns p->normal_prio. + */ +static int effective_prio(struct task_struct *p) +{ + p->normal_prio = normal_prio(p); + /* + * If we are RT tasks or we were boosted to RT priority, + * keep the priority unchanged. Otherwise, update priority + * to the normal priority: + */ + if (!rt_prio(p->prio)) + return p->normal_prio; + return p->prio; +} + +void set_user_nice(struct task_struct *p, long nice) +{ + bool queued, running; + struct rq *rq; + int old_prio; + + if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) + return; + /* + * We have to be careful, if called from sys_setpriority(), + * the task might be in the middle of scheduling on another CPU. + */ + CLASS(task_rq_lock, rq_guard)(p); + rq = rq_guard.rq; + + update_rq_clock(rq); + + /* + * The RT priorities are set via sched_setscheduler(), but we still + * allow the 'normal' nice value to be set - but as expected + * it won't have any effect on scheduling until the task is + * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR: + */ + if (task_has_dl_policy(p) || task_has_rt_policy(p)) { + p->static_prio = NICE_TO_PRIO(nice); + return; + } + + queued = task_on_rq_queued(p); + running = task_current(rq, p); + if (queued) + dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); + if (running) + put_prev_task(rq, p); + + p->static_prio = NICE_TO_PRIO(nice); + set_load_weight(p, true); + old_prio = p->prio; + p->prio = effective_prio(p); + + if (queued) + enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); + if (running) + set_next_task(rq, p); + + /* + * If the task increased its priority or is running and + * lowered its priority, then reschedule its CPU: + */ + p->sched_class->prio_changed(rq, p, old_prio); +} +EXPORT_SYMBOL(set_user_nice); + +/* + * is_nice_reduction - check if nice value is an actual reduction + * + * Similar to can_nice() but does not perform a capability check. + * + * @p: task + * @nice: nice value + */ +static bool is_nice_reduction(const struct task_struct *p, const int nice) +{ + /* Convert nice value [19,-20] to rlimit style value [1,40]: */ + int nice_rlim = nice_to_rlimit(nice); + + return (nice_rlim <= task_rlimit(p, RLIMIT_NICE)); +} + +/* + * can_nice - check if a task can reduce its nice value + * @p: task + * @nice: nice value + */ +int can_nice(const struct task_struct *p, const int nice) +{ + return is_nice_reduction(p, nice) || capable(CAP_SYS_NICE); +} + +#ifdef __ARCH_WANT_SYS_NICE + +/* + * sys_nice - change the priority of the current process. + * @increment: priority increment + * + * sys_setpriority is a more generic, but much slower function that + * does similar things. + */ +SYSCALL_DEFINE1(nice, int, increment) +{ + long nice, retval; + + /* + * Setpriority might change our priority at the same moment. + * We don't have to worry. Conceptually one call occurs first + * and we have a single winner. + */ + increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); + nice = task_nice(current) + increment; + + nice = clamp_val(nice, MIN_NICE, MAX_NICE); + if (increment < 0 && !can_nice(current, nice)) + return -EPERM; + + retval = security_task_setnice(current, nice); + if (retval) + return retval; + + set_user_nice(current, nice); + return 0; +} + +#endif + +/** + * task_prio - return the priority value of a given task. + * @p: the task in question. + * + * Return: The priority value as seen by users in /proc. + * + * sched policy return value kernel prio user prio/nice + * + * normal, batch, idle [0 ... 39] [100 ... 139] 0/[-20 ... 19] + * fifo, rr [-2 ... -100] [98 ... 0] [1 ... 99] + * deadline -101 -1 0 + */ +int task_prio(const struct task_struct *p) +{ + return p->prio - MAX_RT_PRIO; +} + +/** + * idle_cpu - is a given CPU idle currently? + * @cpu: the processor in question. + * + * Return: 1 if the CPU is currently idle. 0 otherwise. + */ +int idle_cpu(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + if (rq->curr != rq->idle) + return 0; + + if (rq->nr_running) + return 0; + +#ifdef CONFIG_SMP + if (rq->ttwu_pending) + return 0; +#endif + + return 1; +} + +/** + * available_idle_cpu - is a given CPU idle for enqueuing work. + * @cpu: the CPU in question. + * + * Return: 1 if the CPU is currently idle. 0 otherwise. + */ +int available_idle_cpu(int cpu) +{ + if (!idle_cpu(cpu)) + return 0; + + if (vcpu_is_preempted(cpu)) + return 0; + + return 1; +} + +/** + * idle_task - return the idle task for a given CPU. + * @cpu: the processor in question. + * + * Return: The idle task for the CPU @cpu. + */ +struct task_struct *idle_task(int cpu) +{ + return cpu_rq(cpu)->idle; +} + +#ifdef CONFIG_SCHED_CORE +int sched_core_idle_cpu(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + if (sched_core_enabled(rq) && rq->curr == rq->idle) + return 1; + + return idle_cpu(cpu); +} + +#endif + +#ifdef CONFIG_SMP +/* + * This function computes an effective utilization for the given CPU, to be + * used for frequency selection given the linear relation: f = u * f_max. + * + * The scheduler tracks the following metrics: + * + * cpu_util_{cfs,rt,dl,irq}() + * cpu_bw_dl() + * + * Where the cfs,rt and dl util numbers are tracked with the same metric and + * synchronized windows and are thus directly comparable. + * + * The cfs,rt,dl utilization are the running times measured with rq->clock_task + * which excludes things like IRQ and steal-time. These latter are then accrued + * in the IRQ utilization. + * + * The DL bandwidth number OTOH is not a measured metric but a value computed + * based on the task model parameters and gives the minimal utilization + * required to meet deadlines. + */ +unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, + unsigned long *min, + unsigned long *max) +{ + unsigned long util, irq, scale; + struct rq *rq = cpu_rq(cpu); + + scale = arch_scale_cpu_capacity(cpu); + + /* + * Early check to see if IRQ/steal time saturates the CPU, can be + * because of inaccuracies in how we track these -- see + * update_irq_load_avg(). + */ + irq = cpu_util_irq(rq); + if (unlikely(irq >= scale)) { + if (min) + *min = scale; + if (max) + *max = scale; + return scale; + } + + if (min) { + /* + * The minimum utilization returns the highest level between: + * - the computed DL bandwidth needed with the IRQ pressure which + * steals time to the deadline task. + * - The minimum performance requirement for CFS and/or RT. + */ + *min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN)); + + /* + * When an RT task is runnable and uclamp is not used, we must + * ensure that the task will run at maximum compute capacity. + */ + if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt)) + *min = max(*min, scale); + } + + /* + * Because the time spend on RT/DL tasks is visible as 'lost' time to + * CFS tasks and we use the same metric to track the effective + * utilization (PELT windows are synchronized) we can directly add them + * to obtain the CPU's actual utilization. + */ + util = util_cfs + cpu_util_rt(rq); + util += cpu_util_dl(rq); + + /* + * The maximum hint is a soft bandwidth requirement, which can be lower + * than the actual utilization because of uclamp_max requirements. + */ + if (max) + *max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX)); + + if (util >= scale) + return scale; + + /* + * There is still idle time; further improve the number by using the + * IRQ metric. Because IRQ/steal time is hidden from the task clock we + * need to scale the task numbers: + * + * max - irq + * U' = irq + --------- * U + * max + */ + util = scale_irq_capacity(util, irq, scale); + util += irq; + + return min(scale, util); +} + +unsigned long sched_cpu_util(int cpu) +{ + return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL); +} +#endif /* CONFIG_SMP */ + +/** + * find_process_by_pid - find a process with a matching PID value. + * @pid: the pid in question. + * + * The task of @pid, if found. %NULL otherwise. + */ +static struct task_struct *find_process_by_pid(pid_t pid) +{ + return pid ? find_task_by_vpid(pid) : current; +} + +static struct task_struct *find_get_task(pid_t pid) +{ + struct task_struct *p; + guard(rcu)(); + + p = find_process_by_pid(pid); + if (likely(p)) + get_task_struct(p); + + return p; +} + +DEFINE_CLASS(find_get_task, struct task_struct *, if (_T) put_task_struct(_T), + find_get_task(pid), pid_t pid) + +/* + * sched_setparam() passes in -1 for its policy, to let the functions + * it calls know not to change it. + */ +#define SETPARAM_POLICY -1 + +static void __setscheduler_params(struct task_struct *p, + const struct sched_attr *attr) +{ + int policy = attr->sched_policy; + + if (policy == SETPARAM_POLICY) + policy = p->policy; + + p->policy = policy; + + if (dl_policy(policy)) + __setparam_dl(p, attr); + else if (fair_policy(policy)) + p->static_prio = NICE_TO_PRIO(attr->sched_nice); + + /* + * __sched_setscheduler() ensures attr->sched_priority == 0 when + * !rt_policy. Always setting this ensures that things like + * getparam()/getattr() don't report silly values for !rt tasks. + */ + p->rt_priority = attr->sched_priority; + p->normal_prio = normal_prio(p); + set_load_weight(p, true); +} + +/* + * Check the target process has a UID that matches the current process's: + */ +static bool check_same_owner(struct task_struct *p) +{ + const struct cred *cred = current_cred(), *pcred; + guard(rcu)(); + + pcred = __task_cred(p); + return (uid_eq(cred->euid, pcred->euid) || + uid_eq(cred->euid, pcred->uid)); +} + +#ifdef CONFIG_UCLAMP_TASK + +static int uclamp_validate(struct task_struct *p, + const struct sched_attr *attr) +{ + int util_min = p->uclamp_req[UCLAMP_MIN].value; + int util_max = p->uclamp_req[UCLAMP_MAX].value; + + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) { + util_min = attr->sched_util_min; + + if (util_min + 1 > SCHED_CAPACITY_SCALE + 1) + return -EINVAL; + } + + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) { + util_max = attr->sched_util_max; + + if (util_max + 1 > SCHED_CAPACITY_SCALE + 1) + return -EINVAL; + } + + if (util_min != -1 && util_max != -1 && util_min > util_max) + return -EINVAL; + + /* + * We have valid uclamp attributes; make sure uclamp is enabled. + * + * We need to do that here, because enabling static branches is a + * blocking operation which obviously cannot be done while holding + * scheduler locks. + */ + static_branch_enable(&sched_uclamp_used); + + return 0; +} + +static bool uclamp_reset(const struct sched_attr *attr, + enum uclamp_id clamp_id, + struct uclamp_se *uc_se) +{ + /* Reset on sched class change for a non user-defined clamp value. */ + if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)) && + !uc_se->user_defined) + return true; + + /* Reset on sched_util_{min,max} == -1. */ + if (clamp_id == UCLAMP_MIN && + attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN && + attr->sched_util_min == -1) { + return true; + } + + if (clamp_id == UCLAMP_MAX && + attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX && + attr->sched_util_max == -1) { + return true; + } + + return false; +} + +static void __setscheduler_uclamp(struct task_struct *p, + const struct sched_attr *attr) +{ + enum uclamp_id clamp_id; + + for_each_clamp_id(clamp_id) { + struct uclamp_se *uc_se = &p->uclamp_req[clamp_id]; + unsigned int value; + + if (!uclamp_reset(attr, clamp_id, uc_se)) + continue; + + /* + * RT by default have a 100% boost value that could be modified + * at runtime. + */ + if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN)) + value = sysctl_sched_uclamp_util_min_rt_default; + else + value = uclamp_none(clamp_id); + + uclamp_se_set(uc_se, value, false); + + } + + if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP))) + return; + + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN && + attr->sched_util_min != -1) { + uclamp_se_set(&p->uclamp_req[UCLAMP_MIN], + attr->sched_util_min, true); + } + + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX && + attr->sched_util_max != -1) { + uclamp_se_set(&p->uclamp_req[UCLAMP_MAX], + attr->sched_util_max, true); + } +} + +#else /* !CONFIG_UCLAMP_TASK: */ + +static inline int uclamp_validate(struct task_struct *p, + const struct sched_attr *attr) +{ + return -EOPNOTSUPP; +} +static void __setscheduler_uclamp(struct task_struct *p, + const struct sched_attr *attr) { } +#endif + +/* + * Allow unprivileged RT tasks to decrease priority. + * Only issue a capable test if needed and only once to avoid an audit + * event on permitted non-privileged operations: + */ +static int user_check_sched_setscheduler(struct task_struct *p, + const struct sched_attr *attr, + int policy, int reset_on_fork) +{ + if (fair_policy(policy)) { + if (attr->sched_nice < task_nice(p) && + !is_nice_reduction(p, attr->sched_nice)) + goto req_priv; + } + + if (rt_policy(policy)) { + unsigned long rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO); + + /* Can't set/change the rt policy: */ + if (policy != p->policy && !rlim_rtprio) + goto req_priv; + + /* Can't increase priority: */ + if (attr->sched_priority > p->rt_priority && + attr->sched_priority > rlim_rtprio) + goto req_priv; + } + + /* + * Can't set/change SCHED_DEADLINE policy at all for now + * (safest behavior); in the future we would like to allow + * unprivileged DL tasks to increase their relative deadline + * or reduce their runtime (both ways reducing utilization) + */ + if (dl_policy(policy)) + goto req_priv; + + /* + * Treat SCHED_IDLE as nice 20. Only allow a switch to + * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. + */ + if (task_has_idle_policy(p) && !idle_policy(policy)) { + if (!is_nice_reduction(p, task_nice(p))) + goto req_priv; + } + + /* Can't change other user's priorities: */ + if (!check_same_owner(p)) + goto req_priv; + + /* Normal users shall not reset the sched_reset_on_fork flag: */ + if (p->sched_reset_on_fork && !reset_on_fork) + goto req_priv; + + return 0; + +req_priv: + if (!capable(CAP_SYS_NICE)) + return -EPERM; + + return 0; +} + +int __sched_setscheduler(struct task_struct *p, + const struct sched_attr *attr, + bool user, bool pi) +{ + int oldpolicy = -1, policy = attr->sched_policy; + int retval, oldprio, newprio, queued, running; + const struct sched_class *prev_class; + struct balance_callback *head; + struct rq_flags rf; + int reset_on_fork; + int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; + struct rq *rq; + bool cpuset_locked = false; + + /* The pi code expects interrupts enabled */ + BUG_ON(pi && in_interrupt()); +recheck: + /* Double check policy once rq lock held: */ + if (policy < 0) { + reset_on_fork = p->sched_reset_on_fork; + policy = oldpolicy = p->policy; + } else { + reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK); + + if (!valid_policy(policy)) + return -EINVAL; + } + + if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV)) + return -EINVAL; + + /* + * Valid priorities for SCHED_FIFO and SCHED_RR are + * 1..MAX_RT_PRIO-1, valid priority for SCHED_NORMAL, + * SCHED_BATCH and SCHED_IDLE is 0. + */ + if (attr->sched_priority > MAX_RT_PRIO-1) + return -EINVAL; + if ((dl_policy(policy) && !__checkparam_dl(attr)) || + (rt_policy(policy) != (attr->sched_priority != 0))) + return -EINVAL; + + if (user) { + retval = user_check_sched_setscheduler(p, attr, policy, reset_on_fork); + if (retval) + return retval; + + if (attr->sched_flags & SCHED_FLAG_SUGOV) + return -EINVAL; + + retval = security_task_setscheduler(p); + if (retval) + return retval; + } + + /* Update task specific "requested" clamps */ + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) { + retval = uclamp_validate(p, attr); + if (retval) + return retval; + } + + /* + * SCHED_DEADLINE bandwidth accounting relies on stable cpusets + * information. + */ + if (dl_policy(policy) || dl_policy(p->policy)) { + cpuset_locked = true; + cpuset_lock(); + } + + /* + * Make sure no PI-waiters arrive (or leave) while we are + * changing the priority of the task: + * + * To be able to change p->policy safely, the appropriate + * runqueue lock must be held. + */ + rq = task_rq_lock(p, &rf); + update_rq_clock(rq); + + /* + * Changing the policy of the stop threads its a very bad idea: + */ + if (p == rq->stop) { + retval = -EINVAL; + goto unlock; + } + + /* + * If not changing anything there's no need to proceed further, + * but store a possible modification of reset_on_fork. + */ + if (unlikely(policy == p->policy)) { + if (fair_policy(policy) && attr->sched_nice != task_nice(p)) + goto change; + if (rt_policy(policy) && attr->sched_priority != p->rt_priority) + goto change; + if (dl_policy(policy) && dl_param_changed(p, attr)) + goto change; + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) + goto change; + + p->sched_reset_on_fork = reset_on_fork; + retval = 0; + goto unlock; + } +change: + + if (user) { +#ifdef CONFIG_RT_GROUP_SCHED + /* + * Do not allow real-time tasks into groups that have no runtime + * assigned. + */ + if (rt_bandwidth_enabled() && rt_policy(policy) && + task_group(p)->rt_bandwidth.rt_runtime == 0 && + !task_group_is_autogroup(task_group(p))) { + retval = -EPERM; + goto unlock; + } +#endif +#ifdef CONFIG_SMP + if (dl_bandwidth_enabled() && dl_policy(policy) && + !(attr->sched_flags & SCHED_FLAG_SUGOV)) { + cpumask_t *span = rq->rd->span; + + /* + * Don't allow tasks with an affinity mask smaller than + * the entire root_domain to become SCHED_DEADLINE. We + * will also fail if there's no bandwidth available. + */ + if (!cpumask_subset(span, p->cpus_ptr) || + rq->rd->dl_bw.bw == 0) { + retval = -EPERM; + goto unlock; + } + } +#endif + } + + /* Re-check policy now with rq lock held: */ + if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { + policy = oldpolicy = -1; + task_rq_unlock(rq, p, &rf); + if (cpuset_locked) + cpuset_unlock(); + goto recheck; + } + + /* + * If setscheduling to SCHED_DEADLINE (or changing the parameters + * of a SCHED_DEADLINE task) we need to check if enough bandwidth + * is available. + */ + if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) { + retval = -EBUSY; + goto unlock; + } + + p->sched_reset_on_fork = reset_on_fork; + oldprio = p->prio; + + newprio = __normal_prio(policy, attr->sched_priority, attr->sched_nice); + if (pi) { + /* + * Take priority boosted tasks into account. If the new + * effective priority is unchanged, we just store the new + * normal parameters and do not touch the scheduler class and + * the runqueue. This will be done when the task deboost + * itself. + */ + newprio = rt_effective_prio(p, newprio); + if (newprio == oldprio) + queue_flags &= ~DEQUEUE_MOVE; + } + + queued = task_on_rq_queued(p); + running = task_current(rq, p); + if (queued) + dequeue_task(rq, p, queue_flags); + if (running) + put_prev_task(rq, p); + + prev_class = p->sched_class; + + if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) { + __setscheduler_params(p, attr); + __setscheduler_prio(p, newprio); + } + __setscheduler_uclamp(p, attr); + + if (queued) { + /* + * We enqueue to tail when the priority of a task is + * increased (user space view). + */ + if (oldprio < p->prio) + queue_flags |= ENQUEUE_HEAD; + + enqueue_task(rq, p, queue_flags); + } + if (running) + set_next_task(rq, p); + + check_class_changed(rq, p, prev_class, oldprio); + + /* Avoid rq from going away on us: */ + preempt_disable(); + head = splice_balance_callbacks(rq); + task_rq_unlock(rq, p, &rf); + + if (pi) { + if (cpuset_locked) + cpuset_unlock(); + rt_mutex_adjust_pi(p); + } + + /* Run balance callbacks after we've adjusted the PI chain: */ + balance_callbacks(rq, head); + preempt_enable(); + + return 0; + +unlock: + task_rq_unlock(rq, p, &rf); + if (cpuset_locked) + cpuset_unlock(); + return retval; +} + +static int _sched_setscheduler(struct task_struct *p, int policy, + const struct sched_param *param, bool check) +{ + struct sched_attr attr = { + .sched_policy = policy, + .sched_priority = param->sched_priority, + .sched_nice = PRIO_TO_NICE(p->static_prio), + }; + + /* Fixup the legacy SCHED_RESET_ON_FORK hack. */ + if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) { + attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; + policy &= ~SCHED_RESET_ON_FORK; + attr.sched_policy = policy; + } + + return __sched_setscheduler(p, &attr, check, true); +} +/** + * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. + * @p: the task in question. + * @policy: new policy. + * @param: structure containing the new RT priority. + * + * Use sched_set_fifo(), read its comment. + * + * Return: 0 on success. An error code otherwise. + * + * NOTE that the task may be already dead. + */ +int sched_setscheduler(struct task_struct *p, int policy, + const struct sched_param *param) +{ + return _sched_setscheduler(p, policy, param, true); +} + +int sched_setattr(struct task_struct *p, const struct sched_attr *attr) +{ + return __sched_setscheduler(p, attr, true, true); +} + +int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) +{ + return __sched_setscheduler(p, attr, false, true); +} +EXPORT_SYMBOL_GPL(sched_setattr_nocheck); + +/** + * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernel-space. + * @p: the task in question. + * @policy: new policy. + * @param: structure containing the new RT priority. + * + * Just like sched_setscheduler, only don't bother checking if the + * current context has permission. For example, this is needed in + * stop_machine(): we create temporary high priority worker threads, + * but our caller might not have that capability. + * + * Return: 0 on success. An error code otherwise. + */ +int sched_setscheduler_nocheck(struct task_struct *p, int policy, + const struct sched_param *param) +{ + return _sched_setscheduler(p, policy, param, false); +} + +/* + * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally + * incapable of resource management, which is the one thing an OS really should + * be doing. + * + * This is of course the reason it is limited to privileged users only. + * + * Worse still; it is fundamentally impossible to compose static priority + * workloads. You cannot take two correctly working static prio workloads + * and smash them together and still expect them to work. + * + * For this reason 'all' FIFO tasks the kernel creates are basically at: + * + * MAX_RT_PRIO / 2 + * + * The administrator _MUST_ configure the system, the kernel simply doesn't + * know enough information to make a sensible choice. + */ +void sched_set_fifo(struct task_struct *p) +{ + struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 }; + WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0); +} +EXPORT_SYMBOL_GPL(sched_set_fifo); + +/* + * For when you don't much care about FIFO, but want to be above SCHED_NORMAL. + */ +void sched_set_fifo_low(struct task_struct *p) +{ + struct sched_param sp = { .sched_priority = 1 }; + WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0); +} +EXPORT_SYMBOL_GPL(sched_set_fifo_low); + +void sched_set_normal(struct task_struct *p, int nice) +{ + struct sched_attr attr = { + .sched_policy = SCHED_NORMAL, + .sched_nice = nice, + }; + WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0); +} +EXPORT_SYMBOL_GPL(sched_set_normal); + +static int +do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) +{ + struct sched_param lparam; + + if (!param || pid < 0) + return -EINVAL; + if (copy_from_user(&lparam, param, sizeof(struct sched_param))) + return -EFAULT; + + CLASS(find_get_task, p)(pid); + if (!p) + return -ESRCH; + + return sched_setscheduler(p, policy, &lparam); +} + +/* + * Mimics kernel/events/core.c perf_copy_attr(). + */ +static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr) +{ + u32 size; + int ret; + + /* Zero the full structure, so that a short copy will be nice: */ + memset(attr, 0, sizeof(*attr)); + + ret = get_user(size, &uattr->size); + if (ret) + return ret; + + /* ABI compatibility quirk: */ + if (!size) + size = SCHED_ATTR_SIZE_VER0; + if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE) + goto err_size; + + ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); + if (ret) { + if (ret == -E2BIG) + goto err_size; + return ret; + } + + if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) && + size < SCHED_ATTR_SIZE_VER1) + return -EINVAL; + + /* + * XXX: Do we want to be lenient like existing syscalls; or do we want + * to be strict and return an error on out-of-bounds values? + */ + attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE); + + return 0; + +err_size: + put_user(sizeof(*attr), &uattr->size); + return -E2BIG; +} + +static void get_params(struct task_struct *p, struct sched_attr *attr) +{ + if (task_has_dl_policy(p)) + __getparam_dl(p, attr); + else if (task_has_rt_policy(p)) + attr->sched_priority = p->rt_priority; + else + attr->sched_nice = task_nice(p); +} + +/** + * sys_sched_setscheduler - set/change the scheduler policy and RT priority + * @pid: the pid in question. + * @policy: new policy. + * @param: structure containing the new RT priority. + * + * Return: 0 on success. An error code otherwise. + */ +SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param) +{ + if (policy < 0) + return -EINVAL; + + return do_sched_setscheduler(pid, policy, param); +} + +/** + * sys_sched_setparam - set/change the RT priority of a thread + * @pid: the pid in question. + * @param: structure containing the new RT priority. + * + * Return: 0 on success. An error code otherwise. + */ +SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) +{ + return do_sched_setscheduler(pid, SETPARAM_POLICY, param); +} + +/** + * sys_sched_setattr - same as above, but with extended sched_attr + * @pid: the pid in question. + * @uattr: structure containing the extended parameters. + * @flags: for future extension. + */ +SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, + unsigned int, flags) +{ + struct sched_attr attr; + int retval; + + if (!uattr || pid < 0 || flags) + return -EINVAL; + + retval = sched_copy_attr(uattr, &attr); + if (retval) + return retval; + + if ((int)attr.sched_policy < 0) + return -EINVAL; + if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY) + attr.sched_policy = SETPARAM_POLICY; + + CLASS(find_get_task, p)(pid); + if (!p) + return -ESRCH; + + if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS) + get_params(p, &attr); + + return sched_setattr(p, &attr); +} + +/** + * sys_sched_getscheduler - get the policy (scheduling class) of a thread + * @pid: the pid in question. + * + * Return: On success, the policy of the thread. Otherwise, a negative error + * code. + */ +SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) +{ + struct task_struct *p; + int retval; + + if (pid < 0) + return -EINVAL; + + guard(rcu)(); + p = find_process_by_pid(pid); + if (!p) + return -ESRCH; + + retval = security_task_getscheduler(p); + if (!retval) { + retval = p->policy; + if (p->sched_reset_on_fork) + retval |= SCHED_RESET_ON_FORK; + } + return retval; +} + +/** + * sys_sched_getparam - get the RT priority of a thread + * @pid: the pid in question. + * @param: structure containing the RT priority. + * + * Return: On success, 0 and the RT priority is in @param. Otherwise, an error + * code. + */ +SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) +{ + struct sched_param lp = { .sched_priority = 0 }; + struct task_struct *p; + int retval; + + if (!param || pid < 0) + return -EINVAL; + + scoped_guard (rcu) { + p = find_process_by_pid(pid); + if (!p) + return -ESRCH; + + retval = security_task_getscheduler(p); + if (retval) + return retval; + + if (task_has_rt_policy(p)) + lp.sched_priority = p->rt_priority; + } + + /* + * This one might sleep, we cannot do it with a spinlock held ... + */ + return copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; +} + +/* + * Copy the kernel size attribute structure (which might be larger + * than what user-space knows about) to user-space. + * + * Note that all cases are valid: user-space buffer can be larger or + * smaller than the kernel-space buffer. The usual case is that both + * have the same size. + */ +static int +sched_attr_copy_to_user(struct sched_attr __user *uattr, + struct sched_attr *kattr, + unsigned int usize) +{ + unsigned int ksize = sizeof(*kattr); + + if (!access_ok(uattr, usize)) + return -EFAULT; + + /* + * sched_getattr() ABI forwards and backwards compatibility: + * + * If usize == ksize then we just copy everything to user-space and all is good. + * + * If usize < ksize then we only copy as much as user-space has space for, + * this keeps ABI compatibility as well. We skip the rest. + * + * If usize > ksize then user-space is using a newer version of the ABI, + * which part the kernel doesn't know about. Just ignore it - tooling can + * detect the kernel's knowledge of attributes from the attr->size value + * which is set to ksize in this case. + */ + kattr->size = min(usize, ksize); + + if (copy_to_user(uattr, kattr, kattr->size)) + return -EFAULT; + + return 0; +} + +/** + * sys_sched_getattr - similar to sched_getparam, but with sched_attr + * @pid: the pid in question. + * @uattr: structure containing the extended parameters. + * @usize: sizeof(attr) for fwd/bwd comp. + * @flags: for future extension. + */ +SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, + unsigned int, usize, unsigned int, flags) +{ + struct sched_attr kattr = { }; + struct task_struct *p; + int retval; + + if (!uattr || pid < 0 || usize > PAGE_SIZE || + usize < SCHED_ATTR_SIZE_VER0 || flags) + return -EINVAL; + + scoped_guard (rcu) { + p = find_process_by_pid(pid); + if (!p) + return -ESRCH; + + retval = security_task_getscheduler(p); + if (retval) + return retval; + + kattr.sched_policy = p->policy; + if (p->sched_reset_on_fork) + kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; + get_params(p, &kattr); + kattr.sched_flags &= SCHED_FLAG_ALL; + +#ifdef CONFIG_UCLAMP_TASK + /* + * This could race with another potential updater, but this is fine + * because it'll correctly read the old or the new value. We don't need + * to guarantee who wins the race as long as it doesn't return garbage. + */ + kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; + kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; +#endif + } + + return sched_attr_copy_to_user(uattr, &kattr, usize); +} + +#ifdef CONFIG_SMP +int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask) +{ + /* + * If the task isn't a deadline task or admission control is + * disabled then we don't care about affinity changes. + */ + if (!task_has_dl_policy(p) || !dl_bandwidth_enabled()) + return 0; + + /* + * Since bandwidth control happens on root_domain basis, + * if admission test is enabled, we only admit -deadline + * tasks allowed to run on all the CPUs in the task's + * root_domain. + */ + guard(rcu)(); + if (!cpumask_subset(task_rq(p)->rd->span, mask)) + return -EBUSY; + + return 0; +} +#endif /* CONFIG_SMP */ + +int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx) +{ + int retval; + cpumask_var_t cpus_allowed, new_mask; + + if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) + return -ENOMEM; + + if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { + retval = -ENOMEM; + goto out_free_cpus_allowed; + } + + cpuset_cpus_allowed(p, cpus_allowed); + cpumask_and(new_mask, ctx->new_mask, cpus_allowed); + + ctx->new_mask = new_mask; + ctx->flags |= SCA_CHECK; + + retval = dl_task_check_affinity(p, new_mask); + if (retval) + goto out_free_new_mask; + + retval = __set_cpus_allowed_ptr(p, ctx); + if (retval) + goto out_free_new_mask; + + cpuset_cpus_allowed(p, cpus_allowed); + if (!cpumask_subset(new_mask, cpus_allowed)) { + /* + * We must have raced with a concurrent cpuset update. + * Just reset the cpumask to the cpuset's cpus_allowed. + */ + cpumask_copy(new_mask, cpus_allowed); + + /* + * If SCA_USER is set, a 2nd call to __set_cpus_allowed_ptr() + * will restore the previous user_cpus_ptr value. + * + * In the unlikely event a previous user_cpus_ptr exists, + * we need to further restrict the mask to what is allowed + * by that old user_cpus_ptr. + */ + if (unlikely((ctx->flags & SCA_USER) && ctx->user_mask)) { + bool empty = !cpumask_and(new_mask, new_mask, + ctx->user_mask); + + if (WARN_ON_ONCE(empty)) + cpumask_copy(new_mask, cpus_allowed); + } + __set_cpus_allowed_ptr(p, ctx); + retval = -EINVAL; + } + +out_free_new_mask: + free_cpumask_var(new_mask); +out_free_cpus_allowed: + free_cpumask_var(cpus_allowed); + return retval; +} + +long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) +{ + struct affinity_context ac; + struct cpumask *user_mask; + int retval; + + CLASS(find_get_task, p)(pid); + if (!p) + return -ESRCH; + + if (p->flags & PF_NO_SETAFFINITY) + return -EINVAL; + + if (!check_same_owner(p)) { + guard(rcu)(); + if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) + return -EPERM; + } + + retval = security_task_setscheduler(p); + if (retval) + return retval; + + /* + * With non-SMP configs, user_cpus_ptr/user_mask isn't used and + * alloc_user_cpus_ptr() returns NULL. + */ + user_mask = alloc_user_cpus_ptr(NUMA_NO_NODE); + if (user_mask) { + cpumask_copy(user_mask, in_mask); + } else if (IS_ENABLED(CONFIG_SMP)) { + return -ENOMEM; + } + + ac = (struct affinity_context){ + .new_mask = in_mask, + .user_mask = user_mask, + .flags = SCA_USER, + }; + + retval = __sched_setaffinity(p, &ac); + kfree(ac.user_mask); + + return retval; +} + +static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, + struct cpumask *new_mask) +{ + if (len < cpumask_size()) + cpumask_clear(new_mask); + else if (len > cpumask_size()) + len = cpumask_size(); + + return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; +} + +/** + * sys_sched_setaffinity - set the CPU affinity of a process + * @pid: pid of the process + * @len: length in bytes of the bitmask pointed to by user_mask_ptr + * @user_mask_ptr: user-space pointer to the new CPU mask + * + * Return: 0 on success. An error code otherwise. + */ +SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, + unsigned long __user *, user_mask_ptr) +{ + cpumask_var_t new_mask; + int retval; + + if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) + return -ENOMEM; + + retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); + if (retval == 0) + retval = sched_setaffinity(pid, new_mask); + free_cpumask_var(new_mask); + return retval; +} + +long sched_getaffinity(pid_t pid, struct cpumask *mask) +{ + struct task_struct *p; + int retval; + + guard(rcu)(); + p = find_process_by_pid(pid); + if (!p) + return -ESRCH; + + retval = security_task_getscheduler(p); + if (retval) + return retval; + + guard(raw_spinlock_irqsave)(&p->pi_lock); + cpumask_and(mask, &p->cpus_mask, cpu_active_mask); + + return 0; +} + +/** + * sys_sched_getaffinity - get the CPU affinity of a process + * @pid: pid of the process + * @len: length in bytes of the bitmask pointed to by user_mask_ptr + * @user_mask_ptr: user-space pointer to hold the current CPU mask + * + * Return: size of CPU mask copied to user_mask_ptr on success. An + * error code otherwise. + */ +SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, + unsigned long __user *, user_mask_ptr) +{ + int ret; + cpumask_var_t mask; + + if ((len * BITS_PER_BYTE) < nr_cpu_ids) + return -EINVAL; + if (len & (sizeof(unsigned long)-1)) + return -EINVAL; + + if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + + ret = sched_getaffinity(pid, mask); + if (ret == 0) { + unsigned int retlen = min(len, cpumask_size()); + + if (copy_to_user(user_mask_ptr, cpumask_bits(mask), retlen)) + ret = -EFAULT; + else + ret = retlen; + } + free_cpumask_var(mask); + + return ret; +} + +static void do_sched_yield(void) +{ + struct rq_flags rf; + struct rq *rq; + + rq = this_rq_lock_irq(&rf); + + schedstat_inc(rq->yld_count); + current->sched_class->yield_task(rq); + + preempt_disable(); + rq_unlock_irq(rq, &rf); + sched_preempt_enable_no_resched(); + + schedule(); +} + +/** + * sys_sched_yield - yield the current processor to other threads. + * + * This function yields the current CPU to other tasks. If there are no + * other threads running on this CPU then this function will return. + * + * Return: 0. + */ +SYSCALL_DEFINE0(sched_yield) +{ + do_sched_yield(); + return 0; +} + +/** + * yield - yield the current processor to other threads. + * + * Do not ever use this function, there's a 99% chance you're doing it wrong. + * + * The scheduler is at all times free to pick the calling task as the most + * eligible task to run, if removing the yield() call from your code breaks + * it, it's already broken. + * + * Typical broken usage is: + * + * while (!event) + * yield(); + * + * where one assumes that yield() will let 'the other' process run that will + * make event true. If the current task is a SCHED_FIFO task that will never + * happen. Never use yield() as a progress guarantee!! + * + * If you want to use yield() to wait for something, use wait_event(). + * If you want to use yield() to be 'nice' for others, use cond_resched(). + * If you still want to use yield(), do not! + */ +void __sched yield(void) +{ + set_current_state(TASK_RUNNING); + do_sched_yield(); +} +EXPORT_SYMBOL(yield); + +/** + * yield_to - yield the current processor to another thread in + * your thread group, or accelerate that thread toward the + * processor it's on. + * @p: target task + * @preempt: whether task preemption is allowed or not + * + * It's the caller's job to ensure that the target task struct + * can't go away on us before we can do any checks. + * + * Return: + * true (>0) if we indeed boosted the target task. + * false (0) if we failed to boost the target. + * -ESRCH if there's no task to yield to. + */ +int __sched yield_to(struct task_struct *p, bool preempt) +{ + struct task_struct *curr = current; + struct rq *rq, *p_rq; + int yielded = 0; + + scoped_guard (irqsave) { + rq = this_rq(); + +again: + p_rq = task_rq(p); + /* + * If we're the only runnable task on the rq and target rq also + * has only one task, there's absolutely no point in yielding. + */ + if (rq->nr_running == 1 && p_rq->nr_running == 1) + return -ESRCH; + + guard(double_rq_lock)(rq, p_rq); + if (task_rq(p) != p_rq) + goto again; + + if (!curr->sched_class->yield_to_task) + return 0; + + if (curr->sched_class != p->sched_class) + return 0; + + if (task_on_cpu(p_rq, p) || !task_is_running(p)) + return 0; + + yielded = curr->sched_class->yield_to_task(rq, p); + if (yielded) { + schedstat_inc(rq->yld_count); + /* + * Make p's CPU reschedule; pick_next_entity + * takes care of fairness. + */ + if (preempt && rq != p_rq) + resched_curr(p_rq); + } + } + + if (yielded) + schedule(); + + return yielded; +} +EXPORT_SYMBOL_GPL(yield_to); + +/** + * sys_sched_get_priority_max - return maximum RT priority. + * @policy: scheduling class. + * + * Return: On success, this syscall returns the maximum + * rt_priority that can be used by a given scheduling class. + * On failure, a negative error code is returned. + */ +SYSCALL_DEFINE1(sched_get_priority_max, int, policy) +{ + int ret = -EINVAL; + + switch (policy) { + case SCHED_FIFO: + case SCHED_RR: + ret = MAX_RT_PRIO-1; + break; + case SCHED_DEADLINE: + case SCHED_NORMAL: + case SCHED_BATCH: + case SCHED_IDLE: + ret = 0; + break; + } + return ret; +} + +/** + * sys_sched_get_priority_min - return minimum RT priority. + * @policy: scheduling class. + * + * Return: On success, this syscall returns the minimum + * rt_priority that can be used by a given scheduling class. + * On failure, a negative error code is returned. + */ +SYSCALL_DEFINE1(sched_get_priority_min, int, policy) +{ + int ret = -EINVAL; + + switch (policy) { + case SCHED_FIFO: + case SCHED_RR: + ret = 1; + break; + case SCHED_DEADLINE: + case SCHED_NORMAL: + case SCHED_BATCH: + case SCHED_IDLE: + ret = 0; + } + return ret; +} + +static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) +{ + unsigned int time_slice = 0; + int retval; + + if (pid < 0) + return -EINVAL; + + scoped_guard (rcu) { + struct task_struct *p = find_process_by_pid(pid); + if (!p) + return -ESRCH; + + retval = security_task_getscheduler(p); + if (retval) + return retval; + + scoped_guard (task_rq_lock, p) { + struct rq *rq = scope.rq; + if (p->sched_class->get_rr_interval) + time_slice = p->sched_class->get_rr_interval(rq, p); + } + } + + jiffies_to_timespec64(time_slice, t); + return 0; +} + +/** + * sys_sched_rr_get_interval - return the default time-slice of a process. + * @pid: pid of the process. + * @interval: userspace pointer to the time-slice value. + * + * this syscall writes the default time-slice value of a given process + * into the user-space timespec buffer. A value of '0' means infinity. + * + * Return: On success, 0 and the time-slice is in @interval. Otherwise, + * an error code. + */ +SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, + struct __kernel_timespec __user *, interval) +{ + struct timespec64 t; + int retval = sched_rr_get_interval(pid, &t); + + if (retval == 0) + retval = put_timespec64(&t, interval); + + return retval; +} + +#ifdef CONFIG_COMPAT_32BIT_TIME +SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid, + struct old_timespec32 __user *, interval) +{ + struct timespec64 t; + int retval = sched_rr_get_interval(pid, &t); + + if (retval == 0) + retval = put_old_timespec32(&t, interval); + return retval; +} +#endif diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index a6994a1fcc90..784a0be81e84 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -501,7 +501,7 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) cpumask_clear_cpu(rq->cpu, old_rd->span); /* - * If we dont want to free the old_rd yet then + * If we don't want to free the old_rd yet then * set old_rd to NULL to skip the freeing later * in this function: */ @@ -1176,7 +1176,7 @@ fail: * uniquely identify each group (for a given domain): * * - The first is the balance_cpu (see should_we_balance() and the - * load-balance blub in fair.c); for each group we only want 1 CPU to + * load-balance blurb in fair.c); for each group we only want 1 CPU to * continue balancing at a higher domain. * * - The second is the sched_group_capacity; we want all identical groups @@ -1388,7 +1388,7 @@ static inline void asym_cpu_capacity_update_data(int cpu) /* * Search if capacity already exits. If not, track which the entry - * where we should insert to keep the list ordered descendingly. + * where we should insert to keep the list ordered descending. */ list_for_each_entry(entry, &asym_cap_list, link) { if (capacity == entry->capacity) @@ -1853,7 +1853,7 @@ void sched_init_numa(int offline_node) struct cpumask ***masks; /* - * O(nr_nodes^2) deduplicating selection sort -- in order to find the + * O(nr_nodes^2) de-duplicating selection sort -- in order to find the * unique distances in the node_distance() table. */ distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL); @@ -2750,7 +2750,7 @@ match2: } #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) - /* Build perf. domains: */ + /* Build perf domains: */ for (i = 0; i < ndoms_new; i++) { for (j = 0; j < n && !sched_energy_update; j++) { if (cpumask_equal(doms_new[i], doms_cur[j]) && @@ -2759,7 +2759,7 @@ match2: goto match3; } } - /* No match - add perf. domains for a new rd */ + /* No match - add perf domains for a new rd */ has_eas |= build_perf_domains(doms_new[i]); match3: ; diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c index 0b1cd985dc27..134d7112ef71 100644 --- a/kernel/sched/wait_bit.c +++ b/kernel/sched/wait_bit.c @@ -33,7 +33,7 @@ int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync EXPORT_SYMBOL(wake_bit_function); /* - * To allow interruptible waiting and asynchronous (i.e. nonblocking) + * To allow interruptible waiting and asynchronous (i.e. non-blocking) * waiting, the actions of __wait_on_bit() and __wait_on_bit_lock() are * permitted return codes. Nonzero return codes halt waiting and return. */ @@ -133,7 +133,7 @@ EXPORT_SYMBOL(__wake_up_bit); * @bit: the bit of the word being waited on * * There is a standard hashed waitqueue table for generic use. This - * is the part of the hashtable's accessor API that wakes up waiters + * is the part of the hash-table's accessor API that wakes up waiters * on a bit. For instance, if one were to have waiters on a bitflag, * one would call wake_up_bit() after clearing the bit. * diff --git a/kernel/seccomp.c b/kernel/seccomp.c index e30b60b57614..dc51e521bc1d 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -502,6 +502,9 @@ static inline pid_t seccomp_can_sync_threads(void) /* Skip current, since it is initiating the sync. */ if (thread == caller) continue; + /* Skip exited threads. */ + if (thread->flags & PF_EXITING) + continue; if (thread->seccomp.mode == SECCOMP_MODE_DISABLED || (thread->seccomp.mode == SECCOMP_MODE_FILTER && @@ -563,18 +566,21 @@ static void __seccomp_filter_release(struct seccomp_filter *orig) * @tsk: task the filter should be released from. * * This function should only be called when the task is exiting as - * it detaches it from its filter tree. As such, READ_ONCE() and - * barriers are not needed here, as would normally be needed. + * it detaches it from its filter tree. PF_EXITING has to be set + * for the task. */ void seccomp_filter_release(struct task_struct *tsk) { - struct seccomp_filter *orig = tsk->seccomp.filter; + struct seccomp_filter *orig; - /* We are effectively holding the siglock by not having any sighand. */ - WARN_ON(tsk->sighand != NULL); + if (WARN_ON((tsk->flags & PF_EXITING) == 0)) + return; + spin_lock_irq(&tsk->sighand->siglock); + orig = tsk->seccomp.filter; /* Detach task from its filter tree. */ tsk->seccomp.filter = NULL; + spin_unlock_irq(&tsk->sighand->siglock); __seccomp_filter_release(orig); } @@ -602,6 +608,13 @@ static inline void seccomp_sync_threads(unsigned long flags) if (thread == caller) continue; + /* + * Skip exited threads. seccomp_filter_release could have + * been already called for this task. + */ + if (thread->flags & PF_EXITING) + continue; + /* Get a task reference for the new leaf node. */ get_seccomp_filter(caller); @@ -1466,7 +1479,7 @@ static int recv_wake_function(wait_queue_entry_t *wait, unsigned int mode, int s void *key) { /* Avoid a wakeup if event not interesting for us. */ - if (key && !(key_to_poll(key) & (EPOLLIN | EPOLLERR))) + if (key && !(key_to_poll(key) & (EPOLLIN | EPOLLERR | EPOLLHUP))) return 0; return autoremove_wake_function(wait, mode, sync, key); } @@ -1476,6 +1489,9 @@ static int recv_wait_event(struct seccomp_filter *filter) DEFINE_WAIT_FUNC(wait, recv_wake_function); int ret; + if (refcount_read(&filter->users) == 0) + return 0; + if (atomic_dec_if_positive(&filter->notif->requests) >= 0) return 0; @@ -1484,6 +1500,8 @@ static int recv_wait_event(struct seccomp_filter *filter) if (atomic_dec_if_positive(&filter->notif->requests) >= 0) break; + if (refcount_read(&filter->users) == 0) + break; if (ret) return ret; diff --git a/kernel/signal.c b/kernel/signal.c index 1f9dd41c04be..60c737e423a1 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2600,6 +2600,14 @@ static void do_freezer_trap(void) spin_unlock_irq(¤t->sighand->siglock); cgroup_enter_frozen(); schedule(); + + /* + * We could've been woken by task_work, run it to clear + * TIF_NOTIFY_SIGNAL. The caller will retry if necessary. + */ + clear_notify_signal(); + if (unlikely(task_work_pending(current))) + task_work_run(); } static int ptrace_signal(int signr, kernel_siginfo_t *info, enum pid_type type) diff --git a/kernel/smp.c b/kernel/smp.c index f085ebcdf9e7..aaffecdad319 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -25,6 +25,7 @@ #include <linux/nmi.h> #include <linux/sched/debug.h> #include <linux/jump_label.h> +#include <linux/string_choices.h> #include <trace/events/ipi.h> #define CREATE_TRACE_POINTS @@ -982,8 +983,7 @@ void __init smp_init(void) num_nodes = num_online_nodes(); num_cpus = num_online_cpus(); pr_info("Brought up %d node%s, %d CPU%s\n", - num_nodes, (num_nodes > 1 ? "s" : ""), - num_cpus, (num_cpus > 1 ? "s" : "")); + num_nodes, str_plural(num_nodes), num_cpus, str_plural(num_cpus)); /* Any cleanup work */ smp_cpus_done(setup_max_cpus); @@ -1119,6 +1119,7 @@ int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys) queue_work_on(cpu, system_wq, &sscs.work); wait_for_completion(&sscs.done); + destroy_work_on_stack(&sscs.work); return sscs.ret; } diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index d7eee421d4bc..c00a86931f8c 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -46,8 +46,8 @@ COND_SYSCALL(io_getevents_time32); COND_SYSCALL(io_getevents); COND_SYSCALL(io_pgetevents_time32); COND_SYSCALL(io_pgetevents); -COND_SYSCALL_COMPAT(io_pgetevents_time32); COND_SYSCALL_COMPAT(io_pgetevents); +COND_SYSCALL_COMPAT(io_pgetevents_time64); COND_SYSCALL(io_uring_setup); COND_SYSCALL(io_uring_enter); COND_SYSCALL(io_uring_register); @@ -76,8 +76,6 @@ COND_SYSCALL(timerfd_gettime32); COND_SYSCALL(acct); COND_SYSCALL(capget); COND_SYSCALL(capset); -/* __ARCH_WANT_SYS_CLONE3 */ -COND_SYSCALL(clone3); COND_SYSCALL(futex); COND_SYSCALL(futex_time32); COND_SYSCALL(set_robust_list); @@ -392,3 +390,5 @@ COND_SYSCALL(setuid16); /* restartable sequence */ COND_SYSCALL(rseq); + +COND_SYSCALL(uretprobe); diff --git a/kernel/sysctl-test.c b/kernel/sysctl-test.c index 6ef887c19c48..3ac98bb7fb82 100644 --- a/kernel/sysctl-test.c +++ b/kernel/sysctl-test.c @@ -367,6 +367,54 @@ static void sysctl_test_api_dointvec_write_single_greater_int_max( KUNIT_EXPECT_EQ(test, 0, *((int *)table.data)); } +/* + * Test that registering an invalid extra value is not allowed. + */ +static void sysctl_test_register_sysctl_sz_invalid_extra_value( + struct kunit *test) +{ + unsigned char data = 0; + struct ctl_table table_foo[] = { + { + .procname = "foo", + .data = &data, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_FOUR, + .extra2 = SYSCTL_ONE_THOUSAND, + }, + }; + + struct ctl_table table_bar[] = { + { + .procname = "bar", + .data = &data, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_NEG_ONE, + .extra2 = SYSCTL_ONE_HUNDRED, + }, + }; + + struct ctl_table table_qux[] = { + { + .procname = "qux", + .data = &data, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO_HUNDRED, + }, + }; + + KUNIT_EXPECT_NULL(test, register_sysctl("foo", table_foo)); + KUNIT_EXPECT_NULL(test, register_sysctl("foo", table_bar)); + KUNIT_EXPECT_NOT_NULL(test, register_sysctl("foo", table_qux)); +} + static struct kunit_case sysctl_test_cases[] = { KUNIT_CASE(sysctl_test_api_dointvec_null_tbl_data), KUNIT_CASE(sysctl_test_api_dointvec_table_maxlen_unset), @@ -378,6 +426,7 @@ static struct kunit_case sysctl_test_cases[] = { KUNIT_CASE(sysctl_test_dointvec_write_happy_single_negative), KUNIT_CASE(sysctl_test_api_dointvec_write_single_less_int_min), KUNIT_CASE(sysctl_test_api_dointvec_write_single_greater_int_max), + KUNIT_CASE(sysctl_test_register_sysctl_sz_invalid_extra_value), {} }; @@ -388,4 +437,5 @@ static struct kunit_suite sysctl_test_suite = { kunit_test_suites(&sysctl_test_suite); +MODULE_DESCRIPTION("KUnit test of proc sysctl"); MODULE_LICENSE("GPL v2"); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e0b917328cf9..e4421594fc25 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -205,7 +205,7 @@ static int _proc_do_string(char *data, int maxlen, int write, return 0; } -static void warn_sysctl_write(struct ctl_table *table) +static void warn_sysctl_write(const struct ctl_table *table) { pr_warn_once("%s wrote to %s when file position was not 0!\n" "This will not be supported in the future. To silence this\n" @@ -223,7 +223,7 @@ static void warn_sysctl_write(struct ctl_table *table) * handlers can ignore the return value. */ static bool proc_first_pos_non_zero_ignore(loff_t *ppos, - struct ctl_table *table) + const struct ctl_table *table) { if (!*ppos) return false; @@ -468,7 +468,7 @@ static int do_proc_douintvec_conv(unsigned long *lvalp, static const char proc_wspace_sep[] = { ' ', '\t', '\n' }; -static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, +static int __do_proc_dointvec(void *tbl_data, const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(bool *negp, unsigned long *lvalp, int *valp, @@ -541,7 +541,7 @@ out: return err; } -static int do_proc_dointvec(struct ctl_table *table, int write, +static int do_proc_dointvec(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(bool *negp, unsigned long *lvalp, int *valp, int write, void *data), @@ -552,7 +552,7 @@ static int do_proc_dointvec(struct ctl_table *table, int write, } static int do_proc_douintvec_w(unsigned int *tbl_data, - struct ctl_table *table, + const struct ctl_table *table, void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(unsigned long *lvalp, @@ -639,7 +639,7 @@ out: return err; } -static int __do_proc_douintvec(void *tbl_data, struct ctl_table *table, +static int __do_proc_douintvec(void *tbl_data, const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(unsigned long *lvalp, @@ -675,7 +675,7 @@ static int __do_proc_douintvec(void *tbl_data, struct ctl_table *table, return do_proc_douintvec_r(i, buffer, lenp, ppos, conv, data); } -int do_proc_douintvec(struct ctl_table *table, int write, +int do_proc_douintvec(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(unsigned long *lvalp, unsigned int *valp, @@ -977,16 +977,10 @@ int proc_dou8vec_minmax(struct ctl_table *table, int write, if (table->maxlen != sizeof(u8)) return -EINVAL; - if (table->extra1) { + if (table->extra1) min = *(unsigned int *) table->extra1; - if (min > 255U) - return -EINVAL; - } - if (table->extra2) { + if (table->extra2) max = *(unsigned int *) table->extra2; - if (max > 255U) - return -EINVAL; - } tmp = *table; @@ -1023,8 +1017,9 @@ static int sysrq_sysctl_handler(struct ctl_table *table, int write, } #endif -static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, - int write, void *buffer, size_t *lenp, loff_t *ppos, +static int __do_proc_doulongvec_minmax(void *data, + const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos, unsigned long convmul, unsigned long convdiv) { unsigned long *i, *min, *max; @@ -1096,7 +1091,7 @@ out: return err; } -static int do_proc_doulongvec_minmax(struct ctl_table *table, int write, +static int do_proc_doulongvec_minmax(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos, unsigned long convmul, unsigned long convdiv) { diff --git a/kernel/task_work.c b/kernel/task_work.c index 95a7e1b7f1da..5c2daa7ad3f9 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -1,10 +1,18 @@ // SPDX-License-Identifier: GPL-2.0 +#include <linux/irq_work.h> #include <linux/spinlock.h> #include <linux/task_work.h> #include <linux/resume_user_mode.h> static struct callback_head work_exited; /* all we need is ->next == NULL */ +static void task_work_set_notify_irq(struct irq_work *entry) +{ + test_and_set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); +} +static DEFINE_PER_CPU(struct irq_work, irq_work_NMI_resume) = + IRQ_WORK_INIT_HARD(task_work_set_notify_irq); + /** * task_work_add - ask the @task to execute @work->func() * @task: the task which should run the callback @@ -12,7 +20,7 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */ * @notify: how to notify the targeted task * * Queue @work for task_work_run() below and notify the @task if @notify - * is @TWA_RESUME, @TWA_SIGNAL, or @TWA_SIGNAL_NO_IPI. + * is @TWA_RESUME, @TWA_SIGNAL, @TWA_SIGNAL_NO_IPI or @TWA_NMI_CURRENT. * * @TWA_SIGNAL works like signals, in that the it will interrupt the targeted * task and run the task_work, regardless of whether the task is currently @@ -24,6 +32,8 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */ * kernel anyway. * @TWA_RESUME work is run only when the task exits the kernel and returns to * user mode, or before entering guest mode. + * @TWA_NMI_CURRENT works like @TWA_RESUME, except it can only be used for the + * current @task and if the current context is NMI. * * Fails if the @task is exiting/exited and thus it can't process this @work. * Otherwise @work->func() will be called when the @task goes through one of @@ -44,8 +54,13 @@ int task_work_add(struct task_struct *task, struct callback_head *work, { struct callback_head *head; - /* record the work call stack in order to print it in KASAN reports */ - kasan_record_aux_stack(work); + if (notify == TWA_NMI_CURRENT) { + if (WARN_ON_ONCE(task != current)) + return -EINVAL; + } else { + /* record the work call stack in order to print it in KASAN reports */ + kasan_record_aux_stack(work); + } head = READ_ONCE(task->task_works); do { @@ -66,6 +81,9 @@ int task_work_add(struct task_struct *task, struct callback_head *work, case TWA_SIGNAL_NO_IPI: __set_notify_signal(task); break; + case TWA_NMI_CURRENT: + irq_work_queue(this_cpu_ptr(&irq_work_NMI_resume)); + break; default: WARN_ON_ONCE(1); break; @@ -120,9 +138,9 @@ static bool task_work_func_match(struct callback_head *cb, void *data) } /** - * task_work_cancel - cancel a pending work added by task_work_add() - * @task: the task which should execute the work - * @func: identifies the work to remove + * task_work_cancel_func - cancel a pending work matching a function added by task_work_add() + * @task: the task which should execute the func's work + * @func: identifies the func to match with a work to remove * * Find the last queued pending work with ->func == @func and remove * it from queue. @@ -131,11 +149,35 @@ static bool task_work_func_match(struct callback_head *cb, void *data) * The found work or NULL if not found. */ struct callback_head * -task_work_cancel(struct task_struct *task, task_work_func_t func) +task_work_cancel_func(struct task_struct *task, task_work_func_t func) { return task_work_cancel_match(task, task_work_func_match, func); } +static bool task_work_match(struct callback_head *cb, void *data) +{ + return cb == data; +} + +/** + * task_work_cancel - cancel a pending work added by task_work_add() + * @task: the task which should execute the work + * @cb: the callback to remove if queued + * + * Remove a callback from a task's queue if queued. + * + * RETURNS: + * True if the callback was queued and got cancelled, false otherwise. + */ +bool task_work_cancel(struct task_struct *task, struct callback_head *cb) +{ + struct callback_head *ret; + + ret = task_work_cancel_match(task, task_work_match, cb); + + return ret == cb; +} + /** * task_work_run - execute the works added by task_work_add() * @@ -168,7 +210,7 @@ void task_work_run(void) if (!work) break; /* - * Synchronize with task_work_cancel(). It can not remove + * Synchronize with task_work_cancel_match(). It can not remove * the first entry == work, cmpxchg(task_works) must fail. * But it can remove another entry from the ->next list. */ diff --git a/kernel/time/clocksource-wdtest.c b/kernel/time/clocksource-wdtest.c index d06185e054ea..62e73444ffe4 100644 --- a/kernel/time/clocksource-wdtest.c +++ b/kernel/time/clocksource-wdtest.c @@ -22,6 +22,7 @@ #include "tick-internal.h" MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Clocksource watchdog unit test"); MODULE_AUTHOR("Paul E. McKenney <paulmck@kernel.org>"); static int holdoff = IS_BUILTIN(CONFIG_TEST_CLOCKSOURCE_WATCHDOG) ? 10 : 0; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 492c14aac642..b8ee320208d4 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1285,6 +1285,8 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, struct hrtimer_clock_base *base; unsigned long flags; + if (WARN_ON_ONCE(!timer->function)) + return; /* * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft * match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard diff --git a/kernel/time/test_udelay.c b/kernel/time/test_udelay.c index 20d5df631570..783f2297111b 100644 --- a/kernel/time/test_udelay.c +++ b/kernel/time/test_udelay.c @@ -155,5 +155,6 @@ static void __exit udelay_test_exit(void) module_exit(udelay_test_exit); +MODULE_DESCRIPTION("udelay test module"); MODULE_AUTHOR("David Riley <davidriley@chromium.org>"); MODULE_LICENSE("GPL"); diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 771d1e040303..b4843099a8da 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -1141,6 +1141,7 @@ void tick_broadcast_switch_to_oneshot(void) #ifdef CONFIG_HOTPLUG_CPU void hotplug_cpu__broadcast_tick_pull(int deadcpu) { + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); struct clock_event_device *bc; unsigned long flags; @@ -1148,6 +1149,28 @@ void hotplug_cpu__broadcast_tick_pull(int deadcpu) bc = tick_broadcast_device.evtdev; if (bc && broadcast_needs_cpu(bc, deadcpu)) { + /* + * If the broadcast force bit of the current CPU is set, + * then the current CPU has not yet reprogrammed the local + * timer device to avoid a ping-pong race. See + * ___tick_broadcast_oneshot_control(). + * + * If the broadcast device is hrtimer based then + * programming the broadcast event below does not have any + * effect because the local clockevent device is not + * running and not programmed because the broadcast event + * is not earlier than the pending event of the local clock + * event device. As a consequence all CPUs waiting for a + * broadcast event are stuck forever. + * + * Detect this condition and reprogram the cpu local timer + * device to avoid the starvation. + */ + if (tick_check_broadcast_expired()) { + cpumask_clear_cpu(smp_processor_id(), tick_broadcast_force_mask); + tick_program_event(td->evtdev->next_event, 1); + } + /* This moves the broadcast assignment to this CPU: */ clockevents_program_event(bc, bc->next_event, 1); } diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index d88b13076b79..a47bcf71defc 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -178,26 +178,6 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) } } -#ifdef CONFIG_NO_HZ_FULL -static void giveup_do_timer(void *info) -{ - int cpu = *(unsigned int *)info; - - WARN_ON(tick_do_timer_cpu != smp_processor_id()); - - tick_do_timer_cpu = cpu; -} - -static void tick_take_do_timer_from_boot(void) -{ - int cpu = smp_processor_id(); - int from = tick_do_timer_boot_cpu; - - if (from >= 0 && from != cpu) - smp_call_function_single(from, giveup_do_timer, &cpu, 1); -} -#endif - /* * Setup the tick device */ @@ -221,19 +201,25 @@ static void tick_setup_device(struct tick_device *td, tick_next_period = ktime_get(); #ifdef CONFIG_NO_HZ_FULL /* - * The boot CPU may be nohz_full, in which case set - * tick_do_timer_boot_cpu so the first housekeeping - * secondary that comes up will take do_timer from - * us. + * The boot CPU may be nohz_full, in which case the + * first housekeeping secondary will take do_timer() + * from it. */ if (tick_nohz_full_cpu(cpu)) tick_do_timer_boot_cpu = cpu; - } else if (tick_do_timer_boot_cpu != -1 && - !tick_nohz_full_cpu(cpu)) { - tick_take_do_timer_from_boot(); + } else if (tick_do_timer_boot_cpu != -1 && !tick_nohz_full_cpu(cpu)) { tick_do_timer_boot_cpu = -1; - WARN_ON(READ_ONCE(tick_do_timer_cpu) != cpu); + /* + * The boot CPU will stay in periodic (NOHZ disabled) + * mode until clocksource_done_booting() called after + * smp_init() selects a high resolution clocksource and + * timekeeping_notify() kicks the NOHZ stuff alive. + * + * So this WRITE_ONCE can only race with the READ_ONCE + * check in tick_periodic() but this race is harmless. + */ + WRITE_ONCE(tick_do_timer_cpu, cpu); #endif } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 71a792cd8936..753a184c7090 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -1026,10 +1026,10 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) if (expires == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer)) return; - WARN_ON_ONCE(1); - printk_once("basemono: %llu ts->next_tick: %llu dev->next_event: %llu timer->active: %d timer->expires: %llu\n", - basemono, ts->next_tick, dev->next_event, - hrtimer_active(&ts->sched_timer), hrtimer_get_expires(&ts->sched_timer)); + WARN_ONCE(1, "basemono: %llu ts->next_tick: %llu dev->next_event: %llu " + "timer->active: %d timer->expires: %llu\n", basemono, ts->next_tick, + dev->next_event, hrtimer_active(&ts->sched_timer), + hrtimer_get_expires(&ts->sched_timer)); } /* @@ -1385,20 +1385,6 @@ unsigned long tick_nohz_get_idle_calls_cpu(int cpu) return ts->idle_calls; } -/** - * tick_nohz_get_idle_calls - return the current idle calls counter value - * - * Called from the schedutil frequency scaling governor in scheduler context. - * - * Return: the current idle calls counter value for the current CPU - */ -unsigned long tick_nohz_get_idle_calls(void) -{ - struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); - - return ts->idle_calls; -} - static void tick_nohz_account_idle_time(struct tick_sched *ts, ktime_t now) { diff --git a/kernel/time/time_test.c b/kernel/time/time_test.c index 3e5d422dd15c..2889763165e5 100644 --- a/kernel/time/time_test.c +++ b/kernel/time/time_test.c @@ -96,4 +96,5 @@ static struct kunit_suite time_test_suite = { }; kunit_test_suite(time_test_suite); +MODULE_DESCRIPTION("time unit test suite"); MODULE_LICENSE("GPL"); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 4e18db1819f8..2fa87dcfeda9 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1195,6 +1195,108 @@ static bool timestamp_in_interval(u64 start, u64 end, u64 ts) return false; } +static bool convert_clock(u64 *val, u32 numerator, u32 denominator) +{ + u64 rem, res; + + if (!numerator || !denominator) + return false; + + res = div64_u64_rem(*val, denominator, &rem) * numerator; + *val = res + div_u64(rem * numerator, denominator); + return true; +} + +static bool convert_base_to_cs(struct system_counterval_t *scv) +{ + struct clocksource *cs = tk_core.timekeeper.tkr_mono.clock; + struct clocksource_base *base; + u32 num, den; + + /* The timestamp was taken from the time keeper clock source */ + if (cs->id == scv->cs_id) + return true; + + /* + * Check whether cs_id matches the base clock. Prevent the compiler from + * re-evaluating @base as the clocksource might change concurrently. + */ + base = READ_ONCE(cs->base); + if (!base || base->id != scv->cs_id) + return false; + + num = scv->use_nsecs ? cs->freq_khz : base->numerator; + den = scv->use_nsecs ? USEC_PER_SEC : base->denominator; + + if (!convert_clock(&scv->cycles, num, den)) + return false; + + scv->cycles += base->offset; + return true; +} + +static bool convert_cs_to_base(u64 *cycles, enum clocksource_ids base_id) +{ + struct clocksource *cs = tk_core.timekeeper.tkr_mono.clock; + struct clocksource_base *base; + + /* + * Check whether base_id matches the base clock. Prevent the compiler from + * re-evaluating @base as the clocksource might change concurrently. + */ + base = READ_ONCE(cs->base); + if (!base || base->id != base_id) + return false; + + *cycles -= base->offset; + if (!convert_clock(cycles, base->denominator, base->numerator)) + return false; + return true; +} + +static bool convert_ns_to_cs(u64 *delta) +{ + struct tk_read_base *tkr = &tk_core.timekeeper.tkr_mono; + + if (BITS_TO_BYTES(fls64(*delta) + tkr->shift) >= sizeof(*delta)) + return false; + + *delta = div_u64((*delta << tkr->shift) - tkr->xtime_nsec, tkr->mult); + return true; +} + +/** + * ktime_real_to_base_clock() - Convert CLOCK_REALTIME timestamp to a base clock timestamp + * @treal: CLOCK_REALTIME timestamp to convert + * @base_id: base clocksource id + * @cycles: pointer to store the converted base clock timestamp + * + * Converts a supplied, future realtime clock value to the corresponding base clock value. + * + * Return: true if the conversion is successful, false otherwise. + */ +bool ktime_real_to_base_clock(ktime_t treal, enum clocksource_ids base_id, u64 *cycles) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned int seq; + u64 delta; + + do { + seq = read_seqcount_begin(&tk_core.seq); + if ((u64)treal < tk->tkr_mono.base_real) + return false; + delta = (u64)treal - tk->tkr_mono.base_real; + if (!convert_ns_to_cs(&delta)) + return false; + *cycles = tk->tkr_mono.cycle_last + delta; + if (!convert_cs_to_base(cycles, base_id)) + return false; + } while (read_seqcount_retry(&tk_core.seq, seq)); + + return true; +} +EXPORT_SYMBOL_GPL(ktime_real_to_base_clock); + /** * get_device_system_crosststamp - Synchronously capture system/device timestamp * @get_time_fn: Callback to get simultaneous device time and @@ -1241,7 +1343,7 @@ int get_device_system_crosststamp(int (*get_time_fn) * installed timekeeper clocksource */ if (system_counterval.cs_id == CSID_GENERIC || - tk->tkr_mono.clock->id != system_counterval.cs_id) + !convert_base_to_cs(&system_counterval)) return -ENODEV; cycles = system_counterval.cycles; @@ -1307,6 +1409,30 @@ int get_device_system_crosststamp(int (*get_time_fn) EXPORT_SYMBOL_GPL(get_device_system_crosststamp); /** + * timekeeping_clocksource_has_base - Check whether the current clocksource + * is based on given a base clock + * @id: base clocksource ID + * + * Note: The return value is a snapshot which can become invalid right + * after the function returns. + * + * Return: true if the timekeeper clocksource has a base clock with @id, + * false otherwise + */ +bool timekeeping_clocksource_has_base(enum clocksource_ids id) +{ + /* + * This is a snapshot, so no point in using the sequence + * count. Just prevent the compiler from re-evaluating @base as the + * clocksource might change concurrently. + */ + struct clocksource_base *base = READ_ONCE(tk_core.timekeeper.tkr_mono.clock->base); + + return base ? base->id == id : false; +} +EXPORT_SYMBOL_GPL(timekeeping_clocksource_has_base); + +/** * do_settimeofday64 - Sets the time of day. * @ts: pointer to the timespec64 variable containing the new time * @@ -2421,6 +2547,7 @@ EXPORT_SYMBOL_GPL(random_get_entropy_fallback); /** * do_adjtimex() - Accessor function to NTP __do_adjtimex function + * @txc: Pointer to kernel_timex structure containing NTP parameters */ int do_adjtimex(struct __kernel_timex *txc) { @@ -2489,6 +2616,8 @@ int do_adjtimex(struct __kernel_timex *txc) #ifdef CONFIG_NTP_PPS /** * hardpps() - Accessor function to NTP __hardpps function + * @phase_ts: Pointer to timespec64 structure representing phase timestamp + * @raw_ts: Pointer to timespec64 structure representing raw timestamp */ void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts) { diff --git a/kernel/torture.c b/kernel/torture.c index c72ab2d251f4..dede150aef01 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -40,6 +40,7 @@ #include <linux/sched/rt.h> #include "rcu/rcu.h" +MODULE_DESCRIPTION("Common functions for in-kernel torture tests"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>"); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 166ad5444eea..721c3b221048 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -1136,7 +1136,7 @@ config PREEMPTIRQ_DELAY_TEST config SYNTH_EVENT_GEN_TEST tristate "Test module for in-kernel synthetic event generation" - depends on SYNTH_EVENTS + depends on SYNTH_EVENTS && m help This option creates a test module to check the base functionality of in-kernel synthetic event definition and @@ -1149,7 +1149,7 @@ config SYNTH_EVENT_GEN_TEST config KPROBE_EVENT_GEN_TEST tristate "Test module for in-kernel kprobe event generation" - depends on KPROBE_EVENTS + depends on KPROBE_EVENTS && m help This option creates a test module to check the base functionality of in-kernel kprobe event definition. diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 6249dac61701..cd098846e251 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1369,8 +1369,8 @@ __bpf_kfunc void bpf_key_put(struct bpf_key *bkey) #ifdef CONFIG_SYSTEM_DATA_VERIFICATION /** * bpf_verify_pkcs7_signature - verify a PKCS#7 signature - * @data_ptr: data to verify - * @sig_ptr: signature of the data + * @data_p: data to verify + * @sig_p: signature of the data * @trusted_keyring: keyring with keys trusted for signature verification * * Verify the PKCS#7 signature *sig_ptr* against the supplied *data_ptr* @@ -1378,10 +1378,12 @@ __bpf_kfunc void bpf_key_put(struct bpf_key *bkey) * * Return: 0 on success, a negative value on error. */ -__bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr_kern *data_ptr, - struct bpf_dynptr_kern *sig_ptr, +__bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p, + struct bpf_dynptr *sig_p, struct bpf_key *trusted_keyring) { + struct bpf_dynptr_kern *data_ptr = (struct bpf_dynptr_kern *)data_p; + struct bpf_dynptr_kern *sig_ptr = (struct bpf_dynptr_kern *)sig_p; const void *data, *sig; u32 data_len, sig_len; int ret; @@ -1444,7 +1446,7 @@ __bpf_kfunc_start_defs(); * bpf_get_file_xattr - get xattr of a file * @file: file to get xattr from * @name__str: name of the xattr - * @value_ptr: output buffer of the xattr value + * @value_p: output buffer of the xattr value * * Get xattr *name__str* of *file* and store the output in *value_ptr*. * @@ -1453,8 +1455,9 @@ __bpf_kfunc_start_defs(); * Return: 0 on success, a negative value on error. */ __bpf_kfunc int bpf_get_file_xattr(struct file *file, const char *name__str, - struct bpf_dynptr_kern *value_ptr) + struct bpf_dynptr *value_p) { + struct bpf_dynptr_kern *value_ptr = (struct bpf_dynptr_kern *)value_p; struct dentry *dentry; u32 value_len; void *value; @@ -3517,7 +3520,6 @@ static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx) } #endif /* CONFIG_UPROBES */ -#ifdef CONFIG_FPROBE __bpf_kfunc_start_defs(); __bpf_kfunc bool bpf_session_is_return(void) @@ -3566,4 +3568,3 @@ static int __init bpf_kprobe_multi_kfuncs_init(void) } late_initcall(bpf_kprobe_multi_kfuncs_init); -#endif diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f44229294e9d..e5d6a4ab433b 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -7484,7 +7484,7 @@ allocate_ftrace_mod_map(struct module *mod, return mod_map; } -static const char * +static int ftrace_func_address_lookup(struct ftrace_mod_map *mod_map, unsigned long addr, unsigned long *size, unsigned long *off, char *sym) @@ -7505,21 +7505,18 @@ ftrace_func_address_lookup(struct ftrace_mod_map *mod_map, *size = found_func->size; if (off) *off = addr - found_func->ip; - if (sym) - strscpy(sym, found_func->name, KSYM_NAME_LEN); - - return found_func->name; + return strscpy(sym, found_func->name, KSYM_NAME_LEN); } - return NULL; + return 0; } -const char * +int ftrace_mod_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char **modname, char *sym) { struct ftrace_mod_map *mod_map; - const char *ret = NULL; + int ret = 0; /* mod_map is freed via call_rcu() */ preempt_disable(); diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c index 95106d02b32d..4966e6bbdf6f 100644 --- a/kernel/trace/pid_list.c +++ b/kernel/trace/pid_list.c @@ -354,7 +354,7 @@ static void pid_list_refill_irq(struct irq_work *iwork) while (upper_count-- > 0) { union upper_chunk *chunk; - chunk = kzalloc(sizeof(*chunk), GFP_KERNEL); + chunk = kzalloc(sizeof(*chunk), GFP_NOWAIT); if (!chunk) break; *upper_next = chunk; @@ -365,7 +365,7 @@ static void pid_list_refill_irq(struct irq_work *iwork) while (lower_count-- > 0) { union lower_chunk *chunk; - chunk = kzalloc(sizeof(*chunk), GFP_KERNEL); + chunk = kzalloc(sizeof(*chunk), GFP_NOWAIT); if (!chunk) break; *lower_next = chunk; @@ -451,6 +451,7 @@ struct trace_pid_list *trace_pid_list_alloc(void) /** * trace_pid_list_free - Frees an allocated pid_list. + * @pid_list: The pid list to free. * * Frees the memory for a pid_list that was allocated. */ diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 16383247bdbf..61a6da808203 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -678,6 +678,21 @@ end: } #ifdef CONFIG_MODULES +static int validate_module_probe_symbol(const char *modname, const char *symbol); + +static int register_module_trace_kprobe(struct module *mod, struct trace_kprobe *tk) +{ + const char *p; + int ret = 0; + + p = strchr(trace_kprobe_symbol(tk), ':'); + if (p) + ret = validate_module_probe_symbol(module_name(mod), p + 1); + if (!ret) + ret = __register_trace_kprobe(tk); + return ret; +} + /* Module notifier call back, checking event on the module */ static int trace_kprobe_module_callback(struct notifier_block *nb, unsigned long val, void *data) @@ -696,7 +711,7 @@ static int trace_kprobe_module_callback(struct notifier_block *nb, if (trace_kprobe_within_module(tk, mod)) { /* Don't need to check busy - this should have gone. */ __unregister_trace_kprobe(tk); - ret = __register_trace_kprobe(tk); + ret = register_module_trace_kprobe(mod, tk); if (ret) pr_warn("Failed to re-register probe %s on %s: %d\n", trace_probe_name(&tk->tp), @@ -747,17 +762,81 @@ static int count_mod_symbols(void *data, const char *name, unsigned long unused) return 0; } -static unsigned int number_of_same_symbols(char *func_name) +static unsigned int number_of_same_symbols(const char *mod, const char *func_name) { struct sym_count_ctx ctx = { .count = 0, .name = func_name }; - kallsyms_on_each_match_symbol(count_symbols, func_name, &ctx.count); + if (!mod) + kallsyms_on_each_match_symbol(count_symbols, func_name, &ctx.count); - module_kallsyms_on_each_symbol(NULL, count_mod_symbols, &ctx); + module_kallsyms_on_each_symbol(mod, count_mod_symbols, &ctx); return ctx.count; } +static int validate_module_probe_symbol(const char *modname, const char *symbol) +{ + unsigned int count = number_of_same_symbols(modname, symbol); + + if (count > 1) { + /* + * Users should use ADDR to remove the ambiguity of + * using KSYM only. + */ + return -EADDRNOTAVAIL; + } else if (count == 0) { + /* + * We can return ENOENT earlier than when register the + * kprobe. + */ + return -ENOENT; + } + return 0; +} + +#ifdef CONFIG_MODULES +/* Return NULL if the module is not loaded or under unloading. */ +static struct module *try_module_get_by_name(const char *name) +{ + struct module *mod; + + rcu_read_lock_sched(); + mod = find_module(name); + if (mod && !try_module_get(mod)) + mod = NULL; + rcu_read_unlock_sched(); + + return mod; +} +#else +#define try_module_get_by_name(name) (NULL) +#endif + +static int validate_probe_symbol(char *symbol) +{ + struct module *mod = NULL; + char *modname = NULL, *p; + int ret = 0; + + p = strchr(symbol, ':'); + if (p) { + modname = symbol; + symbol = p + 1; + *p = '\0'; + mod = try_module_get_by_name(modname); + if (!mod) + goto out; + } + + ret = validate_module_probe_symbol(modname, symbol); +out: + if (p) + *p = ':'; + if (mod) + module_put(mod); + return ret; +} + static int trace_kprobe_entry_handler(struct kretprobe_instance *ri, struct pt_regs *regs); @@ -881,6 +960,14 @@ static int __trace_kprobe_create(int argc, const char *argv[]) trace_probe_log_err(0, BAD_PROBE_ADDR); goto parse_error; } + ret = validate_probe_symbol(symbol); + if (ret) { + if (ret == -EADDRNOTAVAIL) + trace_probe_log_err(0, NON_UNIQ_SYMBOL); + else + trace_probe_log_err(0, BAD_PROBE_ADDR); + goto parse_error; + } if (is_return) ctx.flags |= TPARG_FL_RETURN; ret = kprobe_on_func_entry(NULL, symbol, offset); @@ -893,31 +980,6 @@ static int __trace_kprobe_create(int argc, const char *argv[]) } } - if (symbol && !strchr(symbol, ':')) { - unsigned int count; - - count = number_of_same_symbols(symbol); - if (count > 1) { - /* - * Users should use ADDR to remove the ambiguity of - * using KSYM only. - */ - trace_probe_log_err(0, NON_UNIQ_SYMBOL); - ret = -EADDRNOTAVAIL; - - goto error; - } else if (count == 0) { - /* - * We can return ENOENT earlier than when register the - * kprobe. - */ - trace_probe_log_err(0, BAD_PROBE_ADDR); - ret = -ENOENT; - - goto error; - } - } - trace_probe_log_set_index(0); if (event) { ret = traceprobe_parse_event_name(&event, &group, gbuf, @@ -1835,21 +1897,9 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs, char *event; if (func) { - unsigned int count; - - count = number_of_same_symbols(func); - if (count > 1) - /* - * Users should use addr to remove the ambiguity of - * using func only. - */ - return ERR_PTR(-EADDRNOTAVAIL); - else if (count == 0) - /* - * We can return ENOENT earlier than when register the - * kprobe. - */ - return ERR_PTR(-ENOENT); + ret = validate_probe_symbol(func); + if (ret) + return ERR_PTR(ret); } /* @@ -2023,19 +2073,16 @@ static __init int kprobe_trace_self_tests_init(void) pr_info("Testing kprobe tracing: "); ret = create_or_delete_trace_kprobe("p:testprobe kprobe_trace_selftest_target $stack $stack0 +0($stack)"); - if (WARN_ON_ONCE(ret)) { - pr_warn("error on probing function entry.\n"); + if (WARN_ONCE(ret, "error on probing function entry.")) { warn++; } else { /* Enable trace point */ tk = find_trace_kprobe("testprobe", KPROBE_EVENT_SYSTEM); - if (WARN_ON_ONCE(tk == NULL)) { - pr_warn("error on getting new probe.\n"); + if (WARN_ONCE(tk == NULL, "error on probing function entry.")) { warn++; } else { file = find_trace_probe_file(tk, top_trace_array()); - if (WARN_ON_ONCE(file == NULL)) { - pr_warn("error on getting probe file.\n"); + if (WARN_ONCE(file == NULL, "error on getting probe file.")) { warn++; } else enable_trace_kprobe( @@ -2044,19 +2091,16 @@ static __init int kprobe_trace_self_tests_init(void) } ret = create_or_delete_trace_kprobe("r:testprobe2 kprobe_trace_selftest_target $retval"); - if (WARN_ON_ONCE(ret)) { - pr_warn("error on probing function return.\n"); + if (WARN_ONCE(ret, "error on probing function return.")) { warn++; } else { /* Enable trace point */ tk = find_trace_kprobe("testprobe2", KPROBE_EVENT_SYSTEM); - if (WARN_ON_ONCE(tk == NULL)) { - pr_warn("error on getting 2nd new probe.\n"); + if (WARN_ONCE(tk == NULL, "error on getting 2nd new probe.")) { warn++; } else { file = find_trace_probe_file(tk, top_trace_array()); - if (WARN_ON_ONCE(file == NULL)) { - pr_warn("error on getting probe file.\n"); + if (WARN_ONCE(file == NULL, "error on getting probe file.")) { warn++; } else enable_trace_kprobe( @@ -2079,18 +2123,15 @@ static __init int kprobe_trace_self_tests_init(void) /* Disable trace points before removing it */ tk = find_trace_kprobe("testprobe", KPROBE_EVENT_SYSTEM); - if (WARN_ON_ONCE(tk == NULL)) { - pr_warn("error on getting test probe.\n"); + if (WARN_ONCE(tk == NULL, "error on getting test probe.")) { warn++; } else { - if (trace_kprobe_nhit(tk) != 1) { - pr_warn("incorrect number of testprobe hits\n"); + if (WARN_ONCE(trace_kprobe_nhit(tk) != 1, + "incorrect number of testprobe hits.")) warn++; - } file = find_trace_probe_file(tk, top_trace_array()); - if (WARN_ON_ONCE(file == NULL)) { - pr_warn("error on getting probe file.\n"); + if (WARN_ONCE(file == NULL, "error on getting probe file.")) { warn++; } else disable_trace_kprobe( @@ -2098,18 +2139,15 @@ static __init int kprobe_trace_self_tests_init(void) } tk = find_trace_kprobe("testprobe2", KPROBE_EVENT_SYSTEM); - if (WARN_ON_ONCE(tk == NULL)) { - pr_warn("error on getting 2nd test probe.\n"); + if (WARN_ONCE(tk == NULL, "error on getting 2nd test probe.")) { warn++; } else { - if (trace_kprobe_nhit(tk) != 1) { - pr_warn("incorrect number of testprobe2 hits\n"); + if (WARN_ONCE(trace_kprobe_nhit(tk) != 1, + "incorrect number of testprobe2 hits.")) warn++; - } file = find_trace_probe_file(tk, top_trace_array()); - if (WARN_ON_ONCE(file == NULL)) { - pr_warn("error on getting probe file.\n"); + if (WARN_ONCE(file == NULL, "error on getting probe file.")) { warn++; } else disable_trace_kprobe( @@ -2117,23 +2155,15 @@ static __init int kprobe_trace_self_tests_init(void) } ret = create_or_delete_trace_kprobe("-:testprobe"); - if (WARN_ON_ONCE(ret)) { - pr_warn("error on deleting a probe.\n"); + if (WARN_ONCE(ret, "error on deleting a probe.")) warn++; - } ret = create_or_delete_trace_kprobe("-:testprobe2"); - if (WARN_ON_ONCE(ret)) { - pr_warn("error on deleting a probe.\n"); + if (WARN_ONCE(ret, "error on deleting a probe.")) warn++; - } + end: - ret = dyn_events_release_all(&trace_kprobe_ops); - if (WARN_ON_ONCE(ret)) { - pr_warn("error on cleaning up probes.\n"); - warn++; - } /* * Wait for the optimizer work to finish. Otherwise it might fiddle * with probes in already freed __init text. diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index a8e28f9b9271..66a871553d4a 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1444,9 +1444,9 @@ static int run_osnoise(void) save_osn_sample_stats(osn_var, &s); /* - * if threshold is 0, use the default value of 5 us. + * if threshold is 0, use the default value of 1 us. */ - threshold = tracing_thresh ? : 5000; + threshold = tracing_thresh ? : 1000; /* * Apply PREEMPT and IRQ disabled options. diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 76a772072557..04e4513f2985 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -15,7 +15,7 @@ #ifdef CONFIG_PROC_SYSCTL -static void *get_uts(struct ctl_table *table) +static void *get_uts(const struct ctl_table *table) { char *which = table->data; struct uts_namespace *uts_ns; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 003474c9a77d..1745ca788ede 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -125,6 +125,7 @@ enum wq_internal_consts { HIGHPRI_NICE_LEVEL = MIN_NICE, WQ_NAME_LEN = 32, + WORKER_ID_LEN = 10 + WQ_NAME_LEN, /* "kworker/R-" + WQ_NAME_LEN */ }; /* @@ -215,8 +216,6 @@ struct worker_pool { struct worker *manager; /* L: purely informational */ struct list_head workers; /* A: attached workers */ - struct list_head dying_workers; /* A: workers about to die */ - struct completion *detach_completion; /* all workers detached */ struct ida worker_ida; /* worker IDs for task name */ @@ -435,7 +434,7 @@ static struct wq_pod_type wq_pod_types[WQ_AFFN_NR_TYPES]; static enum wq_affn_scope wq_affn_dfl = WQ_AFFN_CACHE; /* buf for wq_update_unbound_pod_attrs(), protected by CPU hotplug exclusion */ -static struct workqueue_attrs *wq_update_pod_attrs_buf; +static struct workqueue_attrs *unbound_wq_update_pwq_attrs_buf; static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ static DEFINE_MUTEX(wq_pool_attach_mutex); /* protects worker attach/detach */ @@ -446,6 +445,9 @@ static struct rcuwait manager_wait = __RCUWAIT_INITIALIZER(manager_wait); static LIST_HEAD(workqueues); /* PR: list of all workqueues */ static bool workqueue_freezing; /* PL: have wqs started freezing? */ +/* PL: mirror the cpu_online_mask excluding the CPU in the midst of hotplugging */ +static cpumask_var_t wq_online_cpumask; + /* PL&A: allowable cpus for unbound wqs and work items */ static cpumask_var_t wq_unbound_cpumask; @@ -1683,33 +1685,6 @@ static void __pwq_activate_work(struct pool_workqueue *pwq, __clear_bit(WORK_STRUCT_INACTIVE_BIT, wdb); } -/** - * pwq_activate_work - Activate a work item if inactive - * @pwq: pool_workqueue @work belongs to - * @work: work item to activate - * - * Returns %true if activated. %false if already active. - */ -static bool pwq_activate_work(struct pool_workqueue *pwq, - struct work_struct *work) -{ - struct worker_pool *pool = pwq->pool; - struct wq_node_nr_active *nna; - - lockdep_assert_held(&pool->lock); - - if (!(*work_data_bits(work) & WORK_STRUCT_INACTIVE)) - return false; - - nna = wq_node_nr_active(pwq->wq, pool->node); - if (nna) - atomic_inc(&nna->nr); - - pwq->nr_active++; - __pwq_activate_work(pwq, work); - return true; -} - static bool tryinc_node_nr_active(struct wq_node_nr_active *nna) { int max = READ_ONCE(nna->max); @@ -2116,7 +2091,7 @@ static int try_to_grab_pending(struct work_struct *work, u32 cflags, */ pwq = get_work_pwq(work); if (pwq && pwq->pool == pool) { - unsigned long work_data; + unsigned long work_data = *work_data_bits(work); debug_work_deactivate(work); @@ -2125,13 +2100,17 @@ static int try_to_grab_pending(struct work_struct *work, u32 cflags, * pwq->inactive_works since a queued barrier can't be * canceled (see the comments in insert_wq_barrier()). * - * An inactive work item cannot be grabbed directly because + * An inactive work item cannot be deleted directly because * it might have linked barrier work items which, if left * on the inactive_works list, will confuse pwq->nr_active - * management later on and cause stall. Make sure the work - * item is activated before grabbing. + * management later on and cause stall. Move the linked + * barrier work items to the worklist when deleting the grabbed + * item. Also keep WORK_STRUCT_INACTIVE in work_data, so that + * it doesn't participate in nr_active management in later + * pwq_dec_nr_in_flight(). */ - pwq_activate_work(pwq, work); + if (work_data & WORK_STRUCT_INACTIVE) + move_linked_works(work, &pwq->pool->worklist, NULL); list_del_init(&work->entry); @@ -2139,7 +2118,6 @@ static int try_to_grab_pending(struct work_struct *work, u32 cflags, * work->data points to pwq iff queued. Let's point to pool. As * this destroys work->data needed by the next step, stash it. */ - work_data = *work_data_bits(work); set_work_pool_and_keep_pending(work, pool->id, pool_offq_flags(pool)); @@ -2297,9 +2275,13 @@ retry: * If @work was previously on a different pool, it might still be * running there, in which case the work needs to be queued on that * pool to guarantee non-reentrancy. + * + * For ordered workqueue, work items must be queued on the newest pwq + * for accurate order management. Guaranteed order also guarantees + * non-reentrancy. See the comments above unplug_oldest_pwq(). */ last_pool = get_work_pool(work); - if (last_pool && last_pool != pool) { + if (last_pool && last_pool != pool && !(wq->flags & __WQ_ORDERED)) { struct worker *worker; raw_spin_lock(&last_pool->lock); @@ -2709,6 +2691,27 @@ static void worker_attach_to_pool(struct worker *worker, mutex_unlock(&wq_pool_attach_mutex); } +static void unbind_worker(struct worker *worker) +{ + lockdep_assert_held(&wq_pool_attach_mutex); + + kthread_set_per_cpu(worker->task, -1); + if (cpumask_intersects(wq_unbound_cpumask, cpu_active_mask)) + WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, wq_unbound_cpumask) < 0); + else + WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0); +} + + +static void detach_worker(struct worker *worker) +{ + lockdep_assert_held(&wq_pool_attach_mutex); + + unbind_worker(worker); + list_del(&worker->node); + worker->pool = NULL; +} + /** * worker_detach_from_pool() - detach a worker from its pool * @worker: worker which is attached to its pool @@ -2720,26 +2723,36 @@ static void worker_attach_to_pool(struct worker *worker, static void worker_detach_from_pool(struct worker *worker) { struct worker_pool *pool = worker->pool; - struct completion *detach_completion = NULL; /* there is one permanent BH worker per CPU which should never detach */ WARN_ON_ONCE(pool->flags & POOL_BH); mutex_lock(&wq_pool_attach_mutex); - - kthread_set_per_cpu(worker->task, -1); - list_del(&worker->node); - worker->pool = NULL; - - if (list_empty(&pool->workers) && list_empty(&pool->dying_workers)) - detach_completion = pool->detach_completion; + detach_worker(worker); mutex_unlock(&wq_pool_attach_mutex); /* clear leftover flags without pool->lock after it is detached */ worker->flags &= ~(WORKER_UNBOUND | WORKER_REBOUND); +} - if (detach_completion) - complete(detach_completion); +static int format_worker_id(char *buf, size_t size, struct worker *worker, + struct worker_pool *pool) +{ + if (worker->rescue_wq) + return scnprintf(buf, size, "kworker/R-%s", + worker->rescue_wq->name); + + if (pool) { + if (pool->cpu >= 0) + return scnprintf(buf, size, "kworker/%d:%d%s", + pool->cpu, worker->id, + pool->attrs->nice < 0 ? "H" : ""); + else + return scnprintf(buf, size, "kworker/u%d:%d", + pool->id, worker->id); + } else { + return scnprintf(buf, size, "kworker/dying"); + } } /** @@ -2758,7 +2771,6 @@ static struct worker *create_worker(struct worker_pool *pool) { struct worker *worker; int id; - char id_buf[23]; /* ID is needed to determine kthread name */ id = ida_alloc(&pool->worker_ida, GFP_KERNEL); @@ -2777,17 +2789,14 @@ static struct worker *create_worker(struct worker_pool *pool) worker->id = id; if (!(pool->flags & POOL_BH)) { - if (pool->cpu >= 0) - snprintf(id_buf, sizeof(id_buf), "%d:%d%s", pool->cpu, id, - pool->attrs->nice < 0 ? "H" : ""); - else - snprintf(id_buf, sizeof(id_buf), "u%d:%d", pool->id, id); + char id_buf[WORKER_ID_LEN]; + format_worker_id(id_buf, sizeof(id_buf), worker, pool); worker->task = kthread_create_on_node(worker_thread, worker, - pool->node, "kworker/%s", id_buf); + pool->node, "%s", id_buf); if (IS_ERR(worker->task)) { if (PTR_ERR(worker->task) == -EINTR) { - pr_err("workqueue: Interrupted when creating a worker thread \"kworker/%s\"\n", + pr_err("workqueue: Interrupted when creating a worker thread \"%s\"\n", id_buf); } else { pr_err_once("workqueue: Failed to create a worker thread: %pe", @@ -2827,35 +2836,22 @@ fail: return NULL; } -static void unbind_worker(struct worker *worker) +static void detach_dying_workers(struct list_head *cull_list) { - lockdep_assert_held(&wq_pool_attach_mutex); + struct worker *worker; - kthread_set_per_cpu(worker->task, -1); - if (cpumask_intersects(wq_unbound_cpumask, cpu_active_mask)) - WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, wq_unbound_cpumask) < 0); - else - WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0); + list_for_each_entry(worker, cull_list, entry) + detach_worker(worker); } -static void wake_dying_workers(struct list_head *cull_list) +static void reap_dying_workers(struct list_head *cull_list) { struct worker *worker, *tmp; list_for_each_entry_safe(worker, tmp, cull_list, entry) { list_del_init(&worker->entry); - unbind_worker(worker); - /* - * If the worker was somehow already running, then it had to be - * in pool->idle_list when set_worker_dying() happened or we - * wouldn't have gotten here. - * - * Thus, the worker must either have observed the WORKER_DIE - * flag, or have set its state to TASK_IDLE. Either way, the - * below will be observed by the worker and is safe to do - * outside of pool->lock. - */ - wake_up_process(worker->task); + kthread_stop_put(worker->task); + kfree(worker); } } @@ -2889,7 +2885,9 @@ static void set_worker_dying(struct worker *worker, struct list_head *list) worker->flags |= WORKER_DIE; list_move(&worker->entry, list); - list_move(&worker->node, &pool->dying_workers); + + /* get an extra task struct reference for later kthread_stop_put() */ + get_task_struct(worker->task); } /** @@ -2948,9 +2946,9 @@ static void idle_cull_fn(struct work_struct *work) /* * Grabbing wq_pool_attach_mutex here ensures an already-running worker - * cannot proceed beyong worker_detach_from_pool() in its self-destruct - * path. This is required as a previously-preempted worker could run after - * set_worker_dying() has happened but before wake_dying_workers() did. + * cannot proceed beyong set_pf_worker() in its self-destruct path. + * This is required as a previously-preempted worker could run after + * set_worker_dying() has happened but before detach_dying_workers() did. */ mutex_lock(&wq_pool_attach_mutex); raw_spin_lock_irq(&pool->lock); @@ -2971,8 +2969,10 @@ static void idle_cull_fn(struct work_struct *work) } raw_spin_unlock_irq(&pool->lock); - wake_dying_workers(&cull_list); + detach_dying_workers(&cull_list); mutex_unlock(&wq_pool_attach_mutex); + + reap_dying_workers(&cull_list); } static void send_mayday(struct work_struct *work) @@ -3350,11 +3350,8 @@ woke_up: raw_spin_unlock_irq(&pool->lock); set_pf_worker(false); - set_task_comm(worker->task, "kworker/dying"); ida_free(&pool->worker_ida, worker->id); - worker_detach_from_pool(worker); WARN_ON_ONCE(!list_empty(&worker->entry)); - kfree(worker); return 0; } @@ -4745,7 +4742,6 @@ static int init_worker_pool(struct worker_pool *pool) timer_setup(&pool->mayday_timer, pool_mayday_timeout, 0); INIT_LIST_HEAD(&pool->workers); - INIT_LIST_HEAD(&pool->dying_workers); ida_init(&pool->worker_ida); INIT_HLIST_NODE(&pool->hash_node); @@ -4887,7 +4883,6 @@ static void rcu_free_pool(struct rcu_head *rcu) */ static void put_unbound_pool(struct worker_pool *pool) { - DECLARE_COMPLETION_ONSTACK(detach_completion); struct worker *worker; LIST_HEAD(cull_list); @@ -4939,14 +4934,11 @@ static void put_unbound_pool(struct worker_pool *pool) WARN_ON(pool->nr_workers || pool->nr_idle); raw_spin_unlock_irq(&pool->lock); - wake_dying_workers(&cull_list); + detach_dying_workers(&cull_list); - if (!list_empty(&pool->workers) || !list_empty(&pool->dying_workers)) - pool->detach_completion = &detach_completion; mutex_unlock(&wq_pool_attach_mutex); - if (pool->detach_completion) - wait_for_completion(pool->detach_completion); + reap_dying_workers(&cull_list); /* shut down the timers */ del_timer_sync(&pool->idle_timer); @@ -5022,12 +5014,6 @@ fail: return NULL; } -static void rcu_free_pwq(struct rcu_head *rcu) -{ - kmem_cache_free(pwq_cache, - container_of(rcu, struct pool_workqueue, rcu)); -} - /* * Scheduled on pwq_release_worker by put_pwq() when an unbound pwq hits zero * refcnt and needs to be destroyed. @@ -5073,7 +5059,7 @@ static void pwq_release_workfn(struct kthread_work *work) raw_spin_unlock_irq(&nna->lock); } - call_rcu(&pwq->rcu, rcu_free_pwq); + kfree_rcu(pwq, rcu); /* * If we're the last pwq going away, @wq is already dead and no one @@ -5145,14 +5131,22 @@ static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq, return pwq; } +static void apply_wqattrs_lock(void) +{ + mutex_lock(&wq_pool_mutex); +} + +static void apply_wqattrs_unlock(void) +{ + mutex_unlock(&wq_pool_mutex); +} + /** * wq_calc_pod_cpumask - calculate a wq_attrs' cpumask for a pod * @attrs: the wq_attrs of the default pwq of the target workqueue * @cpu: the target CPU - * @cpu_going_down: if >= 0, the CPU to consider as offline * - * Calculate the cpumask a workqueue with @attrs should use on @pod. If - * @cpu_going_down is >= 0, that cpu is considered offline during calculation. + * Calculate the cpumask a workqueue with @attrs should use on @pod. * The result is stored in @attrs->__pod_cpumask. * * If pod affinity is not enabled, @attrs->cpumask is always used. If enabled @@ -5161,29 +5155,18 @@ static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq, * * The caller is responsible for ensuring that the cpumask of @pod stays stable. */ -static void wq_calc_pod_cpumask(struct workqueue_attrs *attrs, int cpu, - int cpu_going_down) +static void wq_calc_pod_cpumask(struct workqueue_attrs *attrs, int cpu) { const struct wq_pod_type *pt = wqattrs_pod_type(attrs); int pod = pt->cpu_pod[cpu]; - /* does @pod have any online CPUs @attrs wants? */ + /* calculate possible CPUs in @pod that @attrs wants */ cpumask_and(attrs->__pod_cpumask, pt->pod_cpus[pod], attrs->cpumask); - cpumask_and(attrs->__pod_cpumask, attrs->__pod_cpumask, cpu_online_mask); - if (cpu_going_down >= 0) - cpumask_clear_cpu(cpu_going_down, attrs->__pod_cpumask); - - if (cpumask_empty(attrs->__pod_cpumask)) { + /* does @pod have any online CPUs @attrs wants? */ + if (!cpumask_intersects(attrs->__pod_cpumask, wq_online_cpumask)) { cpumask_copy(attrs->__pod_cpumask, attrs->cpumask); return; } - - /* yeap, return possible CPUs in @pod that @attrs wants */ - cpumask_and(attrs->__pod_cpumask, attrs->cpumask, pt->pod_cpus[pod]); - - if (cpumask_empty(attrs->__pod_cpumask)) - pr_warn_once("WARNING: workqueue cpumask: online intersect > " - "possible intersect\n"); } /* install @pwq into @wq and return the old pwq, @cpu < 0 for dfl_pwq */ @@ -5268,7 +5251,7 @@ apply_wqattrs_prepare(struct workqueue_struct *wq, ctx->dfl_pwq->refcnt++; ctx->pwq_tbl[cpu] = ctx->dfl_pwq; } else { - wq_calc_pod_cpumask(new_attrs, cpu, -1); + wq_calc_pod_cpumask(new_attrs, cpu); ctx->pwq_tbl[cpu] = alloc_unbound_pwq(wq, new_attrs); if (!ctx->pwq_tbl[cpu]) goto out_free; @@ -5359,8 +5342,6 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, * * Performs GFP_KERNEL allocations. * - * Assumes caller has CPU hotplug read exclusion, i.e. cpus_read_lock(). - * * Return: 0 on success and -errno on failure. */ int apply_workqueue_attrs(struct workqueue_struct *wq, @@ -5368,8 +5349,6 @@ int apply_workqueue_attrs(struct workqueue_struct *wq, { int ret; - lockdep_assert_cpus_held(); - mutex_lock(&wq_pool_mutex); ret = apply_workqueue_attrs_locked(wq, attrs); mutex_unlock(&wq_pool_mutex); @@ -5378,15 +5357,12 @@ int apply_workqueue_attrs(struct workqueue_struct *wq, } /** - * wq_update_pod - update pod affinity of a wq for CPU hot[un]plug + * unbound_wq_update_pwq - update a pwq slot for CPU hot[un]plug * @wq: the target workqueue - * @cpu: the CPU to update pool association for - * @hotplug_cpu: the CPU coming up or going down - * @online: whether @cpu is coming up or going down + * @cpu: the CPU to update the pwq slot for * * This function is to be called from %CPU_DOWN_PREPARE, %CPU_ONLINE and - * %CPU_DOWN_FAILED. @cpu is being hot[un]plugged, update pod affinity of - * @wq accordingly. + * %CPU_DOWN_FAILED. @cpu is in the same pod of the CPU being hot[un]plugged. * * * If pod affinity can't be adjusted due to memory allocation failure, it falls @@ -5399,10 +5375,8 @@ int apply_workqueue_attrs(struct workqueue_struct *wq, * CPU_DOWN. If a workqueue user wants strict affinity, it's the user's * responsibility to flush the work item from CPU_DOWN_PREPARE. */ -static void wq_update_pod(struct workqueue_struct *wq, int cpu, - int hotplug_cpu, bool online) +static void unbound_wq_update_pwq(struct workqueue_struct *wq, int cpu) { - int off_cpu = online ? -1 : hotplug_cpu; struct pool_workqueue *old_pwq = NULL, *pwq; struct workqueue_attrs *target_attrs; @@ -5416,13 +5390,13 @@ static void wq_update_pod(struct workqueue_struct *wq, int cpu, * Let's use a preallocated one. The following buf is protected by * CPU hotplug exclusion. */ - target_attrs = wq_update_pod_attrs_buf; + target_attrs = unbound_wq_update_pwq_attrs_buf; copy_workqueue_attrs(target_attrs, wq->unbound_attrs); wqattrs_actualize_cpumask(target_attrs, wq_unbound_cpumask); /* nothing to do if the target cpumask matches the current pwq */ - wq_calc_pod_cpumask(target_attrs, cpu, off_cpu); + wq_calc_pod_cpumask(target_attrs, cpu); if (wqattrs_equal(target_attrs, unbound_pwq(wq, cpu)->pool->attrs)) return; @@ -5456,21 +5430,24 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) bool highpri = wq->flags & WQ_HIGHPRI; int cpu, ret; + lockdep_assert_held(&wq_pool_mutex); + wq->cpu_pwq = alloc_percpu(struct pool_workqueue *); if (!wq->cpu_pwq) goto enomem; if (!(wq->flags & WQ_UNBOUND)) { + struct worker_pool __percpu *pools; + + if (wq->flags & WQ_BH) + pools = bh_worker_pools; + else + pools = cpu_worker_pools; + for_each_possible_cpu(cpu) { struct pool_workqueue **pwq_p; - struct worker_pool __percpu *pools; struct worker_pool *pool; - if (wq->flags & WQ_BH) - pools = bh_worker_pools; - else - pools = cpu_worker_pools; - pool = &(per_cpu_ptr(pools, cpu)[highpri]); pwq_p = per_cpu_ptr(wq->cpu_pwq, cpu); @@ -5488,26 +5465,18 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) return 0; } - cpus_read_lock(); if (wq->flags & __WQ_ORDERED) { struct pool_workqueue *dfl_pwq; - ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]); + ret = apply_workqueue_attrs_locked(wq, ordered_wq_attrs[highpri]); /* there should only be single pwq for ordering guarantee */ dfl_pwq = rcu_access_pointer(wq->dfl_pwq); WARN(!ret && (wq->pwqs.next != &dfl_pwq->pwqs_node || wq->pwqs.prev != &dfl_pwq->pwqs_node), "ordering guarantee broken for workqueue %s\n", wq->name); } else { - ret = apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]); + ret = apply_workqueue_attrs_locked(wq, unbound_std_wq_attrs[highpri]); } - cpus_read_unlock(); - - /* for unbound pwq, flush the pwq_release_worker ensures that the - * pwq_release_workfn() completes before calling kfree(wq). - */ - if (ret) - kthread_flush_worker(pwq_release_worker); return ret; @@ -5542,8 +5511,11 @@ static int wq_clamp_max_active(int max_active, unsigned int flags, static int init_rescuer(struct workqueue_struct *wq) { struct worker *rescuer; + char id_buf[WORKER_ID_LEN]; int ret; + lockdep_assert_held(&wq_pool_mutex); + if (!(wq->flags & WQ_MEM_RECLAIM)) return 0; @@ -5555,7 +5527,9 @@ static int init_rescuer(struct workqueue_struct *wq) } rescuer->rescue_wq = wq; - rescuer->task = kthread_create(rescuer_thread, rescuer, "kworker/R-%s", wq->name); + format_worker_id(id_buf, sizeof(id_buf), rescuer, NULL); + + rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", id_buf); if (IS_ERR(rescuer->task)) { ret = PTR_ERR(rescuer->task); pr_err("workqueue: Failed to create a rescuer kthread for wq \"%s\": %pe", @@ -5566,7 +5540,7 @@ static int init_rescuer(struct workqueue_struct *wq) wq->rescuer = rescuer; if (wq->flags & WQ_UNBOUND) - kthread_bind_mask(rescuer->task, wq_unbound_cpumask); + kthread_bind_mask(rescuer->task, unbound_effective_cpumask(wq)); else kthread_bind_mask(rescuer->task, cpu_possible_mask); wake_up_process(rescuer->task); @@ -5714,21 +5688,14 @@ struct workqueue_struct *alloc_workqueue(const char *fmt, goto err_unreg_lockdep; } - if (alloc_and_link_pwqs(wq) < 0) - goto err_free_node_nr_active; - - if (wq_online && init_rescuer(wq) < 0) - goto err_destroy; - - if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq)) - goto err_destroy; - /* - * wq_pool_mutex protects global freeze state and workqueues list. - * Grab it, adjust max_active and add the new @wq to workqueues - * list. + * wq_pool_mutex protects the workqueues list, allocations of PWQs, + * and the global freeze state. */ - mutex_lock(&wq_pool_mutex); + apply_wqattrs_lock(); + + if (alloc_and_link_pwqs(wq) < 0) + goto err_unlock_free_node_nr_active; mutex_lock(&wq->mutex); wq_adjust_max_active(wq); @@ -5736,13 +5703,27 @@ struct workqueue_struct *alloc_workqueue(const char *fmt, list_add_tail_rcu(&wq->list, &workqueues); - mutex_unlock(&wq_pool_mutex); + if (wq_online && init_rescuer(wq) < 0) + goto err_unlock_destroy; + + apply_wqattrs_unlock(); + + if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq)) + goto err_destroy; return wq; -err_free_node_nr_active: - if (wq->flags & WQ_UNBOUND) +err_unlock_free_node_nr_active: + apply_wqattrs_unlock(); + /* + * Failed alloc_and_link_pwqs() may leave pending pwq->release_work, + * flushing the pwq_release_worker ensures that the pwq_release_workfn() + * completes before calling kfree(wq). + */ + if (wq->flags & WQ_UNBOUND) { + kthread_flush_worker(pwq_release_worker); free_node_nr_active(wq->node_nr_active); + } err_unreg_lockdep: wq_unregister_lockdep(wq); wq_free_lockdep(wq); @@ -5750,6 +5731,8 @@ err_free_wq: free_workqueue_attrs(wq->unbound_attrs); kfree(wq); return NULL; +err_unlock_destroy: + apply_wqattrs_unlock(); err_destroy: destroy_workqueue(wq); return NULL; @@ -6384,19 +6367,15 @@ void show_freezable_workqueues(void) /* used to show worker information through /proc/PID/{comm,stat,status} */ void wq_worker_comm(char *buf, size_t size, struct task_struct *task) { - int off; - - /* always show the actual comm */ - off = strscpy(buf, task->comm, size); - if (off < 0) - return; - /* stabilize PF_WQ_WORKER and worker pool association */ mutex_lock(&wq_pool_attach_mutex); if (task->flags & PF_WQ_WORKER) { struct worker *worker = kthread_data(task); struct worker_pool *pool = worker->pool; + int off; + + off = format_worker_id(buf, size, worker, pool); if (pool) { raw_spin_lock_irq(&pool->lock); @@ -6415,6 +6394,8 @@ void wq_worker_comm(char *buf, size_t size, struct task_struct *task) } raw_spin_unlock_irq(&pool->lock); } + } else { + strscpy(buf, task->comm, size); } mutex_unlock(&wq_pool_attach_mutex); @@ -6590,6 +6571,8 @@ int workqueue_online_cpu(unsigned int cpu) mutex_lock(&wq_pool_mutex); + cpumask_set_cpu(cpu, wq_online_cpumask); + for_each_pool(pool, pi) { /* BH pools aren't affected by hotplug */ if (pool->flags & POOL_BH) @@ -6612,7 +6595,7 @@ int workqueue_online_cpu(unsigned int cpu) int tcpu; for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]]) - wq_update_pod(wq, tcpu, cpu, true); + unbound_wq_update_pwq(wq, tcpu); mutex_lock(&wq->mutex); wq_update_node_max_active(wq, -1); @@ -6636,6 +6619,9 @@ int workqueue_offline_cpu(unsigned int cpu) /* update pod affinity of unbound workqueues */ mutex_lock(&wq_pool_mutex); + + cpumask_clear_cpu(cpu, wq_online_cpumask); + list_for_each_entry(wq, &workqueues, list) { struct workqueue_attrs *attrs = wq->unbound_attrs; @@ -6644,7 +6630,7 @@ int workqueue_offline_cpu(unsigned int cpu) int tcpu; for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]]) - wq_update_pod(wq, tcpu, cpu, false); + unbound_wq_update_pwq(wq, tcpu); mutex_lock(&wq->mutex); wq_update_node_max_active(wq, cpu); @@ -6870,8 +6856,7 @@ static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask) * @exclude_cpumask: the cpumask to be excluded from wq_unbound_cpumask * * This function can be called from cpuset code to provide a set of isolated - * CPUs that should be excluded from wq_unbound_cpumask. The caller must hold - * either cpus_read_lock or cpus_write_lock. + * CPUs that should be excluded from wq_unbound_cpumask. */ int workqueue_unbound_exclude_cpumask(cpumask_var_t exclude_cpumask) { @@ -6881,12 +6866,8 @@ int workqueue_unbound_exclude_cpumask(cpumask_var_t exclude_cpumask) if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL)) return -ENOMEM; - lockdep_assert_cpus_held(); mutex_lock(&wq_pool_mutex); - /* Save the current isolated cpumask & export it via sysfs */ - cpumask_copy(wq_isolated_cpumask, exclude_cpumask); - /* * If the operation fails, it will fall back to * wq_requested_unbound_cpumask which is initially set to @@ -6898,6 +6879,10 @@ int workqueue_unbound_exclude_cpumask(cpumask_var_t exclude_cpumask) if (!cpumask_equal(cpumask, wq_unbound_cpumask)) ret = workqueue_apply_unbound_cpumask(cpumask); + /* Save the current isolated cpumask & export it via sysfs */ + if (!ret) + cpumask_copy(wq_isolated_cpumask, exclude_cpumask); + mutex_unlock(&wq_pool_mutex); free_cpumask_var(cpumask); return ret; @@ -6931,9 +6916,8 @@ static int wq_affn_dfl_set(const char *val, const struct kernel_param *kp) wq_affn_dfl = affn; list_for_each_entry(wq, &workqueues, list) { - for_each_online_cpu(cpu) { - wq_update_pod(wq, cpu, cpu, true); - } + for_each_online_cpu(cpu) + unbound_wq_update_pwq(wq, cpu); } mutex_unlock(&wq_pool_mutex); @@ -7021,19 +7005,6 @@ static struct attribute *wq_sysfs_attrs[] = { }; ATTRIBUTE_GROUPS(wq_sysfs); -static void apply_wqattrs_lock(void) -{ - /* CPUs should stay stable across pwq creations and installations */ - cpus_read_lock(); - mutex_lock(&wq_pool_mutex); -} - -static void apply_wqattrs_unlock(void) -{ - mutex_unlock(&wq_pool_mutex); - cpus_read_unlock(); -} - static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -7232,16 +7203,12 @@ static int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) */ cpumask_and(cpumask, cpumask, cpu_possible_mask); if (!cpumask_empty(cpumask)) { + ret = 0; apply_wqattrs_lock(); - cpumask_copy(wq_requested_unbound_cpumask, cpumask); - if (cpumask_equal(cpumask, wq_unbound_cpumask)) { - ret = 0; - goto out_unlock; - } - - ret = workqueue_apply_unbound_cpumask(cpumask); - -out_unlock: + if (!cpumask_equal(cpumask, wq_unbound_cpumask)) + ret = workqueue_apply_unbound_cpumask(cpumask); + if (!ret) + cpumask_copy(wq_requested_unbound_cpumask, cpumask); apply_wqattrs_unlock(); } @@ -7560,10 +7527,18 @@ static void wq_watchdog_timer_fn(struct timer_list *unused) notrace void wq_watchdog_touch(int cpu) { + unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ; + unsigned long touch_ts = READ_ONCE(wq_watchdog_touched); + unsigned long now = jiffies; + if (cpu >= 0) - per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies; + per_cpu(wq_watchdog_touched_cpu, cpu) = now; + else + WARN_ONCE(1, "%s should be called with valid CPU", __func__); - wq_watchdog_touched = jiffies; + /* Don't unnecessarily store to global cacheline */ + if (time_after(now, touch_ts + thresh / 4)) + WRITE_ONCE(wq_watchdog_touched, jiffies); } static void wq_watchdog_set_thresh(unsigned long thresh) @@ -7673,10 +7648,12 @@ void __init workqueue_init_early(void) BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); + BUG_ON(!alloc_cpumask_var(&wq_online_cpumask, GFP_KERNEL)); BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL)); BUG_ON(!alloc_cpumask_var(&wq_requested_unbound_cpumask, GFP_KERNEL)); BUG_ON(!zalloc_cpumask_var(&wq_isolated_cpumask, GFP_KERNEL)); + cpumask_copy(wq_online_cpumask, cpu_online_mask); cpumask_copy(wq_unbound_cpumask, cpu_possible_mask); restrict_unbound_cpumask("HK_TYPE_WQ", housekeeping_cpumask(HK_TYPE_WQ)); restrict_unbound_cpumask("HK_TYPE_DOMAIN", housekeeping_cpumask(HK_TYPE_DOMAIN)); @@ -7687,8 +7664,8 @@ void __init workqueue_init_early(void) pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); - wq_update_pod_attrs_buf = alloc_workqueue_attrs(); - BUG_ON(!wq_update_pod_attrs_buf); + unbound_wq_update_pwq_attrs_buf = alloc_workqueue_attrs(); + BUG_ON(!unbound_wq_update_pwq_attrs_buf); /* * If nohz_full is enabled, set power efficient workqueue as unbound. @@ -7953,12 +7930,12 @@ void __init workqueue_init_topology(void) /* * Workqueues allocated earlier would have all CPUs sharing the default - * worker pool. Explicitly call wq_update_pod() on all workqueue and CPU - * combinations to apply per-pod sharing. + * worker pool. Explicitly call unbound_wq_update_pwq() on all workqueue + * and CPU combinations to apply per-pod sharing. */ list_for_each_entry(wq, &workqueues, list) { for_each_online_cpu(cpu) - wq_update_pod(wq, cpu, cpu, true); + unbound_wq_update_pwq(wq, cpu); if (wq->flags & WQ_UNBOUND) { mutex_lock(&wq->mutex); wq_update_node_max_active(wq, -1); |