diff options
Diffstat (limited to 'kernel')
160 files changed, 5241 insertions, 3796 deletions
diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec index 6c34e63c88ff..4d111f871951 100644 --- a/kernel/Kconfig.kexec +++ b/kernel/Kconfig.kexec @@ -97,7 +97,7 @@ config KEXEC_JUMP config CRASH_DUMP bool "kernel crash dumps" - default y + default ARCH_DEFAULT_CRASH_DUMP depends on ARCH_SUPPORTS_CRASH_DUMP depends on KEXEC_CORE select VMCORE_INFO diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index fe782cd77388..54ea59ff8fbe 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -11,12 +11,16 @@ config PREEMPT_BUILD select PREEMPTION select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK +config ARCH_HAS_PREEMPT_LAZY + bool + choice prompt "Preemption Model" default PREEMPT_NONE config PREEMPT_NONE bool "No Forced Preemption (Server)" + depends on !PREEMPT_RT select PREEMPT_NONE_BUILD if !PREEMPT_DYNAMIC help This is the traditional Linux preemption model, geared towards @@ -32,6 +36,7 @@ config PREEMPT_NONE config PREEMPT_VOLUNTARY bool "Voluntary Kernel Preemption (Desktop)" depends on !ARCH_NO_PREEMPT + depends on !PREEMPT_RT select PREEMPT_VOLUNTARY_BUILD if !PREEMPT_DYNAMIC help This option reduces the latency of the kernel by adding more @@ -51,7 +56,7 @@ config PREEMPT_VOLUNTARY config PREEMPT bool "Preemptible Kernel (Low-Latency Desktop)" depends on !ARCH_NO_PREEMPT - select PREEMPT_BUILD + select PREEMPT_BUILD if !PREEMPT_DYNAMIC help This option reduces the latency of the kernel by making all kernel code (that is not executing in a critical section) @@ -67,9 +72,23 @@ config PREEMPT embedded system with latency requirements in the milliseconds range. +config PREEMPT_LAZY + bool "Scheduler controlled preemption model" + depends on !ARCH_NO_PREEMPT + depends on ARCH_HAS_PREEMPT_LAZY + select PREEMPT_BUILD if !PREEMPT_DYNAMIC + help + This option provides a scheduler driven preemption model that + is fundamentally similar to full preemption, but is less + eager to preempt SCHED_NORMAL tasks in an attempt to + reduce lock holder preemption and recover some of the performance + gains seen from using Voluntary preemption. + +endchoice + config PREEMPT_RT bool "Fully Preemptible Kernel (Real-Time)" - depends on EXPERT && ARCH_SUPPORTS_RT + depends on EXPERT && ARCH_SUPPORTS_RT && !COMPILE_TEST select PREEMPTION help This option turns the kernel into a real-time kernel by replacing @@ -84,8 +103,6 @@ config PREEMPT_RT Select this if you are building a kernel for systems which require real-time guarantees. -endchoice - config PREEMPT_COUNT bool @@ -95,7 +112,7 @@ config PREEMPTION config PREEMPT_DYNAMIC bool "Preemption behaviour defined on boot" - depends on HAVE_PREEMPT_DYNAMIC && !PREEMPT_RT + depends on HAVE_PREEMPT_DYNAMIC select JUMP_LABEL if HAVE_PREEMPT_DYNAMIC_KEY select PREEMPT_BUILD default y if HAVE_PREEMPT_DYNAMIC_CALL diff --git a/kernel/audit.c b/kernel/audit.c index 1edaa4846a47..6a95a6077953 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -123,7 +123,7 @@ static u32 audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME; /* The identity of the user shutting down the audit system. */ static kuid_t audit_sig_uid = INVALID_UID; static pid_t audit_sig_pid = -1; -static u32 audit_sig_sid; +static struct lsm_prop audit_sig_lsm; /* Records can be lost in several ways: 0) [suppressed in audit_alloc] @@ -1473,20 +1473,21 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh, } case AUDIT_SIGNAL_INFO: len = 0; - if (audit_sig_sid) { - err = security_secid_to_secctx(audit_sig_sid, &ctx, &len); + if (lsmprop_is_set(&audit_sig_lsm)) { + err = security_lsmprop_to_secctx(&audit_sig_lsm, &ctx, + &len); if (err) return err; } sig_data = kmalloc(struct_size(sig_data, ctx, len), GFP_KERNEL); if (!sig_data) { - if (audit_sig_sid) + if (lsmprop_is_set(&audit_sig_lsm)) security_release_secctx(ctx, len); return -ENOMEM; } sig_data->uid = from_kuid(&init_user_ns, audit_sig_uid); sig_data->pid = audit_sig_pid; - if (audit_sig_sid) { + if (lsmprop_is_set(&audit_sig_lsm)) { memcpy(sig_data->ctx, ctx, len); security_release_secctx(ctx, len); } @@ -2102,8 +2103,8 @@ bool audit_string_contains_control(const char *string, size_t len) /** * audit_log_n_untrustedstring - log a string that may contain random characters * @ab: audit_buffer - * @len: length of string (not including trailing null) * @string: string to be logged + * @len: length of string (not including trailing null) * * This code will escape a string that is passed to it if the string * contains a control character, unprintable character, double quote mark, @@ -2178,16 +2179,16 @@ void audit_log_key(struct audit_buffer *ab, char *key) int audit_log_task_context(struct audit_buffer *ab) { + struct lsm_prop prop; char *ctx = NULL; unsigned len; int error; - u32 sid; - security_current_getsecid_subj(&sid); - if (!sid) + security_current_getlsmprop_subj(&prop); + if (!lsmprop_is_set(&prop)) return 0; - error = security_secid_to_secctx(sid, &ctx, &len); + error = security_lsmprop_to_secctx(&prop, &ctx, &len); if (error) { if (error != -EINVAL) goto error_path; @@ -2404,7 +2405,7 @@ int audit_signal_info(int sig, struct task_struct *t) audit_sig_uid = auid; else audit_sig_uid = uid; - security_current_getsecid_subj(&audit_sig_sid); + security_current_getlsmprop_subj(&audit_sig_lsm); } return audit_signal_info_syscall(t); diff --git a/kernel/audit.h b/kernel/audit.h index a60d2840559e..0211cb307d30 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -11,6 +11,7 @@ #include <linux/fs.h> #include <linux/audit.h> +#include <linux/security.h> #include <linux/skbuff.h> #include <uapi/linux/mqueue.h> #include <linux/tty.h> @@ -81,7 +82,7 @@ struct audit_names { kuid_t uid; kgid_t gid; dev_t rdev; - u32 osid; + struct lsm_prop oprop; struct audit_cap_data fcap; unsigned int fcap_ver; unsigned char type; /* record type */ @@ -143,7 +144,7 @@ struct audit_context { kuid_t target_auid; kuid_t target_uid; unsigned int target_sessionid; - u32 target_sid; + struct lsm_prop target_ref; char target_comm[TASK_COMM_LEN]; struct audit_tree_refs *trees, *first_trees; @@ -160,7 +161,7 @@ struct audit_context { kuid_t uid; kgid_t gid; umode_t mode; - u32 osid; + struct lsm_prop oprop; int has_perm; uid_t perm_uid; gid_t perm_gid; diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 470041c49a44..bceb9f58a09e 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1339,8 +1339,8 @@ int audit_filter(int msgtype, unsigned int listtype) for (i = 0; i < e->rule.field_count; i++) { struct audit_field *f = &e->rule.fields[i]; + struct lsm_prop prop = { }; pid_t pid; - u32 sid; switch (f->type) { case AUDIT_PID: @@ -1370,9 +1370,10 @@ int audit_filter(int msgtype, unsigned int listtype) case AUDIT_SUBJ_SEN: case AUDIT_SUBJ_CLR: if (f->lsm_rule) { - security_current_getsecid_subj(&sid); - result = security_audit_rule_match(sid, - f->type, f->op, f->lsm_rule); + security_current_getlsmprop_subj(&prop); + result = security_audit_rule_match( + &prop, f->type, f->op, + f->lsm_rule); } break; case AUDIT_EXE: diff --git a/kernel/auditsc.c b/kernel/auditsc.c index cd57053b4a69..91afdd0d036e 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -100,7 +100,7 @@ struct audit_aux_data_pids { kuid_t target_auid[AUDIT_AUX_PIDS]; kuid_t target_uid[AUDIT_AUX_PIDS]; unsigned int target_sessionid[AUDIT_AUX_PIDS]; - u32 target_sid[AUDIT_AUX_PIDS]; + struct lsm_prop target_ref[AUDIT_AUX_PIDS]; char target_comm[AUDIT_AUX_PIDS][TASK_COMM_LEN]; int pid_count; }; @@ -470,7 +470,7 @@ static int audit_filter_rules(struct task_struct *tsk, { const struct cred *cred; int i, need_sid = 1; - u32 sid; + struct lsm_prop prop = { }; unsigned int sessionid; if (ctx && rule->prio <= ctx->prio) @@ -674,14 +674,16 @@ static int audit_filter_rules(struct task_struct *tsk, * fork()/copy_process() in which case * the new @tsk creds are still a dup * of @current's creds so we can still - * use security_current_getsecid_subj() + * use + * security_current_getlsmprop_subj() * here even though it always refs * @current's creds */ - security_current_getsecid_subj(&sid); + security_current_getlsmprop_subj(&prop); need_sid = 0; } - result = security_audit_rule_match(sid, f->type, + result = security_audit_rule_match(&prop, + f->type, f->op, f->lsm_rule); } @@ -697,14 +699,14 @@ static int audit_filter_rules(struct task_struct *tsk, /* Find files that match */ if (name) { result = security_audit_rule_match( - name->osid, + &name->oprop, f->type, f->op, f->lsm_rule); } else if (ctx) { list_for_each_entry(n, &ctx->names_list, list) { if (security_audit_rule_match( - n->osid, + &n->oprop, f->type, f->op, f->lsm_rule)) { @@ -716,7 +718,7 @@ static int audit_filter_rules(struct task_struct *tsk, /* Find ipc objects that match */ if (!ctx || ctx->type != AUDIT_IPC) break; - if (security_audit_rule_match(ctx->ipc.osid, + if (security_audit_rule_match(&ctx->ipc.oprop, f->type, f->op, f->lsm_rule)) ++result; @@ -1017,7 +1019,7 @@ static void audit_reset_context(struct audit_context *ctx) ctx->target_pid = 0; ctx->target_auid = ctx->target_uid = KUIDT_INIT(0); ctx->target_sessionid = 0; - ctx->target_sid = 0; + lsmprop_init(&ctx->target_ref); ctx->target_comm[0] = '\0'; unroll_tree_refs(ctx, NULL, 0); WARN_ON(!list_empty(&ctx->killed_trees)); @@ -1091,8 +1093,9 @@ static inline void audit_free_context(struct audit_context *context) } static int audit_log_pid_context(struct audit_context *context, pid_t pid, - kuid_t auid, kuid_t uid, unsigned int sessionid, - u32 sid, char *comm) + kuid_t auid, kuid_t uid, + unsigned int sessionid, struct lsm_prop *prop, + char *comm) { struct audit_buffer *ab; char *ctx = NULL; @@ -1106,8 +1109,8 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, from_kuid(&init_user_ns, auid), from_kuid(&init_user_ns, uid), sessionid); - if (sid) { - if (security_secid_to_secctx(sid, &ctx, &len)) { + if (lsmprop_is_set(prop)) { + if (security_lsmprop_to_secctx(prop, &ctx, &len)) { audit_log_format(ab, " obj=(none)"); rc = 1; } else { @@ -1384,19 +1387,17 @@ static void show_special(struct audit_context *context, int *call_panic) audit_log_format(ab, " a%d=%lx", i, context->socketcall.args[i]); break; } - case AUDIT_IPC: { - u32 osid = context->ipc.osid; - + case AUDIT_IPC: audit_log_format(ab, "ouid=%u ogid=%u mode=%#ho", from_kuid(&init_user_ns, context->ipc.uid), from_kgid(&init_user_ns, context->ipc.gid), context->ipc.mode); - if (osid) { + if (lsmprop_is_set(&context->ipc.oprop)) { char *ctx = NULL; u32 len; - if (security_secid_to_secctx(osid, &ctx, &len)) { - audit_log_format(ab, " osid=%u", osid); + if (security_lsmprop_to_secctx(&context->ipc.oprop, + &ctx, &len)) { *call_panic = 1; } else { audit_log_format(ab, " obj=%s", ctx); @@ -1416,7 +1417,7 @@ static void show_special(struct audit_context *context, int *call_panic) context->ipc.perm_gid, context->ipc.perm_mode); } - break; } + break; case AUDIT_MQ_OPEN: audit_log_format(ab, "oflag=0x%x mode=%#ho mq_flags=0x%lx mq_maxmsg=%ld " @@ -1558,13 +1559,11 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n, from_kgid(&init_user_ns, n->gid), MAJOR(n->rdev), MINOR(n->rdev)); - if (n->osid != 0) { + if (lsmprop_is_set(&n->oprop)) { char *ctx = NULL; u32 len; - if (security_secid_to_secctx( - n->osid, &ctx, &len)) { - audit_log_format(ab, " osid=%u", n->osid); + if (security_lsmprop_to_secctx(&n->oprop, &ctx, &len)) { if (call_panic) *call_panic = 2; } else { @@ -1653,8 +1652,8 @@ static void audit_log_uring(struct audit_context *ctx) audit_log_format(ab, "uring_op=%d", ctx->uring_op); if (ctx->return_valid != AUDITSC_INVALID) audit_log_format(ab, " success=%s exit=%ld", - (ctx->return_valid == AUDITSC_SUCCESS ? - "yes" : "no"), + str_yes_no(ctx->return_valid == + AUDITSC_SUCCESS), ctx->return_code); audit_log_format(ab, " items=%d" @@ -1696,8 +1695,8 @@ static void audit_log_exit(void) audit_log_format(ab, " per=%lx", context->personality); if (context->return_valid != AUDITSC_INVALID) audit_log_format(ab, " success=%s exit=%ld", - (context->return_valid == AUDITSC_SUCCESS ? - "yes" : "no"), + str_yes_no(context->return_valid == + AUDITSC_SUCCESS), context->return_code); audit_log_format(ab, " a0=%lx a1=%lx a2=%lx a3=%lx items=%d", @@ -1780,7 +1779,7 @@ static void audit_log_exit(void) axs->target_auid[i], axs->target_uid[i], axs->target_sessionid[i], - axs->target_sid[i], + &axs->target_ref[i], axs->target_comm[i])) call_panic = 1; } @@ -1789,7 +1788,7 @@ static void audit_log_exit(void) audit_log_pid_context(context, context->target_pid, context->target_auid, context->target_uid, context->target_sessionid, - context->target_sid, context->target_comm)) + &context->target_ref, context->target_comm)) call_panic = 1; if (context->pwd.dentry && context->pwd.mnt) { @@ -2278,7 +2277,7 @@ static void audit_copy_inode(struct audit_names *name, name->uid = inode->i_uid; name->gid = inode->i_gid; name->rdev = inode->i_rdev; - security_inode_getsecid(inode, &name->osid); + security_inode_getlsmprop(inode, &name->oprop); if (flags & AUDIT_INODE_NOEVAL) { name->fcap_ver = -1; return; @@ -2632,7 +2631,7 @@ void __audit_ipc_obj(struct kern_ipc_perm *ipcp) context->ipc.gid = ipcp->gid; context->ipc.mode = ipcp->mode; context->ipc.has_perm = 0; - security_ipc_getsecid(ipcp, &context->ipc.osid); + security_ipc_getlsmprop(ipcp, &context->ipc.oprop); context->type = AUDIT_IPC; } @@ -2729,7 +2728,7 @@ void __audit_ptrace(struct task_struct *t) context->target_auid = audit_get_loginuid(t); context->target_uid = task_uid(t); context->target_sessionid = audit_get_sessionid(t); - security_task_getsecid_obj(t, &context->target_sid); + security_task_getlsmprop_obj(t, &context->target_ref); memcpy(context->target_comm, t->comm, TASK_COMM_LEN); } @@ -2756,7 +2755,7 @@ int audit_signal_info_syscall(struct task_struct *t) ctx->target_auid = audit_get_loginuid(t); ctx->target_uid = t_uid; ctx->target_sessionid = audit_get_sessionid(t); - security_task_getsecid_obj(t, &ctx->target_sid); + security_task_getlsmprop_obj(t, &ctx->target_ref); memcpy(ctx->target_comm, t->comm, TASK_COMM_LEN); return 0; } @@ -2777,7 +2776,7 @@ int audit_signal_info_syscall(struct task_struct *t) axp->target_auid[axp->pid_count] = audit_get_loginuid(t); axp->target_uid[axp->pid_count] = t_uid; axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t); - security_task_getsecid_obj(t, &axp->target_sid[axp->pid_count]); + security_task_getlsmprop_obj(t, &axp->target_ref[axp->pid_count]); memcpy(axp->target_comm[axp->pid_count], t->comm, TASK_COMM_LEN); axp->pid_count++; diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index 0a79aee6523d..e16e79f8cd6d 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -16,7 +16,6 @@ #include <uapi/linux/btf.h> #include <linux/bpf_lsm.h> #include <linux/btf_ids.h> -#include <linux/fdtable.h> #include <linux/rcupdate_trace.h> DEFINE_BPF_STORAGE_CACHE(inode_cache); @@ -78,13 +77,12 @@ void bpf_inode_storage_free(struct inode *inode) static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key) { struct bpf_local_storage_data *sdata; - struct fd f = fdget_raw(*(int *)key); + CLASS(fd_raw, f)(*(int *)key); - if (!fd_file(f)) + if (fd_empty(f)) return ERR_PTR(-EBADF); sdata = inode_storage_lookup(file_inode(fd_file(f)), map, true); - fdput(f); return sdata ? sdata->data : NULL; } @@ -92,19 +90,16 @@ static long bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_local_storage_data *sdata; - struct fd f = fdget_raw(*(int *)key); + CLASS(fd_raw, f)(*(int *)key); - if (!fd_file(f)) + if (fd_empty(f)) return -EBADF; - if (!inode_storage_ptr(file_inode(fd_file(f)))) { - fdput(f); + if (!inode_storage_ptr(file_inode(fd_file(f)))) return -EBADF; - } sdata = bpf_local_storage_update(file_inode(fd_file(f)), (struct bpf_local_storage_map *)map, value, map_flags, GFP_ATOMIC); - fdput(f); return PTR_ERR_OR_ZERO(sdata); } @@ -123,15 +118,11 @@ static int inode_storage_delete(struct inode *inode, struct bpf_map *map) static long bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key) { - struct fd f = fdget_raw(*(int *)key); - int err; + CLASS(fd_raw, f)(*(int *)key); - if (!fd_file(f)) + if (fd_empty(f)) return -EBADF; - - err = inode_storage_delete(file_inode(fd_file(f)), map); - fdput(f); - return err; + return inode_storage_delete(file_inode(fd_file(f)), map); } /* *gfp_flags* is a hidden argument provided by the verifier */ diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 112581cf97e7..106735145948 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -283,7 +283,6 @@ static int iter_release(struct inode *inode, struct file *file) const struct file_operations bpf_iter_fops = { .open = iter_open, - .llseek = no_llseek, .read = bpf_seq_read, .release = iter_release, }; diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 6292ac5f9bd1..3bc61628ab25 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -339,10 +339,6 @@ BTF_ID(func, bpf_lsm_path_chmod) BTF_ID(func, bpf_lsm_path_chown) #endif /* CONFIG_SECURITY_PATH */ -#ifdef CONFIG_KEYS -BTF_ID(func, bpf_lsm_key_free) -#endif /* CONFIG_KEYS */ - BTF_ID(func, bpf_lsm_mmap_file) BTF_ID(func, bpf_lsm_netlink_send) BTF_ID(func, bpf_lsm_path_notify) diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index adf6dfe0ba68..1eb9852a9f8e 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -16,7 +16,6 @@ #include <linux/filter.h> #include <uapi/linux/btf.h> #include <linux/btf_ids.h> -#include <linux/fdtable.h> #include <linux/rcupdate_trace.h> DEFINE_BPF_STORAGE_CACHE(task_cache); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 83bbf935c562..5cd1c7a23848 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3523,7 +3523,7 @@ end: * (i + 1) * elem_size * where i is the repeat index and elem_size is the size of an element. */ -static int btf_repeat_fields(struct btf_field_info *info, +static int btf_repeat_fields(struct btf_field_info *info, int info_cnt, u32 field_cnt, u32 repeat_cnt, u32 elem_size) { u32 i, j; @@ -3543,6 +3543,12 @@ static int btf_repeat_fields(struct btf_field_info *info, } } + /* The type of struct size or variable size is u32, + * so the multiplication will not overflow. + */ + if (field_cnt * (repeat_cnt + 1) > info_cnt) + return -E2BIG; + cur = field_cnt; for (i = 0; i < repeat_cnt; i++) { memcpy(&info[cur], &info[0], field_cnt * sizeof(info[0])); @@ -3587,7 +3593,7 @@ static int btf_find_nested_struct(const struct btf *btf, const struct btf_type * info[i].off += off; if (nelems > 1) { - err = btf_repeat_fields(info, ret, nelems - 1, t->size); + err = btf_repeat_fields(info, info_cnt, ret, nelems - 1, t->size); if (err == 0) ret *= nelems; else @@ -3681,10 +3687,10 @@ static int btf_find_field_one(const struct btf *btf, if (ret == BTF_FIELD_IGNORE) return 0; - if (nelems > info_cnt) + if (!info_cnt) return -E2BIG; if (nelems > 1) { - ret = btf_repeat_fields(info, 1, nelems - 1, sz); + ret = btf_repeat_fields(info, info_cnt, 1, nelems - 1, sz); if (ret < 0) return ret; } @@ -7711,21 +7717,16 @@ int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) struct btf *btf_get_by_fd(int fd) { struct btf *btf; - struct fd f; + CLASS(fd, f)(fd); - f = fdget(fd); - - if (!fd_file(f)) + if (fd_empty(f)) return ERR_PTR(-EBADF); - if (fd_file(f)->f_op != &btf_fops) { - fdput(f); + if (fd_file(f)->f_op != &btf_fops) return ERR_PTR(-EINVAL); - } btf = fd_file(f)->private_data; refcount_inc(&btf->refcnt); - fdput(f); return btf; } @@ -8966,6 +8967,7 @@ int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo, if (!type) { bpf_log(ctx->log, "relo #%u: bad type id %u\n", relo_idx, relo->type_id); + kfree(specs); return -EINVAL; } diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index e7113d700b87..025d7e2214ae 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -24,6 +24,23 @@ DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE); EXPORT_SYMBOL(cgroup_bpf_enabled_key); +/* + * cgroup bpf destruction makes heavy use of work items and there can be a lot + * of concurrent destructions. Use a separate workqueue so that cgroup bpf + * destruction work items don't end up filling up max_active of system_wq + * which may lead to deadlock. + */ +static struct workqueue_struct *cgroup_bpf_destroy_wq; + +static int __init cgroup_bpf_wq_init(void) +{ + cgroup_bpf_destroy_wq = alloc_workqueue("cgroup_bpf_destroy", 0, 1); + if (!cgroup_bpf_destroy_wq) + panic("Failed to alloc workqueue for cgroup bpf destroy.\n"); + return 0; +} +core_initcall(cgroup_bpf_wq_init); + /* __always_inline is necessary to prevent indirect call through run_prog * function pointer. */ @@ -334,7 +351,7 @@ static void cgroup_bpf_release_fn(struct percpu_ref *ref) struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt); INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release); - queue_work(system_wq, &cgrp->bpf.release_work); + queue_work(cgroup_bpf_destroy_wq, &cgrp->bpf.release_work); } /* Get underlying bpf_prog of bpf_prog_list entry, regardless if it's through diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 4e07cc057d6f..e303626bdb2f 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -21,7 +21,7 @@ #include <linux/filter.h> #include <linux/skbuff.h> #include <linux/vmalloc.h> -#include <linux/random.h> +#include <linux/prandom.h> #include <linux/bpf.h> #include <linux/btf.h> #include <linux/objtool.h> @@ -40,7 +40,7 @@ #include <linux/execmem.h> #include <asm/barrier.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> /* Registers */ #define BPF_R0 regs[BPF_REG_0] diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 9e0e3b0a18e4..7878be18e9d2 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -333,9 +333,11 @@ static int dev_map_hash_get_next_key(struct bpf_map *map, void *key, static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog, struct xdp_frame **frames, int n, - struct net_device *dev) + struct net_device *tx_dev, + struct net_device *rx_dev) { - struct xdp_txq_info txq = { .dev = dev }; + struct xdp_txq_info txq = { .dev = tx_dev }; + struct xdp_rxq_info rxq = { .dev = rx_dev }; struct xdp_buff xdp; int i, nframes = 0; @@ -346,6 +348,7 @@ static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog, xdp_convert_frame_to_buff(xdpf, &xdp); xdp.txq = &txq; + xdp.rxq = &rxq; act = bpf_prog_run_xdp(xdp_prog, &xdp); switch (act) { @@ -360,7 +363,7 @@ static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog, bpf_warn_invalid_xdp_action(NULL, xdp_prog, act); fallthrough; case XDP_ABORTED: - trace_xdp_exception(dev, xdp_prog, act); + trace_xdp_exception(tx_dev, xdp_prog, act); fallthrough; case XDP_DROP: xdp_return_frame_rx_napi(xdpf); @@ -388,7 +391,7 @@ static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) } if (bq->xdp_prog) { - to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev); + to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev, bq->dev_rx); if (!to_send) goto out; } diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 1a43d06eab28..3d45ebe8afb4 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -111,7 +111,7 @@ const struct bpf_func_proto bpf_map_pop_elem_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT, + .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT | MEM_WRITE, }; BPF_CALL_2(bpf_map_peek_elem, struct bpf_map *, map, void *, value) @@ -124,7 +124,7 @@ const struct bpf_func_proto bpf_map_peek_elem_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT, + .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT | MEM_WRITE, }; BPF_CALL_3(bpf_map_lookup_percpu_elem, struct bpf_map *, map, void *, key, u32, cpu) @@ -538,7 +538,7 @@ const struct bpf_func_proto bpf_strtol_proto = { .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_ALIGNED, + .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED, .arg4_size = sizeof(s64), }; @@ -566,7 +566,7 @@ const struct bpf_func_proto bpf_strtoul_proto = { .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_ALIGNED, + .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED, .arg4_size = sizeof(u64), }; @@ -1742,7 +1742,7 @@ static const struct bpf_func_proto bpf_dynptr_from_mem_proto = { .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT, + .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT | MEM_WRITE, }; BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src, @@ -2851,21 +2851,47 @@ struct bpf_iter_bits { __u64 __opaque[2]; } __aligned(8); +#define BITS_ITER_NR_WORDS_MAX 511 + struct bpf_iter_bits_kern { union { - unsigned long *bits; - unsigned long bits_copy; + __u64 *bits; + __u64 bits_copy; }; - u32 nr_bits; + int nr_bits; int bit; } __aligned(8); +/* On 64-bit hosts, unsigned long and u64 have the same size, so passing + * a u64 pointer and an unsigned long pointer to find_next_bit() will + * return the same result, as both point to the same 8-byte area. + * + * For 32-bit little-endian hosts, using a u64 pointer or unsigned long + * pointer also makes no difference. This is because the first iterated + * unsigned long is composed of bits 0-31 of the u64 and the second unsigned + * long is composed of bits 32-63 of the u64. + * + * However, for 32-bit big-endian hosts, this is not the case. The first + * iterated unsigned long will be bits 32-63 of the u64, so swap these two + * ulong values within the u64. + */ +static void swap_ulong_in_u64(u64 *bits, unsigned int nr) +{ +#if (BITS_PER_LONG == 32) && defined(__BIG_ENDIAN) + unsigned int i; + + for (i = 0; i < nr; i++) + bits[i] = (bits[i] >> 32) | ((u64)(u32)bits[i] << 32); +#endif +} + /** * bpf_iter_bits_new() - Initialize a new bits iterator for a given memory area * @it: The new bpf_iter_bits to be created * @unsafe_ptr__ign: A pointer pointing to a memory area to be iterated over * @nr_words: The size of the specified memory area, measured in 8-byte units. - * Due to the limitation of memalloc, it can't be greater than 512. + * The maximum value of @nr_words is @BITS_ITER_NR_WORDS_MAX. This limit may be + * further reduced by the BPF memory allocator implementation. * * This function initializes a new bpf_iter_bits structure for iterating over * a memory area which is specified by the @unsafe_ptr__ign and @nr_words. It @@ -2892,6 +2918,8 @@ bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_w if (!unsafe_ptr__ign || !nr_words) return -EINVAL; + if (nr_words > BITS_ITER_NR_WORDS_MAX) + return -E2BIG; /* Optimization for u64 mask */ if (nr_bits == 64) { @@ -2899,10 +2927,15 @@ bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_w if (err) return -EFAULT; + swap_ulong_in_u64(&kit->bits_copy, nr_words); + kit->nr_bits = nr_bits; return 0; } + if (bpf_mem_alloc_check_size(false, nr_bytes)) + return -E2BIG; + /* Fallback to memalloc */ kit->bits = bpf_mem_alloc(&bpf_global_ma, nr_bytes); if (!kit->bits) @@ -2914,6 +2947,8 @@ bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_w return err; } + swap_ulong_in_u64(kit->bits, nr_words); + kit->nr_bits = nr_bits; return 0; } @@ -2930,17 +2965,16 @@ bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_w __bpf_kfunc int *bpf_iter_bits_next(struct bpf_iter_bits *it) { struct bpf_iter_bits_kern *kit = (void *)it; - u32 nr_bits = kit->nr_bits; - const unsigned long *bits; - int bit; + int bit = kit->bit, nr_bits = kit->nr_bits; + const void *bits; - if (nr_bits == 0) + if (!nr_bits || bit >= nr_bits) return NULL; bits = nr_bits == 64 ? &kit->bits_copy : kit->bits; - bit = find_next_bit(bits, nr_bits, kit->bit + 1); + bit = find_next_bit(bits, nr_bits, bit + 1); if (bit >= nr_bits) { - kit->nr_bits = 0; + kit->bit = bit; return NULL; } diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index d8fc5eba529d..9aaf5124648b 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -880,7 +880,7 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param) const struct btf_type *enum_t; const char *enum_pfx; u64 *delegate_msk, msk = 0; - char *p; + char *p, *str; int val; /* ignore errors, fallback to hex */ @@ -911,7 +911,8 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param) return -EINVAL; } - while ((p = strsep(¶m->string, ":"))) { + str = param->string; + while ((p = strsep(&str, ":"))) { if (strcmp(p, "any") == 0) { msk |= ~0ULL; } else if (find_btf_enum_const(info.btf, enum_t, enum_pfx, p, &val)) { diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 5aebfc3051e3..4a858fdb6476 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -688,8 +688,7 @@ static void print_reg_state(struct bpf_verifier_env *env, if (t == SCALAR_VALUE && reg->precise) verbose(env, "P"); if (t == SCALAR_VALUE && tnum_is_const(reg->var_off)) { - /* reg->off should be 0 for SCALAR_VALUE */ - verbose_snum(env, reg->var_off.value + reg->off); + verbose_snum(env, reg->var_off.value); return; } diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 0218a5132ab5..9b60eda0f727 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -655,7 +655,7 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) if (!key || key->prefixlen > trie->max_prefixlen) goto find_leftmost; - node_stack = kmalloc_array(trie->max_prefixlen, + node_stack = kmalloc_array(trie->max_prefixlen + 1, sizeof(struct lpm_trie_node *), GFP_ATOMIC | __GFP_NOWARN); if (!node_stack) diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index b4f18c85d7bc..645bd30bc9a9 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -11,24 +11,18 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) { struct bpf_map *inner_map, *inner_map_meta; u32 inner_map_meta_size; - struct fd f; - int ret; + CLASS(fd, f)(inner_map_ufd); - f = fdget(inner_map_ufd); inner_map = __bpf_map_get(f); if (IS_ERR(inner_map)) return inner_map; /* Does not support >1 level map-in-map */ - if (inner_map->inner_map_meta) { - ret = -EINVAL; - goto put; - } + if (inner_map->inner_map_meta) + return ERR_PTR(-EINVAL); - if (!inner_map->ops->map_meta_equal) { - ret = -ENOTSUPP; - goto put; - } + if (!inner_map->ops->map_meta_equal) + return ERR_PTR(-ENOTSUPP); inner_map_meta_size = sizeof(*inner_map_meta); /* In some cases verifier needs to access beyond just base map. */ @@ -36,10 +30,8 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) inner_map_meta_size = sizeof(struct bpf_array); inner_map_meta = kzalloc(inner_map_meta_size, GFP_USER); - if (!inner_map_meta) { - ret = -ENOMEM; - goto put; - } + if (!inner_map_meta) + return ERR_PTR(-ENOMEM); inner_map_meta->map_type = inner_map->map_type; inner_map_meta->key_size = inner_map->key_size; @@ -53,8 +45,9 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) * invalid/empty/valid, but ERR_PTR in case of errors. During * equality NULL or IS_ERR is equivalent. */ - ret = PTR_ERR(inner_map_meta->record); - goto free; + struct bpf_map *ret = ERR_CAST(inner_map_meta->record); + kfree(inner_map_meta); + return ret; } /* Note: We must use the same BTF, as we also used btf_record_dup above * which relies on BTF being same for both maps, as some members like @@ -77,14 +70,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) inner_array_meta->elem_size = inner_array->elem_size; inner_map_meta->bypass_spec_v1 = inner_map->bypass_spec_v1; } - - fdput(f); return inner_map_meta; -free: - kfree(inner_map_meta); -put: - fdput(f); - return ERR_PTR(ret); } void bpf_map_meta_free(struct bpf_map *map_meta) @@ -110,9 +96,8 @@ void *bpf_map_fd_get_ptr(struct bpf_map *map, int ufd) { struct bpf_map *inner_map, *inner_map_meta; - struct fd f; + CLASS(fd, f)(ufd); - f = fdget(ufd); inner_map = __bpf_map_get(f); if (IS_ERR(inner_map)) return inner_map; @@ -123,7 +108,6 @@ void *bpf_map_fd_get_ptr(struct bpf_map *map, else inner_map = ERR_PTR(-EINVAL); - fdput(f); return inner_map; } diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index b3858a76e0b3..146f5b57cfb1 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -35,6 +35,8 @@ */ #define LLIST_NODE_SZ sizeof(struct llist_node) +#define BPF_MEM_ALLOC_SIZE_MAX 4096 + /* similar to kmalloc, but sizeof == 8 bucket is gone */ static u8 size_index[24] __ro_after_init = { 3, /* 8 */ @@ -65,7 +67,7 @@ static u8 size_index[24] __ro_after_init = { static int bpf_mem_cache_idx(size_t size) { - if (!size || size > 4096) + if (!size || size > BPF_MEM_ALLOC_SIZE_MAX) return -1; if (size <= 192) @@ -1005,3 +1007,13 @@ void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags) return !ret ? NULL : ret + LLIST_NODE_SZ; } + +int bpf_mem_alloc_check_size(bool percpu, size_t size) +{ + /* The size of percpu allocation doesn't have LLIST_NODE_SZ overhead */ + if ((percpu && size > BPF_MEM_ALLOC_SIZE_MAX) || + (!percpu && size > BPF_MEM_ALLOC_SIZE_MAX - LLIST_NODE_SZ)) + return -E2BIG; + + return 0; +} diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index e20b90c36131..e1cfe890e0be 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -29,7 +29,7 @@ struct bpf_ringbuf { u64 mask; struct page **pages; int nr_pages; - spinlock_t spinlock ____cacheline_aligned_in_smp; + raw_spinlock_t spinlock ____cacheline_aligned_in_smp; /* For user-space producer ring buffers, an atomic_t busy bit is used * to synchronize access to the ring buffers in the kernel, rather than * the spinlock that is used for kernel-producer ring buffers. This is @@ -173,7 +173,7 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) if (!rb) return NULL; - spin_lock_init(&rb->spinlock); + raw_spin_lock_init(&rb->spinlock); atomic_set(&rb->busy, 0); init_waitqueue_head(&rb->waitq); init_irq_work(&rb->work, bpf_ringbuf_notify); @@ -421,10 +421,10 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) cons_pos = smp_load_acquire(&rb->consumer_pos); if (in_nmi()) { - if (!spin_trylock_irqsave(&rb->spinlock, flags)) + if (!raw_spin_trylock_irqsave(&rb->spinlock, flags)) return NULL; } else { - spin_lock_irqsave(&rb->spinlock, flags); + raw_spin_lock_irqsave(&rb->spinlock, flags); } pend_pos = rb->pending_pos; @@ -450,7 +450,7 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) */ if (new_prod_pos - cons_pos > rb->mask || new_prod_pos - pend_pos > rb->mask) { - spin_unlock_irqrestore(&rb->spinlock, flags); + raw_spin_unlock_irqrestore(&rb->spinlock, flags); return NULL; } @@ -462,7 +462,7 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) /* pairs with consumer's smp_load_acquire() */ smp_store_release(&rb->producer_pos, new_prod_pos); - spin_unlock_irqrestore(&rb->spinlock, flags); + raw_spin_unlock_irqrestore(&rb->spinlock, flags); return (void *)hdr + BPF_RINGBUF_HDR_SZ; } @@ -632,7 +632,7 @@ const struct bpf_func_proto bpf_ringbuf_reserve_dynptr_proto = { .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | MEM_UNINIT, + .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | MEM_UNINIT | MEM_WRITE, }; BPF_CALL_2(bpf_ringbuf_submit_dynptr, struct bpf_dynptr_kern *, ptr, u64, flags) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8386f25bc532..c5aa127ed4cc 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1425,21 +1425,6 @@ put_token: return err; } -/* if error is returned, fd is released. - * On success caller should complete fd access with matching fdput() - */ -struct bpf_map *__bpf_map_get(struct fd f) -{ - if (!fd_file(f)) - return ERR_PTR(-EBADF); - if (fd_file(f)->f_op != &bpf_map_fops) { - fdput(f); - return ERR_PTR(-EINVAL); - } - - return fd_file(f)->private_data; -} - void bpf_map_inc(struct bpf_map *map) { atomic64_inc(&map->refcnt); @@ -1455,15 +1440,11 @@ EXPORT_SYMBOL_GPL(bpf_map_inc_with_uref); struct bpf_map *bpf_map_get(u32 ufd) { - struct fd f = fdget(ufd); - struct bpf_map *map; + CLASS(fd, f)(ufd); + struct bpf_map *map = __bpf_map_get(f); - map = __bpf_map_get(f); - if (IS_ERR(map)) - return map; - - bpf_map_inc(map); - fdput(f); + if (!IS_ERR(map)) + bpf_map_inc(map); return map; } @@ -1471,15 +1452,11 @@ EXPORT_SYMBOL(bpf_map_get); struct bpf_map *bpf_map_get_with_uref(u32 ufd) { - struct fd f = fdget(ufd); - struct bpf_map *map; - - map = __bpf_map_get(f); - if (IS_ERR(map)) - return map; + CLASS(fd, f)(ufd); + struct bpf_map *map = __bpf_map_get(f); - bpf_map_inc_with_uref(map); - fdput(f); + if (!IS_ERR(map)) + bpf_map_inc_with_uref(map); return map; } @@ -1544,11 +1521,9 @@ static int map_lookup_elem(union bpf_attr *attr) { void __user *ukey = u64_to_user_ptr(attr->key); void __user *uvalue = u64_to_user_ptr(attr->value); - int ufd = attr->map_fd; struct bpf_map *map; void *key, *value; u32 value_size; - struct fd f; int err; if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM)) @@ -1557,26 +1532,20 @@ static int map_lookup_elem(union bpf_attr *attr) if (attr->flags & ~BPF_F_LOCK) return -EINVAL; - f = fdget(ufd); + CLASS(fd, f)(attr->map_fd); map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) { - err = -EPERM; - goto err_put; - } + if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) + return -EPERM; if ((attr->flags & BPF_F_LOCK) && - !btf_record_has_field(map->record, BPF_SPIN_LOCK)) { - err = -EINVAL; - goto err_put; - } + !btf_record_has_field(map->record, BPF_SPIN_LOCK)) + return -EINVAL; key = __bpf_copy_key(ukey, map->key_size); - if (IS_ERR(key)) { - err = PTR_ERR(key); - goto err_put; - } + if (IS_ERR(key)) + return PTR_ERR(key); value_size = bpf_map_value_size(map); @@ -1607,8 +1576,6 @@ free_value: kvfree(value); free_key: kvfree(key); -err_put: - fdput(f); return err; } @@ -1619,17 +1586,15 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr) { bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel); bpfptr_t uvalue = make_bpfptr(attr->value, uattr.is_kernel); - int ufd = attr->map_fd; struct bpf_map *map; void *key, *value; u32 value_size; - struct fd f; int err; if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM)) return -EINVAL; - f = fdget(ufd); + CLASS(fd, f)(attr->map_fd); map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); @@ -1667,7 +1632,6 @@ free_key: kvfree(key); err_put: bpf_map_write_active_dec(map); - fdput(f); return err; } @@ -1676,16 +1640,14 @@ err_put: static int map_delete_elem(union bpf_attr *attr, bpfptr_t uattr) { bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel); - int ufd = attr->map_fd; struct bpf_map *map; - struct fd f; void *key; int err; if (CHECK_ATTR(BPF_MAP_DELETE_ELEM)) return -EINVAL; - f = fdget(ufd); + CLASS(fd, f)(attr->map_fd); map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); @@ -1722,7 +1684,6 @@ out: kvfree(key); err_put: bpf_map_write_active_dec(map); - fdput(f); return err; } @@ -1733,30 +1694,24 @@ static int map_get_next_key(union bpf_attr *attr) { void __user *ukey = u64_to_user_ptr(attr->key); void __user *unext_key = u64_to_user_ptr(attr->next_key); - int ufd = attr->map_fd; struct bpf_map *map; void *key, *next_key; - struct fd f; int err; if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY)) return -EINVAL; - f = fdget(ufd); + CLASS(fd, f)(attr->map_fd); map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) { - err = -EPERM; - goto err_put; - } + if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) + return -EPERM; if (ukey) { key = __bpf_copy_key(ukey, map->key_size); - if (IS_ERR(key)) { - err = PTR_ERR(key); - goto err_put; - } + if (IS_ERR(key)) + return PTR_ERR(key); } else { key = NULL; } @@ -1788,8 +1743,6 @@ free_next_key: kvfree(next_key); free_key: kvfree(key); -err_put: - fdput(f); return err; } @@ -2018,11 +1971,9 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) { void __user *ukey = u64_to_user_ptr(attr->key); void __user *uvalue = u64_to_user_ptr(attr->value); - int ufd = attr->map_fd; struct bpf_map *map; void *key, *value; u32 value_size; - struct fd f; int err; if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM)) @@ -2031,7 +1982,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) if (attr->flags & ~BPF_F_LOCK) return -EINVAL; - f = fdget(ufd); + CLASS(fd, f)(attr->map_fd); map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); @@ -2101,7 +2052,6 @@ free_key: kvfree(key); err_put: bpf_map_write_active_dec(map); - fdput(f); return err; } @@ -2109,27 +2059,22 @@ err_put: static int map_freeze(const union bpf_attr *attr) { - int err = 0, ufd = attr->map_fd; + int err = 0; struct bpf_map *map; - struct fd f; if (CHECK_ATTR(BPF_MAP_FREEZE)) return -EINVAL; - f = fdget(ufd); + CLASS(fd, f)(attr->map_fd); map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS || !IS_ERR_OR_NULL(map->record)) { - fdput(f); + if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS || !IS_ERR_OR_NULL(map->record)) return -ENOTSUPP; - } - if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { - fdput(f); + if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) return -EPERM; - } mutex_lock(&map->freeze_mutex); if (bpf_map_write_active(map)) { @@ -2144,7 +2089,6 @@ static int map_freeze(const union bpf_attr *attr) WRITE_ONCE(map->frozen, true); err_put: mutex_unlock(&map->freeze_mutex); - fdput(f); return err; } @@ -2414,18 +2358,6 @@ int bpf_prog_new_fd(struct bpf_prog *prog) O_RDWR | O_CLOEXEC); } -static struct bpf_prog *____bpf_prog_get(struct fd f) -{ - if (!fd_file(f)) - return ERR_PTR(-EBADF); - if (fd_file(f)->f_op != &bpf_prog_fops) { - fdput(f); - return ERR_PTR(-EINVAL); - } - - return fd_file(f)->private_data; -} - void bpf_prog_add(struct bpf_prog *prog, int i) { atomic64_add(i, &prog->aux->refcnt); @@ -2481,20 +2413,19 @@ bool bpf_prog_get_ok(struct bpf_prog *prog, static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type, bool attach_drv) { - struct fd f = fdget(ufd); + CLASS(fd, f)(ufd); struct bpf_prog *prog; - prog = ____bpf_prog_get(f); - if (IS_ERR(prog)) - return prog; - if (!bpf_prog_get_ok(prog, attach_type, attach_drv)) { - prog = ERR_PTR(-EINVAL); - goto out; - } + if (fd_empty(f)) + return ERR_PTR(-EBADF); + if (fd_file(f)->f_op != &bpf_prog_fops) + return ERR_PTR(-EINVAL); + + prog = fd_file(f)->private_data; + if (!bpf_prog_get_ok(prog, attach_type, attach_drv)) + return ERR_PTR(-EINVAL); bpf_prog_inc(prog); -out: - fdput(f); return prog; } @@ -3138,13 +3069,17 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) { const struct bpf_link *link = filp->private_data; const struct bpf_prog *prog = link->prog; + enum bpf_link_type type = link->type; char prog_tag[sizeof(prog->tag) * 2 + 1] = { }; - seq_printf(m, - "link_type:\t%s\n" - "link_id:\t%u\n", - bpf_link_type_strs[link->type], - link->id); + if (type < ARRAY_SIZE(bpf_link_type_strs) && bpf_link_type_strs[type]) { + seq_printf(m, "link_type:\t%s\n", bpf_link_type_strs[type]); + } else { + WARN_ONCE(1, "missing BPF_LINK_TYPE(...) for link type %u\n", type); + seq_printf(m, "link_type:\t<%u>\n", type); + } + seq_printf(m, "link_id:\t%u\n", link->id); + if (prog) { bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); seq_printf(m, @@ -3263,20 +3198,16 @@ int bpf_link_new_fd(struct bpf_link *link) struct bpf_link *bpf_link_get_from_fd(u32 ufd) { - struct fd f = fdget(ufd); + CLASS(fd, f)(ufd); struct bpf_link *link; - if (!fd_file(f)) + if (fd_empty(f)) return ERR_PTR(-EBADF); - if (fd_file(f)->f_op != &bpf_link_fops && fd_file(f)->f_op != &bpf_link_fops_poll) { - fdput(f); + if (fd_file(f)->f_op != &bpf_link_fops && fd_file(f)->f_op != &bpf_link_fops_poll) return ERR_PTR(-EINVAL); - } link = fd_file(f)->private_data; bpf_link_inc(link); - fdput(f); - return link; } EXPORT_SYMBOL(bpf_link_get_from_fd); @@ -3638,15 +3569,16 @@ static void bpf_perf_link_dealloc(struct bpf_link *link) } static int bpf_perf_link_fill_common(const struct perf_event *event, - char __user *uname, u32 ulen, + char __user *uname, u32 *ulenp, u64 *probe_offset, u64 *probe_addr, u32 *fd_type, unsigned long *missed) { const char *buf; - u32 prog_id; + u32 prog_id, ulen; size_t len; int err; + ulen = *ulenp; if (!ulen ^ !uname) return -EINVAL; @@ -3654,10 +3586,17 @@ static int bpf_perf_link_fill_common(const struct perf_event *event, probe_offset, probe_addr, missed); if (err) return err; + + if (buf) { + len = strlen(buf); + *ulenp = len + 1; + } else { + *ulenp = 1; + } if (!uname) return 0; + if (buf) { - len = strlen(buf); err = bpf_copy_to_user(uname, buf, ulen, len); if (err) return err; @@ -3682,7 +3621,7 @@ static int bpf_perf_link_fill_kprobe(const struct perf_event *event, uname = u64_to_user_ptr(info->perf_event.kprobe.func_name); ulen = info->perf_event.kprobe.name_len; - err = bpf_perf_link_fill_common(event, uname, ulen, &offset, &addr, + err = bpf_perf_link_fill_common(event, uname, &ulen, &offset, &addr, &type, &missed); if (err) return err; @@ -3690,7 +3629,7 @@ static int bpf_perf_link_fill_kprobe(const struct perf_event *event, info->perf_event.type = BPF_PERF_EVENT_KRETPROBE; else info->perf_event.type = BPF_PERF_EVENT_KPROBE; - + info->perf_event.kprobe.name_len = ulen; info->perf_event.kprobe.offset = offset; info->perf_event.kprobe.missed = missed; if (!kallsyms_show_value(current_cred())) @@ -3712,7 +3651,7 @@ static int bpf_perf_link_fill_uprobe(const struct perf_event *event, uname = u64_to_user_ptr(info->perf_event.uprobe.file_name); ulen = info->perf_event.uprobe.name_len; - err = bpf_perf_link_fill_common(event, uname, ulen, &offset, &addr, + err = bpf_perf_link_fill_common(event, uname, &ulen, &offset, &addr, &type, NULL); if (err) return err; @@ -3721,6 +3660,7 @@ static int bpf_perf_link_fill_uprobe(const struct perf_event *event, info->perf_event.type = BPF_PERF_EVENT_URETPROBE; else info->perf_event.type = BPF_PERF_EVENT_UPROBE; + info->perf_event.uprobe.name_len = ulen; info->perf_event.uprobe.offset = offset; info->perf_event.uprobe.cookie = event->bpf_cookie; return 0; @@ -3746,12 +3686,18 @@ static int bpf_perf_link_fill_tracepoint(const struct perf_event *event, { char __user *uname; u32 ulen; + int err; uname = u64_to_user_ptr(info->perf_event.tracepoint.tp_name); ulen = info->perf_event.tracepoint.name_len; + err = bpf_perf_link_fill_common(event, uname, &ulen, NULL, NULL, NULL, NULL); + if (err) + return err; + info->perf_event.type = BPF_PERF_EVENT_TRACEPOINT; + info->perf_event.tracepoint.name_len = ulen; info->perf_event.tracepoint.cookie = event->bpf_cookie; - return bpf_perf_link_fill_common(event, uname, ulen, NULL, NULL, NULL, NULL); + return 0; } static int bpf_perf_link_fill_perf_event(const struct perf_event *event, @@ -4981,33 +4927,25 @@ static int bpf_link_get_info_by_fd(struct file *file, static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, union bpf_attr __user *uattr) { - int ufd = attr->info.bpf_fd; - struct fd f; - int err; - if (CHECK_ATTR(BPF_OBJ_GET_INFO_BY_FD)) return -EINVAL; - f = fdget(ufd); - if (!fd_file(f)) + CLASS(fd, f)(attr->info.bpf_fd); + if (fd_empty(f)) return -EBADFD; if (fd_file(f)->f_op == &bpf_prog_fops) - err = bpf_prog_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr, + return bpf_prog_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr, uattr); else if (fd_file(f)->f_op == &bpf_map_fops) - err = bpf_map_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr, + return bpf_map_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr, uattr); else if (fd_file(f)->f_op == &btf_fops) - err = bpf_btf_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr, uattr); + return bpf_btf_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr, uattr); else if (fd_file(f)->f_op == &bpf_link_fops || fd_file(f)->f_op == &bpf_link_fops_poll) - err = bpf_link_get_info_by_fd(fd_file(f), fd_file(f)->private_data, + return bpf_link_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr, uattr); - else - err = -EINVAL; - - fdput(f); - return err; + return -EINVAL; } #define BPF_BTF_LOAD_LAST_FIELD btf_token_fd @@ -5195,14 +5133,13 @@ static int bpf_map_do_batch(const union bpf_attr *attr, cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH; bool has_write = cmd != BPF_MAP_LOOKUP_BATCH; struct bpf_map *map; - int err, ufd; - struct fd f; + int err; if (CHECK_ATTR(BPF_MAP_BATCH)) return -EINVAL; - ufd = attr->batch.map_fd; - f = fdget(ufd); + CLASS(fd, f)(attr->batch.map_fd); + map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); @@ -5230,7 +5167,6 @@ err_put: maybe_wait_bpf_programs(map); bpf_map_write_active_dec(map); } - fdput(f); return err; } @@ -5960,7 +5896,7 @@ static const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = { .arg1_type = ARG_PTR_TO_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_ALIGNED, + .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED, .arg4_size = sizeof(u64), }; diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 02aa9db8d796..98d9b4c0daff 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -5,7 +5,6 @@ #include <linux/namei.h> #include <linux/pid_namespace.h> #include <linux/fs.h> -#include <linux/fdtable.h> #include <linux/filter.h> #include <linux/bpf_mem_alloc.h> #include <linux/btf_ids.h> @@ -99,7 +98,7 @@ static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *co rcu_read_lock(); pid = find_pid_ns(common->pid, common->ns); if (pid) { - task = get_pid_task(pid, PIDTYPE_TGID); + task = get_pid_task(pid, PIDTYPE_PID); *tid = common->pid; } rcu_read_unlock(); @@ -286,17 +285,14 @@ again: curr_fd = 0; } - rcu_read_lock(); - f = task_lookup_next_fdget_rcu(curr_task, &curr_fd); + f = fget_task_next(curr_task, &curr_fd); if (f) { /* set info->fd */ info->fd = curr_fd; - rcu_read_unlock(); return f; } /* the current task is done, go to the next task */ - rcu_read_unlock(); put_task_struct(curr_task); if (info->common.type == BPF_TASK_ITER_TID) { diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c index 9a1d356e79ed..26057aa13503 100644 --- a/kernel/bpf/token.c +++ b/kernel/bpf/token.c @@ -1,6 +1,5 @@ #include <linux/bpf.h> #include <linux/vmalloc.h> -#include <linux/fdtable.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/kernel.h> @@ -116,67 +115,52 @@ int bpf_token_create(union bpf_attr *attr) struct user_namespace *userns; struct inode *inode; struct file *file; + CLASS(fd, f)(attr->token_create.bpffs_fd); struct path path; - struct fd f; + struct super_block *sb; umode_t mode; int err, fd; - f = fdget(attr->token_create.bpffs_fd); - if (!fd_file(f)) + if (fd_empty(f)) return -EBADF; path = fd_file(f)->f_path; - path_get(&path); - fdput(f); + sb = path.dentry->d_sb; - if (path.dentry != path.mnt->mnt_sb->s_root) { - err = -EINVAL; - goto out_path; - } - if (path.mnt->mnt_sb->s_op != &bpf_super_ops) { - err = -EINVAL; - goto out_path; - } + if (path.dentry != sb->s_root) + return -EINVAL; + if (sb->s_op != &bpf_super_ops) + return -EINVAL; err = path_permission(&path, MAY_ACCESS); if (err) - goto out_path; + return err; - userns = path.dentry->d_sb->s_user_ns; + userns = sb->s_user_ns; /* * Enforce that creators of BPF tokens are in the same user * namespace as the BPF FS instance. This makes reasoning about * permissions a lot easier and we can always relax this later. */ - if (current_user_ns() != userns) { - err = -EPERM; - goto out_path; - } - if (!ns_capable(userns, CAP_BPF)) { - err = -EPERM; - goto out_path; - } + if (current_user_ns() != userns) + return -EPERM; + if (!ns_capable(userns, CAP_BPF)) + return -EPERM; /* Creating BPF token in init_user_ns doesn't make much sense. */ - if (current_user_ns() == &init_user_ns) { - err = -EOPNOTSUPP; - goto out_path; - } + if (current_user_ns() == &init_user_ns) + return -EOPNOTSUPP; - mnt_opts = path.dentry->d_sb->s_fs_info; + mnt_opts = sb->s_fs_info; if (mnt_opts->delegate_cmds == 0 && mnt_opts->delegate_maps == 0 && mnt_opts->delegate_progs == 0 && - mnt_opts->delegate_attachs == 0) { - err = -ENOENT; /* no BPF token delegation is set up */ - goto out_path; - } + mnt_opts->delegate_attachs == 0) + return -ENOENT; /* no BPF token delegation is set up */ mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask()); - inode = bpf_get_inode(path.mnt->mnt_sb, NULL, mode); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); - goto out_path; - } + inode = bpf_get_inode(sb, NULL, mode); + if (IS_ERR(inode)) + return PTR_ERR(inode); inode->i_op = &bpf_token_iops; inode->i_fop = &bpf_token_fops; @@ -185,8 +169,7 @@ int bpf_token_create(union bpf_attr *attr) file = alloc_file_pseudo(inode, path.mnt, BPF_TOKEN_INODE_NAME, O_RDWR, &bpf_token_fops); if (IS_ERR(file)) { iput(inode); - err = PTR_ERR(file); - goto out_path; + return PTR_ERR(file); } token = kzalloc(sizeof(*token), GFP_USER); @@ -218,33 +201,27 @@ int bpf_token_create(union bpf_attr *attr) file->private_data = token; fd_install(fd, file); - path_put(&path); return fd; out_token: bpf_token_free(token); out_file: fput(file); -out_path: - path_put(&path); return err; } struct bpf_token *bpf_token_get_from_fd(u32 ufd) { - struct fd f = fdget(ufd); + CLASS(fd, f)(ufd); struct bpf_token *token; - if (!fd_file(f)) + if (fd_empty(f)) return ERR_PTR(-EBADF); - if (fd_file(f)->f_op != &bpf_token_fops) { - fdput(f); + if (fd_file(f)->f_op != &bpf_token_fops) return ERR_PTR(-EINVAL); - } token = fd_file(f)->private_data; bpf_token_inc(token); - fdput(f); return token; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index dd86282ccaa4..bb99bada7e2e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2750,10 +2750,16 @@ static struct btf *__find_kfunc_desc_btf(struct bpf_verifier_env *env, b->module = mod; b->offset = offset; + /* sort() reorders entries by value, so b may no longer point + * to the right entry after this + */ sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]), kfunc_btf_cmp_by_off, NULL); + } else { + btf = b->btf; } - return b->btf; + + return btf; } void bpf_free_kfunc_btf_tab(struct bpf_kfunc_btf_tab *tab) @@ -6333,10 +6339,10 @@ static void coerce_reg_to_size_sx(struct bpf_reg_state *reg, int size) /* both of s64_max/s64_min positive or negative */ if ((s64_max >= 0) == (s64_min >= 0)) { - reg->smin_value = reg->s32_min_value = s64_min; - reg->smax_value = reg->s32_max_value = s64_max; - reg->umin_value = reg->u32_min_value = s64_min; - reg->umax_value = reg->u32_max_value = s64_max; + reg->s32_min_value = reg->smin_value = s64_min; + reg->s32_max_value = reg->smax_value = s64_max; + reg->u32_min_value = reg->umin_value = s64_min; + reg->u32_max_value = reg->umax_value = s64_max; reg->var_off = tnum_range(s64_min, s64_max); return; } @@ -6798,20 +6804,10 @@ static int check_stack_slot_within_bounds(struct bpf_verifier_env *env, struct bpf_func_state *state, enum bpf_access_type t) { - struct bpf_insn_aux_data *aux = &env->insn_aux_data[env->insn_idx]; - int min_valid_off, max_bpf_stack; - - /* If accessing instruction is a spill/fill from bpf_fastcall pattern, - * add room for all caller saved registers below MAX_BPF_STACK. - * In case if bpf_fastcall rewrite won't happen maximal stack depth - * would be checked by check_max_stack_depth_subprog(). - */ - max_bpf_stack = MAX_BPF_STACK; - if (aux->fastcall_pattern) - max_bpf_stack += CALLER_SAVED_REGS * BPF_REG_SIZE; + int min_valid_off; if (t == BPF_WRITE || env->allow_uninit_stack) - min_valid_off = -max_bpf_stack; + min_valid_off = -MAX_BPF_STACK; else min_valid_off = -state->allocated_stack; @@ -7432,7 +7428,8 @@ mark: } static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, - int access_size, bool zero_size_allowed, + int access_size, enum bpf_access_type access_type, + bool zero_size_allowed, struct bpf_call_arg_meta *meta) { struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; @@ -7444,7 +7441,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, return check_packet_access(env, regno, reg->off, access_size, zero_size_allowed); case PTR_TO_MAP_KEY: - if (meta && meta->raw_mode) { + if (access_type == BPF_WRITE) { verbose(env, "R%d cannot write into %s\n", regno, reg_type_str(env, reg->type)); return -EACCES; @@ -7452,15 +7449,13 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, return check_mem_region_access(env, regno, reg->off, access_size, reg->map_ptr->key_size, false); case PTR_TO_MAP_VALUE: - if (check_map_access_type(env, regno, reg->off, access_size, - meta && meta->raw_mode ? BPF_WRITE : - BPF_READ)) + if (check_map_access_type(env, regno, reg->off, access_size, access_type)) return -EACCES; return check_map_access(env, regno, reg->off, access_size, zero_size_allowed, ACCESS_HELPER); case PTR_TO_MEM: if (type_is_rdonly_mem(reg->type)) { - if (meta && meta->raw_mode) { + if (access_type == BPF_WRITE) { verbose(env, "R%d cannot write into %s\n", regno, reg_type_str(env, reg->type)); return -EACCES; @@ -7471,7 +7466,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, zero_size_allowed); case PTR_TO_BUF: if (type_is_rdonly_mem(reg->type)) { - if (meta && meta->raw_mode) { + if (access_type == BPF_WRITE) { verbose(env, "R%d cannot write into %s\n", regno, reg_type_str(env, reg->type)); return -EACCES; @@ -7499,7 +7494,6 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, * Dynamically check it now. */ if (!env->ops->convert_ctx_access) { - enum bpf_access_type atype = meta && meta->raw_mode ? BPF_WRITE : BPF_READ; int offset = access_size - 1; /* Allow zero-byte read from PTR_TO_CTX */ @@ -7507,7 +7501,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, return zero_size_allowed ? 0 : -EACCES; return check_mem_access(env, env->insn_idx, regno, offset, BPF_B, - atype, -1, false, false); + access_type, -1, false, false); } fallthrough; @@ -7532,6 +7526,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, */ static int check_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, + enum bpf_access_type access_type, bool zero_size_allowed, struct bpf_call_arg_meta *meta) { @@ -7547,15 +7542,12 @@ static int check_mem_size_reg(struct bpf_verifier_env *env, */ meta->msize_max_value = reg->umax_value; - /* The register is SCALAR_VALUE; the access check - * happens using its boundaries. + /* The register is SCALAR_VALUE; the access check happens using + * its boundaries. For unprivileged variable accesses, disable + * raw mode so that the program is required to initialize all + * the memory that the helper could just partially fill up. */ if (!tnum_is_const(reg->var_off)) - /* For unprivileged variable accesses, disable raw - * mode so that the program is required to - * initialize all the memory that the helper could - * just partially fill up. - */ meta = NULL; if (reg->smin_value < 0) { @@ -7575,9 +7567,8 @@ static int check_mem_size_reg(struct bpf_verifier_env *env, regno); return -EACCES; } - err = check_helper_mem_access(env, regno - 1, - reg->umax_value, - zero_size_allowed, meta); + err = check_helper_mem_access(env, regno - 1, reg->umax_value, + access_type, zero_size_allowed, meta); if (!err) err = mark_chain_precision(env, regno); return err; @@ -7588,13 +7579,11 @@ static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg { bool may_be_null = type_may_be_null(reg->type); struct bpf_reg_state saved_reg; - struct bpf_call_arg_meta meta; int err; if (register_is_null(reg)) return 0; - memset(&meta, 0, sizeof(meta)); /* Assuming that the register contains a value check if the memory * access is safe. Temporarily save and restore the register's state as * the conversion shouldn't be visible to a caller. @@ -7604,10 +7593,8 @@ static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg mark_ptr_not_null_reg(reg); } - err = check_helper_mem_access(env, regno, mem_size, true, &meta); - /* Check access for BPF_WRITE */ - meta.raw_mode = true; - err = err ?: check_helper_mem_access(env, regno, mem_size, true, &meta); + err = check_helper_mem_access(env, regno, mem_size, BPF_READ, true, NULL); + err = err ?: check_helper_mem_access(env, regno, mem_size, BPF_WRITE, true, NULL); if (may_be_null) *reg = saved_reg; @@ -7633,13 +7620,12 @@ static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg mark_ptr_not_null_reg(mem_reg); } - err = check_mem_size_reg(env, reg, regno, true, &meta); - /* Check access for BPF_WRITE */ - meta.raw_mode = true; - err = err ?: check_mem_size_reg(env, reg, regno, true, &meta); + err = check_mem_size_reg(env, reg, regno, BPF_READ, true, &meta); + err = err ?: check_mem_size_reg(env, reg, regno, BPF_WRITE, true, &meta); if (may_be_null) *mem_reg = saved_reg; + return err; } @@ -8942,9 +8928,8 @@ skip_type_check: verbose(env, "invalid map_ptr to access map->key\n"); return -EACCES; } - err = check_helper_mem_access(env, regno, - meta->map_ptr->key_size, false, - NULL); + err = check_helper_mem_access(env, regno, meta->map_ptr->key_size, + BPF_READ, false, NULL); break; case ARG_PTR_TO_MAP_VALUE: if (type_may_be_null(arg_type) && register_is_null(reg)) @@ -8959,9 +8944,9 @@ skip_type_check: return -EACCES; } meta->raw_mode = arg_type & MEM_UNINIT; - err = check_helper_mem_access(env, regno, - meta->map_ptr->value_size, false, - meta); + err = check_helper_mem_access(env, regno, meta->map_ptr->value_size, + arg_type & MEM_WRITE ? BPF_WRITE : BPF_READ, + false, meta); break; case ARG_PTR_TO_PERCPU_BTF_ID: if (!reg->btf_id) { @@ -9003,7 +8988,9 @@ skip_type_check: */ meta->raw_mode = arg_type & MEM_UNINIT; if (arg_type & MEM_FIXED_SIZE) { - err = check_helper_mem_access(env, regno, fn->arg_size[arg], false, meta); + err = check_helper_mem_access(env, regno, fn->arg_size[arg], + arg_type & MEM_WRITE ? BPF_WRITE : BPF_READ, + false, meta); if (err) return err; if (arg_type & MEM_ALIGNED) @@ -9011,10 +8998,16 @@ skip_type_check: } break; case ARG_CONST_SIZE: - err = check_mem_size_reg(env, reg, regno, false, meta); + err = check_mem_size_reg(env, reg, regno, + fn->arg_type[arg - 1] & MEM_WRITE ? + BPF_WRITE : BPF_READ, + false, meta); break; case ARG_CONST_SIZE_OR_ZERO: - err = check_mem_size_reg(env, reg, regno, true, meta); + err = check_mem_size_reg(env, reg, regno, + fn->arg_type[arg - 1] & MEM_WRITE ? + BPF_WRITE : BPF_READ, + true, meta); break; case ARG_PTR_TO_DYNPTR: err = process_dynptr_func(env, regno, insn_idx, arg_type, 0); @@ -14264,12 +14257,13 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, * r1 += 0x1 * if r2 < 1000 goto ... * use r1 in memory access - * So remember constant delta between r2 and r1 and update r1 after - * 'if' condition. + * So for 64-bit alu remember constant delta between r2 and r1 and + * update r1 after 'if' condition. */ - if (env->bpf_capable && BPF_OP(insn->code) == BPF_ADD && - dst_reg->id && is_reg_const(src_reg, alu32)) { - u64 val = reg_const_value(src_reg, alu32); + if (env->bpf_capable && + BPF_OP(insn->code) == BPF_ADD && !alu32 && + dst_reg->id && is_reg_const(src_reg, false)) { + u64 val = reg_const_value(src_reg, false); if ((dst_reg->id & BPF_ADD_CONST) || /* prevent overflow in sync_linked_regs() later */ @@ -15326,8 +15320,12 @@ static void sync_linked_regs(struct bpf_verifier_state *vstate, struct bpf_reg_s continue; if ((!(reg->id & BPF_ADD_CONST) && !(known_reg->id & BPF_ADD_CONST)) || reg->off == known_reg->off) { + s32 saved_subreg_def = reg->subreg_def; + copy_register_state(reg, known_reg); + reg->subreg_def = saved_subreg_def; } else { + s32 saved_subreg_def = reg->subreg_def; s32 saved_off = reg->off; fake_reg.type = SCALAR_VALUE; @@ -15340,6 +15338,7 @@ static void sync_linked_regs(struct bpf_verifier_state *vstate, struct bpf_reg_s * otherwise another sync_linked_regs() will be incorrect. */ reg->off = saved_off; + reg->subreg_def = saved_subreg_def; scalar32_min_max_add(reg, &fake_reg); scalar_min_max_add(reg, &fake_reg); @@ -17877,9 +17876,11 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) struct bpf_verifier_state_list *sl, **pprev; struct bpf_verifier_state *cur = env->cur_state, *new, *loop_entry; int i, j, n, err, states_cnt = 0; - bool force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx); - bool add_new_state = force_new_state; - bool force_exact; + bool force_new_state, add_new_state, force_exact; + + force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx) || + /* Avoid accumulating infinitely long jmp history */ + cur->jmp_history_cnt > 40; /* bpf progs typically have pruning point every 4 instructions * http://vger.kernel.org/bpfconf2019.html#session-1 @@ -17889,6 +17890,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) * In tests that amounts to up to 50% reduction into total verifier * memory consumption and 20% verifier time speedup. */ + add_new_state = force_new_state; if (env->jmps_processed - env->prev_jmps_processed >= 2 && env->insn_processed - env->prev_insn_processed >= 8) add_new_state = true; @@ -18920,6 +18922,53 @@ static bool bpf_map_is_cgroup_storage(struct bpf_map *map) map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE); } +/* Add map behind fd to used maps list, if it's not already there, and return + * its index. Also set *reused to true if this map was already in the list of + * used maps. + * Returns <0 on error, or >= 0 index, on success. + */ +static int add_used_map_from_fd(struct bpf_verifier_env *env, int fd, bool *reused) +{ + CLASS(fd, f)(fd); + struct bpf_map *map; + int i; + + map = __bpf_map_get(f); + if (IS_ERR(map)) { + verbose(env, "fd %d is not pointing to valid bpf_map\n", fd); + return PTR_ERR(map); + } + + /* check whether we recorded this map already */ + for (i = 0; i < env->used_map_cnt; i++) { + if (env->used_maps[i] == map) { + *reused = true; + return i; + } + } + + if (env->used_map_cnt >= MAX_USED_MAPS) { + verbose(env, "The total number of maps per program has reached the limit of %u\n", + MAX_USED_MAPS); + return -E2BIG; + } + + if (env->prog->sleepable) + atomic64_inc(&map->sleepable_refcnt); + + /* hold the map. If the program is rejected by verifier, + * the map will be released by release_maps() or it + * will be used by the valid program until it's unloaded + * and all maps are released in bpf_free_used_maps() + */ + bpf_map_inc(map); + + *reused = false; + env->used_maps[env->used_map_cnt++] = map; + + return env->used_map_cnt - 1; +} + /* find and rewrite pseudo imm in ld_imm64 instructions: * * 1. if it accesses map FD, replace it with actual map pointer. @@ -18931,7 +18980,7 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) { struct bpf_insn *insn = env->prog->insnsi; int insn_cnt = env->prog->len; - int i, j, err; + int i, err; err = bpf_prog_calc_tag(env->prog); if (err) @@ -18948,9 +18997,10 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) { struct bpf_insn_aux_data *aux; struct bpf_map *map; - struct fd f; + int map_idx; u64 addr; u32 fd; + bool reused; if (i == insn_cnt - 1 || insn[1].code != 0 || insn[1].dst_reg != 0 || insn[1].src_reg != 0 || @@ -19011,20 +19061,18 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) break; } - f = fdget(fd); - map = __bpf_map_get(f); - if (IS_ERR(map)) { - verbose(env, "fd %d is not pointing to valid bpf_map\n", fd); - return PTR_ERR(map); - } + map_idx = add_used_map_from_fd(env, fd, &reused); + if (map_idx < 0) + return map_idx; + map = env->used_maps[map_idx]; + + aux = &env->insn_aux_data[i]; + aux->map_index = map_idx; err = check_map_prog_compatibility(env, map, env->prog); - if (err) { - fdput(f); + if (err) return err; - } - aux = &env->insn_aux_data[i]; if (insn[0].src_reg == BPF_PSEUDO_MAP_FD || insn[0].src_reg == BPF_PSEUDO_MAP_IDX) { addr = (unsigned long)map; @@ -19033,13 +19081,11 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) if (off >= BPF_MAX_VAR_OFF) { verbose(env, "direct value offset of %u is not allowed\n", off); - fdput(f); return -EINVAL; } if (!map->ops->map_direct_value_addr) { verbose(env, "no direct value access support for this map type\n"); - fdput(f); return -EINVAL; } @@ -19047,7 +19093,6 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) if (err) { verbose(env, "invalid access to map value pointer, value_size=%u off=%u\n", map->value_size, off); - fdput(f); return err; } @@ -19058,70 +19103,39 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) insn[0].imm = (u32)addr; insn[1].imm = addr >> 32; - /* check whether we recorded this map already */ - for (j = 0; j < env->used_map_cnt; j++) { - if (env->used_maps[j] == map) { - aux->map_index = j; - fdput(f); - goto next_insn; - } - } - - if (env->used_map_cnt >= MAX_USED_MAPS) { - verbose(env, "The total number of maps per program has reached the limit of %u\n", - MAX_USED_MAPS); - fdput(f); - return -E2BIG; - } - - if (env->prog->sleepable) - atomic64_inc(&map->sleepable_refcnt); - /* hold the map. If the program is rejected by verifier, - * the map will be released by release_maps() or it - * will be used by the valid program until it's unloaded - * and all maps are released in bpf_free_used_maps() - */ - bpf_map_inc(map); - - aux->map_index = env->used_map_cnt; - env->used_maps[env->used_map_cnt++] = map; + /* proceed with extra checks only if its newly added used map */ + if (reused) + goto next_insn; if (bpf_map_is_cgroup_storage(map) && bpf_cgroup_storage_assign(env->prog->aux, map)) { verbose(env, "only one cgroup storage of each type is allowed\n"); - fdput(f); return -EBUSY; } if (map->map_type == BPF_MAP_TYPE_ARENA) { if (env->prog->aux->arena) { verbose(env, "Only one arena per program\n"); - fdput(f); return -EBUSY; } if (!env->allow_ptr_leaks || !env->bpf_capable) { verbose(env, "CAP_BPF and CAP_PERFMON are required to use arena\n"); - fdput(f); return -EPERM; } if (!env->prog->jit_requested) { verbose(env, "JIT is required to use arena\n"); - fdput(f); return -EOPNOTSUPP; } if (!bpf_jit_supports_arena()) { verbose(env, "JIT doesn't support arena\n"); - fdput(f); return -EOPNOTSUPP; } env->prog->aux->arena = (void *)map; if (!bpf_arena_get_user_vm_start(env->prog->aux->arena)) { verbose(env, "arena's user address must be set via map_extra or mmap()\n"); - fdput(f); return -EINVAL; } } - fdput(f); next_insn: insn++; i++; @@ -21189,7 +21203,7 @@ patch_map_ops_generic: delta += cnt - 1; env->prog = prog = new_prog; insn = new_prog->insnsi + i + delta; - continue; + goto next_insn; } /* Implement bpf_kptr_xchg inline */ @@ -22298,7 +22312,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 /* 'struct bpf_verifier_env' can be global, but since it's not small, * allocate/free it every time bpf_check() is called */ - env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL); + env = kvzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL); if (!env) return -ENOMEM; @@ -22534,6 +22548,6 @@ err_unlock: mutex_unlock(&bpf_verifier_lock); vfree(env->insn_aux_data); err_free_env: - kfree(env); + kvfree(env); return ret; } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 5886b95c6eae..d9061bd55436 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2140,8 +2140,10 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) if (ret) goto exit_stats; - ret = cgroup_bpf_inherit(root_cgrp); - WARN_ON_ONCE(ret); + if (root == &cgrp_dfl_root) { + ret = cgroup_bpf_inherit(root_cgrp); + WARN_ON_ONCE(ret); + } trace_cgroup_setup_root(root); @@ -2314,10 +2316,8 @@ static void cgroup_kill_sb(struct super_block *sb) * And don't kill the default root. */ if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root && - !percpu_ref_is_dying(&root->cgrp.self.refcnt)) { - cgroup_bpf_offline(&root->cgrp); + !percpu_ref_is_dying(&root->cgrp.self.refcnt)) percpu_ref_kill(&root->cgrp.self.refcnt); - } cgroup_put(&root->cgrp); kernfs_kill_sb(sb); } @@ -5710,9 +5710,11 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, if (ret) goto out_kernfs_remove; - ret = cgroup_bpf_inherit(cgrp); - if (ret) - goto out_psi_free; + if (cgrp->root == &cgrp_dfl_root) { + ret = cgroup_bpf_inherit(cgrp); + if (ret) + goto out_psi_free; + } /* * New cgroup inherits effective freeze counter, and @@ -5789,7 +5791,7 @@ static bool cgroup_check_hierarchy_limits(struct cgroup *parent) { struct cgroup *cgroup; int ret = false; - int level = 1; + int level = 0; lockdep_assert_held(&cgroup_mutex); @@ -5797,7 +5799,7 @@ static bool cgroup_check_hierarchy_limits(struct cgroup *parent) if (cgroup->nr_descendants >= cgroup->max_descendants) goto fail; - if (level > cgroup->max_depth) + if (level >= cgroup->max_depth) goto fail; level++; @@ -6026,7 +6028,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) cgroup1_check_for_release(parent); - cgroup_bpf_offline(cgrp); + if (cgrp->root == &cgrp_dfl_root) + cgroup_bpf_offline(cgrp); /* put the base reference */ percpu_ref_kill(&cgrp->self.refcnt); @@ -6476,7 +6479,6 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs) struct cgroup *dst_cgrp = NULL; struct css_set *cset; struct super_block *sb; - struct file *f; if (kargs->flags & CLONE_INTO_CGROUP) cgroup_lock(); @@ -6493,14 +6495,14 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs) return 0; } - f = fget_raw(kargs->cgroup); - if (!f) { + CLASS(fd_raw, f)(kargs->cgroup); + if (fd_empty(f)) { ret = -EBADF; goto err; } - sb = f->f_path.dentry->d_sb; + sb = fd_file(f)->f_path.dentry->d_sb; - dst_cgrp = cgroup_get_from_file(f); + dst_cgrp = cgroup_get_from_file(fd_file(f)); if (IS_ERR(dst_cgrp)) { ret = PTR_ERR(dst_cgrp); dst_cgrp = NULL; @@ -6548,15 +6550,12 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs) } put_css_set(cset); - fput(f); kargs->cgrp = dst_cgrp; return ret; err: cgroup_threadgroup_change_end(current); cgroup_unlock(); - if (f) - fput(f); if (dst_cgrp) cgroup_put(dst_cgrp); put_css_set(cset); @@ -6966,14 +6965,11 @@ EXPORT_SYMBOL_GPL(cgroup_get_from_path); */ struct cgroup *cgroup_v1v2_get_from_fd(int fd) { - struct cgroup *cgrp; - struct fd f = fdget_raw(fd); - if (!fd_file(f)) + CLASS(fd_raw, f)(fd); + if (fd_empty(f)) return ERR_PTR(-EBADF); - cgrp = cgroup_v1v2_get_from_file(fd_file(f)); - fdput(f); - return cgrp; + return cgroup_v1v2_get_from_file(fd_file(f)); } /** diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index a4dd285cdf39..f321ed515f3a 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -84,9 +84,19 @@ static bool have_boot_isolcpus; static struct list_head remote_children; /* - * A flag to force sched domain rebuild at the end of an operation while - * inhibiting it in the intermediate stages when set. Currently it is only - * set in hotplug code. + * A flag to force sched domain rebuild at the end of an operation. + * It can be set in + * - update_partition_sd_lb() + * - remote_partition_check() + * - update_cpumasks_hier() + * - cpuset_update_flag() + * - cpuset_hotplug_update_tasks() + * - cpuset_handle_hotplug() + * + * Protected by cpuset_mutex (with cpus_read_lock held) or cpus_write_lock. + * + * Note that update_relax_domain_level() in cpuset-v1.c can still call + * rebuild_sched_domains_locked() directly without using this flag. */ static bool force_sd_rebuild; @@ -283,6 +293,12 @@ static inline void dec_attach_in_progress(struct cpuset *cs) mutex_unlock(&cpuset_mutex); } +static inline bool cpuset_v2(void) +{ + return !IS_ENABLED(CONFIG_CPUSETS_V1) || + cgroup_subsys_on_dfl(cpuset_cgrp_subsys); +} + /* * Cgroup v2 behavior is used on the "cpus" and "mems" control files when * on default hierarchy or when the cpuset_v2_mode flag is set by mounting @@ -293,7 +309,7 @@ static inline void dec_attach_in_progress(struct cpuset *cs) */ static inline bool is_in_v2_mode(void) { - return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || + return cpuset_v2() || (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE); } @@ -565,12 +581,24 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) /* * We can't shrink if we won't have enough room for SCHED_DEADLINE - * tasks. + * tasks. This check is not done when scheduling is disabled as the + * users should know what they are doing. + * + * For v1, effective_cpus == cpus_allowed & user_xcpus() returns + * cpus_allowed. + * + * For v2, is_cpu_exclusive() & is_sched_load_balance() are true only + * for non-isolated partition root. At this point, the target + * effective_cpus isn't computed yet. user_xcpus() is the best + * approximation. + * + * TBD: May need to precompute the real effective_cpus here in case + * incorrect scheduling of SCHED_DEADLINE tasks in a partition + * becomes an issue. */ ret = -EBUSY; - if (is_cpu_exclusive(cur) && - !cpuset_cpumask_can_shrink(cur->cpus_allowed, - trial->cpus_allowed)) + if (is_cpu_exclusive(cur) && is_sched_load_balance(cur) && + !cpuset_cpumask_can_shrink(cur->effective_cpus, user_xcpus(trial))) goto out; /* @@ -728,7 +756,7 @@ static int generate_sched_domains(cpumask_var_t **domains, int nslot; /* next empty doms[] struct cpumask slot */ struct cgroup_subsys_state *pos_css; bool root_load_balance = is_sched_load_balance(&top_cpuset); - bool cgrpv2 = cgroup_subsys_on_dfl(cpuset_cgrp_subsys); + bool cgrpv2 = cpuset_v2(); int nslot_update; doms = NULL; @@ -990,6 +1018,7 @@ void rebuild_sched_domains_locked(void) lockdep_assert_cpus_held(); lockdep_assert_held(&cpuset_mutex); + force_sd_rebuild = false; /* * If we have raced with CPU hotplug, return early to avoid @@ -1164,8 +1193,8 @@ static void update_partition_sd_lb(struct cpuset *cs, int old_prs) clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); } - if (rebuild_domains && !force_sd_rebuild) - rebuild_sched_domains_locked(); + if (rebuild_domains) + cpuset_force_rebuild(); } /* @@ -1187,7 +1216,7 @@ static void reset_partition_data(struct cpuset *cs) { struct cpuset *parent = parent_cs(cs); - if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) + if (!cpuset_v2()) return; lockdep_assert_held(&callback_lock); @@ -1339,7 +1368,7 @@ static inline bool is_local_partition(struct cpuset *cs) * remote_partition_enable - Enable current cpuset as a remote partition root * @cs: the cpuset to update * @new_prs: new partition_root_state - * @tmp: temparary masks + * @tmp: temporary masks * Return: 0 if successful, errcode if error * * Enable the current cpuset to become a remote partition root taking CPUs @@ -1377,7 +1406,7 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs, update_unbound_workqueue_cpumask(isolcpus_updated); /* - * Proprogate changes in top_cpuset's effective_cpus down the hierarchy. + * Propagate changes in top_cpuset's effective_cpus down the hierarchy. */ cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus); update_sibling_cpumasks(&top_cpuset, NULL, tmp); @@ -1387,7 +1416,7 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs, /* * remote_partition_disable - Remove current cpuset from remote partition list * @cs: the cpuset to update - * @tmp: temparary masks + * @tmp: temporary masks * * The effective_cpus is also updated. * @@ -1413,7 +1442,7 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp) update_unbound_workqueue_cpumask(isolcpus_updated); /* - * Proprogate changes in top_cpuset's effective_cpus down the hierarchy. + * Propagate changes in top_cpuset's effective_cpus down the hierarchy. */ cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus); update_sibling_cpumasks(&top_cpuset, NULL, tmp); @@ -1423,7 +1452,7 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp) * remote_cpus_update - cpus_exclusive change of remote partition * @cs: the cpuset to be updated * @newmask: the new effective_xcpus mask - * @tmp: temparary masks + * @tmp: temporary masks * * top_cpuset and subpartitions_cpus will be updated or partition can be * invalidated. @@ -1465,7 +1494,7 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask, update_unbound_workqueue_cpumask(isolcpus_updated); /* - * Proprogate changes in top_cpuset's effective_cpus down the hierarchy. + * Propagate changes in top_cpuset's effective_cpus down the hierarchy. */ cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus); update_sibling_cpumasks(&top_cpuset, NULL, tmp); @@ -1480,7 +1509,7 @@ invalidate: * @cs: the cpuset to be updated * @newmask: the new effective_xcpus mask * @delmask: temporary mask for deletion (not in tmp) - * @tmp: temparary masks + * @tmp: temporary masks * * This should be called before the given cs has updated its cpus_allowed * and/or effective_xcpus. @@ -1512,8 +1541,8 @@ static void remote_partition_check(struct cpuset *cs, struct cpumask *newmask, remote_partition_disable(child, tmp); disable_cnt++; } - if (disable_cnt && !force_sd_rebuild) - rebuild_sched_domains_locked(); + if (disable_cnt) + cpuset_force_rebuild(); } /* @@ -1923,12 +1952,6 @@ static void compute_partition_effective_cpumask(struct cpuset *cs, } /* - * update_cpumasks_hier() flags - */ -#define HIER_CHECKALL 0x01 /* Check all cpusets with no skipping */ -#define HIER_NO_SD_REBUILD 0x02 /* Don't rebuild sched domains */ - -/* * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree * @cs: the cpuset to consider * @tmp: temp variables for calculating effective_cpus & partition setup @@ -1942,7 +1965,7 @@ static void compute_partition_effective_cpumask(struct cpuset *cs, * Called with cpuset_mutex held */ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, - int flags) + bool force) { struct cpuset *cp; struct cgroup_subsys_state *pos_css; @@ -2007,12 +2030,12 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, * Skip the whole subtree if * 1) the cpumask remains the same, * 2) has no partition root state, - * 3) HIER_CHECKALL flag not set, and + * 3) force flag not set, and * 4) for v2 load balance state same as its parent. */ - if (!cp->partition_root_state && !(flags & HIER_CHECKALL) && + if (!cp->partition_root_state && !force && cpumask_equal(tmp->new_cpus, cp->effective_cpus) && - (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || + (!cpuset_v2() || (is_sched_load_balance(parent) == is_sched_load_balance(cp)))) { pos_css = css_rightmost_descendant(pos_css); continue; @@ -2086,8 +2109,7 @@ get_css: * from parent if current cpuset isn't a valid partition root * and their load balance states differ. */ - if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && - !is_partition_valid(cp) && + if (cpuset_v2() && !is_partition_valid(cp) && (is_sched_load_balance(parent) != is_sched_load_balance(cp))) { if (is_sched_load_balance(parent)) set_bit(CS_SCHED_LOAD_BALANCE, &cp->flags); @@ -2103,8 +2125,7 @@ get_css: */ if (!cpumask_empty(cp->cpus_allowed) && is_sched_load_balance(cp) && - (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || - is_partition_valid(cp))) + (!cpuset_v2() || is_partition_valid(cp))) need_rebuild_sched_domains = true; rcu_read_lock(); @@ -2112,9 +2133,8 @@ get_css: } rcu_read_unlock(); - if (need_rebuild_sched_domains && !(flags & HIER_NO_SD_REBUILD) && - !force_sd_rebuild) - rebuild_sched_domains_locked(); + if (need_rebuild_sched_domains) + cpuset_force_rebuild(); } /** @@ -2141,9 +2161,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, * directly. * * The update_cpumasks_hier() function may sleep. So we have to - * release the RCU read lock before calling it. HIER_NO_SD_REBUILD - * flag is used to suppress rebuild of sched domains as the callers - * will take care of that. + * release the RCU read lock before calling it. */ rcu_read_lock(); cpuset_for_each_child(sibling, pos_css, parent) { @@ -2159,7 +2177,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, continue; rcu_read_unlock(); - update_cpumasks_hier(sibling, tmp, HIER_NO_SD_REBUILD); + update_cpumasks_hier(sibling, tmp, false); rcu_read_lock(); css_put(&sibling->css); } @@ -2179,7 +2197,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, struct tmpmasks tmp; struct cpuset *parent = parent_cs(cs); bool invalidate = false; - int hier_flags = 0; + bool force = false; int old_prs = cs->partition_root_state; /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */ @@ -2206,7 +2224,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, return -EINVAL; /* - * When exclusive_cpus isn't explicitly set, it is constrainted + * When exclusive_cpus isn't explicitly set, it is constrained * by cpus_allowed and parent's effective_xcpus. Otherwise, * trialcs->effective_xcpus is used as a temporary cpumask * for checking validity of the partition root. @@ -2240,12 +2258,11 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, * Check all the descendants in update_cpumasks_hier() if * effective_xcpus is to be changed. */ - if (!cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus)) - hier_flags = HIER_CHECKALL; + force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus); retval = validate_change(cs, trialcs); - if ((retval == -EINVAL) && cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) { + if ((retval == -EINVAL) && cpuset_v2()) { struct cgroup_subsys_state *css; struct cpuset *cp; @@ -2309,7 +2326,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, spin_unlock_irq(&callback_lock); /* effective_cpus/effective_xcpus will be updated here */ - update_cpumasks_hier(cs, &tmp, hier_flags); + update_cpumasks_hier(cs, &tmp, force); /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */ if (cs->partition_root_state) @@ -2334,7 +2351,7 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, struct tmpmasks tmp; struct cpuset *parent = parent_cs(cs); bool invalidate = false; - int hier_flags = 0; + bool force = false; int old_prs = cs->partition_root_state; if (!*buf) { @@ -2357,8 +2374,7 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, * Check all the descendants in update_cpumasks_hier() if * effective_xcpus is to be changed. */ - if (!cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus)) - hier_flags = HIER_CHECKALL; + force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus); retval = validate_change(cs, trialcs); if (retval) @@ -2411,8 +2427,8 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, * of the subtree when it is a valid partition root or effective_xcpus * is updated. */ - if (is_partition_valid(cs) || hier_flags) - update_cpumasks_hier(cs, &tmp, hier_flags); + if (is_partition_valid(cs) || force) + update_cpumasks_hier(cs, &tmp, force); /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */ if (cs->partition_root_state) @@ -2737,9 +2753,12 @@ int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs, cs->flags = trialcs->flags; spin_unlock_irq(&callback_lock); - if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed && - !force_sd_rebuild) - rebuild_sched_domains_locked(); + if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) { + if (cpuset_v2()) + cpuset_force_rebuild(); + else + rebuild_sched_domains_locked(); + } if (spread_flag_changed) cpuset1_update_tasks_flags(cs); @@ -2853,12 +2872,14 @@ out: update_unbound_workqueue_cpumask(new_xcpus_state); /* Force update if switching back to member */ - update_cpumasks_hier(cs, &tmpmask, !new_prs ? HIER_CHECKALL : 0); + update_cpumasks_hier(cs, &tmpmask, !new_prs); /* Update sched domains and load balance flag */ update_partition_sd_lb(cs, old_prs); notify_partition_change(cs, old_prs); + if (force_sd_rebuild) + rebuild_sched_domains_locked(); free_cpumasks(NULL, &tmpmask); return 0; } @@ -2919,8 +2940,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) * migration permission derives from hierarchy ownership in * cgroup_procs_write_permission()). */ - if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || - (cpus_updated || mems_updated)) { + if (!cpuset_v2() || (cpus_updated || mems_updated)) { ret = security_task_setscheduler(task); if (ret) goto out_unlock; @@ -3034,8 +3054,7 @@ static void cpuset_attach(struct cgroup_taskset *tset) * in effective cpus and mems. In that case, we can optimize out * by skipping the task iteration and update. */ - if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && - !cpus_updated && !mems_updated) { + if (cpuset_v2() && !cpus_updated && !mems_updated) { cpuset_attach_nodemask_to = cs->effective_mems; goto out; } @@ -3152,6 +3171,8 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of, } free_cpuset(trialcs); + if (force_sd_rebuild) + rebuild_sched_domains_locked(); out_unlock: mutex_unlock(&cpuset_mutex); cpus_read_unlock(); @@ -3383,7 +3404,7 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css) INIT_LIST_HEAD(&cs->remote_sibling); /* Set CS_MEMORY_MIGRATE for default hierarchy */ - if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) + if (cpuset_v2()) __set_bit(CS_MEMORY_MIGRATE, &cs->flags); return &cs->css; @@ -3410,8 +3431,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) /* * For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated */ - if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && - !is_sched_load_balance(parent)) + if (cpuset_v2() && !is_sched_load_balance(parent)) clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); cpuset_inc(); @@ -3481,8 +3501,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) if (is_partition_valid(cs)) update_prstate(cs, 0); - if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && - is_sched_load_balance(cs)) + if (!cpuset_v2() && is_sched_load_balance(cs)) cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); cpuset_dec(); @@ -3896,11 +3915,9 @@ static void cpuset_handle_hotplug(void) rcu_read_unlock(); } - /* rebuild sched domains if cpus_allowed has changed */ - if (force_sd_rebuild) { - force_sd_rebuild = false; + /* rebuild sched domains if necessary */ + if (force_sd_rebuild) rebuild_sched_domains_cpuslocked(); - } free_cpumasks(NULL, ptmp); } diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c index 617861a54793..bf1690a167dd 100644 --- a/kernel/cgroup/freezer.c +++ b/kernel/cgroup/freezer.c @@ -9,6 +9,28 @@ #include <trace/events/cgroup.h> /* + * Update CGRP_FROZEN of cgroup.flag + * Return true if flags is updated; false if flags has no change + */ +static bool cgroup_update_frozen_flag(struct cgroup *cgrp, bool frozen) +{ + lockdep_assert_held(&css_set_lock); + + /* Already there? */ + if (test_bit(CGRP_FROZEN, &cgrp->flags) == frozen) + return false; + + if (frozen) + set_bit(CGRP_FROZEN, &cgrp->flags); + else + clear_bit(CGRP_FROZEN, &cgrp->flags); + + cgroup_file_notify(&cgrp->events_file); + TRACE_CGROUP_PATH(notify_frozen, cgrp, frozen); + return true; +} + +/* * Propagate the cgroup frozen state upwards by the cgroup tree. */ static void cgroup_propagate_frozen(struct cgroup *cgrp, bool frozen) @@ -24,24 +46,16 @@ static void cgroup_propagate_frozen(struct cgroup *cgrp, bool frozen) while ((cgrp = cgroup_parent(cgrp))) { if (frozen) { cgrp->freezer.nr_frozen_descendants += desc; - if (!test_bit(CGRP_FROZEN, &cgrp->flags) && - test_bit(CGRP_FREEZE, &cgrp->flags) && - cgrp->freezer.nr_frozen_descendants == - cgrp->nr_descendants) { - set_bit(CGRP_FROZEN, &cgrp->flags); - cgroup_file_notify(&cgrp->events_file); - TRACE_CGROUP_PATH(notify_frozen, cgrp, 1); - desc++; - } + if (!test_bit(CGRP_FREEZE, &cgrp->flags) || + (cgrp->freezer.nr_frozen_descendants != + cgrp->nr_descendants)) + continue; } else { cgrp->freezer.nr_frozen_descendants -= desc; - if (test_bit(CGRP_FROZEN, &cgrp->flags)) { - clear_bit(CGRP_FROZEN, &cgrp->flags); - cgroup_file_notify(&cgrp->events_file); - TRACE_CGROUP_PATH(notify_frozen, cgrp, 0); - desc++; - } } + + if (cgroup_update_frozen_flag(cgrp, frozen)) + desc++; } } @@ -53,8 +67,6 @@ void cgroup_update_frozen(struct cgroup *cgrp) { bool frozen; - lockdep_assert_held(&css_set_lock); - /* * If the cgroup has to be frozen (CGRP_FREEZE bit set), * and all tasks are frozen and/or stopped, let's consider @@ -63,24 +75,9 @@ void cgroup_update_frozen(struct cgroup *cgrp) frozen = test_bit(CGRP_FREEZE, &cgrp->flags) && cgrp->freezer.nr_frozen_tasks == __cgroup_task_count(cgrp); - if (frozen) { - /* Already there? */ - if (test_bit(CGRP_FROZEN, &cgrp->flags)) - return; - - set_bit(CGRP_FROZEN, &cgrp->flags); - } else { - /* Already there? */ - if (!test_bit(CGRP_FROZEN, &cgrp->flags)) - return; - - clear_bit(CGRP_FROZEN, &cgrp->flags); - } - cgroup_file_notify(&cgrp->events_file); - TRACE_CGROUP_PATH(notify_frozen, cgrp, frozen); - - /* Update the state of ancestor cgroups. */ - cgroup_propagate_frozen(cgrp, frozen); + /* If flags is updated, update the state of ancestor cgroups. */ + if (cgroup_update_frozen_flag(cgrp, frozen)) + cgroup_propagate_frozen(cgrp, frozen); } /* @@ -260,8 +257,10 @@ void cgroup_freezer_migrate_task(struct task_struct *task, void cgroup_freeze(struct cgroup *cgrp, bool freeze) { struct cgroup_subsys_state *css; + struct cgroup *parent; struct cgroup *dsct; bool applied = false; + bool old_e; lockdep_assert_held(&cgroup_mutex); @@ -282,22 +281,18 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze) if (cgroup_is_dead(dsct)) continue; - if (freeze) { - dsct->freezer.e_freeze++; - /* - * Already frozen because of ancestor's settings? - */ - if (dsct->freezer.e_freeze > 1) - continue; - } else { - dsct->freezer.e_freeze--; - /* - * Still frozen because of ancestor's settings? - */ - if (dsct->freezer.e_freeze > 0) - continue; - - WARN_ON_ONCE(dsct->freezer.e_freeze < 0); + /* + * e_freeze is affected by parent's e_freeze and dst's freeze. + * If old e_freeze eq new e_freeze, no change, its children + * will not be affected. So do nothing and skip the subtree + */ + old_e = dsct->freezer.e_freeze; + parent = cgroup_parent(dsct); + dsct->freezer.e_freeze = (dsct->freezer.freeze || + parent->freezer.e_freeze); + if (dsct->freezer.e_freeze == old_e) { + css = css_rightmost_descendant(css); + continue; } /* diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index a06b45272411..5877974ece92 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -444,6 +444,7 @@ static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat, #ifdef CONFIG_SCHED_CORE dst_bstat->forceidle_sum += src_bstat->forceidle_sum; #endif + dst_bstat->ntime += src_bstat->ntime; } static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat, @@ -455,6 +456,7 @@ static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat, #ifdef CONFIG_SCHED_CORE dst_bstat->forceidle_sum -= src_bstat->forceidle_sum; #endif + dst_bstat->ntime -= src_bstat->ntime; } static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) @@ -534,8 +536,10 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp, rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags); switch (index) { - case CPUTIME_USER: case CPUTIME_NICE: + rstatc->bstat.ntime += delta_exec; + fallthrough; + case CPUTIME_USER: rstatc->bstat.cputime.utime += delta_exec; break; case CPUTIME_SYSTEM: @@ -591,6 +595,7 @@ static void root_cgroup_cputime(struct cgroup_base_stat *bstat) #ifdef CONFIG_SCHED_CORE bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE]; #endif + bstat->ntime += cpustat[CPUTIME_NICE]; } } @@ -608,13 +613,14 @@ static void cgroup_force_idle_show(struct seq_file *seq, struct cgroup_base_stat void cgroup_base_stat_cputime_show(struct seq_file *seq) { struct cgroup *cgrp = seq_css(seq)->cgroup; - u64 usage, utime, stime; + u64 usage, utime, stime, ntime; if (cgroup_parent(cgrp)) { cgroup_rstat_flush_hold(cgrp); usage = cgrp->bstat.cputime.sum_exec_runtime; cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, &utime, &stime); + ntime = cgrp->bstat.ntime; cgroup_rstat_flush_release(cgrp); } else { /* cgrp->bstat of root is not actually used, reuse it */ @@ -622,16 +628,19 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq) usage = cgrp->bstat.cputime.sum_exec_runtime; utime = cgrp->bstat.cputime.utime; stime = cgrp->bstat.cputime.stime; + ntime = cgrp->bstat.ntime; } do_div(usage, NSEC_PER_USEC); do_div(utime, NSEC_PER_USEC); do_div(stime, NSEC_PER_USEC); + do_div(ntime, NSEC_PER_USEC); seq_printf(seq, "usage_usec %llu\n" - "user_usec %llu\n" - "system_usec %llu\n", - usage, utime, stime); + "user_usec %llu\n" + "system_usec %llu\n" + "nice_usec %llu\n", + usage, utime, stime, ntime); cgroup_force_idle_show(seq, &cgrp->bstat); } diff --git a/kernel/cpu.c b/kernel/cpu.c index d293d52a3e00..6e34b52cb5ce 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1338,7 +1338,7 @@ static int takedown_cpu(unsigned int cpu) cpuhp_bp_sync_dead(cpu); - tick_cleanup_dead_cpu(cpu); + lockdep_cleanup_dead_cpu(cpu, idle_thread_get(cpu)); /* * Callbacks must be re-integrated right away to the RCU state machine. diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index 9d34d2364b5a..f625172d4b67 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -33,7 +33,7 @@ #include <linux/reboot.h> #include <linux/uaccess.h> #include <asm/cacheflush.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include "debug_core.h" #define KGDB_MAX_THREAD_QUERY 17 diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 5b6934e23c21..e33691d5adf7 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -98,7 +98,7 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs, local_irq_enable_exit_to_user(ti_work); - if (ti_work & _TIF_NEED_RESCHED) + if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) schedule(); if (ti_work & _TIF_UPROBE) diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c index 2e0f75bcb7fd..8485f63863af 100644 --- a/kernel/entry/kvm.c +++ b/kernel/entry/kvm.c @@ -13,7 +13,7 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work) return -EINTR; } - if (ti_work & _TIF_NEED_RESCHED) + if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) schedule(); if (ti_work & _TIF_NOTIFY_RESUME) @@ -24,7 +24,7 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work) return ret; ti_work = read_thread_flags(); - } while (ti_work & XFER_TO_GUEST_MODE_WORK || need_resched()); + } while (ti_work & XFER_TO_GUEST_MODE_WORK); return 0; } diff --git a/kernel/events/core.c b/kernel/events/core.c index 5a8071c45c80..5d4a54f50826 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -966,22 +966,20 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, { struct perf_cgroup *cgrp; struct cgroup_subsys_state *css; - struct fd f = fdget(fd); + CLASS(fd, f)(fd); int ret = 0; - if (!fd_file(f)) + if (fd_empty(f)) return -EBADF; css = css_tryget_online_from_dir(fd_file(f)->f_path.dentry, &perf_event_cgrp_subsys); - if (IS_ERR(css)) { - ret = PTR_ERR(css); - goto out; - } + if (IS_ERR(css)) + return PTR_ERR(css); ret = perf_cgroup_ensure_storage(event, css); if (ret) - goto out; + return ret; cgrp = container_of(css, struct perf_cgroup, css); event->cgrp = cgrp; @@ -995,8 +993,6 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, perf_detach_cgroup(event); ret = -EINVAL; } -out: - fdput(f); return ret; } @@ -2146,7 +2142,7 @@ static void perf_put_aux_event(struct perf_event *event) static bool perf_need_aux_event(struct perf_event *event) { - return !!event->attr.aux_output || !!event->attr.aux_sample_size; + return event->attr.aux_output || has_aux_action(event); } static int perf_get_aux_event(struct perf_event *event, @@ -2171,6 +2167,10 @@ static int perf_get_aux_event(struct perf_event *event, !perf_aux_output_match(event, group_leader)) return 0; + if ((event->attr.aux_pause || event->attr.aux_resume) && + !(group_leader->pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE)) + return 0; + if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux) return 0; @@ -5998,18 +5998,9 @@ EXPORT_SYMBOL_GPL(perf_event_period); static const struct file_operations perf_fops; -static inline int perf_fget_light(int fd, struct fd *p) +static inline bool is_perf_file(struct fd f) { - struct fd f = fdget(fd); - if (!fd_file(f)) - return -EBADF; - - if (fd_file(f)->f_op != &perf_fops) { - fdput(f); - return -EBADF; - } - *p = f; - return 0; + return !fd_empty(f) && fd_file(f)->f_op == &perf_fops; } static int perf_event_set_output(struct perf_event *event, @@ -6057,20 +6048,14 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon case PERF_EVENT_IOC_SET_OUTPUT: { - int ret; + CLASS(fd, output)(arg); // arg == -1 => empty + struct perf_event *output_event = NULL; if (arg != -1) { - struct perf_event *output_event; - struct fd output; - ret = perf_fget_light(arg, &output); - if (ret) - return ret; + if (!is_perf_file(output)) + return -EBADF; output_event = fd_file(output)->private_data; - ret = perf_event_set_output(event, output_event); - fdput(output); - } else { - ret = perf_event_set_output(event, NULL); } - return ret; + return perf_event_set_output(event, output_event); } case PERF_EVENT_IOC_SET_FILTER: @@ -6821,7 +6806,6 @@ static int perf_fasync(int fd, struct file *filp, int on) } static const struct file_operations perf_fops = { - .llseek = no_llseek, .release = perf_release, .read = perf_read, .poll = perf_poll, @@ -7023,6 +7007,29 @@ void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); #endif +static bool should_sample_guest(struct perf_event *event) +{ + return !event->attr.exclude_guest && perf_guest_state(); +} + +unsigned long perf_misc_flags(struct perf_event *event, + struct pt_regs *regs) +{ + if (should_sample_guest(event)) + return perf_arch_guest_misc_flags(regs); + + return perf_arch_misc_flags(regs); +} + +unsigned long perf_instruction_pointer(struct perf_event *event, + struct pt_regs *regs) +{ + if (should_sample_guest(event)) + return perf_guest_get_ip(); + + return perf_arch_instruction_pointer(regs); +} + static void perf_output_sample_regs(struct perf_output_handle *handle, struct pt_regs *regs, u64 mask) @@ -7840,7 +7847,7 @@ void perf_prepare_sample(struct perf_sample_data *data, __perf_event_header__init_id(data, event, filtered_sample_type); if (filtered_sample_type & PERF_SAMPLE_IP) { - data->ip = perf_instruction_pointer(regs); + data->ip = perf_instruction_pointer(event, regs); data->sample_flags |= PERF_SAMPLE_IP; } @@ -8004,7 +8011,7 @@ void perf_prepare_header(struct perf_event_header *header, { header->type = PERF_RECORD_SAMPLE; header->size = perf_sample_data_size(data, event); - header->misc = perf_misc_flags(regs); + header->misc = perf_misc_flags(event, regs); /* * If you're adding more sample types here, you likely need to do @@ -8017,6 +8024,49 @@ void perf_prepare_header(struct perf_event_header *header, WARN_ON_ONCE(header->size & 7); } +static void __perf_event_aux_pause(struct perf_event *event, bool pause) +{ + if (pause) { + if (!event->hw.aux_paused) { + event->hw.aux_paused = 1; + event->pmu->stop(event, PERF_EF_PAUSE); + } + } else { + if (event->hw.aux_paused) { + event->hw.aux_paused = 0; + event->pmu->start(event, PERF_EF_RESUME); + } + } +} + +static void perf_event_aux_pause(struct perf_event *event, bool pause) +{ + struct perf_buffer *rb; + + if (WARN_ON_ONCE(!event)) + return; + + rb = ring_buffer_get(event); + if (!rb) + return; + + scoped_guard (irqsave) { + /* + * Guard against self-recursion here. Another event could trip + * this same from NMI context. + */ + if (READ_ONCE(rb->aux_in_pause_resume)) + break; + + WRITE_ONCE(rb->aux_in_pause_resume, 1); + barrier(); + __perf_event_aux_pause(event, pause); + barrier(); + WRITE_ONCE(rb->aux_in_pause_resume, 0); + } + ring_buffer_put(rb); +} + static __always_inline int __perf_event_output(struct perf_event *event, struct perf_sample_data *data, @@ -9252,7 +9302,7 @@ static void perf_event_switch(struct task_struct *task, }, }; - if (!sched_in && task->on_rq) { + if (!sched_in && task_is_runnable(task)) { switch_event.event_id.header.misc |= PERF_RECORD_MISC_SWITCH_OUT_PREEMPT; } @@ -9819,9 +9869,12 @@ static int __perf_event_overflow(struct perf_event *event, ret = __perf_event_account_interrupt(event, throttle); + if (event->attr.aux_pause) + perf_event_aux_pause(event->aux_event, true); + if (event->prog && event->prog->type == BPF_PROG_TYPE_PERF_EVENT && !bpf_overflow_handler(event, data, regs)) - return ret; + goto out; /* * XXX event_limit might not quite work as expected on inherited @@ -9883,6 +9936,9 @@ static int __perf_event_overflow(struct perf_event *event, event->pending_wakeup = 1; irq_work_queue(&event->pending_irq); } +out: + if (event->attr.aux_resume) + perf_event_aux_pause(event->aux_event, false); return ret; } @@ -12274,11 +12330,25 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, } if (event->attr.aux_output && - !(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) { + (!(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT) || + event->attr.aux_pause || event->attr.aux_resume)) { err = -EOPNOTSUPP; goto err_pmu; } + if (event->attr.aux_pause && event->attr.aux_resume) { + err = -EINVAL; + goto err_pmu; + } + + if (event->attr.aux_start_paused) { + if (!(pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE)) { + err = -EOPNOTSUPP; + goto err_pmu; + } + event->hw.aux_paused = 1; + } + if (cgroup_fd != -1) { err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader); if (err) @@ -12665,7 +12735,6 @@ SYSCALL_DEFINE5(perf_event_open, struct perf_event_attr attr; struct perf_event_context *ctx; struct file *event_file = NULL; - struct fd group = EMPTY_FD; struct task_struct *task = NULL; struct pmu *pmu; int event_fd; @@ -12736,10 +12805,12 @@ SYSCALL_DEFINE5(perf_event_open, if (event_fd < 0) return event_fd; + CLASS(fd, group)(group_fd); // group_fd == -1 => empty if (group_fd != -1) { - err = perf_fget_light(group_fd, &group); - if (err) + if (!is_perf_file(group)) { + err = -EBADF; goto err_fd; + } group_leader = fd_file(group)->private_data; if (flags & PERF_FLAG_FD_OUTPUT) output_event = group_leader; @@ -12751,7 +12822,7 @@ SYSCALL_DEFINE5(perf_event_open, task = find_lively_task_by_vpid(pid); if (IS_ERR(task)) { err = PTR_ERR(task); - goto err_group_fd; + goto err_fd; } } @@ -13018,12 +13089,11 @@ SYSCALL_DEFINE5(perf_event_open, mutex_unlock(¤t->perf_event_mutex); /* - * Drop the reference on the group_event after placing the - * new event on the sibling_list. This ensures destruction - * of the group leader will find the pointer to itself in - * perf_group_detach(). + * File reference in group guarantees that group_leader has been + * kept alive until we place the new event on the sibling_list. + * This ensures destruction of the group leader will find + * the pointer to itself in perf_group_detach(). */ - fdput(group); fd_install(event_fd, event_file); return event_fd; @@ -13042,8 +13112,6 @@ err_alloc: err_task: if (task) put_task_struct(task); -err_group_fd: - fdput(group); err_fd: put_unused_fd(event_fd); return err; @@ -13074,7 +13142,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, * Grouping is not supported for kernel events, neither is 'AUX', * make sure the caller's intentions are adjusted. */ - if (attr->aux_output) + if (attr->aux_output || attr->aux_action) return ERR_PTR(-EINVAL); event = perf_event_alloc(attr, cpu, task, NULL, NULL, @@ -13960,7 +14028,7 @@ static void perf_event_clear_cpumask(unsigned int cpu) } /* migrate */ - list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) { + list_for_each_entry(pmu, &pmus, entry) { if (pmu->scope == PERF_PMU_SCOPE_NONE || WARN_ON_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE)) continue; diff --git a/kernel/events/internal.h b/kernel/events/internal.h index e072d995d670..249288d82b8d 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -52,6 +52,7 @@ struct perf_buffer { void (*free_aux)(void *); refcount_t aux_refcount; int aux_in_sampling; + int aux_in_pause_resume; void **aux_pages; void *aux_priv; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 2ec796e2f055..a76ddc5fc982 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -26,6 +26,9 @@ #include <linux/task_work.h> #include <linux/shmem_fs.h> #include <linux/khugepaged.h> +#include <linux/rcupdate_trace.h> +#include <linux/workqueue.h> +#include <linux/srcu.h> #include <linux/uprobes.h> @@ -42,8 +45,6 @@ static struct rb_root uprobes_tree = RB_ROOT; static DEFINE_RWLOCK(uprobes_treelock); /* serialize rbtree access */ static seqcount_rwlock_t uprobes_seqcount = SEQCNT_RWLOCK_ZERO(uprobes_seqcount, &uprobes_treelock); -DEFINE_STATIC_SRCU(uprobes_srcu); - #define UPROBES_HASH_SZ 13 /* serialize uprobe->pending_list */ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; @@ -51,6 +52,9 @@ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem); +/* Covers return_instance's uprobe lifetime. */ +DEFINE_STATIC_SRCU(uretprobes_srcu); + /* Have a copy of original instruction */ #define UPROBE_COPY_INSN 0 @@ -62,10 +66,13 @@ struct uprobe { struct list_head pending_list; struct list_head consumers; struct inode *inode; /* Also hold a ref to inode */ - struct rcu_head rcu; + union { + struct rcu_head rcu; + struct work_struct work; + }; loff_t offset; loff_t ref_ctr_offset; - unsigned long flags; + unsigned long flags; /* "unsigned long" so bitops work */ /* * The generic code assumes that it has two members of unknown type @@ -100,7 +107,6 @@ static LIST_HEAD(delayed_uprobe_list); */ struct xol_area { wait_queue_head_t wq; /* if all slots are busy */ - atomic_t slot_count; /* number of in-use slots */ unsigned long *bitmap; /* 0 = free slot */ struct page *page; @@ -620,17 +626,23 @@ static inline bool uprobe_is_active(struct uprobe *uprobe) return !RB_EMPTY_NODE(&uprobe->rb_node); } -static void uprobe_free_rcu(struct rcu_head *rcu) +static void uprobe_free_rcu_tasks_trace(struct rcu_head *rcu) { struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu); kfree(uprobe); } -static void put_uprobe(struct uprobe *uprobe) +static void uprobe_free_srcu(struct rcu_head *rcu) { - if (!refcount_dec_and_test(&uprobe->ref)) - return; + struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu); + + call_rcu_tasks_trace(&uprobe->rcu, uprobe_free_rcu_tasks_trace); +} + +static void uprobe_free_deferred(struct work_struct *work) +{ + struct uprobe *uprobe = container_of(work, struct uprobe, work); write_lock(&uprobes_treelock); @@ -651,7 +663,162 @@ static void put_uprobe(struct uprobe *uprobe) delayed_uprobe_remove(uprobe, NULL); mutex_unlock(&delayed_uprobe_lock); - call_srcu(&uprobes_srcu, &uprobe->rcu, uprobe_free_rcu); + /* start srcu -> rcu_tasks_trace -> kfree chain */ + call_srcu(&uretprobes_srcu, &uprobe->rcu, uprobe_free_srcu); +} + +static void put_uprobe(struct uprobe *uprobe) +{ + if (!refcount_dec_and_test(&uprobe->ref)) + return; + + INIT_WORK(&uprobe->work, uprobe_free_deferred); + schedule_work(&uprobe->work); +} + +/* Initialize hprobe as SRCU-protected "leased" uprobe */ +static void hprobe_init_leased(struct hprobe *hprobe, struct uprobe *uprobe, int srcu_idx) +{ + WARN_ON(!uprobe); + hprobe->state = HPROBE_LEASED; + hprobe->uprobe = uprobe; + hprobe->srcu_idx = srcu_idx; +} + +/* Initialize hprobe as refcounted ("stable") uprobe (uprobe can be NULL). */ +static void hprobe_init_stable(struct hprobe *hprobe, struct uprobe *uprobe) +{ + hprobe->state = uprobe ? HPROBE_STABLE : HPROBE_GONE; + hprobe->uprobe = uprobe; + hprobe->srcu_idx = -1; +} + +/* + * hprobe_consume() fetches hprobe's underlying uprobe and detects whether + * uprobe is SRCU protected or is refcounted. hprobe_consume() can be + * used only once for a given hprobe. + * + * Caller has to call hprobe_finalize() and pass previous hprobe_state, so + * that hprobe_finalize() can perform SRCU unlock or put uprobe, whichever + * is appropriate. + */ +static inline struct uprobe *hprobe_consume(struct hprobe *hprobe, enum hprobe_state *hstate) +{ + *hstate = xchg(&hprobe->state, HPROBE_CONSUMED); + switch (*hstate) { + case HPROBE_LEASED: + case HPROBE_STABLE: + return hprobe->uprobe; + case HPROBE_GONE: /* uprobe is NULL, no SRCU */ + case HPROBE_CONSUMED: /* uprobe was finalized already, do nothing */ + return NULL; + default: + WARN(1, "hprobe invalid state %d", *hstate); + return NULL; + } +} + +/* + * Reset hprobe state and, if hprobe was LEASED, release SRCU lock. + * hprobe_finalize() can only be used from current context after + * hprobe_consume() call (which determines uprobe and hstate value). + */ +static void hprobe_finalize(struct hprobe *hprobe, enum hprobe_state hstate) +{ + switch (hstate) { + case HPROBE_LEASED: + __srcu_read_unlock(&uretprobes_srcu, hprobe->srcu_idx); + break; + case HPROBE_STABLE: + put_uprobe(hprobe->uprobe); + break; + case HPROBE_GONE: + case HPROBE_CONSUMED: + break; + default: + WARN(1, "hprobe invalid state %d", hstate); + break; + } +} + +/* + * Attempt to switch (atomically) uprobe from being SRCU protected (LEASED) + * to refcounted (STABLE) state. Competes with hprobe_consume(); only one of + * them can win the race to perform SRCU unlocking. Whoever wins must perform + * SRCU unlock. + * + * Returns underlying valid uprobe or NULL, if there was no underlying uprobe + * to begin with or we failed to bump its refcount and it's going away. + * + * Returned non-NULL uprobe can be still safely used within an ongoing SRCU + * locked region. If `get` is true, it's guaranteed that non-NULL uprobe has + * an extra refcount for caller to assume and use. Otherwise, it's not + * guaranteed that returned uprobe has a positive refcount, so caller has to + * attempt try_get_uprobe(), if it needs to preserve uprobe beyond current + * SRCU lock region. See dup_utask(). + */ +static struct uprobe *hprobe_expire(struct hprobe *hprobe, bool get) +{ + enum hprobe_state hstate; + + /* + * return_instance's hprobe is protected by RCU. + * Underlying uprobe is itself protected from reuse by SRCU. + */ + lockdep_assert(rcu_read_lock_held() && srcu_read_lock_held(&uretprobes_srcu)); + + hstate = READ_ONCE(hprobe->state); + switch (hstate) { + case HPROBE_STABLE: + /* uprobe has positive refcount, bump refcount, if necessary */ + return get ? get_uprobe(hprobe->uprobe) : hprobe->uprobe; + case HPROBE_GONE: + /* + * SRCU was unlocked earlier and we didn't manage to take + * uprobe refcnt, so it's effectively NULL + */ + return NULL; + case HPROBE_CONSUMED: + /* + * uprobe was consumed, so it's effectively NULL as far as + * uretprobe processing logic is concerned + */ + return NULL; + case HPROBE_LEASED: { + struct uprobe *uprobe = try_get_uprobe(hprobe->uprobe); + /* + * Try to switch hprobe state, guarding against + * hprobe_consume() or another hprobe_expire() racing with us. + * Note, if we failed to get uprobe refcount, we use special + * HPROBE_GONE state to signal that hprobe->uprobe shouldn't + * be used as it will be freed after SRCU is unlocked. + */ + if (try_cmpxchg(&hprobe->state, &hstate, uprobe ? HPROBE_STABLE : HPROBE_GONE)) { + /* We won the race, we are the ones to unlock SRCU */ + __srcu_read_unlock(&uretprobes_srcu, hprobe->srcu_idx); + return get ? get_uprobe(uprobe) : uprobe; + } + + /* + * We lost the race, undo refcount bump (if it ever happened), + * unless caller would like an extra refcount anyways. + */ + if (uprobe && !get) + put_uprobe(uprobe); + /* + * Even if hprobe_consume() or another hprobe_expire() wins + * the state update race and unlocks SRCU from under us, we + * still have a guarantee that underyling uprobe won't be + * freed due to ongoing caller's SRCU lock region, so we can + * return it regardless. Also, if `get` was true, we also have + * an extra ref for the caller to own. This is used in dup_utask(). + */ + return uprobe; + } + default: + WARN(1, "unknown hprobe state %d", hstate); + return NULL; + } } static __always_inline @@ -706,7 +873,7 @@ static struct uprobe *find_uprobe_rcu(struct inode *inode, loff_t offset) struct rb_node *node; unsigned int seq; - lockdep_assert(srcu_read_lock_held(&uprobes_srcu)); + lockdep_assert(rcu_read_lock_trace_held()); do { seq = read_seqcount_begin(&uprobes_seqcount); @@ -825,8 +992,11 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset, static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc) { + static atomic64_t id; + down_write(&uprobe->consumer_rwsem); list_add_rcu(&uc->cons_node, &uprobe->consumers); + uc->id = (__u64) atomic64_inc_return(&id); up_write(&uprobe->consumer_rwsem); } @@ -934,8 +1104,7 @@ static bool filter_chain(struct uprobe *uprobe, struct mm_struct *mm) bool ret = false; down_read(&uprobe->consumer_rwsem); - list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node, - srcu_read_lock_held(&uprobes_srcu)) { + list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) { ret = consumer_filter(uc, mm); if (ret) break; @@ -1156,7 +1325,8 @@ void uprobe_unregister_sync(void) * unlucky enough caller can free consumer's memory and cause * handler_chain() or handle_uretprobe_chain() to do an use-after-free. */ - synchronize_srcu(&uprobes_srcu); + synchronize_rcu_tasks_trace(); + synchronize_srcu(&uretprobes_srcu); } EXPORT_SYMBOL_GPL(uprobe_unregister_sync); @@ -1240,19 +1410,18 @@ EXPORT_SYMBOL_GPL(uprobe_register); int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool add) { struct uprobe_consumer *con; - int ret = -ENOENT, srcu_idx; + int ret = -ENOENT; down_write(&uprobe->register_rwsem); - srcu_idx = srcu_read_lock(&uprobes_srcu); - list_for_each_entry_srcu(con, &uprobe->consumers, cons_node, - srcu_read_lock_held(&uprobes_srcu)) { + rcu_read_lock_trace(); + list_for_each_entry_rcu(con, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) { if (con == uc) { ret = register_for_each_vma(uprobe, add ? uc : NULL); break; } } - srcu_read_unlock(&uprobes_srcu, srcu_idx); + rcu_read_unlock_trace(); up_write(&uprobe->register_rwsem); @@ -1475,9 +1644,15 @@ static vm_fault_t xol_fault(const struct vm_special_mapping *sm, return 0; } +static int xol_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma) +{ + return -EPERM; +} + static const struct vm_special_mapping xol_mapping = { .name = "[uprobes]", .fault = xol_fault, + .mremap = xol_mremap, }; /* Slot allocation for XOL */ @@ -1545,7 +1720,7 @@ static struct xol_area *__create_xol_area(unsigned long vaddr) if (!area->bitmap) goto free_area; - area->page = alloc_page(GFP_HIGHUSER); + area->page = alloc_page(GFP_HIGHUSER | __GFP_ZERO); if (!area->page) goto free_bitmap; @@ -1553,7 +1728,6 @@ static struct xol_area *__create_xol_area(unsigned long vaddr) init_waitqueue_head(&area->wq); /* Reserve the 1st slot for get_trampoline_vaddr() */ set_bit(0, area->bitmap); - atomic_set(&area->slot_count, 1); insns = arch_uprobe_trampoline(&insns_size); arch_uprobe_copy_ixol(area->page, 0, insns, insns_size); @@ -1626,92 +1800,57 @@ void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm) } } -/* - * - search for a free slot. - */ -static unsigned long xol_take_insn_slot(struct xol_area *area) +static unsigned long xol_get_slot_nr(struct xol_area *area) { - unsigned long slot_addr; - int slot_nr; - - do { - slot_nr = find_first_zero_bit(area->bitmap, UINSNS_PER_PAGE); - if (slot_nr < UINSNS_PER_PAGE) { - if (!test_and_set_bit(slot_nr, area->bitmap)) - break; + unsigned long slot_nr; - slot_nr = UINSNS_PER_PAGE; - continue; - } - wait_event(area->wq, (atomic_read(&area->slot_count) < UINSNS_PER_PAGE)); - } while (slot_nr >= UINSNS_PER_PAGE); - - slot_addr = area->vaddr + (slot_nr * UPROBE_XOL_SLOT_BYTES); - atomic_inc(&area->slot_count); + slot_nr = find_first_zero_bit(area->bitmap, UINSNS_PER_PAGE); + if (slot_nr < UINSNS_PER_PAGE) { + if (!test_and_set_bit(slot_nr, area->bitmap)) + return slot_nr; + } - return slot_addr; + return UINSNS_PER_PAGE; } /* * xol_get_insn_slot - allocate a slot for xol. - * Returns the allocated slot address or 0. */ -static unsigned long xol_get_insn_slot(struct uprobe *uprobe) +static bool xol_get_insn_slot(struct uprobe *uprobe, struct uprobe_task *utask) { - struct xol_area *area; - unsigned long xol_vaddr; + struct xol_area *area = get_xol_area(); + unsigned long slot_nr; - area = get_xol_area(); if (!area) - return 0; + return false; - xol_vaddr = xol_take_insn_slot(area); - if (unlikely(!xol_vaddr)) - return 0; + wait_event(area->wq, (slot_nr = xol_get_slot_nr(area)) < UINSNS_PER_PAGE); - arch_uprobe_copy_ixol(area->page, xol_vaddr, + utask->xol_vaddr = area->vaddr + slot_nr * UPROBE_XOL_SLOT_BYTES; + arch_uprobe_copy_ixol(area->page, utask->xol_vaddr, &uprobe->arch.ixol, sizeof(uprobe->arch.ixol)); - - return xol_vaddr; + return true; } /* - * xol_free_insn_slot - If slot was earlier allocated by - * @xol_get_insn_slot(), make the slot available for - * subsequent requests. + * xol_free_insn_slot - free the slot allocated by xol_get_insn_slot() */ -static void xol_free_insn_slot(struct task_struct *tsk) +static void xol_free_insn_slot(struct uprobe_task *utask) { - struct xol_area *area; - unsigned long vma_end; - unsigned long slot_addr; - - if (!tsk->mm || !tsk->mm->uprobes_state.xol_area || !tsk->utask) - return; + struct xol_area *area = current->mm->uprobes_state.xol_area; + unsigned long offset = utask->xol_vaddr - area->vaddr; + unsigned int slot_nr; - slot_addr = tsk->utask->xol_vaddr; - if (unlikely(!slot_addr)) + utask->xol_vaddr = 0; + /* xol_vaddr must fit into [area->vaddr, area->vaddr + PAGE_SIZE) */ + if (WARN_ON_ONCE(offset >= PAGE_SIZE)) return; - area = tsk->mm->uprobes_state.xol_area; - vma_end = area->vaddr + PAGE_SIZE; - if (area->vaddr <= slot_addr && slot_addr < vma_end) { - unsigned long offset; - int slot_nr; - - offset = slot_addr - area->vaddr; - slot_nr = offset / UPROBE_XOL_SLOT_BYTES; - if (slot_nr >= UINSNS_PER_PAGE) - return; - - clear_bit(slot_nr, area->bitmap); - atomic_dec(&area->slot_count); - smp_mb__after_atomic(); /* pairs with prepare_to_wait() */ - if (waitqueue_active(&area->wq)) - wake_up(&area->wq); - - tsk->utask->xol_vaddr = 0; - } + slot_nr = offset / UPROBE_XOL_SLOT_BYTES; + clear_bit(slot_nr, area->bitmap); + smp_mb__after_atomic(); /* pairs with prepare_to_wait() */ + if (waitqueue_active(&area->wq)) + wake_up(&area->wq); } void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, @@ -1750,11 +1889,18 @@ unsigned long uprobe_get_trap_addr(struct pt_regs *regs) return instruction_pointer(regs); } -static struct return_instance *free_ret_instance(struct return_instance *ri) +static struct return_instance *free_ret_instance(struct return_instance *ri, bool cleanup_hprobe) { struct return_instance *next = ri->next; - put_uprobe(ri->uprobe); - kfree(ri); + + if (cleanup_hprobe) { + enum hprobe_state hstate; + + (void)hprobe_consume(&ri->hprobe, &hstate); + hprobe_finalize(&ri->hprobe, hstate); + } + + kfree_rcu(ri, rcu); return next; } @@ -1770,18 +1916,50 @@ void uprobe_free_utask(struct task_struct *t) if (!utask) return; - if (utask->active_uprobe) - put_uprobe(utask->active_uprobe); + WARN_ON_ONCE(utask->active_uprobe || utask->xol_vaddr); + + timer_delete_sync(&utask->ri_timer); ri = utask->return_instances; while (ri) - ri = free_ret_instance(ri); + ri = free_ret_instance(ri, true /* cleanup_hprobe */); - xol_free_insn_slot(t); kfree(utask); t->utask = NULL; } +#define RI_TIMER_PERIOD (HZ / 10) /* 100 ms */ + +#define for_each_ret_instance_rcu(pos, head) \ + for (pos = rcu_dereference_raw(head); pos; pos = rcu_dereference_raw(pos->next)) + +static void ri_timer(struct timer_list *timer) +{ + struct uprobe_task *utask = container_of(timer, struct uprobe_task, ri_timer); + struct return_instance *ri; + + /* SRCU protects uprobe from reuse for the cmpxchg() inside hprobe_expire(). */ + guard(srcu)(&uretprobes_srcu); + /* RCU protects return_instance from freeing. */ + guard(rcu)(); + + for_each_ret_instance_rcu(ri, utask->return_instances) + hprobe_expire(&ri->hprobe, false); +} + +static struct uprobe_task *alloc_utask(void) +{ + struct uprobe_task *utask; + + utask = kzalloc(sizeof(*utask), GFP_KERNEL); + if (!utask) + return NULL; + + timer_setup(&utask->ri_timer, ri_timer, 0); + + return utask; +} + /* * Allocate a uprobe_task object for the task if necessary. * Called when the thread hits a breakpoint. @@ -1793,38 +1971,73 @@ void uprobe_free_utask(struct task_struct *t) static struct uprobe_task *get_utask(void) { if (!current->utask) - current->utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL); + current->utask = alloc_utask(); return current->utask; } +static size_t ri_size(int consumers_cnt) +{ + struct return_instance *ri; + + return sizeof(*ri) + sizeof(ri->consumers[0]) * consumers_cnt; +} + +#define DEF_CNT 4 + +static struct return_instance *alloc_return_instance(void) +{ + struct return_instance *ri; + + ri = kzalloc(ri_size(DEF_CNT), GFP_KERNEL); + if (!ri) + return ZERO_SIZE_PTR; + + ri->consumers_cnt = DEF_CNT; + return ri; +} + +static struct return_instance *dup_return_instance(struct return_instance *old) +{ + size_t size = ri_size(old->consumers_cnt); + + return kmemdup(old, size, GFP_KERNEL); +} + static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask) { struct uprobe_task *n_utask; struct return_instance **p, *o, *n; + struct uprobe *uprobe; - n_utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL); + n_utask = alloc_utask(); if (!n_utask) return -ENOMEM; t->utask = n_utask; + /* protect uprobes from freeing, we'll need try_get_uprobe() them */ + guard(srcu)(&uretprobes_srcu); + p = &n_utask->return_instances; for (o = o_utask->return_instances; o; o = o->next) { - n = kmalloc(sizeof(struct return_instance), GFP_KERNEL); + n = dup_return_instance(o); if (!n) return -ENOMEM; - *n = *o; + /* if uprobe is non-NULL, we'll have an extra refcount for uprobe */ + uprobe = hprobe_expire(&o->hprobe, true); + /* - * uprobe's refcnt has to be positive at this point, kept by - * utask->return_instances items; return_instances can't be - * removed right now, as task is blocked due to duping; so - * get_uprobe() is safe to use here. + * New utask will have stable properly refcounted uprobe or + * NULL. Even if we failed to get refcounted uprobe, we still + * need to preserve full set of return_instances for proper + * uretprobe handling and nesting in forked task. */ - get_uprobe(n->uprobe); - n->next = NULL; + hprobe_init_stable(&n->hprobe, uprobe); - *p = n; + n->next = NULL; + rcu_assign_pointer(*p, n); p = &n->next; + n_utask->depth++; } @@ -1900,45 +2113,34 @@ static void cleanup_return_instances(struct uprobe_task *utask, bool chained, enum rp_check ctx = chained ? RP_CHECK_CHAIN_CALL : RP_CHECK_CALL; while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) { - ri = free_ret_instance(ri); + ri = free_ret_instance(ri, true /* cleanup_hprobe */); utask->depth--; } - utask->return_instances = ri; + rcu_assign_pointer(utask->return_instances, ri); } -static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) +static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs, + struct return_instance *ri) { - struct return_instance *ri; - struct uprobe_task *utask; + struct uprobe_task *utask = current->utask; unsigned long orig_ret_vaddr, trampoline_vaddr; bool chained; + int srcu_idx; if (!get_xol_area()) - return; - - utask = get_utask(); - if (!utask) - return; + goto free; if (utask->depth >= MAX_URETPROBE_DEPTH) { printk_ratelimited(KERN_INFO "uprobe: omit uretprobe due to" " nestedness limit pid/tgid=%d/%d\n", current->pid, current->tgid); - return; + goto free; } - /* we need to bump refcount to store uprobe in utask */ - if (!try_get_uprobe(uprobe)) - return; - - ri = kmalloc(sizeof(struct return_instance), GFP_KERNEL); - if (!ri) - goto fail; - trampoline_vaddr = uprobe_get_trampoline_vaddr(); orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs); if (orig_ret_vaddr == -1) - goto fail; + goto free; /* drop the entries invalidated by longjmp() */ chained = (orig_ret_vaddr == trampoline_vaddr); @@ -1956,53 +2158,51 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) * attack from user-space. */ uprobe_warn(current, "handle tail call"); - goto fail; + goto free; } orig_ret_vaddr = utask->return_instances->orig_ret_vaddr; } - ri->uprobe = uprobe; + + /* __srcu_read_lock() because SRCU lock survives switch to user space */ + srcu_idx = __srcu_read_lock(&uretprobes_srcu); + ri->func = instruction_pointer(regs); ri->stack = user_stack_pointer(regs); ri->orig_ret_vaddr = orig_ret_vaddr; ri->chained = chained; utask->depth++; + + hprobe_init_leased(&ri->hprobe, uprobe, srcu_idx); ri->next = utask->return_instances; - utask->return_instances = ri; + rcu_assign_pointer(utask->return_instances, ri); + + mod_timer(&utask->ri_timer, jiffies + RI_TIMER_PERIOD); return; -fail: +free: kfree(ri); - put_uprobe(uprobe); } /* Prepare to single-step probed instruction out of line. */ static int pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr) { - struct uprobe_task *utask; - unsigned long xol_vaddr; + struct uprobe_task *utask = current->utask; int err; - utask = get_utask(); - if (!utask) - return -ENOMEM; - if (!try_get_uprobe(uprobe)) return -EINVAL; - xol_vaddr = xol_get_insn_slot(uprobe); - if (!xol_vaddr) { + if (!xol_get_insn_slot(uprobe, utask)) { err = -ENOMEM; goto err_out; } - utask->xol_vaddr = xol_vaddr; utask->vaddr = bp_vaddr; - err = arch_uprobe_pre_xol(&uprobe->arch, regs); if (unlikely(err)) { - xol_free_insn_slot(current); + xol_free_insn_slot(utask); goto err_out; } @@ -2125,35 +2325,90 @@ static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swb return uprobe; } +static struct return_instance* +push_consumer(struct return_instance *ri, int idx, __u64 id, __u64 cookie) +{ + if (unlikely(ri == ZERO_SIZE_PTR)) + return ri; + + if (unlikely(idx >= ri->consumers_cnt)) { + struct return_instance *old_ri = ri; + + ri->consumers_cnt += DEF_CNT; + ri = krealloc(old_ri, ri_size(old_ri->consumers_cnt), GFP_KERNEL); + if (!ri) { + kfree(old_ri); + return ZERO_SIZE_PTR; + } + } + + ri->consumers[idx].id = id; + ri->consumers[idx].cookie = cookie; + return ri; +} + +static struct return_consumer * +return_consumer_find(struct return_instance *ri, int *iter, int id) +{ + struct return_consumer *ric; + int idx = *iter; + + for (ric = &ri->consumers[idx]; idx < ri->consumers_cnt; idx++, ric++) { + if (ric->id == id) { + *iter = idx + 1; + return ric; + } + } + return NULL; +} + +static bool ignore_ret_handler(int rc) +{ + return rc == UPROBE_HANDLER_REMOVE || rc == UPROBE_HANDLER_IGNORE; +} + static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) { struct uprobe_consumer *uc; - int remove = UPROBE_HANDLER_REMOVE; - bool need_prep = false; /* prepare return uprobe, when needed */ - bool has_consumers = false; + bool has_consumers = false, remove = true; + struct return_instance *ri = NULL; + int push_idx = 0; current->utask->auprobe = &uprobe->arch; - list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node, - srcu_read_lock_held(&uprobes_srcu)) { + list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) { + bool session = uc->handler && uc->ret_handler; + __u64 cookie = 0; int rc = 0; if (uc->handler) { - rc = uc->handler(uc, regs); - WARN(rc & ~UPROBE_HANDLER_MASK, + rc = uc->handler(uc, regs, &cookie); + WARN(rc < 0 || rc > 2, "bad rc=0x%x from %ps()\n", rc, uc->handler); } - if (uc->ret_handler) - need_prep = true; - - remove &= rc; + remove &= rc == UPROBE_HANDLER_REMOVE; has_consumers = true; + + if (!uc->ret_handler || ignore_ret_handler(rc)) + continue; + + if (!ri) + ri = alloc_return_instance(); + + if (session) + ri = push_consumer(ri, push_idx++, uc->id, cookie); } current->utask->auprobe = NULL; - if (need_prep && !remove) - prepare_uretprobe(uprobe, regs); /* put bp at return */ + if (!ZERO_OR_NULL_PTR(ri)) { + /* + * The push_idx value has the final number of return consumers, + * and ri->consumers_cnt has number of allocated consumers. + */ + ri->consumers_cnt = push_idx; + prepare_uretprobe(uprobe, regs, ri); + } if (remove && has_consumers) { down_read(&uprobe->register_rwsem); @@ -2169,19 +2424,27 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) } static void -handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs) +handle_uretprobe_chain(struct return_instance *ri, struct uprobe *uprobe, struct pt_regs *regs) { - struct uprobe *uprobe = ri->uprobe; + struct return_consumer *ric; struct uprobe_consumer *uc; - int srcu_idx; + int ric_idx = 0; + + /* all consumers unsubscribed meanwhile */ + if (unlikely(!uprobe)) + return; - srcu_idx = srcu_read_lock(&uprobes_srcu); - list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node, - srcu_read_lock_held(&uprobes_srcu)) { - if (uc->ret_handler) - uc->ret_handler(uc, ri->func, regs); + rcu_read_lock_trace(); + list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) { + bool session = uc->handler && uc->ret_handler; + + if (uc->ret_handler) { + ric = return_consumer_find(ri, &ric_idx, uc->id); + if (!session || ric) + uc->ret_handler(uc, ri->func, regs, ric ? &ric->cookie : NULL); + } } - srcu_read_unlock(&uprobes_srcu, srcu_idx); + rcu_read_unlock_trace(); } static struct return_instance *find_next_ret_chain(struct return_instance *ri) @@ -2200,6 +2463,8 @@ void uprobe_handle_trampoline(struct pt_regs *regs) { struct uprobe_task *utask; struct return_instance *ri, *next; + struct uprobe *uprobe; + enum hprobe_state hstate; bool valid; utask = current->utask; @@ -2230,21 +2495,24 @@ void uprobe_handle_trampoline(struct pt_regs *regs) * trampoline addresses on the stack are replaced with correct * original return addresses */ - utask->return_instances = ri->next; + rcu_assign_pointer(utask->return_instances, ri->next); + + uprobe = hprobe_consume(&ri->hprobe, &hstate); if (valid) - handle_uretprobe_chain(ri, regs); - ri = free_ret_instance(ri); + handle_uretprobe_chain(ri, uprobe, regs); + hprobe_finalize(&ri->hprobe, hstate); + + /* We already took care of hprobe, no need to waste more time on that. */ + ri = free_ret_instance(ri, false /* !cleanup_hprobe */); utask->depth--; } while (ri != next); } while (!valid); - utask->return_instances = ri; return; - sigill: +sigill: uprobe_warn(current, "handle uretprobe, sending SIGILL."); force_sig(SIGILL); - } bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs) @@ -2266,13 +2534,13 @@ static void handle_swbp(struct pt_regs *regs) { struct uprobe *uprobe; unsigned long bp_vaddr; - int is_swbp, srcu_idx; + int is_swbp; bp_vaddr = uprobe_get_swbp_addr(regs); if (bp_vaddr == uprobe_get_trampoline_vaddr()) return uprobe_handle_trampoline(regs); - srcu_idx = srcu_read_lock(&uprobes_srcu); + rcu_read_lock_trace(); uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp); if (!uprobe) { @@ -2330,7 +2598,7 @@ static void handle_swbp(struct pt_regs *regs) out: /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */ - srcu_read_unlock(&uprobes_srcu, srcu_idx); + rcu_read_unlock_trace(); } /* @@ -2353,7 +2621,7 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) put_uprobe(uprobe); utask->active_uprobe = NULL; utask->state = UTASK_RUNNING; - xol_free_insn_slot(current); + xol_free_insn_slot(utask); spin_lock_irq(¤t->sighand->siglock); recalc_sigpending(); /* see uprobe_deny_signal() */ diff --git a/kernel/exit.c b/kernel/exit.c index 619f0014c33b..1dcddfe537ee 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -25,7 +25,6 @@ #include <linux/acct.h> #include <linux/tsacct_kern.h> #include <linux/file.h> -#include <linux/fdtable.h> #include <linux/freezer.h> #include <linux/binfmts.h> #include <linux/nsproxy.h> diff --git a/kernel/fork.c b/kernel/fork.c index cbdaca45d0c1..e58d27c05788 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -105,6 +105,7 @@ #include <linux/rseq.h> #include <uapi/linux/pidfd.h> #include <linux/pidfs.h> +#include <linux/tick.h> #include <asm/pgalloc.h> #include <linux/uaccess.h> @@ -653,11 +654,6 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, mm->exec_vm = oldmm->exec_vm; mm->stack_vm = oldmm->stack_vm; - retval = ksm_fork(mm, oldmm); - if (retval) - goto out; - khugepaged_fork(mm, oldmm); - /* Use __mt_dup() to efficiently build an identical maple tree. */ retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL); if (unlikely(retval)) @@ -760,6 +756,8 @@ loop_out: vma_iter_free(&vmi); if (!retval) { mt_set_in_rcu(vmi.mas.tree); + ksm_fork(mm, oldmm); + khugepaged_fork(mm, oldmm); } else if (mpnt) { /* * The entire maple tree has already been duplicated. If the @@ -775,7 +773,10 @@ out: mmap_write_unlock(mm); flush_tlb_mm(oldmm); mmap_write_unlock(oldmm); - dup_userfaultfd_complete(&uf); + if (!retval) + dup_userfaultfd_complete(&uf); + else + dup_userfaultfd_fail(&uf); fail_uprobe_end: uprobe_end_dup_mmap(); return retval; @@ -999,7 +1000,7 @@ void __init __weak arch_task_cache_init(void) { } static void __init set_max_threads(unsigned int max_threads_suggested) { u64 threads; - unsigned long nr_pages = PHYS_PFN(memblock_phys_mem_size() - memblock_reserved_size()); + unsigned long nr_pages = memblock_estimated_nr_free_pages(); /* * The number of threads shall be limited such that the thread @@ -1184,7 +1185,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->active_memcg = NULL; #endif -#ifdef CONFIG_CPU_SUP_INTEL +#ifdef CONFIG_X86_BUS_LOCK_DETECT tsk->reported_split_lock = 0; #endif @@ -1298,7 +1299,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, if (init_new_context(p, mm)) goto fail_nocontext; - if (mm_alloc_cid(mm)) + if (mm_alloc_cid(mm, p)) goto fail_cid; if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT, @@ -1756,33 +1757,30 @@ static int copy_files(unsigned long clone_flags, struct task_struct *tsk, int no_files) { struct files_struct *oldf, *newf; - int error = 0; /* * A background process may not have any files ... */ oldf = current->files; if (!oldf) - goto out; + return 0; if (no_files) { tsk->files = NULL; - goto out; + return 0; } if (clone_flags & CLONE_FILES) { atomic_inc(&oldf->count); - goto out; + return 0; } - newf = dup_fd(oldf, NR_OPEN_MAX, &error); - if (!newf) - goto out; + newf = dup_fd(oldf, NULL); + if (IS_ERR(newf)) + return PTR_ERR(newf); tsk->files = newf; - error = 0; -out: - return error; + return 0; } static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) @@ -1864,6 +1862,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) #ifdef CONFIG_POSIX_TIMERS INIT_HLIST_HEAD(&sig->posix_timers); + INIT_HLIST_HEAD(&sig->ignored_posix_timers); hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); sig->real_timer.function = it_real_fn; #endif @@ -2295,6 +2294,7 @@ __latent_entropy struct task_struct *copy_process( acct_clear_integrals(p); posix_cputimers_init(&p->posix_cputimers); + tick_dep_init_task(p); p->io_context = NULL; audit_set_context(p, NULL); @@ -3238,17 +3238,16 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) /* * Unshare file descriptor table if it is being shared */ -int unshare_fd(unsigned long unshare_flags, unsigned int max_fds, - struct files_struct **new_fdp) +static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp) { struct files_struct *fd = current->files; - int error = 0; if ((unshare_flags & CLONE_FILES) && (fd && atomic_read(&fd->count) > 1)) { - *new_fdp = dup_fd(fd, max_fds, &error); - if (!*new_fdp) - return error; + fd = dup_fd(fd, NULL); + if (IS_ERR(fd)) + return PTR_ERR(fd); + *new_fdp = fd; } return 0; @@ -3306,7 +3305,7 @@ int ksys_unshare(unsigned long unshare_flags) err = unshare_fs(unshare_flags, &new_fs); if (err) goto bad_unshare_out; - err = unshare_fd(unshare_flags, NR_OPEN_MAX, &new_fd); + err = unshare_fd(unshare_flags, &new_fd); if (err) goto bad_unshare_cleanup_fs; err = unshare_userns(unshare_flags, &new_cred); @@ -3398,7 +3397,7 @@ int unshare_files(void) struct files_struct *old, *copy = NULL; int error; - error = unshare_fd(CLONE_FILES, NR_OPEN_MAX, ©); + error = unshare_fd(CLONE_FILES, ©); if (error || !copy) return error; diff --git a/kernel/freezer.c b/kernel/freezer.c index 44bbd7dbd2c8..8d530d0949ff 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -109,7 +109,12 @@ static int __set_task_frozen(struct task_struct *p, void *arg) { unsigned int state = READ_ONCE(p->__state); - if (p->on_rq) + /* + * Allow freezing the sched_delayed tasks; they will not execute until + * ttwu() fixes them up, so it is safe to swap their state now, instead + * of waiting for them to get fully dequeued. + */ + if (task_is_runnable(p)) return 0; if (p != current && task_curr(p)) diff --git a/kernel/futex/core.c b/kernel/futex/core.c index 136768ae2637..326bfe6549d7 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -140,9 +140,9 @@ futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, if (!time) return NULL; - hrtimer_init_sleeper_on_stack(timeout, (flags & FLAGS_CLOCKRT) ? - CLOCK_REALTIME : CLOCK_MONOTONIC, - HRTIMER_MODE_ABS); + hrtimer_setup_sleeper_on_stack(timeout, + (flags & FLAGS_CLOCKRT) ? CLOCK_REALTIME : CLOCK_MONOTONIC, + HRTIMER_MODE_ABS); /* * If range_ns is 0, calling hrtimer_set_expires_range_ns() is * effectively the same as calling hrtimer_set_expires(). @@ -181,12 +181,12 @@ static u64 get_inode_sequence_number(struct inode *inode) return old; for (;;) { - u64 new = atomic64_add_return(1, &i_seq); + u64 new = atomic64_inc_return(&i_seq); if (WARN_ON_ONCE(!new)) continue; - old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new); - if (old) + old = 0; + if (!atomic64_try_cmpxchg_relaxed(&inode->i_sequence, &old, new)) return old; return new; } diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c index 5722467f2737..d62cca5ed8f4 100644 --- a/kernel/futex/pi.c +++ b/kernel/futex/pi.c @@ -922,6 +922,7 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl struct rt_mutex_waiter rt_waiter; struct futex_hash_bucket *hb; struct futex_q q = futex_q_init; + DEFINE_WAKE_Q(wake_q); int res, ret; if (!IS_ENABLED(CONFIG_FUTEX_PI)) @@ -1018,8 +1019,11 @@ retry_private: * such that futex_unlock_pi() is guaranteed to observe the waiter when * it sees the futex_q::pi_state. */ - ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current); + ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current, &wake_q); + preempt_disable(); raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock); + wake_up_q(&wake_q); + preempt_enable(); if (ret) { if (ret == 1) diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c index b3e98668f4dd..eb16a58e0322 100644 --- a/kernel/irq/devres.c +++ b/kernel/irq/devres.c @@ -141,9 +141,8 @@ void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id) { struct irq_devres match_data = { irq, dev_id }; - WARN_ON(devres_destroy(dev, devm_irq_release, devm_irq_match, + WARN_ON(devres_release(dev, devm_irq_release, devm_irq_match, &match_data)); - free_irq(irq, dev_id); } EXPORT_SYMBOL(devm_free_irq); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 1dee88ba0ae4..0253e77fcd9a 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -15,6 +15,7 @@ #include <linux/maple_tree.h> #include <linux/irqdomain.h> #include <linux/sysfs.h> +#include <linux/string_choices.h> #include "internals.h" @@ -138,8 +139,30 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node, desc_smp_init(desc, node, affinity); } -int nr_irqs = NR_IRQS; -EXPORT_SYMBOL_GPL(nr_irqs); +static unsigned int nr_irqs = NR_IRQS; + +/** + * irq_get_nr_irqs() - Number of interrupts supported by the system. + */ +unsigned int irq_get_nr_irqs(void) +{ + return nr_irqs; +} +EXPORT_SYMBOL_GPL(irq_get_nr_irqs); + +/** + * irq_set_nr_irqs() - Set the number of interrupts supported by the system. + * @nr: New number of interrupts. + * + * Return: @nr. + */ +unsigned int irq_set_nr_irqs(unsigned int nr) +{ + nr_irqs = nr; + + return nr; +} +EXPORT_SYMBOL_GPL(irq_set_nr_irqs); static DEFINE_MUTEX(sparse_irq_lock); static struct maple_tree sparse_irqs = MTREE_INIT_EXT(sparse_irqs, @@ -298,8 +321,7 @@ static ssize_t wakeup_show(struct kobject *kobj, ssize_t ret = 0; raw_spin_lock_irq(&desc->lock); - ret = sprintf(buf, "%s\n", - irqd_is_wakeup_set(&desc->irq_data) ? "enabled" : "disabled"); + ret = sprintf(buf, "%s\n", str_enabled_disabled(irqd_is_wakeup_set(&desc->irq_data))); raw_spin_unlock_irq(&desc->lock); return ret; diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index e0bff21f30e0..ec6d8e72d980 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1225,7 +1225,7 @@ int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq, virq = __irq_alloc_descs(virq, virq, cnt, node, THIS_MODULE, affinity); } else { - hint = hwirq % nr_irqs; + hint = hwirq % irq_get_nr_irqs(); if (hint == 0) hint++; virq = __irq_alloc_descs(-1, hint, cnt, node, THIS_MODULE, diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 3a24d6b5f559..396a067a8a56 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -718,7 +718,7 @@ static int msi_domain_alloc(struct irq_domain *domain, unsigned int virq, ret = ops->msi_init(domain, info, virq + i, hwirq + i, arg); if (ret < 0) { if (ops->msi_free) { - for (i--; i > 0; i--) + for (i--; i >= 0; i--) ops->msi_free(domain, info, virq + i); } irq_domain_free_irqs_top(domain, virq, nr_irqs); diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 9081ada81c3d..f36c33bd2da4 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -457,11 +457,12 @@ int __weak arch_show_interrupts(struct seq_file *p, int prec) } #ifndef ACTUAL_NR_IRQS -# define ACTUAL_NR_IRQS nr_irqs +# define ACTUAL_NR_IRQS irq_get_nr_irqs() #endif int show_interrupts(struct seq_file *p, void *v) { + const unsigned int nr_irqs = irq_get_nr_irqs(); static int prec; int i = *(loff_t *) v, j; @@ -494,9 +495,12 @@ int show_interrupts(struct seq_file *p, void *v) if (!desc->action || irq_desc_is_chained(desc) || !desc->kstat_irqs) goto outsparse; - seq_printf(p, "%*d: ", prec, i); - for_each_online_cpu(j) - seq_printf(p, "%10u ", desc->kstat_irqs ? per_cpu(desc->kstat_irqs->cnt, j) : 0); + seq_printf(p, "%*d:", prec, i); + for_each_online_cpu(j) { + unsigned int cnt = desc->kstat_irqs ? per_cpu(desc->kstat_irqs->cnt, j) : 0; + + seq_put_decimal_ull_width(p, " ", cnt, 10); + } raw_spin_lock_irqsave(&desc->lock, flags); if (desc->irq_data.chip) { diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 6dc76b590703..93a822d3c468 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -168,7 +168,7 @@ bool static_key_slow_inc_cpuslocked(struct static_key *key) jump_label_update(key); /* * Ensure that when static_key_fast_inc_not_disabled() or - * static_key_slow_try_dec() observe the positive value, + * static_key_dec_not_one() observe the positive value, * they must also observe all the text changes. */ atomic_set_release(&key->enabled, 1); @@ -250,7 +250,7 @@ void static_key_disable(struct static_key *key) } EXPORT_SYMBOL_GPL(static_key_disable); -static bool static_key_slow_try_dec(struct static_key *key) +static bool static_key_dec_not_one(struct static_key *key) { int v; @@ -274,6 +274,14 @@ static bool static_key_slow_try_dec(struct static_key *key) * enabled. This suggests an ordering problem on the user side. */ WARN_ON_ONCE(v < 0); + + /* + * Warn about underflow, and lie about success in an attempt to + * not make things worse. + */ + if (WARN_ON_ONCE(v == 0)) + return true; + if (v <= 1) return false; } while (!likely(atomic_try_cmpxchg(&key->enabled, &v, v - 1))); @@ -284,15 +292,27 @@ static bool static_key_slow_try_dec(struct static_key *key) static void __static_key_slow_dec_cpuslocked(struct static_key *key) { lockdep_assert_cpus_held(); + int val; - if (static_key_slow_try_dec(key)) + if (static_key_dec_not_one(key)) return; guard(mutex)(&jump_label_mutex); - if (atomic_cmpxchg(&key->enabled, 1, 0) == 1) + val = atomic_read(&key->enabled); + /* + * It should be impossible to observe -1 with jump_label_mutex held, + * see static_key_slow_inc_cpuslocked(). + */ + if (WARN_ON_ONCE(val == -1)) + return; + /* + * Cannot already be 0, something went sideways. + */ + if (WARN_ON_ONCE(val == 0)) + return; + + if (atomic_dec_and_test(&key->enabled)) jump_label_update(key); - else - WARN_ON_ONCE(!static_key_slow_try_dec(key)); } static void __static_key_slow_dec(struct static_key *key) @@ -329,7 +349,7 @@ void __static_key_slow_dec_deferred(struct static_key *key, { STATIC_KEY_CHECK_USE(key); - if (static_key_slow_try_dec(key)) + if (static_key_dec_not_one(key)) return; schedule_delayed_work(work, timeout); diff --git a/kernel/kcmp.c b/kernel/kcmp.c index b0639f21041f..2c596851f8a9 100644 --- a/kernel/kcmp.c +++ b/kernel/kcmp.c @@ -63,9 +63,7 @@ get_file_raw_ptr(struct task_struct *task, unsigned int idx) { struct file *file; - rcu_read_lock(); - file = task_lookup_fdget_rcu(task, idx); - rcu_read_unlock(); + file = fget_task(task, idx); if (file) fput(file); diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c index 53b21ae30e00..2af39ba5b70b 100644 --- a/kernel/kcsan/debugfs.c +++ b/kernel/kcsan/debugfs.c @@ -46,14 +46,8 @@ static struct { int used; /* number of elements used */ bool sorted; /* if elements are sorted */ bool whitelist; /* if list is a blacklist or whitelist */ -} report_filterlist = { - .addrs = NULL, - .size = 8, /* small initial size */ - .used = 0, - .sorted = false, - .whitelist = false, /* default is blacklist */ -}; -static DEFINE_SPINLOCK(report_filterlist_lock); +} report_filterlist; +static DEFINE_RAW_SPINLOCK(report_filterlist_lock); /* * The microbenchmark allows benchmarking KCSAN core runtime only. To run @@ -110,7 +104,7 @@ bool kcsan_skip_report_debugfs(unsigned long func_addr) return false; func_addr -= offset; /* Get function start */ - spin_lock_irqsave(&report_filterlist_lock, flags); + raw_spin_lock_irqsave(&report_filterlist_lock, flags); if (report_filterlist.used == 0) goto out; @@ -127,7 +121,7 @@ bool kcsan_skip_report_debugfs(unsigned long func_addr) ret = !ret; out: - spin_unlock_irqrestore(&report_filterlist_lock, flags); + raw_spin_unlock_irqrestore(&report_filterlist_lock, flags); return ret; } @@ -135,9 +129,9 @@ static void set_report_filterlist_whitelist(bool whitelist) { unsigned long flags; - spin_lock_irqsave(&report_filterlist_lock, flags); + raw_spin_lock_irqsave(&report_filterlist_lock, flags); report_filterlist.whitelist = whitelist; - spin_unlock_irqrestore(&report_filterlist_lock, flags); + raw_spin_unlock_irqrestore(&report_filterlist_lock, flags); } /* Returns 0 on success, error-code otherwise. */ @@ -145,6 +139,9 @@ static ssize_t insert_report_filterlist(const char *func) { unsigned long flags; unsigned long addr = kallsyms_lookup_name(func); + unsigned long *delay_free = NULL; + unsigned long *new_addrs = NULL; + size_t new_size = 0; ssize_t ret = 0; if (!addr) { @@ -152,42 +149,42 @@ static ssize_t insert_report_filterlist(const char *func) return -ENOENT; } - spin_lock_irqsave(&report_filterlist_lock, flags); +retry_alloc: + /* + * Check if we need an allocation, and re-validate under the lock. Since + * the report_filterlist_lock is a raw, cannot allocate under the lock. + */ + if (data_race(report_filterlist.used == report_filterlist.size)) { + new_size = (report_filterlist.size ?: 4) * 2; + delay_free = new_addrs = kmalloc_array(new_size, sizeof(unsigned long), GFP_KERNEL); + if (!new_addrs) + return -ENOMEM; + } - if (report_filterlist.addrs == NULL) { - /* initial allocation */ - report_filterlist.addrs = - kmalloc_array(report_filterlist.size, - sizeof(unsigned long), GFP_ATOMIC); - if (report_filterlist.addrs == NULL) { - ret = -ENOMEM; - goto out; - } - } else if (report_filterlist.used == report_filterlist.size) { - /* resize filterlist */ - size_t new_size = report_filterlist.size * 2; - unsigned long *new_addrs = - krealloc(report_filterlist.addrs, - new_size * sizeof(unsigned long), GFP_ATOMIC); - - if (new_addrs == NULL) { - /* leave filterlist itself untouched */ - ret = -ENOMEM; - goto out; + raw_spin_lock_irqsave(&report_filterlist_lock, flags); + if (report_filterlist.used == report_filterlist.size) { + /* Check we pre-allocated enough, and retry if not. */ + if (report_filterlist.used >= new_size) { + raw_spin_unlock_irqrestore(&report_filterlist_lock, flags); + kfree(new_addrs); /* kfree(NULL) is safe */ + delay_free = new_addrs = NULL; + goto retry_alloc; } + if (report_filterlist.used) + memcpy(new_addrs, report_filterlist.addrs, report_filterlist.used * sizeof(unsigned long)); + delay_free = report_filterlist.addrs; /* free the old list */ + report_filterlist.addrs = new_addrs; /* switch to the new list */ report_filterlist.size = new_size; - report_filterlist.addrs = new_addrs; } /* Note: deduplicating should be done in userspace. */ - report_filterlist.addrs[report_filterlist.used++] = - kallsyms_lookup_name(func); + report_filterlist.addrs[report_filterlist.used++] = addr; report_filterlist.sorted = false; -out: - spin_unlock_irqrestore(&report_filterlist_lock, flags); + raw_spin_unlock_irqrestore(&report_filterlist_lock, flags); + kfree(delay_free); return ret; } @@ -204,13 +201,13 @@ static int show_info(struct seq_file *file, void *v) } /* show filter functions, and filter type */ - spin_lock_irqsave(&report_filterlist_lock, flags); + raw_spin_lock_irqsave(&report_filterlist_lock, flags); seq_printf(file, "\n%s functions: %s\n", report_filterlist.whitelist ? "whitelisted" : "blacklisted", report_filterlist.used == 0 ? "none" : ""); for (i = 0; i < report_filterlist.used; ++i) seq_printf(file, " %ps\n", (void *)report_filterlist.addrs[i]); - spin_unlock_irqrestore(&report_filterlist_lock, flags); + raw_spin_unlock_irqrestore(&report_filterlist_lock, flags); return 0; } diff --git a/kernel/kprobes.c b/kernel/kprobes.c index da59c68df841..b027a4030976 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -95,10 +95,6 @@ struct kprobe_insn_page { char slot_used[]; }; -#define KPROBE_INSN_PAGE_SIZE(slots) \ - (offsetof(struct kprobe_insn_page, slot_used) + \ - (sizeof(char) * (slots))) - static int slots_per_page(struct kprobe_insn_cache *c) { return PAGE_SIZE/(c->insn_size * sizeof(kprobe_opcode_t)); @@ -175,7 +171,7 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c) goto retry; /* All out of space. Need to allocate a new page. */ - kip = kmalloc(KPROBE_INSN_PAGE_SIZE(slots_per_page(c)), GFP_KERNEL); + kip = kmalloc(struct_size(kip, slot_used, slots_per_page(c)), GFP_KERNEL); if (!kip) goto out; @@ -206,29 +202,29 @@ static bool collect_one_slot(struct kprobe_insn_page *kip, int idx) { kip->slot_used[idx] = SLOT_CLEAN; kip->nused--; - if (kip->nused == 0) { + if (kip->nused != 0) + return false; + + /* + * Page is no longer in use. Free it unless + * it's the last one. We keep the last one + * so as not to have to set it up again the + * next time somebody inserts a probe. + */ + if (!list_is_singular(&kip->list)) { /* - * Page is no longer in use. Free it unless - * it's the last one. We keep the last one - * so as not to have to set it up again the - * next time somebody inserts a probe. + * Record perf ksymbol unregister event before removing + * the page. */ - if (!list_is_singular(&kip->list)) { - /* - * Record perf ksymbol unregister event before removing - * the page. - */ - perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, - (unsigned long)kip->insns, PAGE_SIZE, true, - kip->cache->sym); - list_del_rcu(&kip->list); - synchronize_rcu(); - kip->cache->free(kip->insns); - kfree(kip); - } - return true; + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, + (unsigned long)kip->insns, PAGE_SIZE, true, + kip->cache->sym); + list_del_rcu(&kip->list); + synchronize_rcu(); + kip->cache->free(kip->insns); + kfree(kip); } - return false; + return true; } static int collect_garbage_slots(struct kprobe_insn_cache *c) @@ -353,8 +349,8 @@ struct kprobe_insn_cache kprobe_optinsn_slots = { /* .insn_size is initialized later */ .nr_garbage = 0, }; -#endif -#endif +#endif /* CONFIG_OPTPROBES */ +#endif /* __ARCH_WANT_KPROBES_INSN_SLOT */ /* We have preemption disabled.. so it is safe to use __ versions */ static inline void set_kprobe_instance(struct kprobe *kp) @@ -1543,7 +1539,7 @@ static int check_ftrace_location(struct kprobe *p) if (ftrace_location(addr) == addr) { #ifdef CONFIG_KPROBES_ON_FTRACE p->flags |= KPROBE_FLAG_FTRACE; -#else /* !CONFIG_KPROBES_ON_FTRACE */ +#else return -EINVAL; #endif } @@ -1725,28 +1721,29 @@ static struct kprobe *__disable_kprobe(struct kprobe *p) if (unlikely(orig_p == NULL)) return ERR_PTR(-EINVAL); - if (!kprobe_disabled(p)) { - /* Disable probe if it is a child probe */ - if (p != orig_p) - p->flags |= KPROBE_FLAG_DISABLED; + if (kprobe_disabled(p)) + return orig_p; - /* Try to disarm and disable this/parent probe */ - if (p == orig_p || aggr_kprobe_disabled(orig_p)) { - /* - * Don't be lazy here. Even if 'kprobes_all_disarmed' - * is false, 'orig_p' might not have been armed yet. - * Note arm_all_kprobes() __tries__ to arm all kprobes - * on the best effort basis. - */ - if (!kprobes_all_disarmed && !kprobe_disabled(orig_p)) { - ret = disarm_kprobe(orig_p, true); - if (ret) { - p->flags &= ~KPROBE_FLAG_DISABLED; - return ERR_PTR(ret); - } + /* Disable probe if it is a child probe */ + if (p != orig_p) + p->flags |= KPROBE_FLAG_DISABLED; + + /* Try to disarm and disable this/parent probe */ + if (p == orig_p || aggr_kprobe_disabled(orig_p)) { + /* + * Don't be lazy here. Even if 'kprobes_all_disarmed' + * is false, 'orig_p' might not have been armed yet. + * Note arm_all_kprobes() __tries__ to arm all kprobes + * on the best effort basis. + */ + if (!kprobes_all_disarmed && !kprobe_disabled(orig_p)) { + ret = disarm_kprobe(orig_p, true); + if (ret) { + p->flags &= ~KPROBE_FLAG_DISABLED; + return ERR_PTR(ret); } - orig_p->flags |= KPROBE_FLAG_DISABLED; } + orig_p->flags |= KPROBE_FLAG_DISABLED; } return orig_p; diff --git a/kernel/kthread.c b/kernel/kthread.c index db4ceb0f503c..9bb36897b6c6 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -623,6 +623,8 @@ void kthread_unpark(struct task_struct *k) { struct kthread *kthread = to_kthread(k); + if (!test_bit(KTHREAD_SHOULD_PARK, &kthread->flags)) + return; /* * Newly created kthread was parked when the CPU was offline. * The binding was lost and we need to set it again. diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 7963deac33c3..2d8ec0351ef9 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -788,7 +788,7 @@ static void lockdep_print_held_locks(struct task_struct *p) printk("no locks held by %s/%d.\n", p->comm, task_pid_nr(p)); else printk("%d lock%s held by %s/%d:\n", depth, - depth > 1 ? "s" : "", p->comm, task_pid_nr(p)); + str_plural(depth), p->comm, task_pid_nr(p)); /* * It's not reliable to print a task's held locks if it's not sleeping * and it's not the current task. @@ -2084,6 +2084,9 @@ static noinline void print_bfs_bug(int ret) /* * Breadth-first-search failed, graph got corrupted? */ + if (ret == BFS_EQUEUEFULL) + pr_warn("Increase LOCKDEP_CIRCULAR_QUEUE_BITS to avoid this warning:\n"); + WARN(1, "lockdep bfs error:%d\n", ret); } @@ -4583,6 +4586,30 @@ void lockdep_softirqs_off(unsigned long ip) debug_atomic_inc(redundant_softirqs_off); } +/** + * lockdep_cleanup_dead_cpu - Ensure CPU lockdep state is cleanly stopped + * + * @cpu: index of offlined CPU + * @idle: task pointer for offlined CPU's idle thread + * + * Invoked after the CPU is dead. Ensures that the tracing infrastructure + * is left in a suitable state for the CPU to be subsequently brought + * online again. + */ +void lockdep_cleanup_dead_cpu(unsigned int cpu, struct task_struct *idle) +{ + if (unlikely(!debug_locks)) + return; + + if (unlikely(per_cpu(hardirqs_enabled, cpu))) { + pr_warn("CPU %u left hardirqs enabled!", cpu); + if (idle) + print_irqtrace_events(idle); + /* Clean it up for when the CPU comes online again. */ + per_cpu(hardirqs_enabled, cpu) = 0; + } +} + static int mark_usage(struct task_struct *curr, struct held_lock *hlock, int check) { @@ -6263,25 +6290,27 @@ static struct pending_free *get_pending_free(void) static void free_zapped_rcu(struct rcu_head *cb); /* - * Schedule an RCU callback if no RCU callback is pending. Must be called with - * the graph lock held. - */ -static void call_rcu_zapped(struct pending_free *pf) +* See if we need to queue an RCU callback, must called with +* the lockdep lock held, returns false if either we don't have +* any pending free or the callback is already scheduled. +* Otherwise, a call_rcu() must follow this function call. +*/ +static bool prepare_call_rcu_zapped(struct pending_free *pf) { WARN_ON_ONCE(inside_selftest()); if (list_empty(&pf->zapped)) - return; + return false; if (delayed_free.scheduled) - return; + return false; delayed_free.scheduled = true; WARN_ON_ONCE(delayed_free.pf + delayed_free.index != pf); delayed_free.index ^= 1; - call_rcu(&delayed_free.rcu_head, free_zapped_rcu); + return true; } /* The caller must hold the graph lock. May be called from RCU context. */ @@ -6307,6 +6336,7 @@ static void free_zapped_rcu(struct rcu_head *ch) { struct pending_free *pf; unsigned long flags; + bool need_callback; if (WARN_ON_ONCE(ch != &delayed_free.rcu_head)) return; @@ -6318,14 +6348,18 @@ static void free_zapped_rcu(struct rcu_head *ch) pf = delayed_free.pf + (delayed_free.index ^ 1); __free_zapped_classes(pf); delayed_free.scheduled = false; + need_callback = + prepare_call_rcu_zapped(delayed_free.pf + delayed_free.index); + lockdep_unlock(); + raw_local_irq_restore(flags); /* - * If there's anything on the open list, close and start a new callback. - */ - call_rcu_zapped(delayed_free.pf + delayed_free.index); + * If there's pending free and its callback has not been scheduled, + * queue an RCU callback. + */ + if (need_callback) + call_rcu(&delayed_free.rcu_head, free_zapped_rcu); - lockdep_unlock(); - raw_local_irq_restore(flags); } /* @@ -6365,6 +6399,7 @@ static void lockdep_free_key_range_reg(void *start, unsigned long size) { struct pending_free *pf; unsigned long flags; + bool need_callback; init_data_structures_once(); @@ -6372,10 +6407,11 @@ static void lockdep_free_key_range_reg(void *start, unsigned long size) lockdep_lock(); pf = get_pending_free(); __lockdep_free_key_range(pf, start, size); - call_rcu_zapped(pf); + need_callback = prepare_call_rcu_zapped(pf); lockdep_unlock(); raw_local_irq_restore(flags); - + if (need_callback) + call_rcu(&delayed_free.rcu_head, free_zapped_rcu); /* * Wait for any possible iterators from look_up_lock_class() to pass * before continuing to free the memory they refer to. @@ -6469,6 +6505,7 @@ static void lockdep_reset_lock_reg(struct lockdep_map *lock) struct pending_free *pf; unsigned long flags; int locked; + bool need_callback = false; raw_local_irq_save(flags); locked = graph_lock(); @@ -6477,11 +6514,13 @@ static void lockdep_reset_lock_reg(struct lockdep_map *lock) pf = get_pending_free(); __lockdep_reset_lock(pf, lock); - call_rcu_zapped(pf); + need_callback = prepare_call_rcu_zapped(pf); graph_unlock(); out_irq: raw_local_irq_restore(flags); + if (need_callback) + call_rcu(&delayed_free.rcu_head, free_zapped_rcu); } /* @@ -6525,6 +6564,7 @@ void lockdep_unregister_key(struct lock_class_key *key) struct pending_free *pf; unsigned long flags; bool found = false; + bool need_callback = false; might_sleep(); @@ -6545,11 +6585,14 @@ void lockdep_unregister_key(struct lock_class_key *key) if (found) { pf = get_pending_free(); __lockdep_free_key_range(pf, key, 1); - call_rcu_zapped(pf); + need_callback = prepare_call_rcu_zapped(pf); } lockdep_unlock(); raw_local_irq_restore(flags); + if (need_callback) + call_rcu(&delayed_free.rcu_head, free_zapped_rcu); + /* Wait until is_dynamic_key() has finished accessing k->hash_entry. */ synchronize_rcu(); } @@ -6557,17 +6600,17 @@ EXPORT_SYMBOL_GPL(lockdep_unregister_key); void __init lockdep_init(void) { - printk("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n"); + pr_info("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n"); - printk("... MAX_LOCKDEP_SUBCLASSES: %lu\n", MAX_LOCKDEP_SUBCLASSES); - printk("... MAX_LOCK_DEPTH: %lu\n", MAX_LOCK_DEPTH); - printk("... MAX_LOCKDEP_KEYS: %lu\n", MAX_LOCKDEP_KEYS); - printk("... CLASSHASH_SIZE: %lu\n", CLASSHASH_SIZE); - printk("... MAX_LOCKDEP_ENTRIES: %lu\n", MAX_LOCKDEP_ENTRIES); - printk("... MAX_LOCKDEP_CHAINS: %lu\n", MAX_LOCKDEP_CHAINS); - printk("... CHAINHASH_SIZE: %lu\n", CHAINHASH_SIZE); + pr_info("... MAX_LOCKDEP_SUBCLASSES: %lu\n", MAX_LOCKDEP_SUBCLASSES); + pr_info("... MAX_LOCK_DEPTH: %lu\n", MAX_LOCK_DEPTH); + pr_info("... MAX_LOCKDEP_KEYS: %lu\n", MAX_LOCKDEP_KEYS); + pr_info("... CLASSHASH_SIZE: %lu\n", CLASSHASH_SIZE); + pr_info("... MAX_LOCKDEP_ENTRIES: %lu\n", MAX_LOCKDEP_ENTRIES); + pr_info("... MAX_LOCKDEP_CHAINS: %lu\n", MAX_LOCKDEP_CHAINS); + pr_info("... CHAINHASH_SIZE: %lu\n", CHAINHASH_SIZE); - printk(" memory used by lock dependency info: %zu kB\n", + pr_info(" memory used by lock dependency info: %zu kB\n", (sizeof(lock_classes) + sizeof(lock_classes_in_use) + sizeof(classhash_table) + @@ -6585,12 +6628,12 @@ void __init lockdep_init(void) ); #if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) - printk(" memory used for stack traces: %zu kB\n", + pr_info(" memory used for stack traces: %zu kB\n", (sizeof(stack_trace) + sizeof(stack_trace_hash)) / 1024 ); #endif - printk(" per task-struct memory footprint: %zu bytes\n", + pr_info(" per task-struct memory footprint: %zu bytes\n", sizeof(((struct task_struct *)NULL)->held_locks)); } diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index e2bfb1db589d..6db0f43fc4df 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -424,7 +424,7 @@ static void seq_line(struct seq_file *m, char c, int offset, int length) for (i = 0; i < offset; i++) seq_puts(m, " "); for (i = 0; i < length; i++) - seq_printf(m, "%c", c); + seq_putc(m, c); seq_puts(m, "\n"); } diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index cbae8c0b89ab..3302e52f0c96 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -56,31 +56,6 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) } EXPORT_SYMBOL(__mutex_init); -/* - * @owner: contains: 'struct task_struct *' to the current lock owner, - * NULL means not owned. Since task_struct pointers are aligned at - * at least L1_CACHE_BYTES, we have low bits to store extra state. - * - * Bit0 indicates a non-empty waiter list; unlock must issue a wakeup. - * Bit1 indicates unlock needs to hand the lock to the top-waiter - * Bit2 indicates handoff has been done and we're waiting for pickup. - */ -#define MUTEX_FLAG_WAITERS 0x01 -#define MUTEX_FLAG_HANDOFF 0x02 -#define MUTEX_FLAG_PICKUP 0x04 - -#define MUTEX_FLAGS 0x07 - -/* - * Internal helper function; C doesn't allow us to hide it :/ - * - * DO NOT USE (outside of mutex code). - */ -static inline struct task_struct *__mutex_owner(struct mutex *lock) -{ - return (struct task_struct *)(atomic_long_read(&lock->owner) & ~MUTEX_FLAGS); -} - static inline struct task_struct *__owner_task(unsigned long owner) { return (struct task_struct *)(owner & ~MUTEX_FLAGS); @@ -575,8 +550,10 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas struct lockdep_map *nest_lock, unsigned long ip, struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx) { + DEFINE_WAKE_Q(wake_q); struct mutex_waiter waiter; struct ww_mutex *ww; + unsigned long flags; int ret; if (!use_ww_ctx) @@ -619,13 +596,13 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas return 0; } - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irqsave(&lock->wait_lock, flags); /* * After waiting to acquire the wait_lock, try again. */ if (__mutex_trylock(lock)) { if (ww_ctx) - __ww_mutex_check_waiters(lock, ww_ctx); + __ww_mutex_check_waiters(lock, ww_ctx, &wake_q); goto skip_wait; } @@ -645,7 +622,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas * Add in stamp order, waking up waiters that must kill * themselves. */ - ret = __ww_mutex_add_waiter(&waiter, lock, ww_ctx); + ret = __ww_mutex_add_waiter(&waiter, lock, ww_ctx, &wake_q); if (ret) goto err_early_kill; } @@ -680,7 +657,11 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas goto err; } - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); + /* Make sure we do wakeups before calling schedule */ + wake_up_q(&wake_q); + wake_q_init(&wake_q); + schedule_preempt_disabled(); first = __mutex_waiter_is_first(lock, &waiter); @@ -701,9 +682,9 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas trace_contention_begin(lock, LCB_F_MUTEX); } - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irqsave(&lock->wait_lock, flags); } - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irqsave(&lock->wait_lock, flags); acquired: __set_current_state(TASK_RUNNING); @@ -714,7 +695,7 @@ acquired: */ if (!ww_ctx->is_wait_die && !__mutex_waiter_is_first(lock, &waiter)) - __ww_mutex_check_waiters(lock, ww_ctx); + __ww_mutex_check_waiters(lock, ww_ctx, &wake_q); } __mutex_remove_waiter(lock, &waiter); @@ -729,7 +710,8 @@ skip_wait: if (ww_ctx) ww_mutex_lock_acquired(ww, ww_ctx); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); + wake_up_q(&wake_q); preempt_enable(); return 0; @@ -738,9 +720,10 @@ err: __mutex_remove_waiter(lock, &waiter); err_early_kill: trace_contention_end(lock, ret); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); debug_mutex_free_waiter(&waiter); mutex_release(&lock->dep_map, ip); + wake_up_q(&wake_q); preempt_enable(); return ret; } @@ -908,6 +891,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne struct task_struct *next = NULL; DEFINE_WAKE_Q(wake_q); unsigned long owner; + unsigned long flags; mutex_release(&lock->dep_map, ip); @@ -934,7 +918,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne } } - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irqsave(&lock->wait_lock, flags); debug_mutex_unlock(lock); if (!list_empty(&lock->wait_list)) { /* get the first entry from the wait-list: */ @@ -951,9 +935,10 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne if (owner & MUTEX_FLAG_HANDOFF) __mutex_handoff(lock, next); - raw_spin_unlock(&lock->wait_lock); - + preempt_disable(); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); wake_up_q(&wake_q); + preempt_enable(); } #ifndef CONFIG_DEBUG_LOCK_ALLOC diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h index 0b2a79c4013b..cbff35b9b7ae 100644 --- a/kernel/locking/mutex.h +++ b/kernel/locking/mutex.h @@ -20,6 +20,33 @@ struct mutex_waiter { #endif }; +/* + * @owner: contains: 'struct task_struct *' to the current lock owner, + * NULL means not owned. Since task_struct pointers are aligned at + * at least L1_CACHE_BYTES, we have low bits to store extra state. + * + * Bit0 indicates a non-empty waiter list; unlock must issue a wakeup. + * Bit1 indicates unlock needs to hand the lock to the top-waiter + * Bit2 indicates handoff has been done and we're waiting for pickup. + */ +#define MUTEX_FLAG_WAITERS 0x01 +#define MUTEX_FLAG_HANDOFF 0x02 +#define MUTEX_FLAG_PICKUP 0x04 + +#define MUTEX_FLAGS 0x07 + +/* + * Internal helper function; C doesn't allow us to hide it :/ + * + * DO NOT USE (outside of mutex & scheduler code). + */ +static inline struct task_struct *__mutex_owner(struct mutex *lock) +{ + if (!lock) + return NULL; + return (struct task_struct *)(atomic_long_read(&lock->owner) & ~MUTEX_FLAGS); +} + #ifdef CONFIG_DEBUG_MUTEXES extern void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter); diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c index 75a6f6133866..b4233dc2c2b0 100644 --- a/kernel/locking/osq_lock.c +++ b/kernel/locking/osq_lock.c @@ -215,8 +215,7 @@ void osq_unlock(struct optimistic_spin_queue *lock) /* * Fast path for the uncontended case. */ - if (likely(atomic_cmpxchg_release(&lock->tail, curr, - OSQ_UNLOCKED_VAL) == curr)) + if (atomic_try_cmpxchg_release(&lock->tail, &curr, OSQ_UNLOCKED_VAL)) return; /* diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index ac2e22502741..dc1cb90e3644 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -38,13 +38,13 @@ #define PV_PREV_CHECK_MASK 0xff /* - * Queue node uses: vcpu_running & vcpu_halted. - * Queue head uses: vcpu_running & vcpu_hashed. + * Queue node uses: VCPU_RUNNING & VCPU_HALTED. + * Queue head uses: VCPU_RUNNING & VCPU_HASHED. */ enum vcpu_state { - vcpu_running = 0, - vcpu_halted, /* Used only in pv_wait_node */ - vcpu_hashed, /* = pv_hash'ed + vcpu_halted */ + VCPU_RUNNING = 0, + VCPU_HALTED, /* Used only in pv_wait_node */ + VCPU_HASHED, /* = pv_hash'ed + VCPU_HALTED */ }; struct pv_node { @@ -266,7 +266,7 @@ pv_wait_early(struct pv_node *prev, int loop) if ((loop & PV_PREV_CHECK_MASK) != 0) return false; - return READ_ONCE(prev->state) != vcpu_running; + return READ_ONCE(prev->state) != VCPU_RUNNING; } /* @@ -279,7 +279,7 @@ static void pv_init_node(struct mcs_spinlock *node) BUILD_BUG_ON(sizeof(struct pv_node) > sizeof(struct qnode)); pn->cpu = smp_processor_id(); - pn->state = vcpu_running; + pn->state = VCPU_RUNNING; } /* @@ -308,26 +308,26 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev) /* * Order pn->state vs pn->locked thusly: * - * [S] pn->state = vcpu_halted [S] next->locked = 1 + * [S] pn->state = VCPU_HALTED [S] next->locked = 1 * MB MB - * [L] pn->locked [RmW] pn->state = vcpu_hashed + * [L] pn->locked [RmW] pn->state = VCPU_HASHED * * Matches the cmpxchg() from pv_kick_node(). */ - smp_store_mb(pn->state, vcpu_halted); + smp_store_mb(pn->state, VCPU_HALTED); if (!READ_ONCE(node->locked)) { lockevent_inc(pv_wait_node); lockevent_cond_inc(pv_wait_early, wait_early); - pv_wait(&pn->state, vcpu_halted); + pv_wait(&pn->state, VCPU_HALTED); } /* - * If pv_kick_node() changed us to vcpu_hashed, retain that + * If pv_kick_node() changed us to VCPU_HASHED, retain that * value so that pv_wait_head_or_lock() knows to not also try * to hash this lock. */ - cmpxchg(&pn->state, vcpu_halted, vcpu_running); + cmpxchg(&pn->state, VCPU_HALTED, VCPU_RUNNING); /* * If the locked flag is still not set after wakeup, it is a @@ -357,7 +357,7 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev) static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node) { struct pv_node *pn = (struct pv_node *)node; - u8 old = vcpu_halted; + u8 old = VCPU_HALTED; /* * If the vCPU is indeed halted, advance its state to match that of * pv_wait_node(). If OTOH this fails, the vCPU was running and will @@ -374,7 +374,7 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node) * subsequent writes. */ smp_mb__before_atomic(); - if (!try_cmpxchg_relaxed(&pn->state, &old, vcpu_hashed)) + if (!try_cmpxchg_relaxed(&pn->state, &old, VCPU_HASHED)) return; /* @@ -407,7 +407,7 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) * If pv_kick_node() already advanced our state, we don't need to * insert ourselves into the hash table anymore. */ - if (READ_ONCE(pn->state) == vcpu_hashed) + if (READ_ONCE(pn->state) == VCPU_HASHED) lp = (struct qspinlock **)1; /* @@ -420,7 +420,7 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) * Set correct vCPU state to be used by queue node wait-early * mechanism. */ - WRITE_ONCE(pn->state, vcpu_running); + WRITE_ONCE(pn->state, VCPU_RUNNING); /* * Set the pending bit in the active lock spinning loop to @@ -460,7 +460,7 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) goto gotlock; } } - WRITE_ONCE(pn->state, vcpu_hashed); + WRITE_ONCE(pn->state, VCPU_HASHED); lockevent_inc(pv_wait_head); lockevent_cond_inc(pv_wait_again, waitcnt); pv_wait(&lock->locked, _Q_SLOW_VAL); diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index ebebd0eec7f6..ac1365afcc4a 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -34,13 +34,15 @@ static inline int __ww_mutex_add_waiter(struct rt_mutex_waiter *waiter, struct rt_mutex *lock, - struct ww_acquire_ctx *ww_ctx) + struct ww_acquire_ctx *ww_ctx, + struct wake_q_head *wake_q) { return 0; } static inline void __ww_mutex_check_waiters(struct rt_mutex *lock, - struct ww_acquire_ctx *ww_ctx) + struct ww_acquire_ctx *ww_ctx, + struct wake_q_head *wake_q) { } @@ -1201,7 +1203,8 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter, struct task_struct *task, struct ww_acquire_ctx *ww_ctx, - enum rtmutex_chainwalk chwalk) + enum rtmutex_chainwalk chwalk, + struct wake_q_head *wake_q) { struct task_struct *owner = rt_mutex_owner(lock); struct rt_mutex_waiter *top_waiter = waiter; @@ -1245,7 +1248,10 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock, /* Check whether the waiter should back out immediately */ rtm = container_of(lock, struct rt_mutex, rtmutex); - res = __ww_mutex_add_waiter(waiter, rtm, ww_ctx); + preempt_disable(); + res = __ww_mutex_add_waiter(waiter, rtm, ww_ctx, wake_q); + wake_up_q(wake_q); + preempt_enable(); if (res) { raw_spin_lock(&task->pi_lock); rt_mutex_dequeue(lock, waiter); @@ -1601,6 +1607,7 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock, unsigned int state, struct hrtimer_sleeper *timeout, struct rt_mutex_waiter *waiter) + __releases(&lock->wait_lock) __acquires(&lock->wait_lock) { struct rt_mutex *rtm = container_of(lock, struct rt_mutex, rtmutex); struct task_struct *owner; @@ -1674,12 +1681,14 @@ static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock, * @state: The task state for sleeping * @chwalk: Indicator whether full or partial chainwalk is requested * @waiter: Initializer waiter for blocking + * @wake_q: The wake_q to wake tasks after we release the wait_lock */ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, struct ww_acquire_ctx *ww_ctx, unsigned int state, enum rtmutex_chainwalk chwalk, - struct rt_mutex_waiter *waiter) + struct rt_mutex_waiter *waiter, + struct wake_q_head *wake_q) { struct rt_mutex *rtm = container_of(lock, struct rt_mutex, rtmutex); struct ww_mutex *ww = ww_container_of(rtm); @@ -1690,7 +1699,7 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, /* Try to acquire the lock again: */ if (try_to_take_rt_mutex(lock, current, NULL)) { if (build_ww_mutex() && ww_ctx) { - __ww_mutex_check_waiters(rtm, ww_ctx); + __ww_mutex_check_waiters(rtm, ww_ctx, wake_q); ww_mutex_lock_acquired(ww, ww_ctx); } return 0; @@ -1700,7 +1709,7 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, trace_contention_begin(lock, LCB_F_RT); - ret = task_blocks_on_rt_mutex(lock, waiter, current, ww_ctx, chwalk); + ret = task_blocks_on_rt_mutex(lock, waiter, current, ww_ctx, chwalk, wake_q); if (likely(!ret)) ret = rt_mutex_slowlock_block(lock, ww_ctx, state, NULL, waiter); @@ -1708,7 +1717,7 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, /* acquired the lock */ if (build_ww_mutex() && ww_ctx) { if (!ww_ctx->is_wait_die) - __ww_mutex_check_waiters(rtm, ww_ctx); + __ww_mutex_check_waiters(rtm, ww_ctx, wake_q); ww_mutex_lock_acquired(ww, ww_ctx); } } else { @@ -1730,7 +1739,8 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, static inline int __rt_mutex_slowlock_locked(struct rt_mutex_base *lock, struct ww_acquire_ctx *ww_ctx, - unsigned int state) + unsigned int state, + struct wake_q_head *wake_q) { struct rt_mutex_waiter waiter; int ret; @@ -1739,7 +1749,7 @@ static inline int __rt_mutex_slowlock_locked(struct rt_mutex_base *lock, waiter.ww_ctx = ww_ctx; ret = __rt_mutex_slowlock(lock, ww_ctx, state, RT_MUTEX_MIN_CHAINWALK, - &waiter); + &waiter, wake_q); debug_rt_mutex_free_waiter(&waiter); return ret; @@ -1755,6 +1765,7 @@ static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock, struct ww_acquire_ctx *ww_ctx, unsigned int state) { + DEFINE_WAKE_Q(wake_q); unsigned long flags; int ret; @@ -1776,8 +1787,11 @@ static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock, * irqsave/restore variants. */ raw_spin_lock_irqsave(&lock->wait_lock, flags); - ret = __rt_mutex_slowlock_locked(lock, ww_ctx, state); + ret = __rt_mutex_slowlock_locked(lock, ww_ctx, state, &wake_q); + preempt_disable(); raw_spin_unlock_irqrestore(&lock->wait_lock, flags); + wake_up_q(&wake_q); + preempt_enable(); rt_mutex_post_schedule(); return ret; @@ -1803,8 +1817,11 @@ static __always_inline int __rt_mutex_lock(struct rt_mutex_base *lock, /** * rtlock_slowlock_locked - Slow path lock acquisition for RT locks * @lock: The underlying RT mutex + * @wake_q: The wake_q to wake tasks after we release the wait_lock */ -static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock) +static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock, + struct wake_q_head *wake_q) + __releases(&lock->wait_lock) __acquires(&lock->wait_lock) { struct rt_mutex_waiter waiter; struct task_struct *owner; @@ -1821,7 +1838,7 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock) trace_contention_begin(lock, LCB_F_RT); - task_blocks_on_rt_mutex(lock, &waiter, current, NULL, RT_MUTEX_MIN_CHAINWALK); + task_blocks_on_rt_mutex(lock, &waiter, current, NULL, RT_MUTEX_MIN_CHAINWALK, wake_q); for (;;) { /* Try to acquire the lock again */ @@ -1832,7 +1849,11 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock) owner = rt_mutex_owner(lock); else owner = NULL; + preempt_disable(); raw_spin_unlock_irq(&lock->wait_lock); + wake_up_q(wake_q); + wake_q_init(wake_q); + preempt_enable(); if (!owner || !rtmutex_spin_on_owner(lock, &waiter, owner)) schedule_rtlock(); @@ -1857,10 +1878,14 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock) static __always_inline void __sched rtlock_slowlock(struct rt_mutex_base *lock) { unsigned long flags; + DEFINE_WAKE_Q(wake_q); raw_spin_lock_irqsave(&lock->wait_lock, flags); - rtlock_slowlock_locked(lock); + rtlock_slowlock_locked(lock, &wake_q); + preempt_disable(); raw_spin_unlock_irqrestore(&lock->wait_lock, flags); + wake_up_q(&wake_q); + preempt_enable(); } #endif /* RT_MUTEX_BUILD_SPINLOCKS */ diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c index a6974d044593..33ea31d6a7b3 100644 --- a/kernel/locking/rtmutex_api.c +++ b/kernel/locking/rtmutex_api.c @@ -175,10 +175,10 @@ bool __sched __rt_mutex_futex_unlock(struct rt_mutex_base *lock, } /* - * We've already deboosted, mark_wakeup_next_waiter() will - * retain preempt_disabled when we drop the wait_lock, to - * avoid inversion prior to the wakeup. preempt_disable() - * therein pairs with rt_mutex_postunlock(). + * mark_wakeup_next_waiter() deboosts and retains preemption + * disabled when dropping the wait_lock, to avoid inversion prior + * to the wakeup. preempt_disable() therein pairs with the + * preempt_enable() in rt_mutex_postunlock(). */ mark_wakeup_next_waiter(wqh, lock); @@ -275,6 +275,7 @@ void __sched rt_mutex_proxy_unlock(struct rt_mutex_base *lock) * @lock: the rt_mutex to take * @waiter: the pre-initialized rt_mutex_waiter * @task: the task to prepare + * @wake_q: the wake_q to wake tasks after we release the wait_lock * * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that. @@ -291,7 +292,8 @@ void __sched rt_mutex_proxy_unlock(struct rt_mutex_base *lock) */ int __sched __rt_mutex_start_proxy_lock(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter, - struct task_struct *task) + struct task_struct *task, + struct wake_q_head *wake_q) { int ret; @@ -302,7 +304,7 @@ int __sched __rt_mutex_start_proxy_lock(struct rt_mutex_base *lock, /* We enforce deadlock detection for futexes */ ret = task_blocks_on_rt_mutex(lock, waiter, task, NULL, - RT_MUTEX_FULL_CHAINWALK); + RT_MUTEX_FULL_CHAINWALK, wake_q); if (ret && !rt_mutex_owner(lock)) { /* @@ -341,12 +343,16 @@ int __sched rt_mutex_start_proxy_lock(struct rt_mutex_base *lock, struct task_struct *task) { int ret; + DEFINE_WAKE_Q(wake_q); raw_spin_lock_irq(&lock->wait_lock); - ret = __rt_mutex_start_proxy_lock(lock, waiter, task); + ret = __rt_mutex_start_proxy_lock(lock, waiter, task, &wake_q); if (unlikely(ret)) remove_waiter(lock, waiter); + preempt_disable(); raw_spin_unlock_irq(&lock->wait_lock); + wake_up_q(&wake_q); + preempt_enable(); return ret; } diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 1162e07cdaea..c38a2d2d4a7e 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -83,7 +83,8 @@ extern void rt_mutex_init_proxy_locked(struct rt_mutex_base *lock, extern void rt_mutex_proxy_unlock(struct rt_mutex_base *lock); extern int __rt_mutex_start_proxy_lock(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter, - struct task_struct *task); + struct task_struct *task, + struct wake_q_head *); extern int rt_mutex_start_proxy_lock(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter, struct task_struct *task); diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c index 34a59569db6b..9f4322c07486 100644 --- a/kernel/locking/rwbase_rt.c +++ b/kernel/locking/rwbase_rt.c @@ -69,6 +69,7 @@ static int __sched __rwbase_read_lock(struct rwbase_rt *rwb, unsigned int state) { struct rt_mutex_base *rtm = &rwb->rtmutex; + DEFINE_WAKE_Q(wake_q); int ret; rwbase_pre_schedule(); @@ -110,7 +111,7 @@ static int __sched __rwbase_read_lock(struct rwbase_rt *rwb, * For rwlocks this returns 0 unconditionally, so the below * !ret conditionals are optimized out. */ - ret = rwbase_rtmutex_slowlock_locked(rtm, state); + ret = rwbase_rtmutex_slowlock_locked(rtm, state, &wake_q); /* * On success the rtmutex is held, so there can't be a writer @@ -121,7 +122,12 @@ static int __sched __rwbase_read_lock(struct rwbase_rt *rwb, */ if (!ret) atomic_inc(&rwb->readers); + + preempt_disable(); raw_spin_unlock_irq(&rtm->wait_lock); + wake_up_q(&wake_q); + preempt_enable(); + if (!ret) rwbase_rtmutex_unlock(rtm); diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 5ded7dff46ef..2ddb827e3bea 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -181,12 +181,21 @@ static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) __rwsem_set_reader_owned(sem, current); } +#ifdef CONFIG_DEBUG_RWSEMS +/* + * Return just the real task structure pointer of the owner + */ +static inline struct task_struct *rwsem_owner(struct rw_semaphore *sem) +{ + return (struct task_struct *) + (atomic_long_read(&sem->owner) & ~RWSEM_OWNER_FLAGS_MASK); +} + /* * Return true if the rwsem is owned by a reader. */ static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem) { -#ifdef CONFIG_DEBUG_RWSEMS /* * Check the count to see if it is write-locked. */ @@ -194,11 +203,9 @@ static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem) if (count & RWSEM_WRITER_MASK) return false; -#endif return rwsem_test_oflags(sem, RWSEM_READER_OWNED); } -#ifdef CONFIG_DEBUG_RWSEMS /* * With CONFIG_DEBUG_RWSEMS configured, it will make sure that if there * is a task pointer in owner of a reader-owned rwsem, it will be the @@ -266,15 +273,6 @@ static inline bool rwsem_write_trylock(struct rw_semaphore *sem) } /* - * Return just the real task structure pointer of the owner - */ -static inline struct task_struct *rwsem_owner(struct rw_semaphore *sem) -{ - return (struct task_struct *) - (atomic_long_read(&sem->owner) & ~RWSEM_OWNER_FLAGS_MASK); -} - -/* * Return the real task structure pointer of the owner and the embedded * flags in the owner. pflags must be non-NULL. */ @@ -1415,8 +1413,8 @@ static inline void __downgrade_write(struct rw_semaphore *sem) #define rwbase_rtmutex_lock_state(rtm, state) \ __rt_mutex_lock(rtm, state) -#define rwbase_rtmutex_slowlock_locked(rtm, state) \ - __rt_mutex_slowlock_locked(rtm, NULL, state) +#define rwbase_rtmutex_slowlock_locked(rtm, state, wq) \ + __rt_mutex_slowlock_locked(rtm, NULL, state, wq) #define rwbase_rtmutex_unlock(rtm) \ __rt_mutex_unlock(rtm) diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c index 438c6086d540..7685defd7c52 100644 --- a/kernel/locking/spinlock.c +++ b/kernel/locking/spinlock.c @@ -65,7 +65,7 @@ EXPORT_PER_CPU_SYMBOL(__mmiowb_state); * towards that other CPU that it should break the lock ASAP. */ #define BUILD_LOCK_OPS(op, locktype) \ -void __lockfunc __raw_##op##_lock(locktype##_t *lock) \ +static void __lockfunc __raw_##op##_lock(locktype##_t *lock) \ { \ for (;;) { \ preempt_disable(); \ @@ -77,7 +77,7 @@ void __lockfunc __raw_##op##_lock(locktype##_t *lock) \ } \ } \ \ -unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \ +static unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \ { \ unsigned long flags; \ \ @@ -95,12 +95,12 @@ unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \ return flags; \ } \ \ -void __lockfunc __raw_##op##_lock_irq(locktype##_t *lock) \ +static void __lockfunc __raw_##op##_lock_irq(locktype##_t *lock) \ { \ _raw_##op##_lock_irqsave(lock); \ } \ \ -void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock) \ +static void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock) \ { \ unsigned long flags; \ \ diff --git a/kernel/locking/spinlock_rt.c b/kernel/locking/spinlock_rt.c index 38e292454fcc..db1e11b45de6 100644 --- a/kernel/locking/spinlock_rt.c +++ b/kernel/locking/spinlock_rt.c @@ -51,7 +51,7 @@ static __always_inline void __rt_spin_lock(spinlock_t *lock) migrate_disable(); } -void __sched rt_spin_lock(spinlock_t *lock) +void __sched rt_spin_lock(spinlock_t *lock) __acquires(RCU) { spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); __rt_spin_lock(lock); @@ -75,7 +75,7 @@ void __sched rt_spin_lock_nest_lock(spinlock_t *lock, EXPORT_SYMBOL(rt_spin_lock_nest_lock); #endif -void __sched rt_spin_unlock(spinlock_t *lock) +void __sched rt_spin_unlock(spinlock_t *lock) __releases(RCU) { spin_release(&lock->dep_map, _RET_IP_); migrate_enable(); @@ -162,9 +162,10 @@ rwbase_rtmutex_lock_state(struct rt_mutex_base *rtm, unsigned int state) } static __always_inline int -rwbase_rtmutex_slowlock_locked(struct rt_mutex_base *rtm, unsigned int state) +rwbase_rtmutex_slowlock_locked(struct rt_mutex_base *rtm, unsigned int state, + struct wake_q_head *wake_q) { - rtlock_slowlock_locked(rtm); + rtlock_slowlock_locked(rtm, wake_q); return 0; } @@ -225,7 +226,7 @@ int __sched rt_write_trylock(rwlock_t *rwlock) } EXPORT_SYMBOL(rt_write_trylock); -void __sched rt_read_lock(rwlock_t *rwlock) +void __sched rt_read_lock(rwlock_t *rwlock) __acquires(RCU) { rtlock_might_resched(); rwlock_acquire_read(&rwlock->dep_map, 0, 0, _RET_IP_); @@ -235,7 +236,7 @@ void __sched rt_read_lock(rwlock_t *rwlock) } EXPORT_SYMBOL(rt_read_lock); -void __sched rt_write_lock(rwlock_t *rwlock) +void __sched rt_write_lock(rwlock_t *rwlock) __acquires(RCU) { rtlock_might_resched(); rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_); @@ -246,7 +247,7 @@ void __sched rt_write_lock(rwlock_t *rwlock) EXPORT_SYMBOL(rt_write_lock); #ifdef CONFIG_DEBUG_LOCK_ALLOC -void __sched rt_write_lock_nested(rwlock_t *rwlock, int subclass) +void __sched rt_write_lock_nested(rwlock_t *rwlock, int subclass) __acquires(RCU) { rtlock_might_resched(); rwlock_acquire(&rwlock->dep_map, subclass, 0, _RET_IP_); @@ -257,7 +258,7 @@ void __sched rt_write_lock_nested(rwlock_t *rwlock, int subclass) EXPORT_SYMBOL(rt_write_lock_nested); #endif -void __sched rt_read_unlock(rwlock_t *rwlock) +void __sched rt_read_unlock(rwlock_t *rwlock) __releases(RCU) { rwlock_release(&rwlock->dep_map, _RET_IP_); migrate_enable(); @@ -266,7 +267,7 @@ void __sched rt_read_unlock(rwlock_t *rwlock) } EXPORT_SYMBOL(rt_read_unlock); -void __sched rt_write_unlock(rwlock_t *rwlock) +void __sched rt_write_unlock(rwlock_t *rwlock) __releases(RCU) { rwlock_release(&rwlock->dep_map, _RET_IP_); rcu_read_unlock(); diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c index 10a5736a21c2..5d58b2c0ef98 100644 --- a/kernel/locking/test-ww_mutex.c +++ b/kernel/locking/test-ww_mutex.c @@ -62,7 +62,8 @@ static int __test_mutex(unsigned int flags) int ret; ww_mutex_init(&mtx.mutex, &ww_class); - ww_acquire_init(&ctx, &ww_class); + if (flags & TEST_MTX_CTX) + ww_acquire_init(&ctx, &ww_class); INIT_WORK_ONSTACK(&mtx.work, test_mutex_work); init_completion(&mtx.ready); @@ -90,7 +91,8 @@ static int __test_mutex(unsigned int flags) ret = wait_for_completion_timeout(&mtx.done, TIMEOUT); } ww_mutex_unlock(&mtx.mutex); - ww_acquire_fini(&ctx); + if (flags & TEST_MTX_CTX) + ww_acquire_fini(&ctx); if (ret) { pr_err("%s(flags=%x): mutual exclusion failure\n", @@ -679,7 +681,7 @@ static int __init test_ww_mutex_init(void) if (ret) return ret; - ret = stress(2047, hweight32(STRESS_ALL)*ncpus, STRESS_ALL); + ret = stress(2046, hweight32(STRESS_ALL)*ncpus, STRESS_ALL); if (ret) return ret; diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h index 76d204b7d29c..37f025a096c9 100644 --- a/kernel/locking/ww_mutex.h +++ b/kernel/locking/ww_mutex.h @@ -70,14 +70,14 @@ __ww_mutex_has_waiters(struct mutex *lock) return atomic_long_read(&lock->owner) & MUTEX_FLAG_WAITERS; } -static inline void lock_wait_lock(struct mutex *lock) +static inline void lock_wait_lock(struct mutex *lock, unsigned long *flags) { - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irqsave(&lock->wait_lock, *flags); } -static inline void unlock_wait_lock(struct mutex *lock) +static inline void unlock_wait_lock(struct mutex *lock, unsigned long *flags) { - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, *flags); } static inline void lockdep_assert_wait_lock_held(struct mutex *lock) @@ -144,14 +144,14 @@ __ww_mutex_has_waiters(struct rt_mutex *lock) return rt_mutex_has_waiters(&lock->rtmutex); } -static inline void lock_wait_lock(struct rt_mutex *lock) +static inline void lock_wait_lock(struct rt_mutex *lock, unsigned long *flags) { - raw_spin_lock(&lock->rtmutex.wait_lock); + raw_spin_lock_irqsave(&lock->rtmutex.wait_lock, *flags); } -static inline void unlock_wait_lock(struct rt_mutex *lock) +static inline void unlock_wait_lock(struct rt_mutex *lock, unsigned long *flags) { - raw_spin_unlock(&lock->rtmutex.wait_lock); + raw_spin_unlock_irqrestore(&lock->rtmutex.wait_lock, *flags); } static inline void lockdep_assert_wait_lock_held(struct rt_mutex *lock) @@ -275,7 +275,7 @@ __ww_ctx_less(struct ww_acquire_ctx *a, struct ww_acquire_ctx *b) */ static bool __ww_mutex_die(struct MUTEX *lock, struct MUTEX_WAITER *waiter, - struct ww_acquire_ctx *ww_ctx) + struct ww_acquire_ctx *ww_ctx, struct wake_q_head *wake_q) { if (!ww_ctx->is_wait_die) return false; @@ -284,7 +284,7 @@ __ww_mutex_die(struct MUTEX *lock, struct MUTEX_WAITER *waiter, #ifndef WW_RT debug_mutex_wake_waiter(lock, waiter); #endif - wake_up_process(waiter->task); + wake_q_add(wake_q, waiter->task); } return true; @@ -299,7 +299,8 @@ __ww_mutex_die(struct MUTEX *lock, struct MUTEX_WAITER *waiter, */ static bool __ww_mutex_wound(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx, - struct ww_acquire_ctx *hold_ctx) + struct ww_acquire_ctx *hold_ctx, + struct wake_q_head *wake_q) { struct task_struct *owner = __ww_mutex_owner(lock); @@ -331,7 +332,7 @@ static bool __ww_mutex_wound(struct MUTEX *lock, * wakeup pending to re-read the wounded state. */ if (owner != current) - wake_up_process(owner); + wake_q_add(wake_q, owner); return true; } @@ -352,7 +353,8 @@ static bool __ww_mutex_wound(struct MUTEX *lock, * The current task must not be on the wait list. */ static void -__ww_mutex_check_waiters(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx) +__ww_mutex_check_waiters(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx, + struct wake_q_head *wake_q) { struct MUTEX_WAITER *cur; @@ -364,8 +366,8 @@ __ww_mutex_check_waiters(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx) if (!cur->ww_ctx) continue; - if (__ww_mutex_die(lock, cur, ww_ctx) || - __ww_mutex_wound(lock, cur->ww_ctx, ww_ctx)) + if (__ww_mutex_die(lock, cur, ww_ctx, wake_q) || + __ww_mutex_wound(lock, cur->ww_ctx, ww_ctx, wake_q)) break; } } @@ -377,6 +379,9 @@ __ww_mutex_check_waiters(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx) static __always_inline void ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { + DEFINE_WAKE_Q(wake_q); + unsigned long flags; + ww_mutex_lock_acquired(lock, ctx); /* @@ -404,9 +409,12 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) * Uh oh, we raced in fastpath, check if any of the waiters need to * die or wound us. */ - lock_wait_lock(&lock->base); - __ww_mutex_check_waiters(&lock->base, ctx); - unlock_wait_lock(&lock->base); + lock_wait_lock(&lock->base, &flags); + __ww_mutex_check_waiters(&lock->base, ctx, &wake_q); + preempt_disable(); + unlock_wait_lock(&lock->base, &flags); + wake_up_q(&wake_q); + preempt_enable(); } static __always_inline int @@ -488,7 +496,8 @@ __ww_mutex_check_kill(struct MUTEX *lock, struct MUTEX_WAITER *waiter, static inline int __ww_mutex_add_waiter(struct MUTEX_WAITER *waiter, struct MUTEX *lock, - struct ww_acquire_ctx *ww_ctx) + struct ww_acquire_ctx *ww_ctx, + struct wake_q_head *wake_q) { struct MUTEX_WAITER *cur, *pos = NULL; bool is_wait_die; @@ -532,7 +541,7 @@ __ww_mutex_add_waiter(struct MUTEX_WAITER *waiter, pos = cur; /* Wait-Die: ensure younger waiters die. */ - __ww_mutex_die(lock, cur, ww_ctx); + __ww_mutex_die(lock, cur, ww_ctx, wake_q); } __ww_waiter_add(lock, waiter, pos); @@ -550,7 +559,7 @@ __ww_mutex_add_waiter(struct MUTEX_WAITER *waiter, * such that either we or the fastpath will wound @ww->ctx. */ smp_mb(); - __ww_mutex_wound(lock, ww_ctx, ww->ctx); + __ww_mutex_wound(lock, ww_ctx, ww->ctx, wake_q); } return 0; diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig index 4047b6d48255..7c6588148d42 100644 --- a/kernel/module/Kconfig +++ b/kernel/module/Kconfig @@ -160,6 +160,7 @@ config MODULE_UNLOAD_TAINT_TRACKING config MODVERSIONS bool "Module versioning support" + depends on !COMPILE_TEST help Usually, you have to use modules compiled with your kernel. Saying Y here makes it sometimes possible to use modules @@ -228,7 +229,7 @@ comment "Do not forget to sign required modules with scripts/sign-file" depends on MODULE_SIG_FORCE && !MODULE_SIG_ALL choice - prompt "Which hash algorithm should modules be signed with?" + prompt "Hash algorithm to sign modules" depends on MODULE_SIG || IMA_APPRAISE_MODSIG help This determines which sort of hashing algorithm will be used during @@ -238,31 +239,31 @@ choice the signature on that module. config MODULE_SIG_SHA1 - bool "Sign modules with SHA-1" + bool "SHA-1" select CRYPTO_SHA1 config MODULE_SIG_SHA256 - bool "Sign modules with SHA-256" + bool "SHA-256" select CRYPTO_SHA256 config MODULE_SIG_SHA384 - bool "Sign modules with SHA-384" + bool "SHA-384" select CRYPTO_SHA512 config MODULE_SIG_SHA512 - bool "Sign modules with SHA-512" + bool "SHA-512" select CRYPTO_SHA512 config MODULE_SIG_SHA3_256 - bool "Sign modules with SHA3-256" + bool "SHA3-256" select CRYPTO_SHA3 config MODULE_SIG_SHA3_384 - bool "Sign modules with SHA3-384" + bool "SHA3-384" select CRYPTO_SHA3 config MODULE_SIG_SHA3_512 - bool "Sign modules with SHA3-512" + bool "SHA3-512" select CRYPTO_SHA3 endchoice @@ -278,64 +279,65 @@ config MODULE_SIG_HASH default "sha3-384" if MODULE_SIG_SHA3_384 default "sha3-512" if MODULE_SIG_SHA3_512 -choice - prompt "Module compression mode" +config MODULE_COMPRESS + bool "Module compression" help - This option allows you to choose the algorithm which will be used to - compress modules when 'make modules_install' is run. (or, you can - choose to not compress modules at all.) - - External modules will also be compressed in the same way during the - installation. - - For modules inside an initrd or initramfs, it's more efficient to - compress the whole initrd or initramfs instead. - + Enable module compression to reduce on-disk size of module binaries. This is fully compatible with signed modules. - Please note that the tool used to load modules needs to support the - corresponding algorithm. module-init-tools MAY support gzip, and kmod - MAY support gzip, xz and zstd. + The tool used to work with modules needs to support the selected + compression type. kmod MAY support gzip, xz and zstd. Other tools + might have a limited selection of the supported types. - Your build system needs to provide the appropriate compression tool - to compress the modules. + Note that for modules inside an initrd or initramfs, it's more + efficient to compress the whole ramdisk instead. - If in doubt, select 'None'. + If unsure, say N. -config MODULE_COMPRESS_NONE - bool "None" +choice + prompt "Module compression type" + depends on MODULE_COMPRESS help - Do not compress modules. The installed modules are suffixed - with .ko. + Choose the supported algorithm for module compression. config MODULE_COMPRESS_GZIP bool "GZIP" help - Compress modules with GZIP. The installed modules are suffixed - with .ko.gz. + Support modules compressed with GZIP. The installed modules are + suffixed with .ko.gz. config MODULE_COMPRESS_XZ bool "XZ" help - Compress modules with XZ. The installed modules are suffixed - with .ko.xz. + Support modules compressed with XZ. The installed modules are + suffixed with .ko.xz. config MODULE_COMPRESS_ZSTD bool "ZSTD" help - Compress modules with ZSTD. The installed modules are suffixed - with .ko.zst. + Support modules compressed with ZSTD. The installed modules are + suffixed with .ko.zst. endchoice +config MODULE_COMPRESS_ALL + bool "Automatically compress all modules" + default y + depends on MODULE_COMPRESS + help + Compress all modules during 'make modules_install'. + + Your build system needs to provide the appropriate compression tool + for the selected compression type. External modules will also be + compressed in the same way during the installation. + config MODULE_DECOMPRESS bool "Support in-kernel module decompression" - depends on MODULE_COMPRESS_GZIP || MODULE_COMPRESS_XZ || MODULE_COMPRESS_ZSTD + depends on MODULE_COMPRESS select ZLIB_INFLATE if MODULE_COMPRESS_GZIP select XZ_DEC if MODULE_COMPRESS_XZ select ZSTD_DECOMPRESS if MODULE_COMPRESS_ZSTD help - Support for decompressing kernel modules by the kernel itself instead of relying on userspace to perform this task. Useful when load pinning security policy is enabled. diff --git a/kernel/module/debug_kmemleak.c b/kernel/module/debug_kmemleak.c index 12a569d361e8..b4cc03842d70 100644 --- a/kernel/module/debug_kmemleak.c +++ b/kernel/module/debug_kmemleak.c @@ -12,19 +12,9 @@ void kmemleak_load_module(const struct module *mod, const struct load_info *info) { - unsigned int i; - - /* only scan the sections containing data */ - kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL); - - for (i = 1; i < info->hdr->e_shnum; i++) { - /* Scan all writable sections that's not executable */ - if (!(info->sechdrs[i].sh_flags & SHF_ALLOC) || - !(info->sechdrs[i].sh_flags & SHF_WRITE) || - (info->sechdrs[i].sh_flags & SHF_EXECINSTR)) - continue; - - kmemleak_scan_area((void *)info->sechdrs[i].sh_addr, - info->sechdrs[i].sh_size, GFP_KERNEL); + /* only scan writable, non-executable sections */ + for_each_mod_mem_type(type) { + if (type != MOD_DATA && type != MOD_INIT_DATA) + kmemleak_no_scan(mod->mem[type].base); } } diff --git a/kernel/module/dups.c b/kernel/module/dups.c index 9a92f2f8c9d3..bd2149fbe117 100644 --- a/kernel/module/dups.c +++ b/kernel/module/dups.c @@ -18,7 +18,6 @@ #include <linux/completion.h> #include <linux/cred.h> #include <linux/file.h> -#include <linux/fdtable.h> #include <linux/workqueue.h> #include <linux/security.h> #include <linux/mount.h> diff --git a/kernel/module/kmod.c b/kernel/module/kmod.c index 0800d9891692..25f253812512 100644 --- a/kernel/module/kmod.c +++ b/kernel/module/kmod.c @@ -15,7 +15,6 @@ #include <linux/completion.h> #include <linux/cred.h> #include <linux/file.h> -#include <linux/fdtable.h> #include <linux/workqueue.h> #include <linux/security.h> #include <linux/mount.h> diff --git a/kernel/module/main.c b/kernel/module/main.c index 49b9bca9de12..4490924fe24e 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -3202,7 +3202,7 @@ static int idempotent_init_module(struct file *f, const char __user * uargs, int { struct idempotent idem; - if (!f || !(f->f_mode & FMODE_READ)) + if (!(f->f_mode & FMODE_READ)) return -EBADF; /* Are we the winners of the race and get to do this? */ @@ -3219,10 +3219,7 @@ static int idempotent_init_module(struct file *f, const char __user * uargs, int SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) { - int err; - struct fd f; - - err = may_init_module(); + int err = may_init_module(); if (err) return err; @@ -3233,10 +3230,10 @@ SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) |MODULE_INIT_COMPRESSED_FILE)) return -EINVAL; - f = fdget(fd); - err = idempotent_init_module(fd_file(f), uargs, flags); - fdput(f); - return err; + CLASS(fd, f)(fd); + if (fd_empty(f)) + return -EBADF; + return idempotent_init_module(fd_file(f), uargs, flags); } /* Keep in sync with MODULE_FLAGS_BUF_SIZE !!! */ diff --git a/kernel/module/sysfs.c b/kernel/module/sysfs.c index 26efe1305c12..456358e1fdc4 100644 --- a/kernel/module/sysfs.c +++ b/kernel/module/sysfs.c @@ -69,12 +69,13 @@ static void free_sect_attrs(struct module_sect_attrs *sect_attrs) kfree(sect_attrs); } -static void add_sect_attrs(struct module *mod, const struct load_info *info) +static int add_sect_attrs(struct module *mod, const struct load_info *info) { unsigned int nloaded = 0, i, size[2]; struct module_sect_attrs *sect_attrs; struct module_sect_attr *sattr; struct bin_attribute **gattr; + int ret; /* Count loaded sections and allocate structures */ for (i = 0; i < info->hdr->e_shnum; i++) @@ -85,7 +86,7 @@ static void add_sect_attrs(struct module *mod, const struct load_info *info) size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.bin_attrs[0]); sect_attrs = kzalloc(size[0] + size[1], GFP_KERNEL); if (!sect_attrs) - return; + return -ENOMEM; /* Setup section attributes. */ sect_attrs->grp.name = "sections"; @@ -103,8 +104,10 @@ static void add_sect_attrs(struct module *mod, const struct load_info *info) sattr->address = sec->sh_addr; sattr->battr.attr.name = kstrdup(info->secstrings + sec->sh_name, GFP_KERNEL); - if (!sattr->battr.attr.name) + if (!sattr->battr.attr.name) { + ret = -ENOMEM; goto out; + } sect_attrs->nsections++; sattr->battr.read = module_sect_read; sattr->battr.size = MODULE_SECT_READ_SIZE; @@ -113,13 +116,15 @@ static void add_sect_attrs(struct module *mod, const struct load_info *info) } *gattr = NULL; - if (sysfs_create_group(&mod->mkobj.kobj, §_attrs->grp)) + ret = sysfs_create_group(&mod->mkobj.kobj, §_attrs->grp); + if (ret) goto out; mod->sect_attrs = sect_attrs; - return; + return 0; out: free_sect_attrs(sect_attrs); + return ret; } static void remove_sect_attrs(struct module *mod) @@ -158,15 +163,12 @@ static void free_notes_attrs(struct module_notes_attrs *notes_attrs, kfree(notes_attrs); } -static void add_notes_attrs(struct module *mod, const struct load_info *info) +static int add_notes_attrs(struct module *mod, const struct load_info *info) { unsigned int notes, loaded, i; struct module_notes_attrs *notes_attrs; struct bin_attribute *nattr; - - /* failed to create section attributes, so can't create notes */ - if (!mod->sect_attrs) - return; + int ret; /* Count notes sections and allocate structures. */ notes = 0; @@ -176,12 +178,12 @@ static void add_notes_attrs(struct module *mod, const struct load_info *info) ++notes; if (notes == 0) - return; + return 0; notes_attrs = kzalloc(struct_size(notes_attrs, attrs, notes), GFP_KERNEL); if (!notes_attrs) - return; + return -ENOMEM; notes_attrs->notes = notes; nattr = ¬es_attrs->attrs[0]; @@ -201,19 +203,23 @@ static void add_notes_attrs(struct module *mod, const struct load_info *info) } notes_attrs->dir = kobject_create_and_add("notes", &mod->mkobj.kobj); - if (!notes_attrs->dir) + if (!notes_attrs->dir) { + ret = -ENOMEM; goto out; + } - for (i = 0; i < notes; ++i) - if (sysfs_create_bin_file(notes_attrs->dir, - ¬es_attrs->attrs[i])) + for (i = 0; i < notes; ++i) { + ret = sysfs_create_bin_file(notes_attrs->dir, ¬es_attrs->attrs[i]); + if (ret) goto out; + } mod->notes_attrs = notes_attrs; - return; + return 0; out: free_notes_attrs(notes_attrs, i); + return ret; } static void remove_notes_attrs(struct module *mod) @@ -223,9 +229,15 @@ static void remove_notes_attrs(struct module *mod) } #else /* !CONFIG_KALLSYMS */ -static inline void add_sect_attrs(struct module *mod, const struct load_info *info) { } +static inline int add_sect_attrs(struct module *mod, const struct load_info *info) +{ + return 0; +} static inline void remove_sect_attrs(struct module *mod) { } -static inline void add_notes_attrs(struct module *mod, const struct load_info *info) { } +static inline int add_notes_attrs(struct module *mod, const struct load_info *info) +{ + return 0; +} static inline void remove_notes_attrs(struct module *mod) { } #endif /* CONFIG_KALLSYMS */ @@ -385,11 +397,20 @@ int mod_sysfs_setup(struct module *mod, if (err) goto out_unreg_modinfo_attrs; - add_sect_attrs(mod, info); - add_notes_attrs(mod, info); + err = add_sect_attrs(mod, info); + if (err) + goto out_del_usage_links; + + err = add_notes_attrs(mod, info); + if (err) + goto out_unreg_sect_attrs; return 0; +out_unreg_sect_attrs: + remove_sect_attrs(mod); +out_del_usage_links: + del_usage_links(mod); out_unreg_modinfo_attrs: module_remove_modinfo_attrs(mod, -1); out_unreg_param: diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index dc952c3b05af..c9d97ed20122 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -545,12 +545,12 @@ static void commit_nsset(struct nsset *nsset) SYSCALL_DEFINE2(setns, int, fd, int, flags) { - struct fd f = fdget(fd); + CLASS(fd, f)(fd); struct ns_common *ns = NULL; struct nsset nsset = {}; int err = 0; - if (!fd_file(f)) + if (fd_empty(f)) return -EBADF; if (proc_ns_file(fd_file(f))) { @@ -580,7 +580,6 @@ SYSCALL_DEFINE2(setns, int, fd, int, flags) } put_nsset(&nsset); out: - fdput(f); return err; } diff --git a/kernel/padata.c b/kernel/padata.c index d899f34558af..d51bbc76b227 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -521,13 +521,6 @@ void __init padata_do_multithreaded(struct padata_mt_job *job) ps.chunk_size = max(ps.chunk_size, 1ul); ps.chunk_size = roundup(ps.chunk_size, job->align); - /* - * chunk_size can be 0 if the caller sets min_chunk to 0. So force it - * to at least 1 to prevent divide-by-0 panic in padata_mt_helper().` - */ - if (!ps.chunk_size) - ps.chunk_size = 1U; - list_for_each_entry(pw, &works, pw_list) if (job->numa_aware) { int old_node = atomic_read(&last_used_nid); diff --git a/kernel/pid.c b/kernel/pid.c index 2715afb77eab..115448e89c3e 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -536,11 +536,10 @@ EXPORT_SYMBOL_GPL(find_ge_pid); struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags) { - struct fd f; + CLASS(fd, f)(fd); struct pid *pid; - f = fdget(fd); - if (!fd_file(f)) + if (fd_empty(f)) return ERR_PTR(-EBADF); pid = pidfd_pid(fd_file(f)); @@ -548,8 +547,6 @@ struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags) get_pid(pid); *flags = fd_file(f)->f_flags; } - - fdput(f); return pid; } @@ -747,23 +744,18 @@ SYSCALL_DEFINE3(pidfd_getfd, int, pidfd, int, fd, unsigned int, flags) { struct pid *pid; - struct fd f; - int ret; /* flags is currently unused - make sure it's unset */ if (flags) return -EINVAL; - f = fdget(pidfd); - if (!fd_file(f)) + CLASS(fd, f)(pidfd); + if (fd_empty(f)) return -EBADF; pid = pidfd_pid(fd_file(f)); if (IS_ERR(pid)) - ret = PTR_ERR(pid); - else - ret = pidfd_getfd(pid, fd); + return PTR_ERR(pid); - fdput(f); - return ret; + return pidfd_getfd(pid, fd); } diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 927cc55ba0b3..d07faf42eace 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -628,6 +628,8 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, goto unlock; dev->em_pd->flags |= flags; + dev->em_pd->min_perf_state = 0; + dev->em_pd->max_perf_state = nr_states - 1; em_cpufreq_update_efficiencies(dev, dev->em_pd->em_table->state); @@ -856,3 +858,53 @@ int em_dev_update_chip_binning(struct device *dev) return em_recalc_and_update(dev, pd, em_table); } EXPORT_SYMBOL_GPL(em_dev_update_chip_binning); + + +/** + * em_update_performance_limits() - Update Energy Model with performance + * limits information. + * @pd : Performance Domain with EM that has to be updated. + * @freq_min_khz : New minimum allowed frequency for this device. + * @freq_max_khz : New maximum allowed frequency for this device. + * + * This function allows to update the EM with information about available + * performance levels. It takes the minimum and maximum frequency in kHz + * and does internal translation to performance levels. + * Returns 0 on success or -EINVAL when failed. + */ +int em_update_performance_limits(struct em_perf_domain *pd, + unsigned long freq_min_khz, unsigned long freq_max_khz) +{ + struct em_perf_state *table; + int min_ps = -1; + int max_ps = -1; + int i; + + if (!pd) + return -EINVAL; + + rcu_read_lock(); + table = em_perf_state_from_pd(pd); + + for (i = 0; i < pd->nr_perf_states; i++) { + if (freq_min_khz == table[i].frequency) + min_ps = i; + if (freq_max_khz == table[i].frequency) + max_ps = i; + } + rcu_read_unlock(); + + /* Only update when both are found and sane */ + if (min_ps < 0 || max_ps < 0 || max_ps < min_ps) + return -EINVAL; + + + /* Guard simultaneous updates and make them atomic */ + mutex_lock(&em_pd_mutex); + pd->min_perf_state = min_ps; + pd->max_perf_state = max_ps; + mutex_unlock(&em_pd_mutex); + + return 0; +} +EXPORT_SYMBOL_GPL(em_update_performance_limits); diff --git a/kernel/power/user.c b/kernel/power/user.c index 3aa41ba22129..3f9e3efb9f6e 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -447,7 +447,6 @@ static const struct file_operations snapshot_fops = { .release = snapshot_release, .read = snapshot_read, .write = snapshot_write, - .llseek = no_llseek, .unlocked_ioctl = snapshot_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = snapshot_compat_ioctl, diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 3fcb48502adb..c6bb47666aef 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -53,6 +53,8 @@ int devkmsg_sysctl_set_loglvl(const struct ctl_table *table, int write, /* Flags for a single printk record. */ enum printk_info_flags { + /* always show on console, ignore console_loglevel */ + LOG_FORCE_CON = 1, LOG_NEWLINE = 2, /* text ended with a newline */ LOG_CONT = 8, /* text is a fragment of a continuation line */ }; @@ -90,6 +92,7 @@ bool printk_percpu_data_ready(void); void defer_console_output(void); bool is_printk_legacy_deferred(void); +bool is_printk_force_console(void); u16 printk_parse_prefix(const char *text, int *level, enum printk_info_flags *flags); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index beb808f4c367..80910bc3470c 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -560,10 +560,11 @@ bool printk_percpu_data_ready(void) /* Must be called under syslog_lock. */ static void latched_seq_write(struct latched_seq *ls, u64 val) { - raw_write_seqcount_latch(&ls->latch); + write_seqcount_latch_begin(&ls->latch); ls->val[0] = val; - raw_write_seqcount_latch(&ls->latch); + write_seqcount_latch(&ls->latch); ls->val[1] = val; + write_seqcount_latch_end(&ls->latch); } /* Can be called from any context. */ @@ -574,10 +575,10 @@ static u64 latched_seq_read_nolock(struct latched_seq *ls) u64 val; do { - seq = raw_read_seqcount_latch(&ls->latch); + seq = read_seqcount_latch(&ls->latch); idx = seq & 0x1; val = ls->val[idx]; - } while (raw_read_seqcount_latch_retry(&ls->latch, seq)); + } while (read_seqcount_latch_retry(&ls->latch, seq)); return val; } @@ -1156,6 +1157,17 @@ static unsigned int __init add_to_rb(struct printk_ringbuffer *rb, static char setup_text_buf[PRINTKRB_RECORD_MAX] __initdata; +static void print_log_buf_usage_stats(void) +{ + unsigned int descs_count = log_buf_len >> PRB_AVGBITS; + size_t meta_data_size; + + meta_data_size = descs_count * (sizeof(struct prb_desc) + sizeof(struct printk_info)); + + pr_info("log buffer data + meta data: %u + %zu = %zu bytes\n", + log_buf_len, meta_data_size, log_buf_len + meta_data_size); +} + void __init setup_log_buf(int early) { struct printk_info *new_infos; @@ -1185,20 +1197,25 @@ void __init setup_log_buf(int early) if (!early && !new_log_buf_len) log_buf_add_cpu(); - if (!new_log_buf_len) + if (!new_log_buf_len) { + /* Show the memory stats only once. */ + if (!early) + goto out; + return; + } new_descs_count = new_log_buf_len >> PRB_AVGBITS; if (new_descs_count == 0) { pr_err("new_log_buf_len: %lu too small\n", new_log_buf_len); - return; + goto out; } new_log_buf = memblock_alloc(new_log_buf_len, LOG_ALIGN); if (unlikely(!new_log_buf)) { pr_err("log_buf_len: %lu text bytes not available\n", new_log_buf_len); - return; + goto out; } new_descs_size = new_descs_count * sizeof(struct prb_desc); @@ -1261,7 +1278,7 @@ void __init setup_log_buf(int early) prb_next_seq(&printk_rb_static) - seq); } - pr_info("log_buf_len: %u bytes\n", log_buf_len); + print_log_buf_usage_stats(); pr_info("early log buf free: %u(%u%%)\n", free, (free * 100) / __LOG_BUF_LEN); return; @@ -1270,6 +1287,8 @@ err_free_descs: memblock_free(new_descs, new_descs_size); err_free_log_buf: memblock_free(new_log_buf, new_log_buf_len); +out: + print_log_buf_usage_stats(); } static bool __read_mostly ignore_loglevel; @@ -1319,11 +1338,11 @@ static void boot_delay_msec(int level) { unsigned long long k; unsigned long timeout; + bool suppress = !is_printk_force_console() && + suppress_message_printing(level); - if ((boot_delay == 0 || system_state >= SYSTEM_RUNNING) - || suppress_message_printing(level)) { + if ((boot_delay == 0 || system_state >= SYSTEM_RUNNING) || suppress) return; - } k = (unsigned long long)loops_per_msec * boot_delay; @@ -2273,6 +2292,9 @@ int vprintk_store(int facility, int level, if (dev_info) flags |= LOG_NEWLINE; + if (is_printk_force_console()) + flags |= LOG_FORCE_CON; + if (flags & LOG_CONT) { prb_rec_init_wr(&r, reserve_size); if (prb_reserve_in_last(&e, prb, &r, caller_id, PRINTKRB_RECORD_MAX)) { @@ -2280,6 +2302,9 @@ int vprintk_store(int facility, int level, facility, &flags, fmt, args); r.info->text_len += text_len; + if (flags & LOG_FORCE_CON) + r.info->flags |= LOG_FORCE_CON; + if (flags & LOG_NEWLINE) { r.info->flags |= LOG_NEWLINE; prb_final_commit(&e); @@ -2947,6 +2972,7 @@ bool printk_get_next_message(struct printk_message *pmsg, u64 seq, struct printk_info info; struct printk_record r; size_t len = 0; + bool force_con; /* * Formatting extended messages requires a separate buffer, so use the @@ -2965,9 +2991,13 @@ bool printk_get_next_message(struct printk_message *pmsg, u64 seq, pmsg->seq = r.info->seq; pmsg->dropped = r.info->seq - seq; + force_con = r.info->flags & LOG_FORCE_CON; - /* Skip record that has level above the console loglevel. */ - if (may_suppress && suppress_message_printing(r.info->level)) + /* + * Skip records that are not forced to be printed on consoles and that + * has level above the console loglevel. + */ + if (!force_con && may_suppress && suppress_message_printing(r.info->level)) goto out; if (is_extended) { diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index 2b35a9d3919d..6f94418d53ff 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -12,6 +12,24 @@ #include "internal.h" +/* Context where printk messages are never suppressed */ +static atomic_t force_con; + +void printk_force_console_enter(void) +{ + atomic_inc(&force_con); +} + +void printk_force_console_exit(void) +{ + atomic_dec(&force_con); +} + +bool is_printk_force_console(void) +{ + return atomic_read(&force_con); +} + static DEFINE_PER_CPU(int, printk_context); /* Can be preempted by NMI. */ diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 3e079de0f5b4..b9b6bc55185d 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -249,16 +249,24 @@ config RCU_NOCB_CPU workloads will incur significant increases in context-switch rates. - This option offloads callback invocation from the set of CPUs - specified at boot time by the rcu_nocbs parameter. For each - such CPU, a kthread ("rcuox/N") will be created to invoke - callbacks, where the "N" is the CPU being offloaded, and where - the "x" is "p" for RCU-preempt (PREEMPTION kernels) and "s" for - RCU-sched (!PREEMPTION kernels). Nothing prevents this kthread - from running on the specified CPUs, but (1) the kthreads may be - preempted between each callback, and (2) affinity or cgroups can - be used to force the kthreads to run on whatever set of CPUs is - desired. + This option offloads callback invocation from the set of + CPUs specified at boot time by the rcu_nocbs parameter. + For each such CPU, a kthread ("rcuox/N") will be created to + invoke callbacks, where the "N" is the CPU being offloaded, + and where the "x" is "p" for RCU-preempt (PREEMPTION kernels) + and "s" for RCU-sched (!PREEMPTION kernels). This option + also creates another kthread for each sqrt(nr_cpu_ids) CPUs + ("rcuog/N", where N is the first CPU in that group to come + online), which handles grace periods for its group. Nothing + prevents these kthreads from running on the specified CPUs, + but (1) the kthreads may be preempted between each callback, + and (2) affinity or cgroups can be used to force the kthreads + to run on whatever set of CPUs is desired. + + The sqrt(nr_cpu_ids) grouping may be overridden using the + rcutree.rcu_nocb_gp_stride kernel boot parameter. This can + be especially helpful for smaller numbers of CPUs, where + sqrt(nr_cpu_ids) can be a bit of a blunt instrument. Say Y here if you need reduced OS jitter, despite added overhead. Say N here if you are unsure. diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 259904075636..fadc08ad4b7b 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -120,7 +120,6 @@ void rcu_segcblist_inc_len(struct rcu_segcblist *rsclp); void rcu_segcblist_add_len(struct rcu_segcblist *rsclp, long v); void rcu_segcblist_init(struct rcu_segcblist *rsclp); void rcu_segcblist_disable(struct rcu_segcblist *rsclp); -void rcu_segcblist_offload(struct rcu_segcblist *rsclp, bool offload); bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp); bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp); struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp); diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index 6d37596deb1f..0f3059b1b80d 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -889,14 +889,14 @@ kfree_scale_init(void) if (WARN_ON_ONCE(jiffies_at_lazy_cb - jif_start < 2 * HZ)) { pr_alert("ERROR: call_rcu() CBs are not being lazy as expected!\n"); - WARN_ON_ONCE(1); - return -1; + firsterr = -1; + goto unwind; } if (WARN_ON_ONCE(jiffies_at_lazy_cb - jif_start > 3 * HZ)) { pr_alert("ERROR: call_rcu() CBs are being too lazy!\n"); - WARN_ON_ONCE(1); - return -1; + firsterr = -1; + goto unwind; } } diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index bb75dbf5c800..612d27690335 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -57,9 +57,9 @@ MODULE_AUTHOR("Paul E. McKenney <[email protected]> and Josh Triplett <josh@ /* Bits for ->extendables field, extendables param, and related definitions. */ #define RCUTORTURE_RDR_SHIFT_1 8 /* Put SRCU index in upper bits. */ -#define RCUTORTURE_RDR_MASK_1 (1 << RCUTORTURE_RDR_SHIFT_1) -#define RCUTORTURE_RDR_SHIFT_2 9 /* Put SRCU index in upper bits. */ -#define RCUTORTURE_RDR_MASK_2 (1 << RCUTORTURE_RDR_SHIFT_2) +#define RCUTORTURE_RDR_MASK_1 (0xff << RCUTORTURE_RDR_SHIFT_1) +#define RCUTORTURE_RDR_SHIFT_2 16 /* Put SRCU index in upper bits. */ +#define RCUTORTURE_RDR_MASK_2 (0xff << RCUTORTURE_RDR_SHIFT_2) #define RCUTORTURE_RDR_BH 0x01 /* Extend readers by disabling bh. */ #define RCUTORTURE_RDR_IRQ 0x02 /* ... disabling interrupts. */ #define RCUTORTURE_RDR_PREEMPT 0x04 /* ... disabling preemption. */ @@ -71,6 +71,9 @@ MODULE_AUTHOR("Paul E. McKenney <[email protected]> and Josh Triplett <josh@ #define RCUTORTURE_MAX_EXTEND \ (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_IRQ | RCUTORTURE_RDR_PREEMPT | \ RCUTORTURE_RDR_RBH | RCUTORTURE_RDR_SCHED) +#define RCUTORTURE_RDR_ALLBITS \ + (RCUTORTURE_MAX_EXTEND | RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2 | \ + RCUTORTURE_RDR_MASK_1 | RCUTORTURE_RDR_MASK_2) #define RCUTORTURE_RDR_MAX_LOOPS 0x7 /* Maximum reader extensions. */ /* Must be power of two minus one. */ #define RCUTORTURE_RDR_MAX_SEGS (RCUTORTURE_RDR_MAX_LOOPS + 3) @@ -108,6 +111,7 @@ torture_param(int, nocbs_nthreads, 0, "Number of NOCB toggle threads, 0 to disab torture_param(int, nocbs_toggle, 1000, "Time between toggling nocb state (ms)"); torture_param(int, read_exit_delay, 13, "Delay between read-then-exit episodes (s)"); torture_param(int, read_exit_burst, 16, "# of read-then-exit bursts per episode, zero to disable"); +torture_param(int, reader_flavor, 0x1, "Reader flavors to use, one per bit."); torture_param(int, shuffle_interval, 3, "Number of seconds between shuffles"); torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable."); torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable."); @@ -393,6 +397,7 @@ struct rcu_torture_ops { int slow_gps; int no_pi_lock; int debug_objects; + int start_poll_irqsoff; const char *name; }; @@ -581,6 +586,7 @@ static struct rcu_torture_ops rcu_ops = { .can_boost = IS_ENABLED(CONFIG_RCU_BOOST), .extendables = RCUTORTURE_MAX_EXTEND, .debug_objects = 1, + .start_poll_irqsoff = 1, .name = "rcu" }; @@ -641,10 +647,25 @@ static void srcu_get_gp_data(int *flags, unsigned long *gp_seq) static int srcu_torture_read_lock(void) { - if (cur_ops == &srcud_ops) - return srcu_read_lock_nmisafe(srcu_ctlp); - else - return srcu_read_lock(srcu_ctlp); + int idx; + int ret = 0; + + if ((reader_flavor & 0x1) || !(reader_flavor & 0x7)) { + idx = srcu_read_lock(srcu_ctlp); + WARN_ON_ONCE(idx & ~0x1); + ret += idx; + } + if (reader_flavor & 0x2) { + idx = srcu_read_lock_nmisafe(srcu_ctlp); + WARN_ON_ONCE(idx & ~0x1); + ret += idx << 1; + } + if (reader_flavor & 0x4) { + idx = srcu_read_lock_lite(srcu_ctlp); + WARN_ON_ONCE(idx & ~0x1); + ret += idx << 2; + } + return ret; } static void @@ -668,10 +689,13 @@ srcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp) static void srcu_torture_read_unlock(int idx) { - if (cur_ops == &srcud_ops) - srcu_read_unlock_nmisafe(srcu_ctlp, idx); - else - srcu_read_unlock(srcu_ctlp, idx); + WARN_ON_ONCE((reader_flavor && (idx & ~reader_flavor)) || (!reader_flavor && (idx & ~0x1))); + if (reader_flavor & 0x4) + srcu_read_unlock_lite(srcu_ctlp, (idx & 0x4) >> 2); + if (reader_flavor & 0x2) + srcu_read_unlock_nmisafe(srcu_ctlp, (idx & 0x2) >> 1); + if ((reader_flavor & 0x1) || !(reader_flavor & 0x7)) + srcu_read_unlock(srcu_ctlp, idx & 0x1); } static int torture_srcu_read_lock_held(void) @@ -1059,8 +1083,13 @@ static bool rcu_torture_boost_failed(unsigned long gp_state, unsigned long *star // At most one persisted message per boost test. j = jiffies; lp = READ_ONCE(last_persist); - if (time_after(j, lp + mininterval) && cmpxchg(&last_persist, lp, j) == lp) - pr_info("Boost inversion persisted: No QS from CPU %d\n", cpu); + if (time_after(j, lp + mininterval) && + cmpxchg(&last_persist, lp, j) == lp) { + if (cpu < 0) + pr_info("Boost inversion persisted: QS from all CPUs\n"); + else + pr_info("Boost inversion persisted: No QS from CPU %d\n", cpu); + } return false; // passed on a technicality } VERBOSE_TOROUT_STRING("rcu_torture_boost boosting failed"); @@ -1695,14 +1724,22 @@ rcu_torture_fakewriter(void *arg) cur_ops->cond_sync_exp_full(&gp_snap_full); break; case RTWS_POLL_GET: + if (cur_ops->start_poll_irqsoff) + local_irq_disable(); gp_snap = cur_ops->start_gp_poll(); + if (cur_ops->start_poll_irqsoff) + local_irq_enable(); while (!cur_ops->poll_gp_state(gp_snap)) { torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); } break; case RTWS_POLL_GET_FULL: + if (cur_ops->start_poll_irqsoff) + local_irq_disable(); cur_ops->start_gp_poll_full(&gp_snap_full); + if (cur_ops->start_poll_irqsoff) + local_irq_enable(); while (!cur_ops->poll_gp_state_full(&gp_snap_full)) { torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); @@ -1820,7 +1857,7 @@ static void rcutorture_one_extend(int *readstate, int newstate, int statesold = *readstate & ~newstate; WARN_ON_ONCE(idxold2 < 0); - WARN_ON_ONCE((idxold2 >> RCUTORTURE_RDR_SHIFT_2) > 1); + WARN_ON_ONCE(idxold2 & ~RCUTORTURE_RDR_ALLBITS); rtrsp->rt_readstate = newstate; /* First, put new protection in place to avoid critical-section gap. */ @@ -1835,9 +1872,9 @@ static void rcutorture_one_extend(int *readstate, int newstate, if (statesnew & RCUTORTURE_RDR_SCHED) rcu_read_lock_sched(); if (statesnew & RCUTORTURE_RDR_RCU_1) - idxnew1 = (cur_ops->readlock() & 0x1) << RCUTORTURE_RDR_SHIFT_1; + idxnew1 = (cur_ops->readlock() << RCUTORTURE_RDR_SHIFT_1) & RCUTORTURE_RDR_MASK_1; if (statesnew & RCUTORTURE_RDR_RCU_2) - idxnew2 = (cur_ops->readlock() & 0x1) << RCUTORTURE_RDR_SHIFT_2; + idxnew2 = (cur_ops->readlock() << RCUTORTURE_RDR_SHIFT_2) & RCUTORTURE_RDR_MASK_2; /* * Next, remove old protection, in decreasing order of strength @@ -1857,7 +1894,7 @@ static void rcutorture_one_extend(int *readstate, int newstate, if (statesold & RCUTORTURE_RDR_RBH) rcu_read_unlock_bh(); if (statesold & RCUTORTURE_RDR_RCU_2) { - cur_ops->readunlock((idxold2 >> RCUTORTURE_RDR_SHIFT_2) & 0x1); + cur_ops->readunlock((idxold2 & RCUTORTURE_RDR_MASK_2) >> RCUTORTURE_RDR_SHIFT_2); WARN_ON_ONCE(idxnew2 != -1); idxold2 = 0; } @@ -1867,7 +1904,7 @@ static void rcutorture_one_extend(int *readstate, int newstate, lockit = !cur_ops->no_pi_lock && !statesnew && !(torture_random(trsp) & 0xffff); if (lockit) raw_spin_lock_irqsave(¤t->pi_lock, flags); - cur_ops->readunlock((idxold1 >> RCUTORTURE_RDR_SHIFT_1) & 0x1); + cur_ops->readunlock((idxold1 & RCUTORTURE_RDR_MASK_1) >> RCUTORTURE_RDR_SHIFT_1); WARN_ON_ONCE(idxnew1 != -1); idxold1 = 0; if (lockit) @@ -1882,16 +1919,13 @@ static void rcutorture_one_extend(int *readstate, int newstate, if (idxnew1 == -1) idxnew1 = idxold1 & RCUTORTURE_RDR_MASK_1; WARN_ON_ONCE(idxnew1 < 0); - if (WARN_ON_ONCE((idxnew1 >> RCUTORTURE_RDR_SHIFT_1) > 1)) - pr_info("Unexpected idxnew1 value of %#x\n", idxnew1); if (idxnew2 == -1) idxnew2 = idxold2 & RCUTORTURE_RDR_MASK_2; WARN_ON_ONCE(idxnew2 < 0); - WARN_ON_ONCE((idxnew2 >> RCUTORTURE_RDR_SHIFT_2) > 1); *readstate = idxnew1 | idxnew2 | newstate; WARN_ON_ONCE(*readstate < 0); - if (WARN_ON_ONCE((*readstate >> RCUTORTURE_RDR_SHIFT_2) > 1)) - pr_info("Unexpected idxnew2 value of %#x\n", idxnew2); + if (WARN_ON_ONCE(*readstate & ~RCUTORTURE_RDR_ALLBITS)) + pr_info("Unexpected readstate value of %#x\n", *readstate); } /* Return the biggest extendables mask given current RCU and boot parameters. */ @@ -1916,7 +1950,7 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp) unsigned long preempts_irq = preempts | RCUTORTURE_RDR_IRQ; unsigned long bhs = RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH; - WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT_1); + WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT_1); // Can't have reader idx bits. /* Mostly only one bit (need preemption!), sometimes lots of bits. */ if (!(randmask1 & 0x7)) mask = mask & randmask2; @@ -2389,6 +2423,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) "n_barrier_cbs=%d " "onoff_interval=%d onoff_holdoff=%d " "read_exit_delay=%d read_exit_burst=%d " + "reader_flavor=%x " "nocbs_nthreads=%d nocbs_toggle=%d " "test_nmis=%d\n", torture_type, tag, nrealreaders, nfakewriters, @@ -2401,6 +2436,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) n_barrier_cbs, onoff_interval, onoff_holdoff, read_exit_delay, read_exit_burst, + reader_flavor, nocbs_nthreads, nocbs_toggle, test_nmis); } @@ -2440,6 +2476,14 @@ static int rcutorture_booster_init(unsigned int cpu) WARN_ON_ONCE(!t); sp.sched_priority = 2; sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); +#ifdef CONFIG_IRQ_FORCED_THREADING + if (force_irqthreads()) { + t = per_cpu(ktimerd, cpu); + WARN_ON_ONCE(!t); + sp.sched_priority = 2; + sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + } +#endif } /* Don't allow time recalculation while creating a new task. */ diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index 0db9db73f57f..aacfcc9838b3 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -75,6 +75,9 @@ MODULE_PARM_DESC(scale_type, "Type of test (rcu, srcu, refcnt, rwsem, rwlock."); torture_param(int, verbose, 0, "Enable verbose debugging printk()s"); torture_param(int, verbose_batched, 0, "Batch verbose debugging printk()s"); +// Number of seconds to extend warm-up and cool-down for multiple guest OSes +torture_param(long, guest_os_delay, 0, + "Number of seconds to extend warm-up/cool-down for multiple guest OSes."); // Wait until there are multiple CPUs before starting test. torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_SCALE_TEST) ? 10 : 0, "Holdoff time before test start (s)"); @@ -212,6 +215,36 @@ static const struct ref_scale_ops srcu_ops = { .name = "srcu" }; +static void srcu_lite_ref_scale_read_section(const int nloops) +{ + int i; + int idx; + + for (i = nloops; i >= 0; i--) { + idx = srcu_read_lock_lite(srcu_ctlp); + srcu_read_unlock_lite(srcu_ctlp, idx); + } +} + +static void srcu_lite_ref_scale_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + int idx; + + for (i = nloops; i >= 0; i--) { + idx = srcu_read_lock_lite(srcu_ctlp); + un_delay(udl, ndl); + srcu_read_unlock_lite(srcu_ctlp, idx); + } +} + +static const struct ref_scale_ops srcu_lite_ops = { + .init = rcu_sync_scale_init, + .readsection = srcu_lite_ref_scale_read_section, + .delaysection = srcu_lite_ref_scale_delay_section, + .name = "srcu-lite" +}; + #ifdef CONFIG_TASKS_RCU // Definitions for RCU Tasks ref scale testing: Empty read markers. @@ -801,6 +834,18 @@ static void rcu_scale_one_reader(void) cur_ops->delaysection(loops, readdelay / 1000, readdelay % 1000); } +// Warm up cache, or, if needed run a series of rcu_scale_one_reader() +// to allow multiple rcuscale guest OSes to collect mutually valid data. +static void rcu_scale_warm_cool(void) +{ + unsigned long jdone = jiffies + (guest_os_delay > 0 ? guest_os_delay * HZ : -1); + + do { + rcu_scale_one_reader(); + cond_resched(); + } while (time_before(jiffies, jdone)); +} + // Reader kthread. Repeatedly does empty RCU read-side // critical section, minimizing update-side interference. static int @@ -829,7 +874,7 @@ repeat: goto end; // Make sure that the CPU is affinitized appropriately during testing. - WARN_ON_ONCE(raw_smp_processor_id() != me); + WARN_ON_ONCE(raw_smp_processor_id() != me % nr_cpu_ids); WRITE_ONCE(rt->start_reader, 0); if (!atomic_dec_return(&n_started)) @@ -957,6 +1002,7 @@ static int main_func(void *arg) schedule_timeout_uninterruptible(1); // Start exp readers up per experiment + rcu_scale_warm_cool(); for (exp = 0; exp < nruns && !torture_must_stop(); exp++) { if (torture_must_stop()) goto end; @@ -987,6 +1033,7 @@ static int main_func(void *arg) result_avg[exp] = div_u64(1000 * process_durations(nreaders), nreaders * loops); } + rcu_scale_warm_cool(); // Print the average of all experiments SCALEOUT("END OF TEST. Calculating average duration per loop (nanoseconds)...\n"); @@ -1082,9 +1129,10 @@ ref_scale_init(void) long i; int firsterr = 0; static const struct ref_scale_ops *scale_ops[] = { - &rcu_ops, &srcu_ops, RCU_TRACE_OPS RCU_TASKS_OPS &refcnt_ops, &rwlock_ops, - &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, &clock_ops, &jiffies_ops, - &typesafe_ref_ops, &typesafe_lock_ops, &typesafe_seqlock_ops, + &rcu_ops, &srcu_ops, &srcu_lite_ops, RCU_TRACE_OPS RCU_TASKS_OPS + &refcnt_ops, &rwlock_ops, &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, + &clock_ops, &jiffies_ops, &typesafe_ref_ops, &typesafe_lock_ops, + &typesafe_seqlock_ops, }; if (!torture_init_begin(scale_type, verbose)) diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 549c03336ee9..4dcbf8aa80ff 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -122,8 +122,8 @@ void srcu_drive_gp(struct work_struct *wp) ssp = container_of(wp, struct srcu_struct, srcu_work); preempt_disable(); // Needed for PREEMPT_AUTO if (ssp->srcu_gp_running || ULONG_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max))) { - return; /* Already running or nothing to do. */ preempt_enable(); + return; /* Already running or nothing to do. */ } /* Remove recently arrived callbacks and wait for readers. */ diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 31706e3293bc..5e2e53464794 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -128,7 +128,7 @@ static void init_srcu_struct_data(struct srcu_struct *ssp) * Initialize the per-CPU srcu_data array, which feeds into the * leaves of the srcu_node tree. */ - WARN_ON_ONCE(ARRAY_SIZE(sdp->srcu_lock_count) != + BUILD_BUG_ON(ARRAY_SIZE(sdp->srcu_lock_count) != ARRAY_SIZE(sdp->srcu_unlock_count)); for_each_possible_cpu(cpu) { sdp = per_cpu_ptr(ssp->sda, cpu); @@ -187,7 +187,7 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags) /* Each pass through this loop initializes one srcu_node structure. */ srcu_for_each_node_breadth_first(ssp, snp) { spin_lock_init(&ACCESS_PRIVATE(snp, lock)); - WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) != + BUILD_BUG_ON(ARRAY_SIZE(snp->srcu_have_cbs) != ARRAY_SIZE(snp->srcu_data_have_cbs)); for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) { snp->srcu_have_cbs[i] = SRCU_SNP_INIT_SEQ; @@ -419,41 +419,60 @@ static void check_init_srcu_struct(struct srcu_struct *ssp) } /* - * Returns approximate total of the readers' ->srcu_lock_count[] values - * for the rank of per-CPU counters specified by idx. + * Is the current or any upcoming grace period to be expedited? */ -static unsigned long srcu_readers_lock_idx(struct srcu_struct *ssp, int idx) +static bool srcu_gp_is_expedited(struct srcu_struct *ssp) +{ + struct srcu_usage *sup = ssp->srcu_sup; + + return ULONG_CMP_LT(READ_ONCE(sup->srcu_gp_seq), READ_ONCE(sup->srcu_gp_seq_needed_exp)); +} + +/* + * Computes approximate total of the readers' ->srcu_lock_count[] values + * for the rank of per-CPU counters specified by idx, and returns true if + * the caller did the proper barrier (gp), and if the count of the locks + * matches that of the unlocks passed in. + */ +static bool srcu_readers_lock_idx(struct srcu_struct *ssp, int idx, bool gp, unsigned long unlocks) { int cpu; + unsigned long mask = 0; unsigned long sum = 0; for_each_possible_cpu(cpu) { - struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu); + struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); - sum += atomic_long_read(&cpuc->srcu_lock_count[idx]); + sum += atomic_long_read(&sdp->srcu_lock_count[idx]); + if (IS_ENABLED(CONFIG_PROVE_RCU)) + mask = mask | READ_ONCE(sdp->srcu_reader_flavor); } - return sum; + WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask - 1)), + "Mixed reader flavors for srcu_struct at %ps.\n", ssp); + if (mask & SRCU_READ_FLAVOR_LITE && !gp) + return false; + return sum == unlocks; } /* * Returns approximate total of the readers' ->srcu_unlock_count[] values * for the rank of per-CPU counters specified by idx. */ -static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx) +static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx, unsigned long *rdm) { int cpu; unsigned long mask = 0; unsigned long sum = 0; for_each_possible_cpu(cpu) { - struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu); + struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); - sum += atomic_long_read(&cpuc->srcu_unlock_count[idx]); - if (IS_ENABLED(CONFIG_PROVE_RCU)) - mask = mask | READ_ONCE(cpuc->srcu_nmi_safety); + sum += atomic_long_read(&sdp->srcu_unlock_count[idx]); + mask = mask | READ_ONCE(sdp->srcu_reader_flavor); } - WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask >> 1)), - "Mixed NMI-safe readers for srcu_struct at %ps.\n", ssp); + WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask - 1)), + "Mixed reader flavors for srcu_struct at %ps.\n", ssp); + *rdm = mask; return sum; } @@ -463,22 +482,28 @@ static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx) */ static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx) { + bool did_gp; + unsigned long rdm; unsigned long unlocks; - unlocks = srcu_readers_unlock_idx(ssp, idx); + unlocks = srcu_readers_unlock_idx(ssp, idx, &rdm); + did_gp = !!(rdm & SRCU_READ_FLAVOR_LITE); /* * Make sure that a lock is always counted if the corresponding * unlock is counted. Needs to be a smp_mb() as the read side may * contain a read from a variable that is written to before the * synchronize_srcu() in the write side. In this case smp_mb()s - * A and B act like the store buffering pattern. + * A and B (or X and Y) act like the store buffering pattern. * - * This smp_mb() also pairs with smp_mb() C to prevent accesses - * after the synchronize_srcu() from being executed before the - * grace period ends. + * This smp_mb() also pairs with smp_mb() C (or, in the case of X, + * Z) to prevent accesses after the synchronize_srcu() from being + * executed before the grace period ends. */ - smp_mb(); /* A */ + if (!did_gp) + smp_mb(); /* A */ + else + synchronize_rcu(); /* X */ /* * If the locks are the same as the unlocks, then there must have @@ -536,7 +561,7 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx) * which are unlikely to be configured with an address space fully * populated with memory, at least not anytime soon. */ - return srcu_readers_lock_idx(ssp, idx) == unlocks; + return srcu_readers_lock_idx(ssp, idx, did_gp, unlocks); } /** @@ -554,12 +579,12 @@ static bool srcu_readers_active(struct srcu_struct *ssp) unsigned long sum = 0; for_each_possible_cpu(cpu) { - struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu); + struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); - sum += atomic_long_read(&cpuc->srcu_lock_count[0]); - sum += atomic_long_read(&cpuc->srcu_lock_count[1]); - sum -= atomic_long_read(&cpuc->srcu_unlock_count[0]); - sum -= atomic_long_read(&cpuc->srcu_unlock_count[1]); + sum += atomic_long_read(&sdp->srcu_lock_count[0]); + sum += atomic_long_read(&sdp->srcu_lock_count[1]); + sum -= atomic_long_read(&sdp->srcu_unlock_count[0]); + sum -= atomic_long_read(&sdp->srcu_unlock_count[1]); } return sum; } @@ -622,7 +647,7 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp) unsigned long jbase = SRCU_INTERVAL; struct srcu_usage *sup = ssp->srcu_sup; - if (ULONG_CMP_LT(READ_ONCE(sup->srcu_gp_seq), READ_ONCE(sup->srcu_gp_seq_needed_exp))) + if (srcu_gp_is_expedited(ssp)) jbase = 0; if (rcu_seq_state(READ_ONCE(sup->srcu_gp_seq))) { j = jiffies - 1; @@ -687,28 +712,28 @@ void cleanup_srcu_struct(struct srcu_struct *ssp) } EXPORT_SYMBOL_GPL(cleanup_srcu_struct); -#ifdef CONFIG_PROVE_RCU /* - * Check for consistent NMI safety. + * Check for consistent reader flavor. */ -void srcu_check_nmi_safety(struct srcu_struct *ssp, bool nmi_safe) +void __srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor) { - int nmi_safe_mask = 1 << nmi_safe; - int old_nmi_safe_mask; + int old_read_flavor; struct srcu_data *sdp; - /* NMI-unsafe use in NMI is a bad sign */ - WARN_ON_ONCE(!nmi_safe && in_nmi()); + /* NMI-unsafe use in NMI is a bad sign, as is multi-bit read_flavor values. */ + WARN_ON_ONCE((read_flavor != SRCU_READ_FLAVOR_NMI) && in_nmi()); + WARN_ON_ONCE(read_flavor & (read_flavor - 1)); + sdp = raw_cpu_ptr(ssp->sda); - old_nmi_safe_mask = READ_ONCE(sdp->srcu_nmi_safety); - if (!old_nmi_safe_mask) { - WRITE_ONCE(sdp->srcu_nmi_safety, nmi_safe_mask); - return; + old_read_flavor = READ_ONCE(sdp->srcu_reader_flavor); + if (!old_read_flavor) { + old_read_flavor = cmpxchg(&sdp->srcu_reader_flavor, 0, read_flavor); + if (!old_read_flavor) + return; } - WARN_ONCE(old_nmi_safe_mask != nmi_safe_mask, "CPU %d old state %d new state %d\n", sdp->cpu, old_nmi_safe_mask, nmi_safe_mask); + WARN_ONCE(old_read_flavor != read_flavor, "CPU %d old state %d new state %d\n", sdp->cpu, old_read_flavor, read_flavor); } -EXPORT_SYMBOL_GPL(srcu_check_nmi_safety); -#endif /* CONFIG_PROVE_RCU */ +EXPORT_SYMBOL_GPL(__srcu_check_read_flavor); /* * Counts the new reader in the appropriate per-CPU element of the @@ -867,7 +892,7 @@ static void srcu_gp_end(struct srcu_struct *ssp) spin_lock_irq_rcu_node(sup); idx = rcu_seq_state(sup->srcu_gp_seq); WARN_ON_ONCE(idx != SRCU_STATE_SCAN2); - if (ULONG_CMP_LT(READ_ONCE(sup->srcu_gp_seq), READ_ONCE(sup->srcu_gp_seq_needed_exp))) + if (srcu_gp_is_expedited(ssp)) cbdelay = 0; WRITE_ONCE(sup->srcu_last_gp_end, ktime_get_mono_fast_ns()); @@ -1122,6 +1147,8 @@ static void srcu_flip(struct srcu_struct *ssp) * it stays until either (1) Compilers learn about this sort of * control dependency or (2) Some production workload running on * a production system is unduly delayed by this slowpath smp_mb(). + * Except for _lite() readers, where it is inoperative, which + * means that it is a good thing that it is redundant. */ smp_mb(); /* E */ /* Pairs with B and C. */ @@ -1139,7 +1166,9 @@ static void srcu_flip(struct srcu_struct *ssp) } /* - * If SRCU is likely idle, return true, otherwise return false. + * If SRCU is likely idle, in other words, the next SRCU grace period + * should be expedited, return true, otherwise return false. Except that + * in the presence of _lite() readers, always return false. * * Note that it is OK for several current from-idle requests for a new * grace period from idle to specify expediting because they will all end @@ -1159,7 +1188,7 @@ static void srcu_flip(struct srcu_struct *ssp) * negligible when amortized over that time period, and the extra latency * of a needlessly non-expedited grace period is similarly negligible. */ -static bool srcu_might_be_idle(struct srcu_struct *ssp) +static bool srcu_should_expedite(struct srcu_struct *ssp) { unsigned long curseq; unsigned long flags; @@ -1168,6 +1197,9 @@ static bool srcu_might_be_idle(struct srcu_struct *ssp) unsigned long tlast; check_init_srcu_struct(ssp); + /* If _lite() readers, don't do unsolicited expediting. */ + if (this_cpu_read(ssp->sda->srcu_reader_flavor) & SRCU_READ_FLAVOR_LITE) + return false; /* If the local srcu_data structure has callbacks, not idle. */ sdp = raw_cpu_ptr(ssp->sda); spin_lock_irqsave_rcu_node(sdp, flags); @@ -1469,14 +1501,15 @@ EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); * Implementation of these memory-ordering guarantees is similar to * that of synchronize_rcu(). * - * If SRCU is likely idle, expedite the first request. This semantic - * was provided by Classic SRCU, and is relied upon by its users, so TREE - * SRCU must also provide it. Note that detecting idleness is heuristic - * and subject to both false positives and negatives. + * If SRCU is likely idle as determined by srcu_should_expedite(), + * expedite the first request. This semantic was provided by Classic SRCU, + * and is relied upon by its users, so TREE SRCU must also provide it. + * Note that detecting idleness is heuristic and subject to both false + * positives and negatives. */ void synchronize_srcu(struct srcu_struct *ssp) { - if (srcu_might_be_idle(ssp) || rcu_gp_is_expedited()) + if (srcu_should_expedite(ssp) || rcu_gp_is_expedited()) synchronize_srcu_expedited(ssp); else __synchronize_srcu(ssp, true); diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 6333f4ccf024..59314da5eb60 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -986,6 +986,15 @@ static bool rcu_tasks_is_holdout(struct task_struct *t) return false; /* + * t->on_rq && !t->se.sched_delayed *could* be considered sleeping but + * since it is a spurious state (it will transition into the + * traditional blocked state or get woken up without outside + * dependencies), not considering it such should only affect timing. + * + * Be conservative for now and not include it. + */ + + /* * Idle tasks (or idle injection) within the idle loop are RCU-tasks * quiescent states. But CPU boot code performed by the idle task * isn't a quiescent state. @@ -1398,7 +1407,8 @@ static void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func) */ void synchronize_rcu_tasks_rude(void) { - synchronize_rcu_tasks_generic(&rcu_tasks_rude); + if (!IS_ENABLED(CONFIG_ARCH_WANTS_NO_INSTR) || IS_ENABLED(CONFIG_FORCE_TASKS_RUDE_RCU)) + synchronize_rcu_tasks_generic(&rcu_tasks_rude); } EXPORT_SYMBOL_GPL(synchronize_rcu_tasks_rude); @@ -1540,22 +1550,7 @@ static void rcu_st_need_qs(struct task_struct *t, u8 v) */ u8 rcu_trc_cmpxchg_need_qs(struct task_struct *t, u8 old, u8 new) { - union rcu_special ret; - union rcu_special trs_old = READ_ONCE(t->trc_reader_special); - union rcu_special trs_new = trs_old; - - if (trs_old.b.need_qs != old) - return trs_old.b.need_qs; - trs_new.b.need_qs = new; - - // Although cmpxchg() appears to KCSAN to update all four bytes, - // only the .b.need_qs byte actually changes. - instrument_atomic_read_write(&t->trc_reader_special.b.need_qs, - sizeof(t->trc_reader_special.b.need_qs)); - // Avoid false-positive KCSAN failures. - ret.s = data_race(cmpxchg(&t->trc_reader_special.s, trs_old.s, trs_new.s)); - - return ret.b.need_qs; + return cmpxchg(&t->trc_reader_special.b.need_qs, old, new); } EXPORT_SYMBOL_GPL(rcu_trc_cmpxchg_need_qs); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a60616e69b66..ff98233d4aa5 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3511,7 +3511,7 @@ static int krc_count(struct kfree_rcu_cpu *krcp) } static void -schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp) +__schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp) { long delay, delay_left; @@ -3526,6 +3526,16 @@ schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp) } static void +schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&krcp->lock, flags); + __schedule_delayed_monitor_work(krcp); + raw_spin_unlock_irqrestore(&krcp->lock, flags); +} + +static void kvfree_rcu_drain_ready(struct kfree_rcu_cpu *krcp) { struct list_head bulk_ready[FREE_N_CHANNELS]; @@ -3607,11 +3617,12 @@ kvfree_rcu_queue_batch(struct kfree_rcu_cpu *krcp) } // One work is per one batch, so there are three - // "free channels", the batch can handle. It can - // be that the work is in the pending state when - // channels have been detached following by each - // other. + // "free channels", the batch can handle. Break + // the loop since it is done with this CPU thus + // queuing an RCU work is _always_ success here. queued = queue_rcu_work(system_unbound_wq, &krwp->rcu_work); + WARN_ON_ONCE(!queued); + break; } } @@ -3835,7 +3846,7 @@ void kvfree_call_rcu(struct rcu_head *head, void *ptr) // Set timer to drain after KFREE_DRAIN_JIFFIES. if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING) - schedule_delayed_monitor_work(krcp); + __schedule_delayed_monitor_work(krcp); unlock_return: krc_this_cpu_unlock(krcp, flags); @@ -4193,7 +4204,6 @@ static void start_poll_synchronize_rcu_common(void) struct rcu_data *rdp; struct rcu_node *rnp; - lockdep_assert_irqs_enabled(); local_irq_save(flags); rdp = this_cpu_ptr(&rcu_data); rnp = rdp->mynode; @@ -4218,9 +4228,6 @@ static void start_poll_synchronize_rcu_common(void) * grace period has elapsed in the meantime. If the needed grace period * is not already slated to start, notifies RCU core of the need for that * grace period. - * - * Interrupts must be enabled for the case where it is necessary to awaken - * the grace-period kthread. */ unsigned long start_poll_synchronize_rcu(void) { @@ -4241,9 +4248,6 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu); * grace period (whether normal or expedited) has elapsed in the meantime. * If the needed grace period is not already slated to start, notifies * RCU core of the need for that grace period. - * - * Interrupts must be enabled for the case where it is necessary to awaken - * the grace-period kthread. */ void start_poll_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp) { @@ -5579,8 +5583,7 @@ void rcu_init_geometry(void) * Complain and fall back to the compile-time values if this * limit is exceeded. */ - if (rcu_fanout_leaf < 2 || - rcu_fanout_leaf > sizeof(unsigned long) * 8) { + if (rcu_fanout_leaf < 2 || rcu_fanout_leaf > BITS_PER_LONG) { rcu_fanout_leaf = RCU_FANOUT_LEAF; WARN_ON(1); return; diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 97b99cd06923..2605dd234a13 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -554,13 +554,19 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, rcu_nocb_unlock(rdp); wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_LAZY, TPS("WakeLazy")); - } else if (!irqs_disabled_flags(flags)) { + } else if (!irqs_disabled_flags(flags) && cpu_online(rdp->cpu)) { /* ... if queue was empty ... */ rcu_nocb_unlock(rdp); wake_nocb_gp(rdp, false); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeEmpty")); } else { + /* + * Don't do the wake-up upfront on fragile paths. + * Also offline CPUs can't call swake_up_one_online() from + * (soft-)IRQs. Rely on the final deferred wake-up from + * rcutree_report_cpu_dead() + */ rcu_nocb_unlock(rdp); wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE, TPS("WakeEmptyIsDeferred")); @@ -885,7 +891,18 @@ static void nocb_cb_wait(struct rcu_data *rdp) swait_event_interruptible_exclusive(rdp->nocb_cb_wq, nocb_cb_wait_cond(rdp)); if (kthread_should_park()) { - kthread_parkme(); + /* + * kthread_park() must be preceded by an rcu_barrier(). + * But yet another rcu_barrier() might have sneaked in between + * the barrier callback execution and the callbacks counter + * decrement. + */ + if (rdp->nocb_cb_sleep) { + rcu_nocb_lock_irqsave(rdp, flags); + WARN_ON_ONCE(rcu_segcblist_n_cbs(&rdp->cblist)); + rcu_nocb_unlock_irqrestore(rdp, flags); + kthread_parkme(); + } } else if (READ_ONCE(rdp->nocb_cb_sleep)) { WARN_ON(signal_pending(current)); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty")); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 1c7cbd145d5e..3927ea5f7955 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -183,9 +183,9 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) switch (blkd_state) { case 0: case RCU_EXP_TASKS: - case RCU_EXP_TASKS + RCU_GP_BLKD: + case RCU_EXP_TASKS | RCU_GP_BLKD: case RCU_GP_TASKS: - case RCU_GP_TASKS + RCU_EXP_TASKS: + case RCU_GP_TASKS | RCU_EXP_TASKS: /* * Blocking neither GP, or first task blocking the normal @@ -198,10 +198,10 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) case RCU_EXP_BLKD: case RCU_GP_BLKD: - case RCU_GP_BLKD + RCU_EXP_BLKD: - case RCU_GP_TASKS + RCU_EXP_BLKD: - case RCU_GP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD: - case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD: + case RCU_GP_BLKD | RCU_EXP_BLKD: + case RCU_GP_TASKS | RCU_EXP_BLKD: + case RCU_GP_TASKS | RCU_GP_BLKD | RCU_EXP_BLKD: + case RCU_GP_TASKS | RCU_EXP_TASKS | RCU_GP_BLKD | RCU_EXP_BLKD: /* * First task arriving that blocks either GP, or first task @@ -214,9 +214,9 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) list_add_tail(&t->rcu_node_entry, &rnp->blkd_tasks); break; - case RCU_EXP_TASKS + RCU_EXP_BLKD: - case RCU_EXP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD: - case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_EXP_BLKD: + case RCU_EXP_TASKS | RCU_EXP_BLKD: + case RCU_EXP_TASKS | RCU_GP_BLKD | RCU_EXP_BLKD: + case RCU_GP_TASKS | RCU_EXP_TASKS | RCU_EXP_BLKD: /* * Second or subsequent task blocking the expedited GP. @@ -227,8 +227,8 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) list_add(&t->rcu_node_entry, rnp->exp_tasks); break; - case RCU_GP_TASKS + RCU_GP_BLKD: - case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_GP_BLKD: + case RCU_GP_TASKS | RCU_GP_BLKD: + case RCU_GP_TASKS | RCU_EXP_TASKS | RCU_GP_BLKD: /* * Second or subsequent task blocking the normal GP. diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 4432db6d0b99..925fcdad5dea 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -76,36 +76,6 @@ int rcu_jiffies_till_stall_check(void) } EXPORT_SYMBOL_GPL(rcu_jiffies_till_stall_check); -/** - * rcu_gp_might_be_stalled - Is it likely that the grace period is stalled? - * - * Returns @true if the current grace period is sufficiently old that - * it is reasonable to assume that it might be stalled. This can be - * useful when deciding whether to allocate memory to enable RCU-mediated - * freeing on the one hand or just invoking synchronize_rcu() on the other. - * The latter is preferable when the grace period is stalled. - * - * Note that sampling of the .gp_start and .gp_seq fields must be done - * carefully to avoid false positives at the beginnings and ends of - * grace periods. - */ -bool rcu_gp_might_be_stalled(void) -{ - unsigned long d = rcu_jiffies_till_stall_check() / RCU_STALL_MIGHT_DIV; - unsigned long j = jiffies; - - if (d < RCU_STALL_MIGHT_MIN) - d = RCU_STALL_MIGHT_MIN; - smp_mb(); // jiffies before .gp_seq to avoid false positives. - if (!rcu_gp_in_progress()) - return false; - // Long delays at this point avoids false positive, but a delay - // of ULONG_MAX/4 jiffies voids your no-false-positive warranty. - smp_mb(); // .gp_seq before second .gp_start - // And ditto here. - return !time_before(j, READ_ONCE(rcu_state.gp_start) + d); -} - /* Don't do RCU CPU stall warnings during long sysrq printouts. */ void rcu_sysrq_start(void) { @@ -365,7 +335,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags) * that don't support NMI-based stack dumps. The NMI-triggered stack * traces are more accurate because they are printed by the target CPU. */ -static void rcu_dump_cpu_stacks(void) +static void rcu_dump_cpu_stacks(unsigned long gp_seq) { int cpu; unsigned long flags; @@ -373,15 +343,23 @@ static void rcu_dump_cpu_stacks(void) rcu_for_each_leaf_node(rnp) { printk_deferred_enter(); - raw_spin_lock_irqsave_rcu_node(rnp, flags); - for_each_leaf_node_possible_cpu(rnp, cpu) + for_each_leaf_node_possible_cpu(rnp, cpu) { + if (gp_seq != data_race(rcu_state.gp_seq)) { + printk_deferred_exit(); + pr_err("INFO: Stall ended during stack backtracing.\n"); + return; + } + if (!(data_race(rnp->qsmask) & leaf_node_cpu_bit(rnp, cpu))) + continue; + raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) { if (cpu_is_offline(cpu)) pr_err("Offline CPU %d blocking current GP.\n", cpu); else dump_cpu_task(cpu); } - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + } printk_deferred_exit(); } } @@ -638,7 +616,7 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) (long)rcu_seq_current(&rcu_state.gp_seq), totqlen, data_race(rcu_state.n_online_cpus)); // Diagnostic read if (ndetected) { - rcu_dump_cpu_stacks(); + rcu_dump_cpu_stacks(gp_seq); /* Complain about tasks blocking the grace period. */ rcu_for_each_leaf_node(rnp) @@ -670,7 +648,7 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) rcu_force_quiescent_state(); /* Kick them all. */ } -static void print_cpu_stall(unsigned long gps) +static void print_cpu_stall(unsigned long gp_seq, unsigned long gps) { int cpu; unsigned long flags; @@ -707,7 +685,7 @@ static void print_cpu_stall(unsigned long gps) rcu_check_gp_kthread_expired_fqs_timer(); rcu_check_gp_kthread_starvation(); - rcu_dump_cpu_stacks(); + rcu_dump_cpu_stacks(gp_seq); raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Rewrite if needed in case of slow consoles. */ @@ -789,7 +767,8 @@ static void check_cpu_stall(struct rcu_data *rdp) gs2 = READ_ONCE(rcu_state.gp_seq); if (gs1 != gs2 || ULONG_CMP_LT(j, js) || - ULONG_CMP_GE(gps, js)) + ULONG_CMP_GE(gps, js) || + !rcu_seq_state(gs2)) return; /* No stall or GP completed since entering function. */ rnp = rdp->mynode; jn = jiffies + ULONG_MAX / 2; @@ -810,7 +789,7 @@ static void check_cpu_stall(struct rcu_data *rdp) pr_err("INFO: %s detected stall, but suppressed full report due to a stuck CSD-lock.\n", rcu_state.name); } else if (self_detected) { /* We haven't checked in, so go dump stack. */ - print_cpu_stall(gps); + print_cpu_stall(gs2, gps); } else { /* They had a few time units to dump stack, so complain. */ print_other_cpu_stall(gs2, gps); diff --git a/kernel/relay.c b/kernel/relay.c index a8e90e98bf2c..a8ae436dc77e 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1079,7 +1079,6 @@ const struct file_operations relay_file_operations = { .poll = relay_file_poll, .mmap = relay_file_mmap, .read = relay_file_read, - .llseek = no_llseek, .release = relay_file_release, }; EXPORT_SYMBOL_GPL(relay_file_operations); diff --git a/kernel/resource.c b/kernel/resource.c index b730bd28b422..4101016e8b20 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -459,9 +459,7 @@ int walk_system_ram_res_rev(u64 start, u64 end, void *arg, rams_size += 16; } - rams[i].start = res.start; - rams[i++].end = res.end; - + rams[i++] = res; start = res.end + 1; } diff --git a/kernel/resource_kunit.c b/kernel/resource_kunit.c index 42d2d8d20f5d..b8ef75b99eb2 100644 --- a/kernel/resource_kunit.c +++ b/kernel/resource_kunit.c @@ -169,6 +169,8 @@ static void resource_test_intersection(struct kunit *test) #define RES_TEST_RAM3_SIZE SZ_1M #define RES_TEST_TOTAL_SIZE ((RES_TEST_WIN1_OFFSET + RES_TEST_WIN1_SIZE)) +KUNIT_DEFINE_ACTION_WRAPPER(kfree_wrapper, kfree, const void *); + static void remove_free_resource(void *ctx) { struct resource *res = (struct resource *)ctx; @@ -177,6 +179,14 @@ static void remove_free_resource(void *ctx) kfree(res); } +static void resource_test_add_action_or_abort( + struct kunit *test, void (*action)(void *), void *ctx) +{ + KUNIT_ASSERT_EQ_MSG(test, 0, + kunit_add_action_or_reset(test, action, ctx), + "Fail to add action"); +} + static void resource_test_request_region(struct kunit *test, struct resource *parent, resource_size_t start, resource_size_t size, const char *name, unsigned long flags) @@ -185,7 +195,7 @@ static void resource_test_request_region(struct kunit *test, struct resource *pa res = __request_region(parent, start, size, name, flags); KUNIT_ASSERT_NOT_NULL(test, res); - kunit_add_action_or_reset(test, remove_free_resource, res); + resource_test_add_action_or_abort(test, remove_free_resource, res); } static void resource_test_insert_resource(struct kunit *test, struct resource *parent, @@ -202,11 +212,11 @@ static void resource_test_insert_resource(struct kunit *test, struct resource *p res->end = start + size - 1; res->flags = flags; if (insert_resource(parent, res)) { - kfree(res); + resource_test_add_action_or_abort(test, kfree_wrapper, res); KUNIT_FAIL_AND_ABORT(test, "Fail to insert resource %pR\n", res); } - kunit_add_action_or_reset(test, remove_free_resource, res); + resource_test_add_action_or_abort(test, remove_free_resource, res); } static void resource_test_region_intersects(struct kunit *test) @@ -220,7 +230,7 @@ static void resource_test_region_intersects(struct kunit *test) "test resources"); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, parent); start = parent->start; - kunit_add_action_or_reset(test, remove_free_resource, parent); + resource_test_add_action_or_abort(test, remove_free_resource, parent); resource_test_request_region(test, parent, start + RES_TEST_RAM0_OFFSET, RES_TEST_RAM0_SIZE, "Test System RAM 0", flags); diff --git a/kernel/scftorture.c b/kernel/scftorture.c index 44e83a646264..d86d2d9c4624 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -97,6 +97,7 @@ struct scf_statistics { static struct scf_statistics *scf_stats_p; static struct task_struct *scf_torture_stats_task; static DEFINE_PER_CPU(long long, scf_invoked_count); +static DEFINE_PER_CPU(struct llist_head, scf_free_pool); // Data for random primitive selection #define SCF_PRIM_RESCHED 0 @@ -133,6 +134,7 @@ struct scf_check { bool scfc_wait; bool scfc_rpc; struct completion scfc_completion; + struct llist_node scf_node; }; // Use to wait for all threads to start. @@ -148,6 +150,33 @@ static DEFINE_TORTURE_RANDOM_PERCPU(scf_torture_rand); extern void resched_cpu(int cpu); // An alternative IPI vector. +static void scf_add_to_free_list(struct scf_check *scfcp) +{ + struct llist_head *pool; + unsigned int cpu; + + if (!scfcp) + return; + cpu = raw_smp_processor_id() % nthreads; + pool = &per_cpu(scf_free_pool, cpu); + llist_add(&scfcp->scf_node, pool); +} + +static void scf_cleanup_free_list(unsigned int cpu) +{ + struct llist_head *pool; + struct llist_node *node; + struct scf_check *scfcp; + + pool = &per_cpu(scf_free_pool, cpu); + node = llist_del_all(pool); + while (node) { + scfcp = llist_entry(node, struct scf_check, scf_node); + node = node->next; + kfree(scfcp); + } +} + // Print torture statistics. Caller must ensure serialization. static void scf_torture_stats_print(void) { @@ -296,7 +325,7 @@ out: if (scfcp->scfc_rpc) complete(&scfcp->scfc_completion); } else { - kfree(scfcp); + scf_add_to_free_list(scfcp); } } @@ -320,10 +349,6 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra struct scf_check *scfcp = NULL; struct scf_selector *scfsp = scf_sel_rand(trsp); - if (use_cpus_read_lock) - cpus_read_lock(); - else - preempt_disable(); if (scfsp->scfs_prim == SCF_PRIM_SINGLE || scfsp->scfs_wait) { scfcp = kmalloc(sizeof(*scfcp), GFP_ATOMIC); if (!scfcp) { @@ -337,6 +362,10 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra scfcp->scfc_rpc = false; } } + if (use_cpus_read_lock) + cpus_read_lock(); + else + preempt_disable(); switch (scfsp->scfs_prim) { case SCF_PRIM_RESCHED: if (IS_BUILTIN(CONFIG_SCF_TORTURE_TEST)) { @@ -363,7 +392,7 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra scfp->n_single_wait_ofl++; else scfp->n_single_ofl++; - kfree(scfcp); + scf_add_to_free_list(scfcp); scfcp = NULL; } break; @@ -391,7 +420,7 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra preempt_disable(); } else { scfp->n_single_rpc_ofl++; - kfree(scfcp); + scf_add_to_free_list(scfcp); scfcp = NULL; } break; @@ -428,7 +457,7 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra pr_warn("%s: Memory-ordering failure, scfs_prim: %d.\n", __func__, scfsp->scfs_prim); atomic_inc(&n_mb_out_errs); // Leak rather than trash! } else { - kfree(scfcp); + scf_add_to_free_list(scfcp); } barrier(); // Prevent race-reduction compiler optimizations. } @@ -463,7 +492,7 @@ static int scftorture_invoker(void *arg) // Make sure that the CPU is affinitized appropriately during testing. curcpu = raw_smp_processor_id(); - WARN_ONCE(curcpu != scfp->cpu % nr_cpu_ids, + WARN_ONCE(curcpu != cpu, "%s: Wanted CPU %d, running on %d, nr_cpu_ids = %d\n", __func__, scfp->cpu, curcpu, nr_cpu_ids); @@ -479,6 +508,8 @@ static int scftorture_invoker(void *arg) VERBOSE_SCFTORTOUT("scftorture_invoker %d started", scfp->cpu); do { + scf_cleanup_free_list(cpu); + scftorture_invoke_one(scfp, &rand); while (cpu_is_offline(cpu) && !torture_must_stop()) { schedule_timeout_interruptible(HZ / 5); @@ -523,12 +554,15 @@ static void scf_torture_cleanup(void) torture_stop_kthread("scftorture_invoker", scf_stats_p[i].task); else goto end; - smp_call_function(scf_cleanup_handler, NULL, 0); + smp_call_function(scf_cleanup_handler, NULL, 1); torture_stop_kthread(scf_torture_stats, scf_torture_stats_task); scf_torture_stats_print(); // -After- the stats thread is stopped! kfree(scf_stats_p); // -After- the last stats print has completed! scf_stats_p = NULL; + for (i = 0; i < nr_cpu_ids; i++) + scf_cleanup_free_list(i); + if (atomic_read(&n_errs) || atomic_read(&n_mb_in_errs) || atomic_read(&n_mb_out_errs)) scftorture_print_module_parms("End of test: FAILURE"); else if (torture_onoff_failures()) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index aeb595514461..95e40895a519 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -548,6 +548,11 @@ sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { } * ON_RQ_MIGRATING state is used for migration without holding both * rq->locks. It indicates task_cpu() is not stable, see task_rq_lock(). * + * Additionally it is possible to be ->on_rq but still be considered not + * runnable when p->se.sched_delayed is true. These tasks are on the runqueue + * but will be dequeued as soon as they get picked again. See the + * task_is_runnable() helper. + * * p->on_cpu <- { 0, 1 }: * * is set by prepare_task() and cleared by finish_task() such that it will be @@ -827,7 +832,7 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) rq_lock(rq, &rf); update_rq_clock(rq); - rq->curr->sched_class->task_tick(rq, rq->curr, 1); + rq->donor->sched_class->task_tick(rq, rq->curr, 1); rq_unlock(rq, &rf); return HRTIMER_NORESTART; @@ -936,10 +941,9 @@ static inline void hrtick_rq_init(struct rq *rq) * this avoids any races wrt polling state changes and thereby avoids * spurious IPIs. */ -static inline bool set_nr_and_not_polling(struct task_struct *p) +static inline bool set_nr_and_not_polling(struct thread_info *ti, int tif) { - struct thread_info *ti = task_thread_info(p); - return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); + return !(fetch_or(&ti->flags, 1 << tif) & _TIF_POLLING_NRFLAG); } /* @@ -964,9 +968,9 @@ static bool set_nr_if_polling(struct task_struct *p) } #else -static inline bool set_nr_and_not_polling(struct task_struct *p) +static inline bool set_nr_and_not_polling(struct thread_info *ti, int tif) { - set_tsk_need_resched(p); + set_ti_thread_flag(ti, tif); return true; } @@ -1071,28 +1075,70 @@ void wake_up_q(struct wake_q_head *head) * might also involve a cross-CPU call to trigger the scheduler on * the target CPU. */ -void resched_curr(struct rq *rq) +static void __resched_curr(struct rq *rq, int tif) { struct task_struct *curr = rq->curr; + struct thread_info *cti = task_thread_info(curr); int cpu; lockdep_assert_rq_held(rq); - if (test_tsk_need_resched(curr)) + /* + * Always immediately preempt the idle task; no point in delaying doing + * actual work. + */ + if (is_idle_task(curr) && tif == TIF_NEED_RESCHED_LAZY) + tif = TIF_NEED_RESCHED; + + if (cti->flags & ((1 << tif) | _TIF_NEED_RESCHED)) return; cpu = cpu_of(rq); if (cpu == smp_processor_id()) { - set_tsk_need_resched(curr); - set_preempt_need_resched(); + set_ti_thread_flag(cti, tif); + if (tif == TIF_NEED_RESCHED) + set_preempt_need_resched(); return; } - if (set_nr_and_not_polling(curr)) - smp_send_reschedule(cpu); - else + if (set_nr_and_not_polling(cti, tif)) { + if (tif == TIF_NEED_RESCHED) + smp_send_reschedule(cpu); + } else { trace_sched_wake_idle_without_ipi(cpu); + } +} + +void resched_curr(struct rq *rq) +{ + __resched_curr(rq, TIF_NEED_RESCHED); +} + +#ifdef CONFIG_PREEMPT_DYNAMIC +static DEFINE_STATIC_KEY_FALSE(sk_dynamic_preempt_lazy); +static __always_inline bool dynamic_preempt_lazy(void) +{ + return static_branch_unlikely(&sk_dynamic_preempt_lazy); +} +#else +static __always_inline bool dynamic_preempt_lazy(void) +{ + return IS_ENABLED(CONFIG_PREEMPT_LAZY); +} +#endif + +static __always_inline int get_lazy_tif_bit(void) +{ + if (dynamic_preempt_lazy()) + return TIF_NEED_RESCHED_LAZY; + + return TIF_NEED_RESCHED; +} + +void resched_curr_lazy(struct rq *rq) +{ + __resched_curr(rq, get_lazy_tif_bit()); } void resched_cpu(int cpu) @@ -1187,7 +1233,7 @@ static void wake_up_idle_cpu(int cpu) * and testing of the above solutions didn't appear to report * much benefits. */ - if (set_nr_and_not_polling(rq->idle)) + if (set_nr_and_not_polling(task_thread_info(rq->idle), TIF_NEED_RESCHED)) smp_send_reschedule(cpu); else trace_sched_wake_idle_without_ipi(cpu); @@ -1394,7 +1440,7 @@ void set_load_weight(struct task_struct *p, bool update_load) * requests are serialized using a mutex to reduce the risk of conflicting * updates or API abuses. */ -static DEFINE_MUTEX(uclamp_mutex); +static __maybe_unused DEFINE_MUTEX(uclamp_mutex); /* Max allowed minimum utilization */ static unsigned int __maybe_unused sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE; @@ -2012,11 +2058,6 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags) if (!(flags & ENQUEUE_NOCLOCK)) update_rq_clock(rq); - if (!(flags & ENQUEUE_RESTORE)) { - sched_info_enqueue(rq, p); - psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED)); - } - p->sched_class->enqueue_task(rq, p, flags); /* * Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear @@ -2024,6 +2065,11 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags) */ uclamp_rq_inc(rq, p); + psi_enqueue(p, flags); + + if (!(flags & ENQUEUE_RESTORE)) + sched_info_enqueue(rq, p); + if (sched_core_enabled(rq)) sched_core_enqueue(rq, p); } @@ -2039,10 +2085,10 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags) if (!(flags & DEQUEUE_NOCLOCK)) update_rq_clock(rq); - if (!(flags & DEQUEUE_SAVE)) { + if (!(flags & DEQUEUE_SAVE)) sched_info_dequeue(rq, p); - psi_dequeue(p, flags & DEQUEUE_SLEEP); - } + + psi_dequeue(p, flags); /* * Must be before ->dequeue_task() because ->dequeue_task() can 'fail' @@ -2130,16 +2176,18 @@ void check_class_changed(struct rq *rq, struct task_struct *p, void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags) { - if (p->sched_class == rq->curr->sched_class) - rq->curr->sched_class->wakeup_preempt(rq, p, flags); - else if (sched_class_above(p->sched_class, rq->curr->sched_class)) + struct task_struct *donor = rq->donor; + + if (p->sched_class == donor->sched_class) + donor->sched_class->wakeup_preempt(rq, p, flags); + else if (sched_class_above(p->sched_class, donor->sched_class)) resched_curr(rq); /* * A queue event has occurred, and we're going to schedule. In * this case, we can save a useless back to back clock update. */ - if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr)) + if (task_on_rq_queued(donor) && test_tsk_need_resched(rq->curr)) rq_clock_skip_update(rq); } @@ -2615,9 +2663,7 @@ int push_cpu_stop(void *arg) // XXX validate p is still the highest prio task if (task_rq(p) == rq) { - deactivate_task(rq, p, 0); - set_task_cpu(p, lowest_rq->cpu); - activate_task(lowest_rq, p, 0); + move_queued_task_locked(rq, lowest_rq, p); resched_curr(lowest_rq); } @@ -2677,7 +2723,7 @@ __do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx) lockdep_assert_held(&p->pi_lock); queued = task_on_rq_queued(p); - running = task_current(rq, p); + running = task_current_donor(rq, p); if (queued) { /* @@ -2691,6 +2737,7 @@ __do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx) put_prev_task(rq, p); p->sched_class->set_cpus_allowed(p, ctx); + mm_set_cpus_allowed(p->mm, ctx->new_mask); if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); @@ -3303,9 +3350,7 @@ static void __migrate_swap_task(struct task_struct *p, int cpu) rq_pin_lock(src_rq, &srf); rq_pin_lock(dst_rq, &drf); - deactivate_task(src_rq, p, 0); - set_task_cpu(p, cpu); - activate_task(dst_rq, p, 0); + move_queued_task_locked(src_rq, dst_rq, p); wakeup_preempt(dst_rq, p, 0); rq_unpin_lock(dst_rq, &drf); @@ -4323,9 +4368,10 @@ static bool __task_needs_rq_lock(struct task_struct *p) * @arg: Argument to function. * * Fix the task in it's current state by avoiding wakeups and or rq operations - * and call @func(@arg) on it. This function can use ->on_rq and task_curr() - * to work out what the state is, if required. Given that @func can be invoked - * with a runqueue lock held, it had better be quite lightweight. + * and call @func(@arg) on it. This function can use task_is_runnable() and + * task_curr() to work out what the state is, if required. Given that @func + * can be invoked with a runqueue lock held, it had better be quite + * lightweight. * * Returns: * Whatever @func returns @@ -4418,7 +4464,8 @@ int wake_up_state(struct task_struct *p, unsigned int state) * Perform scheduler related setup for a newly forked process p. * p is forked by current. * - * __sched_fork() is basic setup used by init_idle() too: + * __sched_fork() is basic setup which is also used by sched_init() to + * initialize the boot CPU's idle task. */ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) { @@ -4705,7 +4752,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) if (rt_prio(p->prio)) { p->sched_class = &rt_sched_class; #ifdef CONFIG_SCHED_CLASS_EXT - } else if (task_should_scx(p)) { + } else if (task_should_scx(p->policy)) { p->sched_class = &ext_sched_class; #endif } else { @@ -5511,7 +5558,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) * project cycles that may never be accounted to this * thread, breaking clock_gettime(). */ - if (task_current(rq, p) && task_on_rq_queued(p)) { + if (task_current_donor(rq, p) && task_on_rq_queued(p)) { prefetch_curr_exec_start(p); update_rq_clock(rq); p->sched_class->update_curr(rq); @@ -5579,7 +5626,8 @@ void sched_tick(void) { int cpu = smp_processor_id(); struct rq *rq = cpu_rq(cpu); - struct task_struct *curr; + /* accounting goes to the donor task */ + struct task_struct *donor; struct rq_flags rf; unsigned long hw_pressure; u64 resched_latency; @@ -5590,19 +5638,23 @@ void sched_tick(void) sched_clock_tick(); rq_lock(rq, &rf); + donor = rq->donor; - curr = rq->curr; - psi_account_irqtime(rq, curr, NULL); + psi_account_irqtime(rq, donor, NULL); update_rq_clock(rq); hw_pressure = arch_scale_hw_pressure(cpu_of(rq)); update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure); - curr->sched_class->task_tick(rq, curr, 0); + + if (dynamic_preempt_lazy() && tif_test_bit(TIF_NEED_RESCHED_LAZY)) + resched_curr(rq); + + donor->sched_class->task_tick(rq, donor, 0); if (sched_feat(LATENCY_WARN)) resched_latency = cpu_resched_latency(rq); calc_global_load_tick(rq); sched_core_tick(rq); - task_tick_mm_cid(rq, curr); + task_tick_mm_cid(rq, donor); scx_tick(rq); rq_unlock(rq, &rf); @@ -5612,8 +5664,8 @@ void sched_tick(void) perf_event_task_tick(); - if (curr->flags & PF_WQ_WORKER) - wq_worker_tick(curr); + if (donor->flags & PF_WQ_WORKER) + wq_worker_tick(donor); #ifdef CONFIG_SMP if (!scx_switched_all()) { @@ -5680,6 +5732,12 @@ static void sched_tick_remote(struct work_struct *work) struct task_struct *curr = rq->curr; if (cpu_online(cpu)) { + /* + * Since this is a remote tick for full dynticks mode, + * we are always sure that there is no proxy (only a + * single task is running). + */ + SCHED_WARN_ON(rq->curr != rq->donor); update_rq_clock(rq); if (!is_idle_task(curr)) { @@ -5914,12 +5972,15 @@ static void prev_balance(struct rq *rq, struct task_struct *prev, #ifdef CONFIG_SCHED_CLASS_EXT /* - * SCX requires a balance() call before every pick_next_task() including - * when waking up from SCHED_IDLE. If @start_class is below SCX, start - * from SCX instead. + * SCX requires a balance() call before every pick_task() including when + * waking up from SCHED_IDLE. If @start_class is below SCX, start from + * SCX instead. Also, set a flag to detect missing balance() call. */ - if (scx_enabled() && sched_class_above(&ext_sched_class, start_class)) - start_class = &ext_sched_class; + if (scx_enabled()) { + rq->scx.flags |= SCX_RQ_BAL_PENDING; + if (sched_class_above(&ext_sched_class, start_class)) + start_class = &ext_sched_class; + } #endif /* @@ -6300,10 +6361,7 @@ static bool try_steal_cookie(int this, int that) if (sched_task_is_throttled(p, this)) goto next; - deactivate_task(src, p, 0); - set_task_cpu(p, this); - activate_task(dst, p, 0); - + move_queued_task_locked(src, dst, p); resched_curr(dst); success = true; @@ -6498,6 +6556,45 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) #define SM_RTLOCK_WAIT 2 /* + * Helper function for __schedule() + * + * If a task does not have signals pending, deactivate it + * Otherwise marks the task's __state as RUNNING + */ +static bool try_to_block_task(struct rq *rq, struct task_struct *p, + unsigned long task_state) +{ + int flags = DEQUEUE_NOCLOCK; + + if (signal_pending_state(task_state, p)) { + WRITE_ONCE(p->__state, TASK_RUNNING); + return false; + } + + p->sched_contributes_to_load = + (task_state & TASK_UNINTERRUPTIBLE) && + !(task_state & TASK_NOLOAD) && + !(task_state & TASK_FROZEN); + + if (unlikely(is_special_task_state(task_state))) + flags |= DEQUEUE_SPECIAL; + + /* + * __schedule() ttwu() + * prev_state = prev->state; if (p->on_rq && ...) + * if (prev_state) goto out; + * p->on_rq = 0; smp_acquire__after_ctrl_dep(); + * p->state = TASK_WAKING + * + * Where __schedule() and ttwu() have matching control dependencies. + * + * After this, schedule() must not care about p->state any more. + */ + block_task(rq, p, flags); + return true; +} + +/* * __schedule() is the main scheduler function. * * The main means of driving the scheduler and thus entering this function are: @@ -6544,6 +6641,7 @@ static void __sched notrace __schedule(int sched_mode) * as a preemption by schedule_debug() and RCU. */ bool preempt = sched_mode > SM_NONE; + bool block = false; unsigned long *switch_count; unsigned long prev_state; struct rq_flags rf; @@ -6604,36 +6702,12 @@ static void __sched notrace __schedule(int sched_mode) goto picked; } } else if (!preempt && prev_state) { - if (signal_pending_state(prev_state, prev)) { - WRITE_ONCE(prev->__state, TASK_RUNNING); - } else { - int flags = DEQUEUE_NOCLOCK; - - prev->sched_contributes_to_load = - (prev_state & TASK_UNINTERRUPTIBLE) && - !(prev_state & TASK_NOLOAD) && - !(prev_state & TASK_FROZEN); - - if (unlikely(is_special_task_state(prev_state))) - flags |= DEQUEUE_SPECIAL; - - /* - * __schedule() ttwu() - * prev_state = prev->state; if (p->on_rq && ...) - * if (prev_state) goto out; - * p->on_rq = 0; smp_acquire__after_ctrl_dep(); - * p->state = TASK_WAKING - * - * Where __schedule() and ttwu() have matching control dependencies. - * - * After this, schedule() must not care about p->state any more. - */ - block_task(rq, prev, flags); - } + block = try_to_block_task(rq, prev, prev_state); switch_count = &prev->nvcsw; } next = pick_next_task(rq, prev, &rf); + rq_set_donor(rq, next); picked: clear_tsk_need_resched(prev); clear_preempt_need_resched(); @@ -6674,7 +6748,7 @@ picked: migrate_disable_switch(rq, prev); psi_account_irqtime(rq, prev, next); - psi_sched_switch(prev, next, !task_on_rq_queued(prev)); + psi_sched_switch(prev, next, block); trace_sched_switch(preempt, prev, next, prev_state); @@ -7017,20 +7091,20 @@ int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flag } EXPORT_SYMBOL(default_wake_function); -void __setscheduler_prio(struct task_struct *p, int prio) +const struct sched_class *__setscheduler_class(int policy, int prio) { if (dl_prio(prio)) - p->sched_class = &dl_sched_class; - else if (rt_prio(prio)) - p->sched_class = &rt_sched_class; + return &dl_sched_class; + + if (rt_prio(prio)) + return &rt_sched_class; + #ifdef CONFIG_SCHED_CLASS_EXT - else if (task_should_scx(p)) - p->sched_class = &ext_sched_class; + if (task_should_scx(policy)) + return &ext_sched_class; #endif - else - p->sched_class = &fair_sched_class; - p->prio = prio; + return &fair_sched_class; } #ifdef CONFIG_RT_MUTEXES @@ -7076,7 +7150,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) { int prio, oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; - const struct sched_class *prev_class; + const struct sched_class *prev_class, *next_class; struct rq_flags rf; struct rq *rq; @@ -7134,8 +7208,13 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) queue_flag &= ~DEQUEUE_MOVE; prev_class = p->sched_class; + next_class = __setscheduler_class(p->policy, prio); + + if (prev_class != next_class && p->se.sched_delayed) + dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK); + queued = task_on_rq_queued(p); - running = task_current(rq, p); + running = task_current_donor(rq, p); if (queued) dequeue_task(rq, p, queue_flag); if (running) @@ -7171,7 +7250,9 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) p->rt.timeout = 0; } - __setscheduler_prio(p, prio); + p->sched_class = next_class; + p->prio = prio; + check_class_changing(rq, p, prev_class); if (queued) @@ -7333,6 +7414,7 @@ EXPORT_SYMBOL(__cond_resched_rwlock_write); * preempt_schedule <- NOP * preempt_schedule_notrace <- NOP * irqentry_exit_cond_resched <- NOP + * dynamic_preempt_lazy <- false * * VOLUNTARY: * cond_resched <- __cond_resched @@ -7340,6 +7422,7 @@ EXPORT_SYMBOL(__cond_resched_rwlock_write); * preempt_schedule <- NOP * preempt_schedule_notrace <- NOP * irqentry_exit_cond_resched <- NOP + * dynamic_preempt_lazy <- false * * FULL: * cond_resched <- RET0 @@ -7347,6 +7430,15 @@ EXPORT_SYMBOL(__cond_resched_rwlock_write); * preempt_schedule <- preempt_schedule * preempt_schedule_notrace <- preempt_schedule_notrace * irqentry_exit_cond_resched <- irqentry_exit_cond_resched + * dynamic_preempt_lazy <- false + * + * LAZY: + * cond_resched <- RET0 + * might_resched <- RET0 + * preempt_schedule <- preempt_schedule + * preempt_schedule_notrace <- preempt_schedule_notrace + * irqentry_exit_cond_resched <- irqentry_exit_cond_resched + * dynamic_preempt_lazy <- true */ enum { @@ -7354,30 +7446,41 @@ enum { preempt_dynamic_none, preempt_dynamic_voluntary, preempt_dynamic_full, + preempt_dynamic_lazy, }; int preempt_dynamic_mode = preempt_dynamic_undefined; int sched_dynamic_mode(const char *str) { +#ifndef CONFIG_PREEMPT_RT if (!strcmp(str, "none")) return preempt_dynamic_none; if (!strcmp(str, "voluntary")) return preempt_dynamic_voluntary; +#endif if (!strcmp(str, "full")) return preempt_dynamic_full; +#ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY + if (!strcmp(str, "lazy")) + return preempt_dynamic_lazy; +#endif + return -EINVAL; } +#define preempt_dynamic_key_enable(f) static_key_enable(&sk_dynamic_##f.key) +#define preempt_dynamic_key_disable(f) static_key_disable(&sk_dynamic_##f.key) + #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) #define preempt_dynamic_enable(f) static_call_update(f, f##_dynamic_enabled) #define preempt_dynamic_disable(f) static_call_update(f, f##_dynamic_disabled) #elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) -#define preempt_dynamic_enable(f) static_key_enable(&sk_dynamic_##f.key) -#define preempt_dynamic_disable(f) static_key_disable(&sk_dynamic_##f.key) +#define preempt_dynamic_enable(f) preempt_dynamic_key_enable(f) +#define preempt_dynamic_disable(f) preempt_dynamic_key_disable(f) #else #error "Unsupported PREEMPT_DYNAMIC mechanism" #endif @@ -7397,6 +7500,7 @@ static void __sched_dynamic_update(int mode) preempt_dynamic_enable(preempt_schedule); preempt_dynamic_enable(preempt_schedule_notrace); preempt_dynamic_enable(irqentry_exit_cond_resched); + preempt_dynamic_key_disable(preempt_lazy); switch (mode) { case preempt_dynamic_none: @@ -7406,6 +7510,7 @@ static void __sched_dynamic_update(int mode) preempt_dynamic_disable(preempt_schedule); preempt_dynamic_disable(preempt_schedule_notrace); preempt_dynamic_disable(irqentry_exit_cond_resched); + preempt_dynamic_key_disable(preempt_lazy); if (mode != preempt_dynamic_mode) pr_info("Dynamic Preempt: none\n"); break; @@ -7417,6 +7522,7 @@ static void __sched_dynamic_update(int mode) preempt_dynamic_disable(preempt_schedule); preempt_dynamic_disable(preempt_schedule_notrace); preempt_dynamic_disable(irqentry_exit_cond_resched); + preempt_dynamic_key_disable(preempt_lazy); if (mode != preempt_dynamic_mode) pr_info("Dynamic Preempt: voluntary\n"); break; @@ -7428,9 +7534,22 @@ static void __sched_dynamic_update(int mode) preempt_dynamic_enable(preempt_schedule); preempt_dynamic_enable(preempt_schedule_notrace); preempt_dynamic_enable(irqentry_exit_cond_resched); + preempt_dynamic_key_disable(preempt_lazy); if (mode != preempt_dynamic_mode) pr_info("Dynamic Preempt: full\n"); break; + + case preempt_dynamic_lazy: + if (!klp_override) + preempt_dynamic_disable(cond_resched); + preempt_dynamic_disable(might_resched); + preempt_dynamic_enable(preempt_schedule); + preempt_dynamic_enable(preempt_schedule_notrace); + preempt_dynamic_enable(irqentry_exit_cond_resched); + preempt_dynamic_key_enable(preempt_lazy); + if (mode != preempt_dynamic_mode) + pr_info("Dynamic Preempt: lazy\n"); + break; } preempt_dynamic_mode = mode; @@ -7493,6 +7612,8 @@ static void __init preempt_dynamic_init(void) sched_dynamic_update(preempt_dynamic_none); } else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) { sched_dynamic_update(preempt_dynamic_voluntary); + } else if (IS_ENABLED(CONFIG_PREEMPT_LAZY)) { + sched_dynamic_update(preempt_dynamic_lazy); } else { /* Default static call setting, nothing to do */ WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT)); @@ -7513,6 +7634,7 @@ static void __init preempt_dynamic_init(void) PREEMPT_MODEL_ACCESSOR(none); PREEMPT_MODEL_ACCESSOR(voluntary); PREEMPT_MODEL_ACCESSOR(full); +PREEMPT_MODEL_ACCESSOR(lazy); #else /* !CONFIG_PREEMPT_DYNAMIC: */ @@ -7665,8 +7787,6 @@ void __init init_idle(struct task_struct *idle, int cpu) struct rq *rq = cpu_rq(cpu); unsigned long flags; - __sched_fork(0, idle); - raw_spin_lock_irqsave(&idle->pi_lock, flags); raw_spin_rq_lock(rq); @@ -7681,10 +7801,8 @@ void __init init_idle(struct task_struct *idle, int cpu) #ifdef CONFIG_SMP /* - * It's possible that init_idle() gets called multiple times on a task, - * in that case do_set_cpus_allowed() will not do the right thing. - * - * And since this is boot we can forgo the serialization. + * No validation and serialization required at boot time and for + * setting up the idle tasks of not yet online CPUs. */ set_cpus_allowed_common(idle, &ac); #endif @@ -7703,6 +7821,7 @@ void __init init_idle(struct task_struct *idle, int cpu) rcu_read_unlock(); rq->idle = idle; + rq_set_donor(rq, idle); rcu_assign_pointer(rq->curr, idle); idle->on_rq = TASK_ON_RQ_QUEUED; #ifdef CONFIG_SMP @@ -7792,7 +7911,7 @@ void sched_setnuma(struct task_struct *p, int nid) rq = task_rq_lock(p, &rf); queued = task_on_rq_queued(p); - running = task_current(rq, p); + running = task_current_donor(rq, p); if (queued) dequeue_task(rq, p, DEQUEUE_SAVE); @@ -8528,6 +8647,7 @@ void __init sched_init(void) * but because we are the idle thread, we just pick up running again * when this runqueue becomes "idle". */ + __sched_fork(0, current); init_idle(current, smp_processor_id()); calc_load_update = jiffies + LOAD_FREQ; @@ -8942,7 +9062,7 @@ void sched_move_task(struct task_struct *tsk) update_rq_clock(rq); - running = task_current(rq, tsk); + running = task_current_donor(rq, tsk); queued = task_on_rq_queued(tsk); if (queued) @@ -10235,6 +10355,7 @@ int __sched_mm_cid_migrate_from_try_steal_cid(struct rq *src_rq, */ if (!try_cmpxchg(&src_pcpu_cid->cid, &lazy_cid, MM_CID_UNSET)) return -1; + WRITE_ONCE(src_pcpu_cid->recent_cid, MM_CID_UNSET); return src_cid; } @@ -10247,7 +10368,8 @@ void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) { struct mm_cid *src_pcpu_cid, *dst_pcpu_cid; struct mm_struct *mm = t->mm; - int src_cid, dst_cid, src_cpu; + int src_cid, src_cpu; + bool dst_cid_is_set; struct rq *src_rq; lockdep_assert_rq_held(dst_rq); @@ -10264,9 +10386,9 @@ void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) * allocation closest to 0 in cases where few threads migrate around * many CPUs. * - * If destination cid is already set, we may have to just clear - * the src cid to ensure compactness in frequent migrations - * scenarios. + * If destination cid or recent cid is already set, we may have + * to just clear the src cid to ensure compactness in frequent + * migrations scenarios. * * It is not useful to clear the src cid when the number of threads is * greater or equal to the number of allowed CPUs, because user-space @@ -10274,9 +10396,9 @@ void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) * allowed CPUs. */ dst_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(dst_rq)); - dst_cid = READ_ONCE(dst_pcpu_cid->cid); - if (!mm_cid_is_unset(dst_cid) && - atomic_read(&mm->mm_users) >= t->nr_cpus_allowed) + dst_cid_is_set = !mm_cid_is_unset(READ_ONCE(dst_pcpu_cid->cid)) || + !mm_cid_is_unset(READ_ONCE(dst_pcpu_cid->recent_cid)); + if (dst_cid_is_set && atomic_read(&mm->mm_users) >= READ_ONCE(mm->nr_cpus_allowed)) return; src_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, src_cpu); src_rq = cpu_rq(src_cpu); @@ -10287,13 +10409,14 @@ void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) src_cid); if (src_cid == -1) return; - if (!mm_cid_is_unset(dst_cid)) { + if (dst_cid_is_set) { __mm_cid_put(mm, src_cid); return; } /* Move src_cid to dst cpu. */ mm_cid_snapshot_time(dst_rq, mm); WRITE_ONCE(dst_pcpu_cid->cid, src_cid); + WRITE_ONCE(dst_pcpu_cid->recent_cid, src_cid); } static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid *pcpu_cid, @@ -10465,7 +10588,9 @@ void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) return; if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_scan))) return; - task_work_add(curr, work, TWA_RESUME); + + /* No page allocation under rq lock */ + task_work_add(curr, work, TWA_RESUME | TWAF_NO_ALLOC); } void sched_mm_cid_exit_signals(struct task_struct *t) @@ -10530,7 +10655,7 @@ void sched_mm_cid_after_execve(struct task_struct *t) * Matches barrier in sched_mm_cid_remote_clear_old(). */ smp_mb(); - t->last_mm_cid = t->mm_cid = mm_cid_get(rq, mm); + t->last_mm_cid = t->mm_cid = mm_cid_get(rq, t, mm); } rseq_set_notify_resume(t); } diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index c6ba15388ea7..28c77904ea74 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -783,9 +783,8 @@ static int sugov_init(struct cpufreq_policy *policy) if (ret) goto fail; - sugov_eas_rebuild_sd(); - out: + sugov_eas_rebuild_sd(); mutex_unlock(&global_tunables_lock); return 0; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 9ce93d0bf452..d9d5a702f1a6 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1339,7 +1339,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) #endif enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); - if (dl_task(rq->curr)) + if (dl_task(rq->donor)) wakeup_preempt_dl(rq, p, 0); else resched_curr(rq); @@ -1736,11 +1736,11 @@ int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 perio */ static void update_curr_dl(struct rq *rq) { - struct task_struct *curr = rq->curr; - struct sched_dl_entity *dl_se = &curr->dl; + struct task_struct *donor = rq->donor; + struct sched_dl_entity *dl_se = &donor->dl; s64 delta_exec; - if (!dl_task(curr) || !on_dl_rq(dl_se)) + if (!dl_task(donor) || !on_dl_rq(dl_se)) return; /* @@ -2213,7 +2213,7 @@ static int find_later_rq(struct task_struct *task); static int select_task_rq_dl(struct task_struct *p, int cpu, int flags) { - struct task_struct *curr; + struct task_struct *curr, *donor; bool select_rq; struct rq *rq; @@ -2224,6 +2224,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int flags) rcu_read_lock(); curr = READ_ONCE(rq->curr); /* unlocked access */ + donor = READ_ONCE(rq->donor); /* * If we are dealing with a -deadline task, we must @@ -2234,9 +2235,9 @@ select_task_rq_dl(struct task_struct *p, int cpu, int flags) * other hand, if it has a shorter deadline, we * try to make it stay here, it might be important. */ - select_rq = unlikely(dl_task(curr)) && + select_rq = unlikely(dl_task(donor)) && (curr->nr_cpus_allowed < 2 || - !dl_entity_preempt(&p->dl, &curr->dl)) && + !dl_entity_preempt(&p->dl, &donor->dl)) && p->nr_cpus_allowed > 1; /* @@ -2299,7 +2300,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) * let's hope p can move out. */ if (rq->curr->nr_cpus_allowed == 1 || - !cpudl_find(&rq->rd->cpudl, rq->curr, NULL)) + !cpudl_find(&rq->rd->cpudl, rq->donor, NULL)) return; /* @@ -2338,7 +2339,7 @@ static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf) static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, int flags) { - if (dl_entity_preempt(&p->dl, &rq->curr->dl)) { + if (dl_entity_preempt(&p->dl, &rq->donor->dl)) { resched_curr(rq); return; } @@ -2348,7 +2349,7 @@ static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, * In the unlikely case current and p have the same deadline * let us try to decide what's the best thing to do... */ - if ((p->dl.deadline == rq->curr->dl.deadline) && + if ((p->dl.deadline == rq->donor->dl.deadline) && !test_tsk_need_resched(rq->curr)) check_preempt_equal_dl(rq, p); #endif /* CONFIG_SMP */ @@ -2380,12 +2381,12 @@ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first) if (!first) return; - if (rq->curr->sched_class != &dl_sched_class) + if (rq->donor->sched_class != &dl_sched_class) update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0); deadline_queue_push_tasks(rq); - if (hrtick_enabled(rq)) + if (hrtick_enabled_dl(rq)) start_hrtick_dl(rq, &p->dl); } @@ -2487,14 +2488,6 @@ static void task_fork_dl(struct task_struct *p) /* Only try algorithms three times */ #define DL_MAX_TRIES 3 -static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) -{ - if (!task_on_cpu(rq, p) && - cpumask_test_cpu(cpu, &p->cpus_mask)) - return 1; - return 0; -} - /* * Return the earliest pushable rq's task, which is suitable to be executed * on the CPU, NULL otherwise: @@ -2513,7 +2506,7 @@ next_node: if (next_node) { p = __node_2_pdl(next_node); - if (pick_dl_task(rq, p, cpu)) + if (task_is_pushable(rq, p, cpu)) return p; next_node = rb_next(next_node); @@ -2707,8 +2700,8 @@ retry: * can move away, it makes sense to just reschedule * without going further in pushing next_task. */ - if (dl_task(rq->curr) && - dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) && + if (dl_task(rq->donor) && + dl_time_before(next_task->dl.deadline, rq->donor->dl.deadline) && rq->curr->nr_cpus_allowed > 1) { resched_curr(rq); return 0; @@ -2751,9 +2744,7 @@ retry: goto retry; } - deactivate_task(rq, next_task, 0); - set_task_cpu(next_task, later_rq->cpu); - activate_task(later_rq, next_task, 0); + move_queued_task_locked(rq, later_rq, next_task); ret = 1; resched_curr(later_rq); @@ -2833,15 +2824,13 @@ static void pull_dl_task(struct rq *this_rq) * deadline than the current task of its runqueue. */ if (dl_time_before(p->dl.deadline, - src_rq->curr->dl.deadline)) + src_rq->donor->dl.deadline)) goto skip; if (is_migration_disabled(p)) { push_task = get_push_task(src_rq); } else { - deactivate_task(src_rq, p, 0); - set_task_cpu(p, this_cpu); - activate_task(this_rq, p, 0); + move_queued_task_locked(src_rq, this_rq, p); dmin = p->dl.deadline; resched = true; } @@ -2874,9 +2863,9 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p) if (!task_on_cpu(rq, p) && !test_tsk_need_resched(rq->curr) && p->nr_cpus_allowed > 1 && - dl_task(rq->curr) && + dl_task(rq->donor) && (rq->curr->nr_cpus_allowed < 2 || - !dl_entity_preempt(&p->dl, &rq->curr->dl))) { + !dl_entity_preempt(&p->dl, &rq->donor->dl))) { push_dl_tasks(rq); } } @@ -3051,12 +3040,12 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) return; } - if (rq->curr != p) { + if (rq->donor != p) { #ifdef CONFIG_SMP if (p->nr_cpus_allowed > 1 && rq->dl.overloaded) deadline_queue_push_tasks(rq); #endif - if (dl_task(rq->curr)) + if (dl_task(rq->donor)) wakeup_preempt_dl(rq, p, 0); else resched_curr(rq); @@ -3085,7 +3074,7 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, if (!rq->dl.overloaded) deadline_queue_pull_task(rq); - if (task_current(rq, p)) { + if (task_current_donor(rq, p)) { /* * If we now have a earlier deadline task than p, * then reschedule, provided p is still on this diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index f4035c7a0fa1..a48b2a701ec2 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -245,11 +245,12 @@ static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf, static int sched_dynamic_show(struct seq_file *m, void *v) { static const char * preempt_modes[] = { - "none", "voluntary", "full" + "none", "voluntary", "full", "lazy", }; - int i; + int j = ARRAY_SIZE(preempt_modes) - !IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY); + int i = IS_ENABLED(CONFIG_PREEMPT_RT) * 2; - for (i = 0; i < ARRAY_SIZE(preempt_modes); i++) { + for (; i < j; i++) { if (preempt_dynamic_mode == i) seq_puts(m, "("); seq_puts(m, preempt_modes[i]); diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 3c4a94e4258f..7fff1d045477 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -2751,7 +2751,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev) lockdep_assert_rq_held(rq); rq->scx.flags |= SCX_RQ_IN_BALANCE; - rq->scx.flags &= ~SCX_RQ_BAL_KEEP; + rq->scx.flags &= ~(SCX_RQ_BAL_PENDING | SCX_RQ_BAL_KEEP); if (static_branch_unlikely(&scx_ops_cpu_preempt) && unlikely(rq->scx.cpu_released)) { @@ -2762,7 +2762,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev) * emitted in switch_class(). */ if (SCX_HAS_OP(cpu_acquire)) - SCX_CALL_OP(0, cpu_acquire, cpu_of(rq), NULL); + SCX_CALL_OP(SCX_KF_REST, cpu_acquire, cpu_of(rq), NULL); rq->scx.cpu_released = false; } @@ -3065,12 +3065,11 @@ static struct task_struct *pick_task_scx(struct rq *rq) { struct task_struct *prev = rq->curr; struct task_struct *p; + bool prev_on_scx = prev->sched_class == &ext_sched_class; + bool keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP; + bool kick_idle = false; /* - * If balance_scx() is telling us to keep running @prev, replenish slice - * if necessary and keep running @prev. Otherwise, pop the first one - * from the local DSQ. - * * WORKAROUND: * * %SCX_RQ_BAL_KEEP should be set iff $prev is on SCX as it must just @@ -3079,22 +3078,41 @@ static struct task_struct *pick_task_scx(struct rq *rq) * which then ends up calling pick_task_scx() without preceding * balance_scx(). * - * For now, ignore cases where $prev is not on SCX. This isn't great and - * can theoretically lead to stalls. However, for switch_all cases, this - * happens only while a BPF scheduler is being loaded or unloaded, and, - * for partial cases, fair will likely keep triggering this CPU. + * Keep running @prev if possible and avoid stalling from entering idle + * without balancing. * - * Once fair is fixed, restore WARN_ON_ONCE(). + * Once fair is fixed, remove the workaround and trigger WARN_ON_ONCE() + * if pick_task_scx() is called without preceding balance_scx(). + */ + if (unlikely(rq->scx.flags & SCX_RQ_BAL_PENDING)) { + if (prev_on_scx) { + keep_prev = true; + } else { + keep_prev = false; + kick_idle = true; + } + } else if (unlikely(keep_prev && !prev_on_scx)) { + /* only allowed during transitions */ + WARN_ON_ONCE(scx_ops_enable_state() == SCX_OPS_ENABLED); + keep_prev = false; + } + + /* + * If balance_scx() is telling us to keep running @prev, replenish slice + * if necessary and keep running @prev. Otherwise, pop the first one + * from the local DSQ. */ - if ((rq->scx.flags & SCX_RQ_BAL_KEEP) && - prev->sched_class == &ext_sched_class) { + if (keep_prev) { p = prev; if (!p->scx.slice) p->scx.slice = SCX_SLICE_DFL; } else { p = first_local_task(rq); - if (!p) + if (!p) { + if (kick_idle) + scx_bpf_kick_cpu(cpu_of(rq), SCX_KICK_IDLE); return NULL; + } if (unlikely(!p->scx.slice)) { if (!scx_rq_bypassing(rq) && !scx_warned_zero_slice) { @@ -3908,12 +3926,7 @@ static void scx_ops_exit_task(struct task_struct *p) void init_scx_entity(struct sched_ext_entity *scx) { - /* - * init_idle() calls this function again after fork sequence is - * complete. Don't touch ->tasks_node as it's already linked. - */ - memset(scx, 0, offsetof(struct sched_ext_entity, tasks_node)); - + memset(scx, 0, sizeof(*scx)); INIT_LIST_HEAD(&scx->dsq_list.node); RB_CLEAR_NODE(&scx->dsq_priq); scx->sticky_cpu = -1; @@ -4616,14 +4629,14 @@ static const struct kset_uevent_ops scx_uevent_ops = { * Used by sched_fork() and __setscheduler_prio() to pick the matching * sched_class. dl/rt are already handled. */ -bool task_should_scx(struct task_struct *p) +bool task_should_scx(int policy) { if (!scx_enabled() || unlikely(scx_ops_enable_state() == SCX_OPS_DISABLING)) return false; if (READ_ONCE(scx_switching_all)) return true; - return p->policy == SCHED_EXT; + return policy == SCHED_EXT; } /** @@ -4902,11 +4915,16 @@ static void scx_ops_disable_workfn(struct kthread_work *work) scx_task_iter_start(&sti); while ((p = scx_task_iter_next_locked(&sti))) { const struct sched_class *old_class = p->sched_class; + const struct sched_class *new_class = + __setscheduler_class(p->policy, p->prio); struct sched_enq_and_set_ctx ctx; + if (old_class != new_class && p->se.sched_delayed) + dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED); + sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); - __setscheduler_prio(p, p->prio); + p->sched_class = new_class; check_class_changing(task_rq(p), p, old_class); sched_enq_and_set_task(&ctx); @@ -5615,12 +5633,17 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) scx_task_iter_start(&sti); while ((p = scx_task_iter_next_locked(&sti))) { const struct sched_class *old_class = p->sched_class; + const struct sched_class *new_class = + __setscheduler_class(p->policy, p->prio); struct sched_enq_and_set_ctx ctx; + if (old_class != new_class && p->se.sched_delayed) + dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED); + sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); p->scx.slice = SCX_SLICE_DFL; - __setscheduler_prio(p, p->prio); + p->sched_class = new_class; check_class_changing(task_rq(p), p, old_class); sched_enq_and_set_task(&ctx); diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h index 246019519231..b1675bb59fc4 100644 --- a/kernel/sched/ext.h +++ b/kernel/sched/ext.h @@ -18,7 +18,7 @@ bool scx_can_stop_tick(struct rq *rq); void scx_rq_activate(struct rq *rq); void scx_rq_deactivate(struct rq *rq); int scx_check_setscheduler(struct task_struct *p, int policy); -bool task_should_scx(struct task_struct *p); +bool task_should_scx(int policy); void init_sched_ext_class(void); static inline u32 scx_cpuperf_target(s32 cpu) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 225b31aaee55..fbdca89c677f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1200,12 +1200,12 @@ static inline bool do_preempt_short(struct cfs_rq *cfs_rq, */ s64 update_curr_common(struct rq *rq) { - struct task_struct *curr = rq->curr; + struct task_struct *donor = rq->donor; s64 delta_exec; - delta_exec = update_curr_se(rq, &curr->se); + delta_exec = update_curr_se(rq, &donor->se); if (likely(delta_exec > 0)) - update_curr_task(curr, delta_exec); + update_curr_task(donor, delta_exec); return delta_exec; } @@ -1247,18 +1247,18 @@ static void update_curr(struct cfs_rq *cfs_rq) account_cfs_rq_runtime(cfs_rq, delta_exec); - if (rq->nr_running == 1) + if (cfs_rq->nr_running == 1) return; if (resched || did_preempt_short(cfs_rq, curr)) { - resched_curr(rq); + resched_curr_lazy(rq); clear_buddies(cfs_rq, curr); } } static void update_curr_fair(struct rq *rq) { - update_curr(cfs_rq_of(&rq->curr->se)); + update_curr(cfs_rq_of(&rq->donor->se)); } static inline void @@ -3369,7 +3369,7 @@ retry_pids: vma = vma_next(&vmi); } - do { + for (; vma; vma = vma_next(&vmi)) { if (!vma_migratable(vma) || !vma_policy_mof(vma) || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) { trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_UNSUITABLE); @@ -3491,7 +3491,7 @@ retry_pids: */ if (vma_pids_forced) break; - } for_each_vma(vmi, vma); + } /* * If no VMAs are remaining and VMAs were skipped due to the PID @@ -5280,7 +5280,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * * EEVDF: placement strategy #1 / #2 */ - if (sched_feat(PLACE_LAG) && cfs_rq->nr_running) { + if (sched_feat(PLACE_LAG) && cfs_rq->nr_running && se->vlag) { struct sched_entity *curr = cfs_rq->curr; unsigned long load; @@ -5625,8 +5625,9 @@ pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq) struct sched_entity *se = pick_eevdf(cfs_rq); if (se->sched_delayed) { dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED); - SCHED_WARN_ON(se->sched_delayed); - SCHED_WARN_ON(se->on_rq); + /* + * Must not reference @se again, see __block_task(). + */ return NULL; } return se; @@ -5677,15 +5678,9 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) * validating it and just reschedule. */ if (queued) { - resched_curr(rq_of(cfs_rq)); + resched_curr_lazy(rq_of(cfs_rq)); return; } - /* - * don't let the period tick interfere with the hrtick preemption - */ - if (!sched_feat(DOUBLE_TICK) && - hrtimer_active(&rq_of(cfs_rq)->hrtick_timer)) - return; #endif } @@ -6058,10 +6053,13 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) for_each_sched_entity(se) { struct cfs_rq *qcfs_rq = cfs_rq_of(se); - if (se->on_rq) { - SCHED_WARN_ON(se->sched_delayed); + /* Handle any unfinished DELAY_DEQUEUE business first. */ + if (se->sched_delayed) { + int flags = DEQUEUE_SLEEP | DEQUEUE_DELAYED; + + dequeue_entity(qcfs_rq, se, flags); + } else if (se->on_rq) break; - } enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP); if (cfs_rq_is_idle(group_cfs_rq(se))) @@ -6818,7 +6816,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) s64 delta = slice - ran; if (delta < 0) { - if (task_current(rq, p)) + if (task_current_donor(rq, p)) resched_curr(rq); return; } @@ -6833,12 +6831,12 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) */ static void hrtick_update(struct rq *rq) { - struct task_struct *curr = rq->curr; + struct task_struct *donor = rq->donor; - if (!hrtick_enabled_fair(rq) || curr->sched_class != &fair_sched_class) + if (!hrtick_enabled_fair(rq) || donor->sched_class != &fair_sched_class) return; - hrtick_start_fair(rq, curr); + hrtick_start_fair(rq, donor); } #else /* !CONFIG_SCHED_HRTICK */ static inline void @@ -7173,7 +7171,11 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) /* Fix-up what dequeue_task_fair() skipped */ hrtick_update(rq); - /* Fix-up what block_task() skipped. */ + /* + * Fix-up what block_task() skipped. + * + * Must be last, @p might not be valid after this. + */ __block_task(rq, p); } @@ -7190,12 +7192,14 @@ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE)))) util_est_dequeue(&rq->cfs, p); - if (dequeue_entities(rq, &p->se, flags) < 0) { - util_est_update(&rq->cfs, p, DEQUEUE_SLEEP); + util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP); + if (dequeue_entities(rq, &p->se, flags) < 0) return false; - } - util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP); + /* + * Must not reference @p after dequeue_entities(DEQUEUE_DELAYED). + */ + hrtick_update(rq); return true; } @@ -8753,9 +8757,9 @@ static void set_next_buddy(struct sched_entity *se) */ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags) { - struct task_struct *curr = rq->curr; - struct sched_entity *se = &curr->se, *pse = &p->se; - struct cfs_rq *cfs_rq = task_cfs_rq(curr); + struct task_struct *donor = rq->donor; + struct sched_entity *se = &donor->se, *pse = &p->se; + struct cfs_rq *cfs_rq = task_cfs_rq(donor); int cse_is_idle, pse_is_idle; if (unlikely(se == pse)) @@ -8784,7 +8788,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int * prevents us from potentially nominating it as a false LAST_BUDDY * below. */ - if (test_tsk_need_resched(curr)) + if (test_tsk_need_resched(rq->curr)) return; if (!sched_feat(WAKEUP_PREEMPTION)) @@ -8832,7 +8836,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int return; preempt: - resched_curr(rq); + resched_curr_lazy(rq); } static struct task_struct *pick_task_fair(struct rq *rq) @@ -13083,7 +13087,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) * our priority decreased, or if we are not currently running on * this runqueue and our priority is higher than the current's */ - if (task_current(rq, p)) { + if (task_current_donor(rq, p)) { if (p->prio > oldprio) resched_curr(rq); } else @@ -13174,22 +13178,6 @@ static void attach_task_cfs_rq(struct task_struct *p) static void switched_from_fair(struct rq *rq, struct task_struct *p) { detach_task_cfs_rq(p); - /* - * Since this is called after changing class, this is a little weird - * and we cannot use DEQUEUE_DELAYED. - */ - if (p->se.sched_delayed) { - /* First, dequeue it from its new class' structures */ - dequeue_task(rq, p, DEQUEUE_NOCLOCK | DEQUEUE_SLEEP); - /* - * Now, clean up the fair_sched_class side of things - * related to sched_delayed being true and that wasn't done - * due to the generic dequeue not using DEQUEUE_DELAYED. - */ - finish_delayed_dequeue_entity(&p->se); - p->se.rel_deadline = 0; - __block_task(rq, p); - } } static void switched_to_fair(struct rq *rq, struct task_struct *p) @@ -13206,7 +13194,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) * kick off the schedule if running, otherwise just see * if we can still preempt the current task. */ - if (task_current(rq, p)) + if (task_current_donor(rq, p)) resched_curr(rq); else wakeup_preempt(rq, p, 0); diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 290874079f60..a3d331dd2d8f 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -19,7 +19,7 @@ SCHED_FEAT(PLACE_REL_DEADLINE, true) */ SCHED_FEAT(RUN_TO_PARITY, true) /* - * Allow wakeup of tasks with a shorter slice to cancel RESPECT_SLICE for + * Allow wakeup of tasks with a shorter slice to cancel RUN_TO_PARITY for * current. */ SCHED_FEAT(PREEMPT_SHORT, true) @@ -56,7 +56,6 @@ SCHED_FEAT(WAKEUP_PREEMPTION, true) SCHED_FEAT(HRTICK, false) SCHED_FEAT(HRTICK_DL, false) -SCHED_FEAT(DOUBLE_TICK, false) /* * Decrement CPU capacity based on time not spent running tasks diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index d2f096bb274c..621696269584 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -271,7 +271,6 @@ static void do_idle(void) tick_nohz_idle_enter(); while (!need_resched()) { - rmb(); /* * Interrupts shouldn't be re-enabled from that point on until @@ -399,8 +398,8 @@ void play_idle_precise(u64 duration_ns, u64 latency_ns) cpuidle_use_deepest_state(latency_ns); it.done = 0; - hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); - it.timer.function = idle_inject_timer_fn; + hrtimer_setup_on_stack(&it.timer, idle_inject_timer_fn, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_HARD); hrtimer_start(&it.timer, ns_to_ktime(duration_ns), HRTIMER_MODE_REL_PINNED_HARD); diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index a9c65d97b3ca..fc07382361a8 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -476,7 +476,7 @@ int update_irq_load_avg(struct rq *rq, u64 running) bool update_other_load_avgs(struct rq *rq) { u64 now = rq_clock_pelt(rq); - const struct sched_class *curr_class = rq->curr->sched_class; + const struct sched_class *curr_class = rq->donor->sched_class; unsigned long hw_pressure = arch_scale_hw_pressure(cpu_of(rq)); lockdep_assert_rq_held(rq); diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 020d58967d4e..84dad1511d1e 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -769,12 +769,13 @@ static void record_times(struct psi_group_cpu *groupc, u64 now) } static void psi_group_change(struct psi_group *group, int cpu, - unsigned int clear, unsigned int set, u64 now, + unsigned int clear, unsigned int set, bool wake_clock) { struct psi_group_cpu *groupc; unsigned int t, m; u32 state_mask; + u64 now; lockdep_assert_rq_held(cpu_rq(cpu)); groupc = per_cpu_ptr(group->pcpu, cpu); @@ -789,6 +790,7 @@ static void psi_group_change(struct psi_group *group, int cpu, * SOME and FULL time these may have resulted in. */ write_seqcount_begin(&groupc->seq); + now = cpu_clock(cpu); /* * Start with TSK_ONCPU, which doesn't have a corresponding @@ -899,18 +901,15 @@ void psi_task_change(struct task_struct *task, int clear, int set) { int cpu = task_cpu(task); struct psi_group *group; - u64 now; if (!task->pid) return; psi_flags_change(task, clear, set); - now = cpu_clock(cpu); - group = task_psi_group(task); do { - psi_group_change(group, cpu, clear, set, now, true); + psi_group_change(group, cpu, clear, set, true); } while ((group = group->parent)); } @@ -919,7 +918,6 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, { struct psi_group *group, *common = NULL; int cpu = task_cpu(prev); - u64 now = cpu_clock(cpu); if (next->pid) { psi_flags_change(next, 0, TSK_ONCPU); @@ -936,7 +934,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, break; } - psi_group_change(group, cpu, 0, TSK_ONCPU, now, true); + psi_group_change(group, cpu, 0, TSK_ONCPU, true); } while ((group = group->parent)); } @@ -974,7 +972,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, do { if (group == common) break; - psi_group_change(group, cpu, clear, set, now, wake_clock); + psi_group_change(group, cpu, clear, set, wake_clock); } while ((group = group->parent)); /* @@ -986,7 +984,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) { clear &= ~TSK_ONCPU; for (; group; group = group->parent) - psi_group_change(group, cpu, clear, set, now, wake_clock); + psi_group_change(group, cpu, clear, set, wake_clock); } } } @@ -997,8 +995,8 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st int cpu = task_cpu(curr); struct psi_group *group; struct psi_group_cpu *groupc; - u64 now, irq; s64 delta; + u64 irq; if (static_branch_likely(&psi_disabled)) return; @@ -1011,7 +1009,6 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st if (prev && task_psi_group(prev) == group) return; - now = cpu_clock(cpu); irq = irq_time_read(cpu); delta = (s64)(irq - rq->psi_irq_time); if (delta < 0) @@ -1019,12 +1016,15 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st rq->psi_irq_time = irq; do { + u64 now; + if (!group->enabled) continue; groupc = per_cpu_ptr(group->pcpu, cpu); write_seqcount_begin(&groupc->seq); + now = cpu_clock(cpu); record_times(groupc, now); groupc->times[PSI_IRQ_FULL] += delta; @@ -1223,11 +1223,9 @@ void psi_cgroup_restart(struct psi_group *group) for_each_possible_cpu(cpu) { struct rq *rq = cpu_rq(cpu); struct rq_flags rf; - u64 now; rq_lock_irq(rq, &rf); - now = cpu_clock(cpu); - psi_group_change(group, cpu, 0, 0, now, true); + psi_group_change(group, cpu, 0, 0, true); rq_unlock_irq(rq, &rf); } } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 172c588de542..bd66a46b06ac 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -528,7 +528,7 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) { - struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; + struct task_struct *donor = rq_of_rt_rq(rt_rq)->donor; struct rq *rq = rq_of_rt_rq(rt_rq); struct sched_rt_entity *rt_se; @@ -542,7 +542,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) else if (!on_rt_rq(rt_se)) enqueue_rt_entity(rt_se, 0); - if (rt_rq->highest_prio.curr < curr->prio) + if (rt_rq->highest_prio.curr < donor->prio) resched_curr(rq); } } @@ -988,10 +988,10 @@ static inline int rt_se_prio(struct sched_rt_entity *rt_se) */ static void update_curr_rt(struct rq *rq) { - struct task_struct *curr = rq->curr; + struct task_struct *donor = rq->donor; s64 delta_exec; - if (curr->sched_class != &rt_sched_class) + if (donor->sched_class != &rt_sched_class) return; delta_exec = update_curr_common(rq); @@ -999,7 +999,7 @@ static void update_curr_rt(struct rq *rq) return; #ifdef CONFIG_RT_GROUP_SCHED - struct sched_rt_entity *rt_se = &curr->rt; + struct sched_rt_entity *rt_se = &donor->rt; if (!rt_bandwidth_enabled()) return; @@ -1535,7 +1535,7 @@ static int find_lowest_rq(struct task_struct *task); static int select_task_rq_rt(struct task_struct *p, int cpu, int flags) { - struct task_struct *curr; + struct task_struct *curr, *donor; struct rq *rq; bool test; @@ -1547,6 +1547,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int flags) rcu_read_lock(); curr = READ_ONCE(rq->curr); /* unlocked access */ + donor = READ_ONCE(rq->donor); /* * If the current task on @p's runqueue is an RT task, then @@ -1575,8 +1576,8 @@ select_task_rq_rt(struct task_struct *p, int cpu, int flags) * systems like big.LITTLE. */ test = curr && - unlikely(rt_task(curr)) && - (curr->nr_cpus_allowed < 2 || curr->prio <= p->prio); + unlikely(rt_task(donor)) && + (curr->nr_cpus_allowed < 2 || donor->prio <= p->prio); if (test || !rt_task_fits_capacity(p, cpu)) { int target = find_lowest_rq(p); @@ -1606,12 +1607,8 @@ out: static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) { - /* - * Current can't be migrated, useless to reschedule, - * let's hope p can move out. - */ if (rq->curr->nr_cpus_allowed == 1 || - !cpupri_find(&rq->rd->cpupri, rq->curr, NULL)) + !cpupri_find(&rq->rd->cpupri, rq->donor, NULL)) return; /* @@ -1654,7 +1651,9 @@ static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf) */ static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags) { - if (p->prio < rq->curr->prio) { + struct task_struct *donor = rq->donor; + + if (p->prio < donor->prio) { resched_curr(rq); return; } @@ -1672,7 +1671,7 @@ static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags) * to move current somewhere else, making room for our non-migratable * task. */ - if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr)) + if (p->prio == donor->prio && !test_tsk_need_resched(rq->curr)) check_preempt_equal_prio(rq, p); #endif } @@ -1697,7 +1696,7 @@ static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool f * utilization. We only care of the case where we start to schedule a * rt task */ - if (rq->curr->sched_class != &rt_sched_class) + if (rq->donor->sched_class != &rt_sched_class) update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0); rt_queue_push_tasks(rq); @@ -1773,15 +1772,6 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct task_s /* Only try algorithms three times */ #define RT_MAX_TRIES 3 -static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) -{ - if (!task_on_cpu(rq, p) && - cpumask_test_cpu(cpu, &p->cpus_mask)) - return 1; - - return 0; -} - /* * Return the highest pushable rq's task, which is suitable to be executed * on the CPU, NULL otherwise @@ -1795,7 +1785,7 @@ static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu) return NULL; plist_for_each_entry(p, head, pushable_tasks) { - if (pick_rt_task(rq, p, cpu)) + if (task_is_pushable(rq, p, cpu)) return p; } @@ -1968,6 +1958,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq) BUG_ON(rq->cpu != task_cpu(p)); BUG_ON(task_current(rq, p)); + BUG_ON(task_current_donor(rq, p)); BUG_ON(p->nr_cpus_allowed <= 1); BUG_ON(!task_on_rq_queued(p)); @@ -2000,7 +1991,7 @@ retry: * higher priority than current. If that's the case * just reschedule current. */ - if (unlikely(next_task->prio < rq->curr->prio)) { + if (unlikely(next_task->prio < rq->donor->prio)) { resched_curr(rq); return 0; } @@ -2021,7 +2012,7 @@ retry: * Note that the stoppers are masqueraded as SCHED_FIFO * (cf. sched_set_stop_task()), so we can't rely on rt_task(). */ - if (rq->curr->sched_class != &rt_sched_class) + if (rq->donor->sched_class != &rt_sched_class) return 0; cpu = find_lowest_rq(rq->curr); @@ -2088,9 +2079,7 @@ retry: goto retry; } - deactivate_task(rq, next_task, 0); - set_task_cpu(next_task, lowest_rq->cpu); - activate_task(lowest_rq, next_task, 0); + move_queued_task_locked(rq, lowest_rq, next_task); resched_curr(lowest_rq); ret = 1; @@ -2355,15 +2344,13 @@ static void pull_rt_task(struct rq *this_rq) * p if it is lower in priority than the * current task on the run queue */ - if (p->prio < src_rq->curr->prio) + if (p->prio < src_rq->donor->prio) goto skip; if (is_migration_disabled(p)) { push_task = get_push_task(src_rq); } else { - deactivate_task(src_rq, p, 0); - set_task_cpu(p, this_cpu); - activate_task(this_rq, p, 0); + move_queued_task_locked(src_rq, this_rq, p); resched = true; } /* @@ -2399,9 +2386,9 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) bool need_to_push = !task_on_cpu(rq, p) && !test_tsk_need_resched(rq->curr) && p->nr_cpus_allowed > 1 && - (dl_task(rq->curr) || rt_task(rq->curr)) && + (dl_task(rq->donor) || rt_task(rq->donor)) && (rq->curr->nr_cpus_allowed < 2 || - rq->curr->prio <= p->prio); + rq->donor->prio <= p->prio); if (need_to_push) push_rt_tasks(rq); @@ -2485,7 +2472,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) rt_queue_push_tasks(rq); #endif /* CONFIG_SMP */ - if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq))) + if (p->prio < rq->donor->prio && cpu_online(cpu_of(rq))) resched_curr(rq); } } @@ -2500,7 +2487,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) if (!task_on_rq_queued(p)) return; - if (task_current(rq, p)) { + if (task_current_donor(rq, p)) { #ifdef CONFIG_SMP /* * If our priority decreases while running, we @@ -2526,7 +2513,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) * greater than the current running task * then reschedule. */ - if (p->prio < rq->curr->prio) + if (p->prio < rq->donor->prio) resched_curr(rq); } } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6085ef50febf..76f5f53a645f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -751,8 +751,9 @@ enum scx_rq_flags { */ SCX_RQ_ONLINE = 1 << 0, SCX_RQ_CAN_STOP_TICK = 1 << 1, - SCX_RQ_BAL_KEEP = 1 << 2, /* balance decided to keep current */ - SCX_RQ_BYPASSING = 1 << 3, + SCX_RQ_BAL_PENDING = 1 << 2, /* balance hasn't run yet */ + SCX_RQ_BAL_KEEP = 1 << 3, /* balance decided to keep current */ + SCX_RQ_BYPASSING = 1 << 4, SCX_RQ_IN_WAKEUP = 1 << 16, SCX_RQ_IN_BALANCE = 1 << 17, @@ -1147,7 +1148,10 @@ struct rq { */ unsigned int nr_uninterruptible; - struct task_struct __rcu *curr; + union { + struct task_struct __rcu *donor; /* Scheduler context */ + struct task_struct __rcu *curr; /* Execution context */ + }; struct sched_dl_entity *dl_server; struct task_struct *idle; struct task_struct *stop; @@ -1344,6 +1348,11 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); #define cpu_curr(cpu) (cpu_rq(cpu)->curr) #define raw_rq() raw_cpu_ptr(&runqueues) +static inline void rq_set_donor(struct rq *rq, struct task_struct *t) +{ + /* Do nothing */ +} + #ifdef CONFIG_SCHED_CORE static inline struct cpumask *sched_group_span(struct sched_group *sg); @@ -2085,34 +2094,6 @@ static inline const struct cpumask *task_user_cpus(struct task_struct *p) #endif /* CONFIG_SMP */ -#include "stats.h" - -#if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS) - -extern void __sched_core_account_forceidle(struct rq *rq); - -static inline void sched_core_account_forceidle(struct rq *rq) -{ - if (schedstat_enabled()) - __sched_core_account_forceidle(rq); -} - -extern void __sched_core_tick(struct rq *rq); - -static inline void sched_core_tick(struct rq *rq) -{ - if (sched_core_enabled(rq) && schedstat_enabled()) - __sched_core_tick(rq); -} - -#else /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS): */ - -static inline void sched_core_account_forceidle(struct rq *rq) { } - -static inline void sched_core_tick(struct rq *rq) { } - -#endif /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS) */ - #ifdef CONFIG_CGROUP_SCHED /* @@ -2260,11 +2241,25 @@ static inline u64 global_rt_runtime(void) return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; } +/* + * Is p the current execution context? + */ static inline int task_current(struct rq *rq, struct task_struct *p) { return rq->curr == p; } +/* + * Is p the current scheduling context? + * + * Note that it might be the current execution context at the same time if + * rq->curr == rq->donor == p. + */ +static inline int task_current_donor(struct rq *rq, struct task_struct *p) +{ + return rq->donor == p; +} + static inline int task_on_cpu(struct rq *rq, struct task_struct *p) { #ifdef CONFIG_SMP @@ -2451,7 +2446,7 @@ struct sched_class { static inline void put_prev_task(struct rq *rq, struct task_struct *prev) { - WARN_ON_ONCE(rq->curr != prev); + WARN_ON_ONCE(rq->donor != prev); prev->sched_class->put_prev_task(rq, prev, NULL); } @@ -2615,7 +2610,7 @@ static inline cpumask_t *alloc_user_cpus_ptr(int node) static inline struct task_struct *get_push_task(struct rq *rq) { - struct task_struct *p = rq->curr; + struct task_struct *p = rq->donor; lockdep_assert_rq_held(rq); @@ -2695,6 +2690,7 @@ extern void init_sched_rt_class(void); extern void init_sched_fair_class(void); extern void resched_curr(struct rq *rq); +extern void resched_curr_lazy(struct rq *rq); extern void resched_cpu(int cpu); extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); @@ -2769,8 +2765,6 @@ static inline void sub_nr_running(struct rq *rq, unsigned count) static inline void __block_task(struct rq *rq, struct task_struct *p) { - WRITE_ONCE(p->on_rq, 0); - ASSERT_EXCLUSIVE_WRITER(p->on_rq); if (p->sched_contributes_to_load) rq->nr_uninterruptible++; @@ -2778,6 +2772,38 @@ static inline void __block_task(struct rq *rq, struct task_struct *p) atomic_inc(&rq->nr_iowait); delayacct_blkio_start(); } + + ASSERT_EXCLUSIVE_WRITER(p->on_rq); + + /* + * The moment this write goes through, ttwu() can swoop in and migrate + * this task, rendering our rq->__lock ineffective. + * + * __schedule() try_to_wake_up() + * LOCK rq->__lock LOCK p->pi_lock + * pick_next_task() + * pick_next_task_fair() + * pick_next_entity() + * dequeue_entities() + * __block_task() + * RELEASE p->on_rq = 0 if (p->on_rq && ...) + * break; + * + * ACQUIRE (after ctrl-dep) + * + * cpu = select_task_rq(); + * set_task_cpu(p, cpu); + * ttwu_queue() + * ttwu_do_activate() + * LOCK rq->__lock + * activate_task() + * STORE p->on_rq = 1 + * UNLOCK rq->__lock + * + * Callers must ensure to not reference @p after this -- we no longer + * own it. + */ + smp_store_release(&p->on_rq, 0); } extern void activate_task(struct rq *rq, struct task_struct *p, int flags); @@ -3169,6 +3195,34 @@ extern void nohz_run_idle_balance(int cpu); static inline void nohz_run_idle_balance(int cpu) { } #endif +#include "stats.h" + +#if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS) + +extern void __sched_core_account_forceidle(struct rq *rq); + +static inline void sched_core_account_forceidle(struct rq *rq) +{ + if (schedstat_enabled()) + __sched_core_account_forceidle(rq); +} + +extern void __sched_core_tick(struct rq *rq); + +static inline void sched_core_tick(struct rq *rq) +{ + if (sched_core_enabled(rq) && schedstat_enabled()) + __sched_core_tick(rq); +} + +#else /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS): */ + +static inline void sched_core_account_forceidle(struct rq *rq) { } + +static inline void sched_core_tick(struct rq *rq) { } + +#endif /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS) */ + #ifdef CONFIG_IRQ_TIME_ACCOUNTING struct irqtime { @@ -3599,24 +3653,41 @@ static inline void mm_cid_put(struct mm_struct *mm) __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); } -static inline int __mm_cid_try_get(struct mm_struct *mm) +static inline int __mm_cid_try_get(struct task_struct *t, struct mm_struct *mm) { - struct cpumask *cpumask; - int cid; + struct cpumask *cidmask = mm_cidmask(mm); + struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; + int cid = __this_cpu_read(pcpu_cid->recent_cid); - cpumask = mm_cidmask(mm); + /* Try to re-use recent cid. This improves cache locality. */ + if (!mm_cid_is_unset(cid) && !cpumask_test_and_set_cpu(cid, cidmask)) + return cid; + /* + * Expand cid allocation if the maximum number of concurrency + * IDs allocated (max_nr_cid) is below the number cpus allowed + * and number of threads. Expanding cid allocation as much as + * possible improves cache locality. + */ + cid = atomic_read(&mm->max_nr_cid); + while (cid < READ_ONCE(mm->nr_cpus_allowed) && cid < atomic_read(&mm->mm_users)) { + if (!atomic_try_cmpxchg(&mm->max_nr_cid, &cid, cid + 1)) + continue; + if (!cpumask_test_and_set_cpu(cid, cidmask)) + return cid; + } /* + * Find the first available concurrency id. * Retry finding first zero bit if the mask is temporarily * filled. This only happens during concurrent remote-clear * which owns a cid without holding a rq lock. */ for (;;) { - cid = cpumask_first_zero(cpumask); - if (cid < nr_cpu_ids) + cid = cpumask_first_zero(cidmask); + if (cid < READ_ONCE(mm->nr_cpus_allowed)) break; cpu_relax(); } - if (cpumask_test_and_set_cpu(cid, cpumask)) + if (cpumask_test_and_set_cpu(cid, cidmask)) return -1; return cid; @@ -3634,7 +3705,8 @@ static inline void mm_cid_snapshot_time(struct rq *rq, struct mm_struct *mm) WRITE_ONCE(pcpu_cid->time, rq->clock); } -static inline int __mm_cid_get(struct rq *rq, struct mm_struct *mm) +static inline int __mm_cid_get(struct rq *rq, struct task_struct *t, + struct mm_struct *mm) { int cid; @@ -3644,13 +3716,13 @@ static inline int __mm_cid_get(struct rq *rq, struct mm_struct *mm) * guarantee forward progress. */ if (!READ_ONCE(use_cid_lock)) { - cid = __mm_cid_try_get(mm); + cid = __mm_cid_try_get(t, mm); if (cid >= 0) goto end; raw_spin_lock(&cid_lock); } else { raw_spin_lock(&cid_lock); - cid = __mm_cid_try_get(mm); + cid = __mm_cid_try_get(t, mm); if (cid >= 0) goto unlock; } @@ -3670,7 +3742,7 @@ static inline int __mm_cid_get(struct rq *rq, struct mm_struct *mm) * all newcoming allocations observe the use_cid_lock flag set. */ do { - cid = __mm_cid_try_get(mm); + cid = __mm_cid_try_get(t, mm); cpu_relax(); } while (cid < 0); /* @@ -3687,7 +3759,8 @@ end: return cid; } -static inline int mm_cid_get(struct rq *rq, struct mm_struct *mm) +static inline int mm_cid_get(struct rq *rq, struct task_struct *t, + struct mm_struct *mm) { struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; struct cpumask *cpumask; @@ -3704,8 +3777,9 @@ static inline int mm_cid_get(struct rq *rq, struct mm_struct *mm) if (try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET)) __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); } - cid = __mm_cid_get(rq, mm); + cid = __mm_cid_get(rq, t, mm); __this_cpu_write(pcpu_cid->cid, cid); + __this_cpu_write(pcpu_cid->recent_cid, cid); return cid; } @@ -3758,7 +3832,7 @@ static inline void switch_mm_cid(struct rq *rq, prev->mm_cid = -1; } if (next->mm_cid_active) - next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next->mm); + next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next, next->mm); } #else /* !CONFIG_SCHED_MM_CID: */ @@ -3771,6 +3845,28 @@ static inline void init_sched_mm_cid(struct task_struct *t) { } extern u64 avg_vruntime(struct cfs_rq *cfs_rq); extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se); +#ifdef CONFIG_SMP +static inline +void move_queued_task_locked(struct rq *src_rq, struct rq *dst_rq, struct task_struct *task) +{ + lockdep_assert_rq_held(src_rq); + lockdep_assert_rq_held(dst_rq); + + deactivate_task(src_rq, task, 0); + set_task_cpu(task, dst_rq->cpu); + activate_task(dst_rq, task, 0); +} + +static inline +bool task_is_pushable(struct rq *rq, struct task_struct *p, int cpu) +{ + if (!task_on_cpu(rq, p) && + cpumask_test_cpu(cpu, &p->cpus_mask)) + return true; + + return false; +} +#endif #ifdef CONFIG_RT_MUTEXES @@ -3800,7 +3896,7 @@ static inline int rt_effective_prio(struct task_struct *p, int prio) extern int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user, bool pi); extern int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx); -extern void __setscheduler_prio(struct task_struct *p, int prio); +extern const struct sched_class *__setscheduler_class(int policy, int prio); extern void set_load_weight(struct task_struct *p, bool update_load); extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags); extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags); diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 237780aa3c53..8ee0add5a48a 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -119,44 +119,71 @@ static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr, /* * PSI tracks state that persists across sleeps, such as iowaits and * memory stalls. As a result, it has to distinguish between sleeps, - * where a task's runnable state changes, and requeues, where a task - * and its state are being moved between CPUs and runqueues. + * where a task's runnable state changes, and migrations, where a task + * and its runnable state are being moved between CPUs and runqueues. + * + * A notable case is a task whose dequeue is delayed. PSI considers + * those sleeping, but because they are still on the runqueue they can + * go through migration requeues. In this case, *sleeping* states need + * to be transferred. */ -static inline void psi_enqueue(struct task_struct *p, bool wakeup) +static inline void psi_enqueue(struct task_struct *p, int flags) { - int clear = 0, set = TSK_RUNNING; + int clear = 0, set = 0; if (static_branch_likely(&psi_disabled)) return; - if (p->in_memstall) - set |= TSK_MEMSTALL_RUNNING; + /* Same runqueue, nothing changed for psi */ + if (flags & ENQUEUE_RESTORE) + return; - if (!wakeup) { + if (p->se.sched_delayed) { + /* CPU migration of "sleeping" task */ + SCHED_WARN_ON(!(flags & ENQUEUE_MIGRATED)); if (p->in_memstall) set |= TSK_MEMSTALL; + if (p->in_iowait) + set |= TSK_IOWAIT; + } else if (flags & ENQUEUE_MIGRATED) { + /* CPU migration of runnable task */ + set = TSK_RUNNING; + if (p->in_memstall) + set |= TSK_MEMSTALL | TSK_MEMSTALL_RUNNING; } else { + /* Wakeup of new or sleeping task */ if (p->in_iowait) clear |= TSK_IOWAIT; + set = TSK_RUNNING; + if (p->in_memstall) + set |= TSK_MEMSTALL_RUNNING; } psi_task_change(p, clear, set); } -static inline void psi_dequeue(struct task_struct *p, bool sleep) +static inline void psi_dequeue(struct task_struct *p, int flags) { if (static_branch_likely(&psi_disabled)) return; + /* Same runqueue, nothing changed for psi */ + if (flags & DEQUEUE_SAVE) + return; + /* * A voluntary sleep is a dequeue followed by a task switch. To * avoid walking all ancestors twice, psi_task_switch() handles * TSK_RUNNING and TSK_IOWAIT for us when it moves TSK_ONCPU. * Do nothing here. */ - if (sleep) + if (flags & DEQUEUE_SLEEP) return; + /* + * When migrating a task to another CPU, clear all psi + * state. The enqueue callback above will work it out. + */ psi_task_change(p, p->psi_flags, 0); } @@ -190,8 +217,8 @@ static inline void psi_sched_switch(struct task_struct *prev, } #else /* CONFIG_PSI */ -static inline void psi_enqueue(struct task_struct *p, bool wakeup) {} -static inline void psi_dequeue(struct task_struct *p, bool sleep) {} +static inline void psi_enqueue(struct task_struct *p, bool migrate) {} +static inline void psi_dequeue(struct task_struct *p, bool migrate) {} static inline void psi_ttwu_dequeue(struct task_struct *p) {} static inline void psi_sched_switch(struct task_struct *prev, struct task_struct *next, diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c index aa70beee9895..0d71fcbaf1e3 100644 --- a/kernel/sched/syscalls.c +++ b/kernel/sched/syscalls.c @@ -91,7 +91,7 @@ void set_user_nice(struct task_struct *p, long nice) } queued = task_on_rq_queued(p); - running = task_current(rq, p); + running = task_current_donor(rq, p); if (queued) dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); if (running) @@ -529,7 +529,7 @@ int __sched_setscheduler(struct task_struct *p, { int oldpolicy = -1, policy = attr->sched_policy; int retval, oldprio, newprio, queued, running; - const struct sched_class *prev_class; + const struct sched_class *prev_class, *next_class; struct balance_callback *head; struct rq_flags rf; int reset_on_fork; @@ -706,18 +706,23 @@ change: queue_flags &= ~DEQUEUE_MOVE; } + prev_class = p->sched_class; + next_class = __setscheduler_class(policy, newprio); + + if (prev_class != next_class && p->se.sched_delayed) + dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK); + queued = task_on_rq_queued(p); - running = task_current(rq, p); + running = task_current_donor(rq, p); if (queued) dequeue_task(rq, p, queue_flags); if (running) put_prev_task(rq, p); - prev_class = p->sched_class; - if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) { __setscheduler_params(p, attr); - __setscheduler_prio(p, newprio); + p->sched_class = next_class; + p->prio = newprio; } __setscheduler_uclamp(p, attr); check_class_changing(rq, p, prev_class); @@ -1076,45 +1081,6 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) return copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; } -/* - * Copy the kernel size attribute structure (which might be larger - * than what user-space knows about) to user-space. - * - * Note that all cases are valid: user-space buffer can be larger or - * smaller than the kernel-space buffer. The usual case is that both - * have the same size. - */ -static int -sched_attr_copy_to_user(struct sched_attr __user *uattr, - struct sched_attr *kattr, - unsigned int usize) -{ - unsigned int ksize = sizeof(*kattr); - - if (!access_ok(uattr, usize)) - return -EFAULT; - - /* - * sched_getattr() ABI forwards and backwards compatibility: - * - * If usize == ksize then we just copy everything to user-space and all is good. - * - * If usize < ksize then we only copy as much as user-space has space for, - * this keeps ABI compatibility as well. We skip the rest. - * - * If usize > ksize then user-space is using a newer version of the ABI, - * which part the kernel doesn't know about. Just ignore it - tooling can - * detect the kernel's knowledge of attributes from the attr->size value - * which is set to ksize in this case. - */ - kattr->size = min(usize, ksize); - - if (copy_to_user(uattr, kattr, kattr->size)) - return -EFAULT; - - return 0; -} - /** * sys_sched_getattr - similar to sched_getparam, but with sched_attr * @pid: the pid in question. @@ -1159,7 +1125,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, #endif } - return sched_attr_copy_to_user(uattr, &kattr, usize); + kattr.size = min(usize, sizeof(kattr)); + return copy_struct_to_user(uattr, usize, &kattr, sizeof(kattr), NULL); } #ifdef CONFIG_SMP diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c index 134d7112ef71..b410b61cec95 100644 --- a/kernel/sched/wait_bit.c +++ b/kernel/sched/wait_bit.c @@ -9,7 +9,7 @@ static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned; -wait_queue_head_t *bit_waitqueue(void *word, int bit) +wait_queue_head_t *bit_waitqueue(unsigned long *word, int bit) { const int shift = BITS_PER_LONG == 32 ? 5 : 6; unsigned long val = (unsigned long)word << shift | bit; @@ -55,7 +55,7 @@ __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_ } EXPORT_SYMBOL(__wait_on_bit); -int __sched out_of_line_wait_on_bit(void *word, int bit, +int __sched out_of_line_wait_on_bit(unsigned long *word, int bit, wait_bit_action_f *action, unsigned mode) { struct wait_queue_head *wq_head = bit_waitqueue(word, bit); @@ -66,7 +66,7 @@ int __sched out_of_line_wait_on_bit(void *word, int bit, EXPORT_SYMBOL(out_of_line_wait_on_bit); int __sched out_of_line_wait_on_bit_timeout( - void *word, int bit, wait_bit_action_f *action, + unsigned long *word, int bit, wait_bit_action_f *action, unsigned mode, unsigned long timeout) { struct wait_queue_head *wq_head = bit_waitqueue(word, bit); @@ -108,7 +108,7 @@ __wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue_entry } EXPORT_SYMBOL(__wait_on_bit_lock); -int __sched out_of_line_wait_on_bit_lock(void *word, int bit, +int __sched out_of_line_wait_on_bit_lock(unsigned long *word, int bit, wait_bit_action_f *action, unsigned mode) { struct wait_queue_head *wq_head = bit_waitqueue(word, bit); @@ -118,7 +118,7 @@ int __sched out_of_line_wait_on_bit_lock(void *word, int bit, } EXPORT_SYMBOL(out_of_line_wait_on_bit_lock); -void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit) +void __wake_up_bit(struct wait_queue_head *wq_head, unsigned long *word, int bit) { struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit); @@ -128,23 +128,31 @@ void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit) EXPORT_SYMBOL(__wake_up_bit); /** - * wake_up_bit - wake up a waiter on a bit - * @word: the word being waited on, a kernel virtual address - * @bit: the bit of the word being waited on + * wake_up_bit - wake up waiters on a bit + * @word: the address containing the bit being waited on + * @bit: the bit at that address being waited on * - * There is a standard hashed waitqueue table for generic use. This - * is the part of the hash-table's accessor API that wakes up waiters - * on a bit. For instance, if one were to have waiters on a bitflag, - * one would call wake_up_bit() after clearing the bit. + * Wake up any process waiting in wait_on_bit() or similar for the + * given bit to be cleared. * - * In order for this to function properly, as it uses waitqueue_active() - * internally, some kind of memory barrier must be done prior to calling - * this. Typically, this will be smp_mb__after_atomic(), but in some - * cases where bitflags are manipulated non-atomically under a lock, one - * may need to use a less regular barrier, such fs/inode.c's smp_mb(), - * because spin_unlock() does not guarantee a memory barrier. + * The wake-up is sent to tasks in a waitqueue selected by hash from a + * shared pool. Only those tasks on that queue which have requested + * wake_up on this specific address and bit will be woken, and only if the + * bit is clear. + * + * In order for this to function properly there must be a full memory + * barrier after the bit is cleared and before this function is called. + * If the bit was cleared atomically, such as a by clear_bit() then + * smb_mb__after_atomic() can be used, othwewise smb_mb() is needed. + * If the bit was cleared with a fully-ordered operation, no further + * barrier is required. + * + * Normally the bit should be cleared by an operation with RELEASE + * semantics so that any changes to memory made before the bit is + * cleared are guaranteed to be visible after the matching wait_on_bit() + * completes. */ -void wake_up_bit(void *word, int bit) +void wake_up_bit(unsigned long *word, int bit) { __wake_up_bit(bit_waitqueue(word, bit), word, bit); } @@ -188,6 +196,36 @@ void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var, int } EXPORT_SYMBOL(init_wait_var_entry); +/** + * wake_up_var - wake up waiters on a variable (kernel address) + * @var: the address of the variable being waited on + * + * Wake up any process waiting in wait_var_event() or similar for the + * given variable to change. wait_var_event() can be waiting for an + * arbitrary condition to be true and associates that condition with an + * address. Calling wake_up_var() suggests that the condition has been + * made true, but does not strictly require the condtion to use the + * address given. + * + * The wake-up is sent to tasks in a waitqueue selected by hash from a + * shared pool. Only those tasks on that queue which have requested + * wake_up on this specific address will be woken. + * + * In order for this to function properly there must be a full memory + * barrier after the variable is updated (or more accurately, after the + * condition waited on has been made to be true) and before this function + * is called. If the variable was updated atomically, such as a by + * atomic_dec() then smb_mb__after_atomic() can be used. If the + * variable was updated by a fully ordered operation such as + * atomic_dec_and_test() then no extra barrier is required. Otherwise + * smb_mb() is needed. + * + * Normally the variable should be updated (the condition should be made + * to be true) by an operation with RELEASE semantics such as + * smp_store_release() so that any changes to memory made before the + * variable was updated are guaranteed to be visible after the matching + * wait_var_event() completes. + */ void wake_up_var(void *var) { __wake_up_bit(__var_waitqueue(var), var, -1); @@ -228,20 +266,6 @@ __sched int bit_wait_timeout(struct wait_bit_key *word, int mode) } EXPORT_SYMBOL_GPL(bit_wait_timeout); -__sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode) -{ - unsigned long now = READ_ONCE(jiffies); - - if (time_after_eq(now, word->timeout)) - return -EAGAIN; - io_schedule_timeout(word->timeout - now); - if (signal_pending_state(mode, current)) - return -EINTR; - - return 0; -} -EXPORT_SYMBOL_GPL(bit_wait_io_timeout); - void __init wait_bit_init(void) { int i; diff --git a/kernel/signal.c b/kernel/signal.c index 6e57036f947f..98b65cb35830 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -59,6 +59,8 @@ #include <asm/cacheflush.h> #include <asm/syscall.h> /* for syscall_get_* */ +#include "time/posix-timers.h" + /* * SLAB caches for signal bits. */ @@ -396,16 +398,9 @@ void task_join_group_stop(struct task_struct *task) task_set_jobctl_pending(task, mask | JOBCTL_STOP_PENDING); } -/* - * allocate a new signal queue record - * - this may be called without locks if and only if t == current, otherwise an - * appropriate lock must be held to stop the target task from exiting - */ -static struct sigqueue * -__sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags, - int override_rlimit, const unsigned int sigqueue_flags) +static struct ucounts *sig_get_ucounts(struct task_struct *t, int sig, + int override_rlimit) { - struct sigqueue *q = NULL; struct ucounts *ucounts; long sigpending; @@ -419,31 +414,59 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags, */ rcu_read_lock(); ucounts = task_ucounts(t); - sigpending = inc_rlimit_get_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING); + sigpending = inc_rlimit_get_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING, + override_rlimit); rcu_read_unlock(); if (!sigpending) return NULL; - if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) { - q = kmem_cache_alloc(sigqueue_cachep, gfp_flags); - } else { + if (unlikely(!override_rlimit && sigpending > task_rlimit(t, RLIMIT_SIGPENDING))) { + dec_rlimit_put_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING); print_dropped_signal(sig); + return NULL; } - if (unlikely(q == NULL)) { + return ucounts; +} + +static void __sigqueue_init(struct sigqueue *q, struct ucounts *ucounts, + const unsigned int sigqueue_flags) +{ + INIT_LIST_HEAD(&q->list); + q->flags = sigqueue_flags; + q->ucounts = ucounts; +} + +/* + * allocate a new signal queue record + * - this may be called without locks if and only if t == current, otherwise an + * appropriate lock must be held to stop the target task from exiting + */ +static struct sigqueue *sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags, + int override_rlimit) +{ + struct ucounts *ucounts = sig_get_ucounts(t, sig, override_rlimit); + struct sigqueue *q; + + if (!ucounts) + return NULL; + + q = kmem_cache_alloc(sigqueue_cachep, gfp_flags); + if (!q) { dec_rlimit_put_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING); - } else { - INIT_LIST_HEAD(&q->list); - q->flags = sigqueue_flags; - q->ucounts = ucounts; + return NULL; } + + __sigqueue_init(q, ucounts, 0); return q; } static void __sigqueue_free(struct sigqueue *q) { - if (q->flags & SIGQUEUE_PREALLOC) + if (q->flags & SIGQUEUE_PREALLOC) { + posixtimer_sigqueue_putref(q); return; + } if (q->ucounts) { dec_rlimit_put_ucounts(q->ucounts, UCOUNT_RLIMIT_SIGPENDING); q->ucounts = NULL; @@ -478,42 +501,6 @@ void flush_signals(struct task_struct *t) } EXPORT_SYMBOL(flush_signals); -#ifdef CONFIG_POSIX_TIMERS -static void __flush_itimer_signals(struct sigpending *pending) -{ - sigset_t signal, retain; - struct sigqueue *q, *n; - - signal = pending->signal; - sigemptyset(&retain); - - list_for_each_entry_safe(q, n, &pending->list, list) { - int sig = q->info.si_signo; - - if (likely(q->info.si_code != SI_TIMER)) { - sigaddset(&retain, sig); - } else { - sigdelset(&signal, sig); - list_del_init(&q->list); - __sigqueue_free(q); - } - } - - sigorsets(&pending->signal, &signal, &retain); -} - -void flush_itimer_signals(void) -{ - struct task_struct *tsk = current; - unsigned long flags; - - spin_lock_irqsave(&tsk->sighand->siglock, flags); - __flush_itimer_signals(&tsk->pending); - __flush_itimer_signals(&tsk->signal->shared_pending); - spin_unlock_irqrestore(&tsk->sighand->siglock, flags); -} -#endif - void ignore_signals(struct task_struct *t) { int i; @@ -563,7 +550,7 @@ bool unhandled_signal(struct task_struct *tsk, int sig) } static void collect_signal(int sig, struct sigpending *list, kernel_siginfo_t *info, - bool *resched_timer) + struct sigqueue **timer_sigq) { struct sigqueue *q, *first = NULL; @@ -586,12 +573,17 @@ still_pending: list_del_init(&first->list); copy_siginfo(info, &first->info); - *resched_timer = - (first->flags & SIGQUEUE_PREALLOC) && - (info->si_code == SI_TIMER) && - (info->si_sys_private); - - __sigqueue_free(first); + /* + * posix-timer signals are preallocated and freed when the last + * reference count is dropped in posixtimer_deliver_signal() or + * immediately on timer deletion when the signal is not pending. + * Spare the extra round through __sigqueue_free() which is + * ignoring preallocated signals. + */ + if (unlikely((first->flags & SIGQUEUE_PREALLOC) && (info->si_code == SI_TIMER))) + *timer_sigq = first; + else + __sigqueue_free(first); } else { /* * Ok, it wasn't in the queue. This must be @@ -608,12 +600,12 @@ still_pending: } static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, - kernel_siginfo_t *info, bool *resched_timer) + kernel_siginfo_t *info, struct sigqueue **timer_sigq) { int sig = next_signal(pending, mask); if (sig) - collect_signal(sig, pending, info, resched_timer); + collect_signal(sig, pending, info, timer_sigq); return sig; } @@ -625,42 +617,22 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, int dequeue_signal(sigset_t *mask, kernel_siginfo_t *info, enum pid_type *type) { struct task_struct *tsk = current; - bool resched_timer = false; + struct sigqueue *timer_sigq; int signr; lockdep_assert_held(&tsk->sighand->siglock); +again: *type = PIDTYPE_PID; - signr = __dequeue_signal(&tsk->pending, mask, info, &resched_timer); + timer_sigq = NULL; + signr = __dequeue_signal(&tsk->pending, mask, info, &timer_sigq); if (!signr) { *type = PIDTYPE_TGID; signr = __dequeue_signal(&tsk->signal->shared_pending, - mask, info, &resched_timer); -#ifdef CONFIG_POSIX_TIMERS - /* - * itimer signal ? - * - * itimers are process shared and we restart periodic - * itimers in the signal delivery path to prevent DoS - * attacks in the high resolution timer case. This is - * compliant with the old way of self-restarting - * itimers, as the SIGALRM is a legacy signal and only - * queued once. Changing the restart behaviour to - * restart the timer in the signal dequeue path is - * reducing the timer noise on heavy loaded !highres - * systems too. - */ - if (unlikely(signr == SIGALRM)) { - struct hrtimer *tmr = &tsk->signal->real_timer; - - if (!hrtimer_is_queued(tmr) && - tsk->signal->it_real_incr != 0) { - hrtimer_forward(tmr, tmr->base->get_time(), - tsk->signal->it_real_incr); - hrtimer_restart(tmr); - } - } -#endif + mask, info, &timer_sigq); + + if (unlikely(signr == SIGALRM)) + posixtimer_rearm_itimer(tsk); } recalc_sigpending(); @@ -682,22 +654,12 @@ int dequeue_signal(sigset_t *mask, kernel_siginfo_t *info, enum pid_type *type) */ current->jobctl |= JOBCTL_STOP_DEQUEUED; } -#ifdef CONFIG_POSIX_TIMERS - if (resched_timer) { - /* - * Release the siglock to ensure proper locking order - * of timer locks outside of siglocks. Note, we leave - * irqs disabled here, since the posix-timers code is - * about to disable them again anyway. - */ - spin_unlock(&tsk->sighand->siglock); - posixtimer_rearm(info); - spin_lock(&tsk->sighand->siglock); - /* Don't expose the si_sys_private value to userspace */ - info->si_sys_private = 0; + if (IS_ENABLED(CONFIG_POSIX_TIMERS) && unlikely(timer_sigq)) { + if (!posixtimer_deliver_signal(info, timer_sigq)) + goto again; } -#endif + return signr; } EXPORT_SYMBOL_GPL(dequeue_signal); @@ -772,17 +734,24 @@ void signal_wake_up_state(struct task_struct *t, unsigned int state) kick_process(t); } -/* - * Remove signals in mask from the pending set and queue. - * Returns 1 if any signals were found. - * - * All callers must be holding the siglock. - */ -static void flush_sigqueue_mask(sigset_t *mask, struct sigpending *s) +static inline void posixtimer_sig_ignore(struct task_struct *tsk, struct sigqueue *q); + +static void sigqueue_free_ignored(struct task_struct *tsk, struct sigqueue *q) +{ + if (likely(!(q->flags & SIGQUEUE_PREALLOC) || q->info.si_code != SI_TIMER)) + __sigqueue_free(q); + else + posixtimer_sig_ignore(tsk, q); +} + +/* Remove signals in mask from the pending set and queue. */ +static void flush_sigqueue_mask(struct task_struct *p, sigset_t *mask, struct sigpending *s) { struct sigqueue *q, *n; sigset_t m; + lockdep_assert_held(&p->sighand->siglock); + sigandsets(&m, mask, &s->signal); if (sigisemptyset(&m)) return; @@ -791,7 +760,7 @@ static void flush_sigqueue_mask(sigset_t *mask, struct sigpending *s) list_for_each_entry_safe(q, n, &s->list, list) { if (sigismember(mask, q->info.si_signo)) { list_del_init(&q->list); - __sigqueue_free(q); + sigqueue_free_ignored(p, q); } } } @@ -916,18 +885,18 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force) * This is a stop signal. Remove SIGCONT from all queues. */ siginitset(&flush, sigmask(SIGCONT)); - flush_sigqueue_mask(&flush, &signal->shared_pending); + flush_sigqueue_mask(p, &flush, &signal->shared_pending); for_each_thread(p, t) - flush_sigqueue_mask(&flush, &t->pending); + flush_sigqueue_mask(p, &flush, &t->pending); } else if (sig == SIGCONT) { unsigned int why; /* * Remove all stop signals from all queues, wake all threads. */ siginitset(&flush, SIG_KERNEL_STOP_MASK); - flush_sigqueue_mask(&flush, &signal->shared_pending); + flush_sigqueue_mask(p, &flush, &signal->shared_pending); for_each_thread(p, t) { - flush_sigqueue_mask(&flush, &t->pending); + flush_sigqueue_mask(p, &flush, &t->pending); task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING); if (likely(!(t->ptrace & PT_SEIZED))) { t->jobctl &= ~JOBCTL_STOPPED; @@ -1114,7 +1083,7 @@ static int __send_signal_locked(int sig, struct kernel_siginfo *info, else override_rlimit = 0; - q = __sigqueue_alloc(sig, t, GFP_ATOMIC, override_rlimit, 0); + q = sigqueue_alloc(sig, t, GFP_ATOMIC, override_rlimit); if (q) { list_add_tail(&q->list, &pending->list); @@ -1922,112 +1891,242 @@ int kill_pid(struct pid *pid, int sig, int priv) } EXPORT_SYMBOL(kill_pid); +#ifdef CONFIG_POSIX_TIMERS /* - * These functions support sending signals using preallocated sigqueue - * structures. This is needed "because realtime applications cannot - * afford to lose notifications of asynchronous events, like timer - * expirations or I/O completions". In the case of POSIX Timers - * we allocate the sigqueue structure from the timer_create. If this - * allocation fails we are able to report the failure to the application - * with an EAGAIN error. + * These functions handle POSIX timer signals. POSIX timers use + * preallocated sigqueue structs for sending signals. */ -struct sigqueue *sigqueue_alloc(void) +static void __flush_itimer_signals(struct sigpending *pending) { - return __sigqueue_alloc(-1, current, GFP_KERNEL, 0, SIGQUEUE_PREALLOC); + sigset_t signal, retain; + struct sigqueue *q, *n; + + signal = pending->signal; + sigemptyset(&retain); + + list_for_each_entry_safe(q, n, &pending->list, list) { + int sig = q->info.si_signo; + + if (likely(q->info.si_code != SI_TIMER)) { + sigaddset(&retain, sig); + } else { + sigdelset(&signal, sig); + list_del_init(&q->list); + __sigqueue_free(q); + } + } + + sigorsets(&pending->signal, &signal, &retain); } -void sigqueue_free(struct sigqueue *q) +void flush_itimer_signals(void) { - spinlock_t *lock = ¤t->sighand->siglock; - unsigned long flags; + struct task_struct *tsk = current; - if (WARN_ON_ONCE(!(q->flags & SIGQUEUE_PREALLOC))) - return; - /* - * We must hold ->siglock while testing q->list - * to serialize with collect_signal() or with - * __exit_signal()->flush_sigqueue(). - */ - spin_lock_irqsave(lock, flags); - q->flags &= ~SIGQUEUE_PREALLOC; - /* - * If it is queued it will be freed when dequeued, - * like the "regular" sigqueue. - */ - if (!list_empty(&q->list)) - q = NULL; - spin_unlock_irqrestore(lock, flags); + guard(spinlock_irqsave)(&tsk->sighand->siglock); + __flush_itimer_signals(&tsk->pending); + __flush_itimer_signals(&tsk->signal->shared_pending); +} - if (q) - __sigqueue_free(q); +bool posixtimer_init_sigqueue(struct sigqueue *q) +{ + struct ucounts *ucounts = sig_get_ucounts(current, -1, 0); + + if (!ucounts) + return false; + clear_siginfo(&q->info); + __sigqueue_init(q, ucounts, SIGQUEUE_PREALLOC); + return true; } -int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type) +static void posixtimer_queue_sigqueue(struct sigqueue *q, struct task_struct *t, enum pid_type type) { - int sig = q->info.si_signo; struct sigpending *pending; + int sig = q->info.si_signo; + + signalfd_notify(t, sig); + pending = (type != PIDTYPE_PID) ? &t->signal->shared_pending : &t->pending; + list_add_tail(&q->list, &pending->list); + sigaddset(&pending->signal, sig); + complete_signal(sig, t, type); +} + +/* + * This function is used by POSIX timers to deliver a timer signal. + * Where type is PIDTYPE_PID (such as for timers with SIGEV_THREAD_ID + * set), the signal must be delivered to the specific thread (queues + * into t->pending). + * + * Where type is not PIDTYPE_PID, signals must be delivered to the + * process. In this case, prefer to deliver to current if it is in + * the same thread group as the target process, which avoids + * unnecessarily waking up a potentially idle task. + */ +static inline struct task_struct *posixtimer_get_target(struct k_itimer *tmr) +{ + struct task_struct *t = pid_task(tmr->it_pid, tmr->it_pid_type); + + if (t && tmr->it_pid_type != PIDTYPE_PID && same_thread_group(t, current)) + t = current; + return t; +} + +void posixtimer_send_sigqueue(struct k_itimer *tmr) +{ + struct sigqueue *q = &tmr->sigq; + int sig = q->info.si_signo; struct task_struct *t; unsigned long flags; - int ret, result; + int result; - if (WARN_ON_ONCE(!(q->flags & SIGQUEUE_PREALLOC))) - return 0; - if (WARN_ON_ONCE(q->info.si_code != SI_TIMER)) - return 0; + guard(rcu)(); - ret = -1; - rcu_read_lock(); + t = posixtimer_get_target(tmr); + if (!t) + return; + + if (!likely(lock_task_sighand(t, &flags))) + return; /* - * This function is used by POSIX timers to deliver a timer signal. - * Where type is PIDTYPE_PID (such as for timers with SIGEV_THREAD_ID - * set), the signal must be delivered to the specific thread (queues - * into t->pending). - * - * Where type is not PIDTYPE_PID, signals must be delivered to the - * process. In this case, prefer to deliver to current if it is in - * the same thread group as the target process, which avoids - * unnecessarily waking up a potentially idle task. + * Update @tmr::sigqueue_seq for posix timer signals with sighand + * locked to prevent a race against dequeue_signal(). */ - t = pid_task(pid, type); - if (!t) - goto ret; - if (type != PIDTYPE_PID && same_thread_group(t, current)) - t = current; - if (!likely(lock_task_sighand(t, &flags))) - goto ret; + tmr->it_sigqueue_seq = tmr->it_signal_seq; - ret = 1; /* the signal is ignored */ - result = TRACE_SIGNAL_IGNORED; - if (!prepare_signal(sig, t, false)) + /* + * Set the signal delivery status under sighand lock, so that the + * ignored signal handling can distinguish between a periodic and a + * non-periodic timer. + */ + tmr->it_sig_periodic = tmr->it_status == POSIX_TIMER_REQUEUE_PENDING; + + if (!prepare_signal(sig, t, false)) { + result = TRACE_SIGNAL_IGNORED; + + if (!list_empty(&q->list)) { + /* + * If task group is exiting with the signal already pending, + * wait for __exit_signal() to do its job. Otherwise if + * ignored, it's not supposed to be queued. Try to survive. + */ + WARN_ON_ONCE(!(t->signal->flags & SIGNAL_GROUP_EXIT)); + goto out; + } + + /* Periodic timers with SIG_IGN are queued on the ignored list */ + if (tmr->it_sig_periodic) { + /* + * Already queued means the timer was rearmed after + * the previous expiry got it on the ignore list. + * Nothing to do for that case. + */ + if (hlist_unhashed(&tmr->ignored_list)) { + /* + * Take a signal reference and queue it on + * the ignored list. + */ + posixtimer_sigqueue_getref(q); + posixtimer_sig_ignore(t, q); + } + } else if (!hlist_unhashed(&tmr->ignored_list)) { + /* + * Covers the case where a timer was periodic and + * then the signal was ignored. Later it was rearmed + * as oneshot timer. The previous signal is invalid + * now, and this oneshot signal has to be dropped. + * Remove it from the ignored list and drop the + * reference count as the signal is not longer + * queued. + */ + hlist_del_init(&tmr->ignored_list); + posixtimer_putref(tmr); + } goto out; + } + + /* This should never happen and leaks a reference count */ + if (WARN_ON_ONCE(!hlist_unhashed(&tmr->ignored_list))) + hlist_del_init(&tmr->ignored_list); - ret = 0; if (unlikely(!list_empty(&q->list))) { - /* - * If an SI_TIMER entry is already queue just increment - * the overrun count. - */ - q->info.si_overrun++; + /* This holds a reference count already */ result = TRACE_SIGNAL_ALREADY_PENDING; goto out; } - q->info.si_overrun = 0; - signalfd_notify(t, sig); - pending = (type != PIDTYPE_PID) ? &t->signal->shared_pending : &t->pending; - list_add_tail(&q->list, &pending->list); - sigaddset(&pending->signal, sig); - complete_signal(sig, t, type); + posixtimer_sigqueue_getref(q); + posixtimer_queue_sigqueue(q, t, tmr->it_pid_type); result = TRACE_SIGNAL_DELIVERED; out: - trace_signal_generate(sig, &q->info, t, type != PIDTYPE_PID, result); + trace_signal_generate(sig, &q->info, t, tmr->it_pid_type != PIDTYPE_PID, result); unlock_task_sighand(t, &flags); -ret: - rcu_read_unlock(); - return ret; } +static inline void posixtimer_sig_ignore(struct task_struct *tsk, struct sigqueue *q) +{ + struct k_itimer *tmr = container_of(q, struct k_itimer, sigq); + + /* + * If the timer is marked deleted already or the signal originates + * from a non-periodic timer, then just drop the reference + * count. Otherwise queue it on the ignored list. + */ + if (tmr->it_signal && tmr->it_sig_periodic) + hlist_add_head(&tmr->ignored_list, &tsk->signal->ignored_posix_timers); + else + posixtimer_putref(tmr); +} + +static void posixtimer_sig_unignore(struct task_struct *tsk, int sig) +{ + struct hlist_head *head = &tsk->signal->ignored_posix_timers; + struct hlist_node *tmp; + struct k_itimer *tmr; + + if (likely(hlist_empty(head))) + return; + + /* + * Rearming a timer with sighand lock held is not possible due to + * lock ordering vs. tmr::it_lock. Just stick the sigqueue back and + * let the signal delivery path deal with it whether it needs to be + * rearmed or not. This cannot be decided here w/o dropping sighand + * lock and creating a loop retry horror show. + */ + hlist_for_each_entry_safe(tmr, tmp , head, ignored_list) { + struct task_struct *target; + + /* + * tmr::sigq.info.si_signo is immutable, so accessing it + * without holding tmr::it_lock is safe. + */ + if (tmr->sigq.info.si_signo != sig) + continue; + + hlist_del_init(&tmr->ignored_list); + + /* This should never happen and leaks a reference count */ + if (WARN_ON_ONCE(!list_empty(&tmr->sigq.list))) + continue; + + /* + * Get the target for the signal. If target is a thread and + * has exited by now, drop the reference count. + */ + guard(rcu)(); + target = posixtimer_get_target(tmr); + if (target) + posixtimer_queue_sigqueue(&tmr->sigq, target, tmr->it_pid_type); + else + posixtimer_putref(tmr); + } +} +#else /* CONFIG_POSIX_TIMERS */ +static inline void posixtimer_sig_ignore(struct task_struct *tsk, struct sigqueue *q) { } +static inline void posixtimer_sig_unignore(struct task_struct *tsk, int sig) { } +#endif /* !CONFIG_POSIX_TIMERS */ + void do_notify_pidfd(struct task_struct *task) { struct pid *pid = task_pid(task); @@ -2888,8 +2987,6 @@ relock: current->flags |= PF_SIGNALED; if (sig_kernel_coredump(signr)) { - int ret; - if (print_fatal_signals) print_fatal_signal(signr); proc_coredump_connector(current); @@ -2901,24 +2998,7 @@ relock: * first and our do_group_exit call below will use * that value and ignore the one we pass it. */ - ret = do_coredump(&ksig->info); - if (ret) - coredump_report_failure("coredump has not been created, error %d", - ret); - else if (!IS_ENABLED(CONFIG_COREDUMP)) { - /* - * Coredumps are not available, can't fail collecting - * the coredump. - * - * Leave a note though that the coredump is going to be - * not created. This is not an error or a warning as disabling - * support in the kernel for coredumps isn't commonplace, and - * the user must've built the kernel with the custom config so - * let them know all works as desired. - */ - coredump_report("no coredump collected as " - "that is disabled in the kernel configuration"); - } + do_coredump(&ksig->info); } /* @@ -3927,7 +4007,6 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, siginfo_t __user *, info, unsigned int, flags) { int ret; - struct fd f; struct pid *pid; kernel_siginfo_t kinfo; enum pid_type type; @@ -3940,20 +4019,17 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, if (hweight32(flags & PIDFD_SEND_SIGNAL_FLAGS) > 1) return -EINVAL; - f = fdget(pidfd); - if (!fd_file(f)) + CLASS(fd, f)(pidfd); + if (fd_empty(f)) return -EBADF; /* Is this a pidfd? */ pid = pidfd_to_pid(fd_file(f)); - if (IS_ERR(pid)) { - ret = PTR_ERR(pid); - goto err; - } + if (IS_ERR(pid)) + return PTR_ERR(pid); - ret = -EINVAL; if (!access_pidfd_pidns(pid)) - goto err; + return -EINVAL; switch (flags) { case 0: @@ -3977,28 +4053,23 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, if (info) { ret = copy_siginfo_from_user_any(&kinfo, info); if (unlikely(ret)) - goto err; + return ret; - ret = -EINVAL; if (unlikely(sig != kinfo.si_signo)) - goto err; + return -EINVAL; /* Only allow sending arbitrary signals to yourself. */ - ret = -EPERM; if ((task_pid(current) != pid || type > PIDTYPE_TGID) && (kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL)) - goto err; + return -EPERM; } else { prepare_kill_siginfo(sig, &kinfo, type); } if (type == PIDTYPE_PGID) - ret = kill_pgrp_info(sig, &kinfo, pid); + return kill_pgrp_info(sig, &kinfo, pid); else - ret = kill_pid_info_type(sig, &kinfo, pid, type); -err: - fdput(f); - return ret; + return kill_pid_info_type(sig, &kinfo, pid, type); } static int @@ -4172,8 +4243,8 @@ void kernel_sigaction(int sig, __sighandler_t action) sigemptyset(&mask); sigaddset(&mask, sig); - flush_sigqueue_mask(&mask, ¤t->signal->shared_pending); - flush_sigqueue_mask(&mask, ¤t->pending); + flush_sigqueue_mask(current, &mask, ¤t->signal->shared_pending); + flush_sigqueue_mask(current, &mask, ¤t->pending); recalc_sigpending(); } spin_unlock_irq(¤t->sighand->siglock); @@ -4223,6 +4294,8 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) sigaction_compat_abi(act, oact); if (act) { + bool was_ignored = k->sa.sa_handler == SIG_IGN; + sigdelsetmask(&act->sa.sa_mask, sigmask(SIGKILL) | sigmask(SIGSTOP)); *k = *act; @@ -4240,9 +4313,11 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) if (sig_handler_ignored(sig_handler(p, sig), sig)) { sigemptyset(&mask); sigaddset(&mask, sig); - flush_sigqueue_mask(&mask, &p->signal->shared_pending); + flush_sigqueue_mask(p, &mask, &p->signal->shared_pending); for_each_thread(p, t) - flush_sigqueue_mask(&mask, &t->pending); + flush_sigqueue_mask(p, &mask, &t->pending); + } else if (was_ignored) { + posixtimer_sig_unignore(p, sig); } } diff --git a/kernel/smp.c b/kernel/smp.c index f25e20617b7e..27dc31a146a3 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -246,7 +246,7 @@ static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, in return true; } - ts2 = sched_clock(); + ts2 = ktime_get_mono_fast_ns(); /* How long since we last checked for a stuck CSD lock.*/ ts_delta = ts2 - *ts1; if (likely(ts_delta <= csd_lock_timeout_ns * (*nmessages + 1) * @@ -321,7 +321,7 @@ static void __csd_lock_wait(call_single_data_t *csd) int bug_id = 0; u64 ts0, ts1; - ts1 = ts0 = sched_clock(); + ts1 = ts0 = ktime_get_mono_fast_ns(); for (;;) { if (csd_lock_wait_toolong(csd, ts0, &ts1, &bug_id, &nmessages)) break; diff --git a/kernel/softirq.c b/kernel/softirq.c index d082e7840f88..8b41bd13cc3d 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -624,6 +624,24 @@ static inline void tick_irq_exit(void) #endif } +#ifdef CONFIG_IRQ_FORCED_THREADING +DEFINE_PER_CPU(struct task_struct *, ktimerd); +DEFINE_PER_CPU(unsigned long, pending_timer_softirq); + +static void wake_timersd(void) +{ + struct task_struct *tsk = __this_cpu_read(ktimerd); + + if (tsk) + wake_up_process(tsk); +} + +#else + +static inline void wake_timersd(void) { } + +#endif + static inline void __irq_exit_rcu(void) { #ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED @@ -636,6 +654,10 @@ static inline void __irq_exit_rcu(void) if (!in_interrupt() && local_softirq_pending()) invoke_softirq(); + if (IS_ENABLED(CONFIG_IRQ_FORCED_THREADING) && force_irqthreads() && + local_timers_pending_force_th() && !(in_nmi() | in_hardirq())) + wake_timersd(); + tick_irq_exit(); } @@ -748,10 +770,8 @@ EXPORT_SYMBOL(__tasklet_hi_schedule); static bool tasklet_clear_sched(struct tasklet_struct *t) { - if (test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) { - wake_up_var(&t->state); + if (test_and_clear_wake_up_bit(TASKLET_STATE_SCHED, &t->state)) return true; - } WARN_ONCE(1, "tasklet SCHED state not set: %s %pS\n", t->use_callback ? "callback" : "func", @@ -871,8 +891,7 @@ void tasklet_kill(struct tasklet_struct *t) if (in_interrupt()) pr_notice("Attempt to kill tasklet from interrupt\n"); - while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) - wait_var_event(&t->state, !test_bit(TASKLET_STATE_SCHED, &t->state)); + wait_on_bit_lock(&t->state, TASKLET_STATE_SCHED, TASK_UNINTERRUPTIBLE); tasklet_unlock_wait(t); tasklet_clear_sched(t); @@ -882,16 +901,13 @@ EXPORT_SYMBOL(tasklet_kill); #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) void tasklet_unlock(struct tasklet_struct *t) { - smp_mb__before_atomic(); - clear_bit(TASKLET_STATE_RUN, &t->state); - smp_mb__after_atomic(); - wake_up_var(&t->state); + clear_and_wake_up_bit(TASKLET_STATE_RUN, &t->state); } EXPORT_SYMBOL_GPL(tasklet_unlock); void tasklet_unlock_wait(struct tasklet_struct *t) { - wait_var_event(&t->state, !test_bit(TASKLET_STATE_RUN, &t->state)); + wait_on_bit(&t->state, TASKLET_STATE_RUN, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL_GPL(tasklet_unlock_wait); #endif @@ -971,12 +987,57 @@ static struct smp_hotplug_thread softirq_threads = { .thread_comm = "ksoftirqd/%u", }; +#ifdef CONFIG_IRQ_FORCED_THREADING +static void ktimerd_setup(unsigned int cpu) +{ + /* Above SCHED_NORMAL to handle timers before regular tasks. */ + sched_set_fifo_low(current); +} + +static int ktimerd_should_run(unsigned int cpu) +{ + return local_timers_pending_force_th(); +} + +void raise_ktimers_thread(unsigned int nr) +{ + trace_softirq_raise(nr); + __this_cpu_or(pending_timer_softirq, BIT(nr)); +} + +static void run_ktimerd(unsigned int cpu) +{ + unsigned int timer_si; + + ksoftirqd_run_begin(); + + timer_si = local_timers_pending_force_th(); + __this_cpu_write(pending_timer_softirq, 0); + or_softirq_pending(timer_si); + + __do_softirq(); + + ksoftirqd_run_end(); +} + +static struct smp_hotplug_thread timer_thread = { + .store = &ktimerd, + .setup = ktimerd_setup, + .thread_should_run = ktimerd_should_run, + .thread_fn = run_ktimerd, + .thread_comm = "ktimers/%u", +}; +#endif + static __init int spawn_ksoftirqd(void) { cpuhp_setup_state_nocalls(CPUHP_SOFTIRQ_DEAD, "softirq:dead", NULL, takeover_tasklets); BUG_ON(smpboot_register_percpu_thread(&softirq_threads)); - +#ifdef CONFIG_IRQ_FORCED_THREADING + if (force_irqthreads()) + BUG_ON(smpboot_register_percpu_thread(&timer_thread)); +#endif return 0; } early_initcall(spawn_ksoftirqd); diff --git a/kernel/static_call_inline.c b/kernel/static_call_inline.c index 639397b5491c..5259cda486d0 100644 --- a/kernel/static_call_inline.c +++ b/kernel/static_call_inline.c @@ -411,6 +411,17 @@ static void static_call_del_module(struct module *mod) for (site = start; site < stop; site++) { key = static_call_key(site); + + /* + * If the key was not updated due to a memory allocation + * failure in __static_call_init() then treating key::sites + * as key::mods in the code below would cause random memory + * access and #GP. In that case all subsequent sites have + * not been touched either, so stop iterating. + */ + if (!static_call_key_has_mods(key)) + break; + if (key == prev_key) continue; @@ -442,7 +453,7 @@ static int static_call_module_notify(struct notifier_block *nb, case MODULE_STATE_COMING: ret = static_call_add_module(mod); if (ret) { - WARN(1, "Failed to allocate memory for static calls"); + pr_warn("Failed to allocate memory for static calls\n"); static_call_del_module(mod); } break; diff --git a/kernel/sys.c b/kernel/sys.c index 4da31f28fda8..c4c701c6f0b4 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1911,12 +1911,11 @@ SYSCALL_DEFINE1(umask, int, mask) static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) { - struct fd exe; + CLASS(fd, exe)(fd); struct inode *inode; int err; - exe = fdget(fd); - if (!fd_file(exe)) + if (fd_empty(exe)) return -EBADF; inode = file_inode(fd_file(exe)); @@ -1926,18 +1925,14 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) * sure that this one is executable as well, to avoid breaking an * overall picture. */ - err = -EACCES; if (!S_ISREG(inode->i_mode) || path_noexec(&fd_file(exe)->f_path)) - goto exit; + return -EACCES; err = file_permission(fd_file(exe), MAY_EXEC); if (err) - goto exit; + return err; - err = replace_mm_exe_file(mm, fd_file(exe)); -exit: - fdput(exe); - return err; + return replace_mm_exe_file(mm, fd_file(exe)); } /* @@ -2324,6 +2319,21 @@ int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which, return -EINVAL; } +int __weak arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *status) +{ + return -EINVAL; +} + +int __weak arch_set_shadow_stack_status(struct task_struct *t, unsigned long status) +{ + return -EINVAL; +} + +int __weak arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status) +{ + return -EINVAL; +} + #define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LOCAL_THROTTLE) #ifdef CONFIG_ANON_VMA_NAME @@ -2784,6 +2794,21 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_RISCV_SET_ICACHE_FLUSH_CTX: error = RISCV_SET_ICACHE_FLUSH_CTX(arg2, arg3); break; + case PR_GET_SHADOW_STACK_STATUS: + if (arg3 || arg4 || arg5) + return -EINVAL; + error = arch_get_shadow_stack_status(me, (unsigned long __user *) arg2); + break; + case PR_SET_SHADOW_STACK_STATUS: + if (arg3 || arg4 || arg5) + return -EINVAL; + error = arch_set_shadow_stack_status(me, arg2); + break; + case PR_LOCK_SHADOW_STACK_STATUS: + if (arg3 || arg4 || arg5) + return -EINVAL; + error = arch_lock_shadow_stack_status(me, arg2); + break; default: error = -EINVAL; break; diff --git a/kernel/task_work.c b/kernel/task_work.c index 5d14d639ac71..c969f1f26be5 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -55,15 +55,26 @@ int task_work_add(struct task_struct *task, struct callback_head *work, enum task_work_notify_mode notify) { struct callback_head *head; + int flags = notify & TWA_FLAGS; + notify &= ~TWA_FLAGS; if (notify == TWA_NMI_CURRENT) { if (WARN_ON_ONCE(task != current)) return -EINVAL; if (!IS_ENABLED(CONFIG_IRQ_WORK)) return -EINVAL; } else { - /* record the work call stack in order to print it in KASAN reports */ - kasan_record_aux_stack(work); + /* + * Record the work call stack in order to print it in KASAN + * reports. + * + * Note that stack allocation can fail if TWAF_NO_ALLOC flag + * is set and new page is needed to expand the stack buffer. + */ + if (flags & TWAF_NO_ALLOC) + kasan_record_aux_stack_noalloc(work); + else + kasan_record_aux_stack(work); } head = READ_ONCE(task->task_works); diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 0700f40c53ac..0cd680ccc7e5 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -411,15 +411,14 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) struct nlattr *na; size_t size; u32 fd; - struct fd f; na = info->attrs[CGROUPSTATS_CMD_ATTR_FD]; if (!na) return -EINVAL; fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]); - f = fdget(fd); - if (!fd_file(f)) + CLASS(fd, f)(fd); + if (fd_empty(f)) return 0; size = nla_total_size(sizeof(struct cgroupstats)); @@ -427,14 +426,13 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) rc = prepare_reply(info, CGROUPSTATS_CMD_NEW, &rep_skb, size); if (rc < 0) - goto err; + return rc; na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS, sizeof(struct cgroupstats)); if (na == NULL) { nlmsg_free(rep_skb); - rc = -EMSGSIZE; - goto err; + return -EMSGSIZE; } stats = nla_data(na); @@ -443,14 +441,10 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) rc = cgroupstats_build(stats, fd_file(f)->f_path.dentry); if (rc < 0) { nlmsg_free(rep_skb); - goto err; + return rc; } - rc = send_reply(rep_skb, info); - -err: - fdput(f); - return rc; + return send_reply(rep_skb, info); } static int cmd_attr_register_cpumask(struct genl_info *info) diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 8ebb6d5a106b..b0b97a60aaa6 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -17,11 +17,6 @@ config ARCH_CLOCKSOURCE_DATA config ARCH_CLOCKSOURCE_INIT bool -# Clocksources require validation of the clocksource against the last -# cycle update - x86/TSC misfeature -config CLOCKSOURCE_VALIDATE_LAST_CYCLE - bool - # Timekeeping vsyscall support config GENERIC_TIME_VSYSCALL bool diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 4af2a264a160..fe0ae82124fe 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -obj-y += time.o timer.o hrtimer.o +obj-y += time.o timer.o hrtimer.o sleep_timeout.o obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o obj-y += timeconv.o timecounter.o alarmtimer.o diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 8bf888641694..0ddccdff119a 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -197,28 +197,15 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) { struct alarm *alarm = container_of(timer, struct alarm, timer); struct alarm_base *base = &alarm_bases[alarm->type]; - unsigned long flags; - int ret = HRTIMER_NORESTART; - int restart = ALARMTIMER_NORESTART; - spin_lock_irqsave(&base->lock, flags); - alarmtimer_dequeue(base, alarm); - spin_unlock_irqrestore(&base->lock, flags); + scoped_guard (spinlock_irqsave, &base->lock) + alarmtimer_dequeue(base, alarm); if (alarm->function) - restart = alarm->function(alarm, base->get_ktime()); - - spin_lock_irqsave(&base->lock, flags); - if (restart != ALARMTIMER_NORESTART) { - hrtimer_set_expires(&alarm->timer, alarm->node.expires); - alarmtimer_enqueue(base, alarm); - ret = HRTIMER_RESTART; - } - spin_unlock_irqrestore(&base->lock, flags); + alarm->function(alarm, base->get_ktime()); trace_alarmtimer_fired(alarm, base->get_ktime()); - return ret; - + return HRTIMER_NORESTART; } ktime_t alarm_expires_remaining(const struct alarm *alarm) @@ -334,10 +321,9 @@ static int alarmtimer_resume(struct device *dev) static void __alarm_init(struct alarm *alarm, enum alarmtimer_type type, - enum alarmtimer_restart (*function)(struct alarm *, ktime_t)) + void (*function)(struct alarm *, ktime_t)) { timerqueue_init(&alarm->node); - alarm->timer.function = alarmtimer_fired; alarm->function = function; alarm->type = type; alarm->state = ALARMTIMER_STATE_INACTIVE; @@ -350,10 +336,10 @@ __alarm_init(struct alarm *alarm, enum alarmtimer_type type, * @function: callback that is run when the alarm fires */ void alarm_init(struct alarm *alarm, enum alarmtimer_type type, - enum alarmtimer_restart (*function)(struct alarm *, ktime_t)) + void (*function)(struct alarm *, ktime_t)) { - hrtimer_init(&alarm->timer, alarm_bases[type].base_clockid, - HRTIMER_MODE_ABS); + hrtimer_setup(&alarm->timer, alarmtimer_fired, alarm_bases[type].base_clockid, + HRTIMER_MODE_ABS); __alarm_init(alarm, type, function); } EXPORT_SYMBOL_GPL(alarm_init); @@ -480,35 +466,11 @@ u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval) } EXPORT_SYMBOL_GPL(alarm_forward); -static u64 __alarm_forward_now(struct alarm *alarm, ktime_t interval, bool throttle) +u64 alarm_forward_now(struct alarm *alarm, ktime_t interval) { struct alarm_base *base = &alarm_bases[alarm->type]; - ktime_t now = base->get_ktime(); - - if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && throttle) { - /* - * Same issue as with posix_timer_fn(). Timers which are - * periodic but the signal is ignored can starve the system - * with a very small interval. The real fix which was - * promised in the context of posix_timer_fn() never - * materialized, but someone should really work on it. - * - * To prevent DOS fake @now to be 1 jiffy out which keeps - * the overrun accounting correct but creates an - * inconsistency vs. timer_gettime(2). - */ - ktime_t kj = NSEC_PER_SEC / HZ; - if (interval < kj) - now = ktime_add(now, kj); - } - - return alarm_forward(alarm, now, interval); -} - -u64 alarm_forward_now(struct alarm *alarm, ktime_t interval) -{ - return __alarm_forward_now(alarm, interval, false); + return alarm_forward(alarm, base->get_ktime(), interval); } EXPORT_SYMBOL_GPL(alarm_forward_now); @@ -567,30 +529,12 @@ static enum alarmtimer_type clock2alarm(clockid_t clockid) * * Return: whether the timer is to be restarted */ -static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, - ktime_t now) +static void alarm_handle_timer(struct alarm *alarm, ktime_t now) { - struct k_itimer *ptr = container_of(alarm, struct k_itimer, - it.alarm.alarmtimer); - enum alarmtimer_restart result = ALARMTIMER_NORESTART; - unsigned long flags; + struct k_itimer *ptr = container_of(alarm, struct k_itimer, it.alarm.alarmtimer); - spin_lock_irqsave(&ptr->it_lock, flags); - - if (posix_timer_queue_signal(ptr) && ptr->it_interval) { - /* - * Handle ignored signals and rearm the timer. This will go - * away once we handle ignored signals proper. Ensure that - * small intervals cannot starve the system. - */ - ptr->it_overrun += __alarm_forward_now(alarm, ptr->it_interval, true); - ++ptr->it_requeue_pending; - ptr->it_active = 1; - result = ALARMTIMER_RESTART; - } - spin_unlock_irqrestore(&ptr->it_lock, flags); - - return result; + guard(spinlock_irqsave)(&ptr->it_lock); + posix_timer_queue_signal(ptr); } /** @@ -751,18 +695,14 @@ static int alarm_timer_create(struct k_itimer *new_timer) * @now: time at the timer expiration * * Wakes up the task that set the alarmtimer - * - * Return: ALARMTIMER_NORESTART */ -static enum alarmtimer_restart alarmtimer_nsleep_wakeup(struct alarm *alarm, - ktime_t now) +static void alarmtimer_nsleep_wakeup(struct alarm *alarm, ktime_t now) { struct task_struct *task = alarm->data; alarm->data = NULL; if (task) wake_up_process(task); - return ALARMTIMER_NORESTART; } /** @@ -814,10 +754,10 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp, static void alarm_init_on_stack(struct alarm *alarm, enum alarmtimer_type type, - enum alarmtimer_restart (*function)(struct alarm *, ktime_t)) + void (*function)(struct alarm *, ktime_t)) { - hrtimer_init_on_stack(&alarm->timer, alarm_bases[type].base_clockid, - HRTIMER_MODE_ABS); + hrtimer_setup_on_stack(&alarm->timer, alarmtimer_fired, alarm_bases[type].base_clockid, + HRTIMER_MODE_ABS); __alarm_init(alarm, type, function); } diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 78c7bd64d0dd..f3e831f62906 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -337,13 +337,21 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, } /* - * Called after a notify add to make devices available which were - * released from the notifier call. + * Called after a clockevent has been added which might + * have replaced a current regular or broadcast device. A + * released normal device might be a suitable replacement + * for the current broadcast device. Similarly a released + * broadcast device might be a suitable replacement for a + * normal device. */ static void clockevents_notify_released(void) { struct clock_event_device *dev; + /* + * Keep iterating as long as tick_check_new_device() + * replaces a device. + */ while (!list_empty(&clockevents_released)) { dev = list_entry(clockevents_released.next, struct clock_event_device, list); @@ -610,39 +618,30 @@ void clockevents_resume(void) #ifdef CONFIG_HOTPLUG_CPU -# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST /** - * tick_offline_cpu - Take CPU out of the broadcast mechanism + * tick_offline_cpu - Shutdown all clock events related + * to this CPU and take it out of the + * broadcast mechanism. * @cpu: The outgoing CPU * - * Called on the outgoing CPU after it took itself offline. + * Called by the dying CPU during teardown. */ void tick_offline_cpu(unsigned int cpu) { - raw_spin_lock(&clockevents_lock); - tick_broadcast_offline(cpu); - raw_spin_unlock(&clockevents_lock); -} -# endif - -/** - * tick_cleanup_dead_cpu - Cleanup the tick and clockevents of a dead cpu - * @cpu: The dead CPU - */ -void tick_cleanup_dead_cpu(int cpu) -{ struct clock_event_device *dev, *tmp; - unsigned long flags; - raw_spin_lock_irqsave(&clockevents_lock, flags); + raw_spin_lock(&clockevents_lock); + tick_broadcast_offline(cpu); tick_shutdown(cpu); + /* * Unregister the clock event devices which were - * released from the users in the notify chain. + * released above. */ list_for_each_entry_safe(dev, tmp, &clockevents_released, list) list_del(&dev->list); + /* * Now check whether the CPU has left unused per cpu devices */ @@ -654,7 +653,8 @@ void tick_cleanup_dead_cpu(int cpu) list_del(&dev->list); } } - raw_spin_unlock_irqrestore(&clockevents_lock, flags); + + raw_spin_unlock(&clockevents_lock); } #endif diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 23336eecb4f4..aab6472853fa 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -20,6 +20,8 @@ #include "tick-internal.h" #include "timekeeping_internal.h" +static void clocksource_enqueue(struct clocksource *cs); + static noinline u64 cycles_to_nsec_safe(struct clocksource *cs, u64 start, u64 end) { u64 delta = clocksource_delta(end, start, cs->mask); @@ -171,7 +173,6 @@ static inline void clocksource_watchdog_unlock(unsigned long *flags) } static int clocksource_watchdog_kthread(void *data); -static void __clocksource_change_rating(struct clocksource *cs, int rating); static void clocksource_watchdog_work(struct work_struct *work) { @@ -191,6 +192,13 @@ static void clocksource_watchdog_work(struct work_struct *work) kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog"); } +static void clocksource_change_rating(struct clocksource *cs, int rating) +{ + list_del(&cs->list); + cs->rating = rating; + clocksource_enqueue(cs); +} + static void __clocksource_unstable(struct clocksource *cs) { cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG); @@ -697,7 +705,7 @@ static int __clocksource_watchdog_kthread(void) list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) { if (cs->flags & CLOCK_SOURCE_UNSTABLE) { list_del_init(&cs->wd_list); - __clocksource_change_rating(cs, 0); + clocksource_change_rating(cs, 0); select = 1; } if (cs->flags & CLOCK_SOURCE_RESELECT) { @@ -1255,34 +1263,6 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) } EXPORT_SYMBOL_GPL(__clocksource_register_scale); -static void __clocksource_change_rating(struct clocksource *cs, int rating) -{ - list_del(&cs->list); - cs->rating = rating; - clocksource_enqueue(cs); -} - -/** - * clocksource_change_rating - Change the rating of a registered clocksource - * @cs: clocksource to be changed - * @rating: new rating - */ -void clocksource_change_rating(struct clocksource *cs, int rating) -{ - unsigned long flags; - - mutex_lock(&clocksource_mutex); - clocksource_watchdog_lock(&flags); - __clocksource_change_rating(cs, rating); - clocksource_watchdog_unlock(&flags); - - clocksource_select(); - clocksource_select_watchdog(false); - clocksource_suspend_select(false); - mutex_unlock(&clocksource_mutex); -} -EXPORT_SYMBOL(clocksource_change_rating); - /* * Unbind clocksource @cs. Called with clocksource_mutex held */ diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index cddcd08ea827..80fe3749d2db 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -417,6 +417,11 @@ static inline void debug_hrtimer_init(struct hrtimer *timer) debug_object_init(timer, &hrtimer_debug_descr); } +static inline void debug_hrtimer_init_on_stack(struct hrtimer *timer) +{ + debug_object_init_on_stack(timer, &hrtimer_debug_descr); +} + static inline void debug_hrtimer_activate(struct hrtimer *timer, enum hrtimer_mode mode) { @@ -428,28 +433,6 @@ static inline void debug_hrtimer_deactivate(struct hrtimer *timer) debug_object_deactivate(timer, &hrtimer_debug_descr); } -static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, - enum hrtimer_mode mode); - -void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id, - enum hrtimer_mode mode) -{ - debug_object_init_on_stack(timer, &hrtimer_debug_descr); - __hrtimer_init(timer, clock_id, mode); -} -EXPORT_SYMBOL_GPL(hrtimer_init_on_stack); - -static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, - clockid_t clock_id, enum hrtimer_mode mode); - -void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl, - clockid_t clock_id, enum hrtimer_mode mode) -{ - debug_object_init_on_stack(&sl->timer, &hrtimer_debug_descr); - __hrtimer_init_sleeper(sl, clock_id, mode); -} -EXPORT_SYMBOL_GPL(hrtimer_init_sleeper_on_stack); - void destroy_hrtimer_on_stack(struct hrtimer *timer) { debug_object_free(timer, &hrtimer_debug_descr); @@ -459,6 +442,7 @@ EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack); #else static inline void debug_hrtimer_init(struct hrtimer *timer) { } +static inline void debug_hrtimer_init_on_stack(struct hrtimer *timer) { } static inline void debug_hrtimer_activate(struct hrtimer *timer, enum hrtimer_mode mode) { } static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { } @@ -472,6 +456,13 @@ debug_init(struct hrtimer *timer, clockid_t clockid, trace_hrtimer_init(timer, clockid, mode); } +static inline void debug_init_on_stack(struct hrtimer *timer, clockid_t clockid, + enum hrtimer_mode mode) +{ + debug_hrtimer_init_on_stack(timer); + trace_hrtimer_init(timer, clockid, mode); +} + static inline void debug_activate(struct hrtimer *timer, enum hrtimer_mode mode) { @@ -1544,6 +1535,11 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id) return HRTIMER_BASE_MONOTONIC; } +static enum hrtimer_restart hrtimer_dummy_timeout(struct hrtimer *unused) +{ + return HRTIMER_NORESTART; +} + static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode) { @@ -1580,6 +1576,18 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, timerqueue_init(&timer->node); } +static void __hrtimer_setup(struct hrtimer *timer, + enum hrtimer_restart (*function)(struct hrtimer *), + clockid_t clock_id, enum hrtimer_mode mode) +{ + __hrtimer_init(timer, clock_id, mode); + + if (WARN_ON_ONCE(!function)) + timer->function = hrtimer_dummy_timeout; + else + timer->function = function; +} + /** * hrtimer_init - initialize a timer to the given clock * @timer: the timer to be initialized @@ -1600,6 +1608,46 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, } EXPORT_SYMBOL_GPL(hrtimer_init); +/** + * hrtimer_setup - initialize a timer to the given clock + * @timer: the timer to be initialized + * @function: the callback function + * @clock_id: the clock to be used + * @mode: The modes which are relevant for initialization: + * HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT, + * HRTIMER_MODE_REL_SOFT + * + * The PINNED variants of the above can be handed in, + * but the PINNED bit is ignored as pinning happens + * when the hrtimer is started + */ +void hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *), + clockid_t clock_id, enum hrtimer_mode mode) +{ + debug_init(timer, clock_id, mode); + __hrtimer_setup(timer, function, clock_id, mode); +} +EXPORT_SYMBOL_GPL(hrtimer_setup); + +/** + * hrtimer_setup_on_stack - initialize a timer on stack memory + * @timer: The timer to be initialized + * @function: the callback function + * @clock_id: The clock to be used + * @mode: The timer mode + * + * Similar to hrtimer_setup(), except that this one must be used if struct hrtimer is in stack + * memory. + */ +void hrtimer_setup_on_stack(struct hrtimer *timer, + enum hrtimer_restart (*function)(struct hrtimer *), + clockid_t clock_id, enum hrtimer_mode mode) +{ + debug_init_on_stack(timer, clock_id, mode); + __hrtimer_setup(timer, function, clock_id, mode); +} +EXPORT_SYMBOL_GPL(hrtimer_setup_on_stack); + /* * A timer is active, when it is enqueued into the rbtree or the * callback function is running or it's in the state of being migrated @@ -1811,7 +1859,7 @@ retry: if (!ktime_before(now, cpu_base->softirq_expires_next)) { cpu_base->softirq_expires_next = KTIME_MAX; cpu_base->softirq_activated = 1; - raise_softirq_irqoff(HRTIMER_SOFTIRQ); + raise_timer_softirq(HRTIMER_SOFTIRQ); } __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); @@ -1906,7 +1954,7 @@ void hrtimer_run_queues(void) if (!ktime_before(now, cpu_base->softirq_expires_next)) { cpu_base->softirq_expires_next = KTIME_MAX; cpu_base->softirq_activated = 1; - raise_softirq_irqoff(HRTIMER_SOFTIRQ); + raise_timer_softirq(HRTIMER_SOFTIRQ); } __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); @@ -1944,7 +1992,7 @@ void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, * Make the enqueue delivery mode check work on RT. If the sleeper * was initialized for hard interrupt delivery, force the mode bit. * This is a special case for hrtimer_sleepers because - * hrtimer_init_sleeper() determines the delivery mode on RT so the + * __hrtimer_init_sleeper() determines the delivery mode on RT so the * fiddling with this decision is avoided at the call sites. */ if (IS_ENABLED(CONFIG_PREEMPT_RT) && sl->timer.is_hard) @@ -1987,19 +2035,18 @@ static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, } /** - * hrtimer_init_sleeper - initialize sleeper to the given clock + * hrtimer_setup_sleeper_on_stack - initialize a sleeper in stack memory * @sl: sleeper to be initialized * @clock_id: the clock to be used * @mode: timer mode abs/rel */ -void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id, - enum hrtimer_mode mode) +void hrtimer_setup_sleeper_on_stack(struct hrtimer_sleeper *sl, + clockid_t clock_id, enum hrtimer_mode mode) { - debug_init(&sl->timer, clock_id, mode); + debug_init_on_stack(&sl->timer, clock_id, mode); __hrtimer_init_sleeper(sl, clock_id, mode); - } -EXPORT_SYMBOL_GPL(hrtimer_init_sleeper); +EXPORT_SYMBOL_GPL(hrtimer_setup_sleeper_on_stack); int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts) { @@ -2060,8 +2107,7 @@ static long __sched hrtimer_nanosleep_restart(struct restart_block *restart) struct hrtimer_sleeper t; int ret; - hrtimer_init_sleeper_on_stack(&t, restart->nanosleep.clockid, - HRTIMER_MODE_ABS); + hrtimer_setup_sleeper_on_stack(&t, restart->nanosleep.clockid, HRTIMER_MODE_ABS); hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires); ret = do_nanosleep(&t, HRTIMER_MODE_ABS); destroy_hrtimer_on_stack(&t.timer); @@ -2075,7 +2121,7 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, struct hrtimer_sleeper t; int ret = 0; - hrtimer_init_sleeper_on_stack(&t, clockid, mode); + hrtimer_setup_sleeper_on_stack(&t, clockid, mode); hrtimer_set_expires_range_ns(&t.timer, rqtp, current->timer_slack_ns); ret = do_nanosleep(&t, mode); if (ret != -ERESTART_RESTARTBLOCK) @@ -2242,123 +2288,3 @@ void __init hrtimers_init(void) hrtimers_prepare_cpu(smp_processor_id()); open_softirq(HRTIMER_SOFTIRQ, hrtimer_run_softirq); } - -/** - * schedule_hrtimeout_range_clock - sleep until timeout - * @expires: timeout value (ktime_t) - * @delta: slack in expires timeout (ktime_t) - * @mode: timer mode - * @clock_id: timer clock to be used - */ -int __sched -schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, - const enum hrtimer_mode mode, clockid_t clock_id) -{ - struct hrtimer_sleeper t; - - /* - * Optimize when a zero timeout value is given. It does not - * matter whether this is an absolute or a relative time. - */ - if (expires && *expires == 0) { - __set_current_state(TASK_RUNNING); - return 0; - } - - /* - * A NULL parameter means "infinite" - */ - if (!expires) { - schedule(); - return -EINTR; - } - - hrtimer_init_sleeper_on_stack(&t, clock_id, mode); - hrtimer_set_expires_range_ns(&t.timer, *expires, delta); - hrtimer_sleeper_start_expires(&t, mode); - - if (likely(t.task)) - schedule(); - - hrtimer_cancel(&t.timer); - destroy_hrtimer_on_stack(&t.timer); - - __set_current_state(TASK_RUNNING); - - return !t.task ? 0 : -EINTR; -} -EXPORT_SYMBOL_GPL(schedule_hrtimeout_range_clock); - -/** - * schedule_hrtimeout_range - sleep until timeout - * @expires: timeout value (ktime_t) - * @delta: slack in expires timeout (ktime_t) - * @mode: timer mode - * - * Make the current task sleep until the given expiry time has - * elapsed. The routine will return immediately unless - * the current task state has been set (see set_current_state()). - * - * The @delta argument gives the kernel the freedom to schedule the - * actual wakeup to a time that is both power and performance friendly - * for regular (non RT/DL) tasks. - * The kernel give the normal best effort behavior for "@expires+@delta", - * but may decide to fire the timer earlier, but no earlier than @expires. - * - * You can set the task state as follows - - * - * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to - * pass before the routine returns unless the current task is explicitly - * woken up, (e.g. by wake_up_process()). - * - * %TASK_INTERRUPTIBLE - the routine may return early if a signal is - * delivered to the current task or the current task is explicitly woken - * up. - * - * The current task state is guaranteed to be TASK_RUNNING when this - * routine returns. - * - * Returns 0 when the timer has expired. If the task was woken before the - * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or - * by an explicit wakeup, it returns -EINTR. - */ -int __sched schedule_hrtimeout_range(ktime_t *expires, u64 delta, - const enum hrtimer_mode mode) -{ - return schedule_hrtimeout_range_clock(expires, delta, mode, - CLOCK_MONOTONIC); -} -EXPORT_SYMBOL_GPL(schedule_hrtimeout_range); - -/** - * schedule_hrtimeout - sleep until timeout - * @expires: timeout value (ktime_t) - * @mode: timer mode - * - * Make the current task sleep until the given expiry time has - * elapsed. The routine will return immediately unless - * the current task state has been set (see set_current_state()). - * - * You can set the task state as follows - - * - * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to - * pass before the routine returns unless the current task is explicitly - * woken up, (e.g. by wake_up_process()). - * - * %TASK_INTERRUPTIBLE - the routine may return early if a signal is - * delivered to the current task or the current task is explicitly woken - * up. - * - * The current task state is guaranteed to be TASK_RUNNING when this - * routine returns. - * - * Returns 0 when the timer has expired. If the task was woken before the - * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or - * by an explicit wakeup, it returns -EINTR. - */ -int __sched schedule_hrtimeout(ktime_t *expires, - const enum hrtimer_mode mode) -{ - return schedule_hrtimeout_range(expires, 0, mode); -} -EXPORT_SYMBOL_GPL(schedule_hrtimeout); diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index 00629e658ca1..876d389b2e21 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -151,7 +151,27 @@ COMPAT_SYSCALL_DEFINE2(getitimer, int, which, #endif /* - * The timer is automagically restarted, when interval != 0 + * Invoked from dequeue_signal() when SIG_ALRM is delivered. + * + * Restart the ITIMER_REAL timer if it is armed as periodic timer. Doing + * this in the signal delivery path instead of self rearming prevents a DoS + * with small increments in the high reolution timer case and reduces timer + * noise in general. + */ +void posixtimer_rearm_itimer(struct task_struct *tsk) +{ + struct hrtimer *tmr = &tsk->signal->real_timer; + + if (!hrtimer_is_queued(tmr) && tsk->signal->it_real_incr != 0) { + hrtimer_forward(tmr, tmr->base->get_time(), + tsk->signal->it_real_incr); + hrtimer_restart(tmr); + } +} + +/* + * Interval timers are restarted in the signal delivery path. See + * posixtimer_rearm_itimer(). */ enum hrtimer_restart it_real_fn(struct hrtimer *timer) { diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 802b336f4b8c..b550ebe0f03b 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -22,22 +22,79 @@ #include "ntp_internal.h" #include "timekeeping_internal.h" - -/* - * NTP timekeeping variables: +/** + * struct ntp_data - Structure holding all NTP related state + * @tick_usec: USER_HZ period in microseconds + * @tick_length: Adjusted tick length + * @tick_length_base: Base value for @tick_length + * @time_state: State of the clock synchronization + * @time_status: Clock status bits + * @time_offset: Time adjustment in nanoseconds + * @time_constant: PLL time constant + * @time_maxerror: Maximum error in microseconds holding the NTP sync distance + * (NTP dispersion + delay / 2) + * @time_esterror: Estimated error in microseconds holding NTP dispersion + * @time_freq: Frequency offset scaled nsecs/secs + * @time_reftime: Time at last adjustment in seconds + * @time_adjust: Adjustment value + * @ntp_tick_adj: Constant boot-param configurable NTP tick adjustment (upscaled) + * @ntp_next_leap_sec: Second value of the next pending leapsecond, or TIME64_MAX if no leap * - * Note: All of the NTP state is protected by the timekeeping locks. + * @pps_valid: PPS signal watchdog counter + * @pps_tf: PPS phase median filter + * @pps_jitter: PPS current jitter in nanoseconds + * @pps_fbase: PPS beginning of the last freq interval + * @pps_shift: PPS current interval duration in seconds (shift value) + * @pps_intcnt: PPS interval counter + * @pps_freq: PPS frequency offset in scaled ns/s + * @pps_stabil: PPS current stability in scaled ns/s + * @pps_calcnt: PPS monitor: calibration intervals + * @pps_jitcnt: PPS monitor: jitter limit exceeded + * @pps_stbcnt: PPS monitor: stability limit exceeded + * @pps_errcnt: PPS monitor: calibration errors + * + * Protected by the timekeeping locks. */ +struct ntp_data { + unsigned long tick_usec; + u64 tick_length; + u64 tick_length_base; + int time_state; + int time_status; + s64 time_offset; + long time_constant; + long time_maxerror; + long time_esterror; + s64 time_freq; + time64_t time_reftime; + long time_adjust; + s64 ntp_tick_adj; + time64_t ntp_next_leap_sec; +#ifdef CONFIG_NTP_PPS + int pps_valid; + long pps_tf[3]; + long pps_jitter; + struct timespec64 pps_fbase; + int pps_shift; + int pps_intcnt; + s64 pps_freq; + long pps_stabil; + long pps_calcnt; + long pps_jitcnt; + long pps_stbcnt; + long pps_errcnt; +#endif +}; - -/* USER_HZ period (usecs): */ -unsigned long tick_usec = USER_TICK_USEC; - -/* SHIFTED_HZ period (nsecs): */ -unsigned long tick_nsec; - -static u64 tick_length; -static u64 tick_length_base; +static struct ntp_data tk_ntp_data = { + .tick_usec = USER_TICK_USEC, + .time_state = TIME_OK, + .time_status = STA_UNSYNC, + .time_constant = 2, + .time_maxerror = NTP_PHASE_LIMIT, + .time_esterror = NTP_PHASE_LIMIT, + .ntp_next_leap_sec = TIME64_MAX, +}; #define SECS_PER_DAY 86400 #define MAX_TICKADJ 500LL /* usecs */ @@ -45,46 +102,6 @@ static u64 tick_length_base; (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) #define MAX_TAI_OFFSET 100000 -/* - * phase-lock loop variables - */ - -/* - * clock synchronization status - * - * (TIME_ERROR prevents overwriting the CMOS clock) - */ -static int time_state = TIME_OK; - -/* clock status bits: */ -static int time_status = STA_UNSYNC; - -/* time adjustment (nsecs): */ -static s64 time_offset; - -/* pll time constant: */ -static long time_constant = 2; - -/* maximum error (usecs): */ -static long time_maxerror = NTP_PHASE_LIMIT; - -/* estimated error (usecs): */ -static long time_esterror = NTP_PHASE_LIMIT; - -/* frequency offset (scaled nsecs/secs): */ -static s64 time_freq; - -/* time at last adjustment (secs): */ -static time64_t time_reftime; - -static long time_adjust; - -/* constant (boot-param configurable) NTP tick adjustment (upscaled) */ -static s64 ntp_tick_adj; - -/* second value of the next pending leapsecond, or TIME64_MAX if no leap */ -static time64_t ntp_next_leap_sec = TIME64_MAX; - #ifdef CONFIG_NTP_PPS /* @@ -101,128 +118,115 @@ static time64_t ntp_next_leap_sec = TIME64_MAX; intervals to decrease it */ #define PPS_MAXWANDER 100000 /* max PPS freq wander (ns/s) */ -static int pps_valid; /* signal watchdog counter */ -static long pps_tf[3]; /* phase median filter */ -static long pps_jitter; /* current jitter (ns) */ -static struct timespec64 pps_fbase; /* beginning of the last freq interval */ -static int pps_shift; /* current interval duration (s) (shift) */ -static int pps_intcnt; /* interval counter */ -static s64 pps_freq; /* frequency offset (scaled ns/s) */ -static long pps_stabil; /* current stability (scaled ns/s) */ - /* - * PPS signal quality monitors - */ -static long pps_calcnt; /* calibration intervals */ -static long pps_jitcnt; /* jitter limit exceeded */ -static long pps_stbcnt; /* stability limit exceeded */ -static long pps_errcnt; /* calibration errors */ - - -/* PPS kernel consumer compensates the whole phase error immediately. + * PPS kernel consumer compensates the whole phase error immediately. * Otherwise, reduce the offset by a fixed factor times the time constant. */ -static inline s64 ntp_offset_chunk(s64 offset) +static inline s64 ntp_offset_chunk(struct ntp_data *ntpdata, s64 offset) { - if (time_status & STA_PPSTIME && time_status & STA_PPSSIGNAL) + if (ntpdata->time_status & STA_PPSTIME && ntpdata->time_status & STA_PPSSIGNAL) return offset; else - return shift_right(offset, SHIFT_PLL + time_constant); + return shift_right(offset, SHIFT_PLL + ntpdata->time_constant); } -static inline void pps_reset_freq_interval(void) +static inline void pps_reset_freq_interval(struct ntp_data *ntpdata) { - /* the PPS calibration interval may end - surprisingly early */ - pps_shift = PPS_INTMIN; - pps_intcnt = 0; + /* The PPS calibration interval may end surprisingly early */ + ntpdata->pps_shift = PPS_INTMIN; + ntpdata->pps_intcnt = 0; } /** * pps_clear - Clears the PPS state variables + * @ntpdata: Pointer to ntp data */ -static inline void pps_clear(void) +static inline void pps_clear(struct ntp_data *ntpdata) { - pps_reset_freq_interval(); - pps_tf[0] = 0; - pps_tf[1] = 0; - pps_tf[2] = 0; - pps_fbase.tv_sec = pps_fbase.tv_nsec = 0; - pps_freq = 0; + pps_reset_freq_interval(ntpdata); + ntpdata->pps_tf[0] = 0; + ntpdata->pps_tf[1] = 0; + ntpdata->pps_tf[2] = 0; + ntpdata->pps_fbase.tv_sec = ntpdata->pps_fbase.tv_nsec = 0; + ntpdata->pps_freq = 0; } -/* Decrease pps_valid to indicate that another second has passed since - * the last PPS signal. When it reaches 0, indicate that PPS signal is - * missing. +/* + * Decrease pps_valid to indicate that another second has passed since the + * last PPS signal. When it reaches 0, indicate that PPS signal is missing. */ -static inline void pps_dec_valid(void) +static inline void pps_dec_valid(struct ntp_data *ntpdata) { - if (pps_valid > 0) - pps_valid--; - else { - time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | - STA_PPSWANDER | STA_PPSERROR); - pps_clear(); + if (ntpdata->pps_valid > 0) { + ntpdata->pps_valid--; + } else { + ntpdata->time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | + STA_PPSWANDER | STA_PPSERROR); + pps_clear(ntpdata); } } -static inline void pps_set_freq(s64 freq) +static inline void pps_set_freq(struct ntp_data *ntpdata) { - pps_freq = freq; + ntpdata->pps_freq = ntpdata->time_freq; } -static inline int is_error_status(int status) +static inline bool is_error_status(int status) { return (status & (STA_UNSYNC|STA_CLOCKERR)) - /* PPS signal lost when either PPS time or - * PPS frequency synchronization requested + /* + * PPS signal lost when either PPS time or PPS frequency + * synchronization requested */ || ((status & (STA_PPSFREQ|STA_PPSTIME)) && !(status & STA_PPSSIGNAL)) - /* PPS jitter exceeded when - * PPS time synchronization requested */ + /* + * PPS jitter exceeded when PPS time synchronization + * requested + */ || ((status & (STA_PPSTIME|STA_PPSJITTER)) == (STA_PPSTIME|STA_PPSJITTER)) - /* PPS wander exceeded or calibration error when - * PPS frequency synchronization requested + /* + * PPS wander exceeded or calibration error when PPS + * frequency synchronization requested */ || ((status & STA_PPSFREQ) && (status & (STA_PPSWANDER|STA_PPSERROR))); } -static inline void pps_fill_timex(struct __kernel_timex *txc) +static inline void pps_fill_timex(struct ntp_data *ntpdata, struct __kernel_timex *txc) { - txc->ppsfreq = shift_right((pps_freq >> PPM_SCALE_INV_SHIFT) * + txc->ppsfreq = shift_right((ntpdata->pps_freq >> PPM_SCALE_INV_SHIFT) * PPM_SCALE_INV, NTP_SCALE_SHIFT); - txc->jitter = pps_jitter; - if (!(time_status & STA_NANO)) - txc->jitter = pps_jitter / NSEC_PER_USEC; - txc->shift = pps_shift; - txc->stabil = pps_stabil; - txc->jitcnt = pps_jitcnt; - txc->calcnt = pps_calcnt; - txc->errcnt = pps_errcnt; - txc->stbcnt = pps_stbcnt; + txc->jitter = ntpdata->pps_jitter; + if (!(ntpdata->time_status & STA_NANO)) + txc->jitter = ntpdata->pps_jitter / NSEC_PER_USEC; + txc->shift = ntpdata->pps_shift; + txc->stabil = ntpdata->pps_stabil; + txc->jitcnt = ntpdata->pps_jitcnt; + txc->calcnt = ntpdata->pps_calcnt; + txc->errcnt = ntpdata->pps_errcnt; + txc->stbcnt = ntpdata->pps_stbcnt; } #else /* !CONFIG_NTP_PPS */ -static inline s64 ntp_offset_chunk(s64 offset) +static inline s64 ntp_offset_chunk(struct ntp_data *ntpdata, s64 offset) { - return shift_right(offset, SHIFT_PLL + time_constant); + return shift_right(offset, SHIFT_PLL + ntpdata->time_constant); } -static inline void pps_reset_freq_interval(void) {} -static inline void pps_clear(void) {} -static inline void pps_dec_valid(void) {} -static inline void pps_set_freq(s64 freq) {} +static inline void pps_reset_freq_interval(struct ntp_data *ntpdata) {} +static inline void pps_clear(struct ntp_data *ntpdata) {} +static inline void pps_dec_valid(struct ntp_data *ntpdata) {} +static inline void pps_set_freq(struct ntp_data *ntpdata) {} -static inline int is_error_status(int status) +static inline bool is_error_status(int status) { return status & (STA_UNSYNC|STA_CLOCKERR); } -static inline void pps_fill_timex(struct __kernel_timex *txc) +static inline void pps_fill_timex(struct ntp_data *ntpdata, struct __kernel_timex *txc) { /* PPS is not implemented, so these are zero */ txc->ppsfreq = 0; @@ -237,138 +241,123 @@ static inline void pps_fill_timex(struct __kernel_timex *txc) #endif /* CONFIG_NTP_PPS */ - -/** - * ntp_synced - Returns 1 if the NTP status is not UNSYNC - * - */ -static inline int ntp_synced(void) -{ - return !(time_status & STA_UNSYNC); -} - - /* - * NTP methods: + * Update tick_length and tick_length_base, based on tick_usec, ntp_tick_adj and + * time_freq: */ - -/* - * Update (tick_length, tick_length_base, tick_nsec), based - * on (tick_usec, ntp_tick_adj, time_freq): - */ -static void ntp_update_frequency(void) +static void ntp_update_frequency(struct ntp_data *ntpdata) { - u64 second_length; - u64 new_base; + u64 second_length, new_base, tick_usec = (u64)ntpdata->tick_usec; - second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) - << NTP_SCALE_SHIFT; + second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) << NTP_SCALE_SHIFT; - second_length += ntp_tick_adj; - second_length += time_freq; + second_length += ntpdata->ntp_tick_adj; + second_length += ntpdata->time_freq; - tick_nsec = div_u64(second_length, HZ) >> NTP_SCALE_SHIFT; new_base = div_u64(second_length, NTP_INTERVAL_FREQ); /* - * Don't wait for the next second_overflow, apply - * the change to the tick length immediately: + * Don't wait for the next second_overflow, apply the change to the + * tick length immediately: */ - tick_length += new_base - tick_length_base; - tick_length_base = new_base; + ntpdata->tick_length += new_base - ntpdata->tick_length_base; + ntpdata->tick_length_base = new_base; } -static inline s64 ntp_update_offset_fll(s64 offset64, long secs) +static inline s64 ntp_update_offset_fll(struct ntp_data *ntpdata, s64 offset64, long secs) { - time_status &= ~STA_MODE; + ntpdata->time_status &= ~STA_MODE; if (secs < MINSEC) return 0; - if (!(time_status & STA_FLL) && (secs <= MAXSEC)) + if (!(ntpdata->time_status & STA_FLL) && (secs <= MAXSEC)) return 0; - time_status |= STA_MODE; + ntpdata->time_status |= STA_MODE; return div64_long(offset64 << (NTP_SCALE_SHIFT - SHIFT_FLL), secs); } -static void ntp_update_offset(long offset) +static void ntp_update_offset(struct ntp_data *ntpdata, long offset) { - s64 freq_adj; - s64 offset64; - long secs; + s64 freq_adj, offset64; + long secs, real_secs; - if (!(time_status & STA_PLL)) + if (!(ntpdata->time_status & STA_PLL)) return; - if (!(time_status & STA_NANO)) { + if (!(ntpdata->time_status & STA_NANO)) { /* Make sure the multiplication below won't overflow */ offset = clamp(offset, -USEC_PER_SEC, USEC_PER_SEC); offset *= NSEC_PER_USEC; } - /* - * Scale the phase adjustment and - * clamp to the operating range. - */ + /* Scale the phase adjustment and clamp to the operating range. */ offset = clamp(offset, -MAXPHASE, MAXPHASE); /* * Select how the frequency is to be controlled * and in which mode (PLL or FLL). */ - secs = (long)(__ktime_get_real_seconds() - time_reftime); - if (unlikely(time_status & STA_FREQHOLD)) + real_secs = __ktime_get_real_seconds(); + secs = (long)(real_secs - ntpdata->time_reftime); + if (unlikely(ntpdata->time_status & STA_FREQHOLD)) secs = 0; - time_reftime = __ktime_get_real_seconds(); + ntpdata->time_reftime = real_secs; offset64 = offset; - freq_adj = ntp_update_offset_fll(offset64, secs); + freq_adj = ntp_update_offset_fll(ntpdata, offset64, secs); /* * Clamp update interval to reduce PLL gain with low * sampling rate (e.g. intermittent network connection) * to avoid instability. */ - if (unlikely(secs > 1 << (SHIFT_PLL + 1 + time_constant))) - secs = 1 << (SHIFT_PLL + 1 + time_constant); + if (unlikely(secs > 1 << (SHIFT_PLL + 1 + ntpdata->time_constant))) + secs = 1 << (SHIFT_PLL + 1 + ntpdata->time_constant); freq_adj += (offset64 * secs) << - (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant)); + (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + ntpdata->time_constant)); - freq_adj = min(freq_adj + time_freq, MAXFREQ_SCALED); + freq_adj = min(freq_adj + ntpdata->time_freq, MAXFREQ_SCALED); - time_freq = max(freq_adj, -MAXFREQ_SCALED); + ntpdata->time_freq = max(freq_adj, -MAXFREQ_SCALED); - time_offset = div_s64(offset64 << NTP_SCALE_SHIFT, NTP_INTERVAL_FREQ); + ntpdata->time_offset = div_s64(offset64 << NTP_SCALE_SHIFT, NTP_INTERVAL_FREQ); } -/** - * ntp_clear - Clears the NTP state variables - */ -void ntp_clear(void) +static void __ntp_clear(struct ntp_data *ntpdata) { - time_adjust = 0; /* stop active adjtime() */ - time_status |= STA_UNSYNC; - time_maxerror = NTP_PHASE_LIMIT; - time_esterror = NTP_PHASE_LIMIT; + /* Stop active adjtime() */ + ntpdata->time_adjust = 0; + ntpdata->time_status |= STA_UNSYNC; + ntpdata->time_maxerror = NTP_PHASE_LIMIT; + ntpdata->time_esterror = NTP_PHASE_LIMIT; - ntp_update_frequency(); + ntp_update_frequency(ntpdata); - tick_length = tick_length_base; - time_offset = 0; + ntpdata->tick_length = ntpdata->tick_length_base; + ntpdata->time_offset = 0; - ntp_next_leap_sec = TIME64_MAX; + ntpdata->ntp_next_leap_sec = TIME64_MAX; /* Clear PPS state variables */ - pps_clear(); + pps_clear(ntpdata); +} + +/** + * ntp_clear - Clears the NTP state variables + */ +void ntp_clear(void) +{ + __ntp_clear(&tk_ntp_data); } u64 ntp_tick_length(void) { - return tick_length; + return tk_ntp_data.tick_length; } /** @@ -379,16 +368,17 @@ u64 ntp_tick_length(void) */ ktime_t ntp_get_next_leap(void) { + struct ntp_data *ntpdata = &tk_ntp_data; ktime_t ret; - if ((time_state == TIME_INS) && (time_status & STA_INS)) - return ktime_set(ntp_next_leap_sec, 0); + if ((ntpdata->time_state == TIME_INS) && (ntpdata->time_status & STA_INS)) + return ktime_set(ntpdata->ntp_next_leap_sec, 0); ret = KTIME_MAX; return ret; } /* - * this routine handles the overflow of the microsecond field + * This routine handles the overflow of the microsecond field * * The tricky bits of code to handle the accurate clock support * were provided by Dave Mills ([email protected]) of NTP fame. @@ -399,6 +389,7 @@ ktime_t ntp_get_next_leap(void) */ int second_overflow(time64_t secs) { + struct ntp_data *ntpdata = &tk_ntp_data; s64 delta; int leap = 0; s32 rem; @@ -408,87 +399,84 @@ int second_overflow(time64_t secs) * day, the system clock is set back one second; if in leap-delete * state, the system clock is set ahead one second. */ - switch (time_state) { + switch (ntpdata->time_state) { case TIME_OK: - if (time_status & STA_INS) { - time_state = TIME_INS; + if (ntpdata->time_status & STA_INS) { + ntpdata->time_state = TIME_INS; div_s64_rem(secs, SECS_PER_DAY, &rem); - ntp_next_leap_sec = secs + SECS_PER_DAY - rem; - } else if (time_status & STA_DEL) { - time_state = TIME_DEL; + ntpdata->ntp_next_leap_sec = secs + SECS_PER_DAY - rem; + } else if (ntpdata->time_status & STA_DEL) { + ntpdata->time_state = TIME_DEL; div_s64_rem(secs + 1, SECS_PER_DAY, &rem); - ntp_next_leap_sec = secs + SECS_PER_DAY - rem; + ntpdata->ntp_next_leap_sec = secs + SECS_PER_DAY - rem; } break; case TIME_INS: - if (!(time_status & STA_INS)) { - ntp_next_leap_sec = TIME64_MAX; - time_state = TIME_OK; - } else if (secs == ntp_next_leap_sec) { + if (!(ntpdata->time_status & STA_INS)) { + ntpdata->ntp_next_leap_sec = TIME64_MAX; + ntpdata->time_state = TIME_OK; + } else if (secs == ntpdata->ntp_next_leap_sec) { leap = -1; - time_state = TIME_OOP; - printk(KERN_NOTICE - "Clock: inserting leap second 23:59:60 UTC\n"); + ntpdata->time_state = TIME_OOP; + pr_notice("Clock: inserting leap second 23:59:60 UTC\n"); } break; case TIME_DEL: - if (!(time_status & STA_DEL)) { - ntp_next_leap_sec = TIME64_MAX; - time_state = TIME_OK; - } else if (secs == ntp_next_leap_sec) { + if (!(ntpdata->time_status & STA_DEL)) { + ntpdata->ntp_next_leap_sec = TIME64_MAX; + ntpdata->time_state = TIME_OK; + } else if (secs == ntpdata->ntp_next_leap_sec) { leap = 1; - ntp_next_leap_sec = TIME64_MAX; - time_state = TIME_WAIT; - printk(KERN_NOTICE - "Clock: deleting leap second 23:59:59 UTC\n"); + ntpdata->ntp_next_leap_sec = TIME64_MAX; + ntpdata->time_state = TIME_WAIT; + pr_notice("Clock: deleting leap second 23:59:59 UTC\n"); } break; case TIME_OOP: - ntp_next_leap_sec = TIME64_MAX; - time_state = TIME_WAIT; + ntpdata->ntp_next_leap_sec = TIME64_MAX; + ntpdata->time_state = TIME_WAIT; break; case TIME_WAIT: - if (!(time_status & (STA_INS | STA_DEL))) - time_state = TIME_OK; + if (!(ntpdata->time_status & (STA_INS | STA_DEL))) + ntpdata->time_state = TIME_OK; break; } - /* Bump the maxerror field */ - time_maxerror += MAXFREQ / NSEC_PER_USEC; - if (time_maxerror > NTP_PHASE_LIMIT) { - time_maxerror = NTP_PHASE_LIMIT; - time_status |= STA_UNSYNC; + ntpdata->time_maxerror += MAXFREQ / NSEC_PER_USEC; + if (ntpdata->time_maxerror > NTP_PHASE_LIMIT) { + ntpdata->time_maxerror = NTP_PHASE_LIMIT; + ntpdata->time_status |= STA_UNSYNC; } /* Compute the phase adjustment for the next second */ - tick_length = tick_length_base; + ntpdata->tick_length = ntpdata->tick_length_base; - delta = ntp_offset_chunk(time_offset); - time_offset -= delta; - tick_length += delta; + delta = ntp_offset_chunk(ntpdata, ntpdata->time_offset); + ntpdata->time_offset -= delta; + ntpdata->tick_length += delta; /* Check PPS signal */ - pps_dec_valid(); + pps_dec_valid(ntpdata); - if (!time_adjust) + if (!ntpdata->time_adjust) goto out; - if (time_adjust > MAX_TICKADJ) { - time_adjust -= MAX_TICKADJ; - tick_length += MAX_TICKADJ_SCALED; + if (ntpdata->time_adjust > MAX_TICKADJ) { + ntpdata->time_adjust -= MAX_TICKADJ; + ntpdata->tick_length += MAX_TICKADJ_SCALED; goto out; } - if (time_adjust < -MAX_TICKADJ) { - time_adjust += MAX_TICKADJ; - tick_length -= MAX_TICKADJ_SCALED; + if (ntpdata->time_adjust < -MAX_TICKADJ) { + ntpdata->time_adjust += MAX_TICKADJ; + ntpdata->tick_length -= MAX_TICKADJ_SCALED; goto out; } - tick_length += (s64)(time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ) - << NTP_SCALE_SHIFT; - time_adjust = 0; + ntpdata->tick_length += (s64)(ntpdata->time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ) + << NTP_SCALE_SHIFT; + ntpdata->time_adjust = 0; out: return leap; @@ -611,6 +599,15 @@ static inline int update_rtc(struct timespec64 *to_set, unsigned long *offset_ns } #endif +/** + * ntp_synced - Tells whether the NTP status is not UNSYNC + * Returns: true if not UNSYNC, false otherwise + */ +static inline bool ntp_synced(void) +{ + return !(tk_ntp_data.time_status & STA_UNSYNC); +} + /* * If we have an externally synchronized Linux clock, then update RTC clock * accordingly every ~11 minutes. Generally RTCs can only store second @@ -691,162 +688,156 @@ static inline void __init ntp_init_cmos_sync(void) { } /* * Propagate a new txc->status value into the NTP state: */ -static inline void process_adj_status(const struct __kernel_timex *txc) +static inline void process_adj_status(struct ntp_data *ntpdata, const struct __kernel_timex *txc) { - if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { - time_state = TIME_OK; - time_status = STA_UNSYNC; - ntp_next_leap_sec = TIME64_MAX; - /* restart PPS frequency calibration */ - pps_reset_freq_interval(); + if ((ntpdata->time_status & STA_PLL) && !(txc->status & STA_PLL)) { + ntpdata->time_state = TIME_OK; + ntpdata->time_status = STA_UNSYNC; + ntpdata->ntp_next_leap_sec = TIME64_MAX; + /* Restart PPS frequency calibration */ + pps_reset_freq_interval(ntpdata); } /* * If we turn on PLL adjustments then reset the * reference time to current time. */ - if (!(time_status & STA_PLL) && (txc->status & STA_PLL)) - time_reftime = __ktime_get_real_seconds(); + if (!(ntpdata->time_status & STA_PLL) && (txc->status & STA_PLL)) + ntpdata->time_reftime = __ktime_get_real_seconds(); /* only set allowed bits */ - time_status &= STA_RONLY; - time_status |= txc->status & ~STA_RONLY; + ntpdata->time_status &= STA_RONLY; + ntpdata->time_status |= txc->status & ~STA_RONLY; } - -static inline void process_adjtimex_modes(const struct __kernel_timex *txc, +static inline void process_adjtimex_modes(struct ntp_data *ntpdata, const struct __kernel_timex *txc, s32 *time_tai) { if (txc->modes & ADJ_STATUS) - process_adj_status(txc); + process_adj_status(ntpdata, txc); if (txc->modes & ADJ_NANO) - time_status |= STA_NANO; + ntpdata->time_status |= STA_NANO; if (txc->modes & ADJ_MICRO) - time_status &= ~STA_NANO; + ntpdata->time_status &= ~STA_NANO; if (txc->modes & ADJ_FREQUENCY) { - time_freq = txc->freq * PPM_SCALE; - time_freq = min(time_freq, MAXFREQ_SCALED); - time_freq = max(time_freq, -MAXFREQ_SCALED); - /* update pps_freq */ - pps_set_freq(time_freq); + ntpdata->time_freq = txc->freq * PPM_SCALE; + ntpdata->time_freq = min(ntpdata->time_freq, MAXFREQ_SCALED); + ntpdata->time_freq = max(ntpdata->time_freq, -MAXFREQ_SCALED); + /* Update pps_freq */ + pps_set_freq(ntpdata); } if (txc->modes & ADJ_MAXERROR) - time_maxerror = clamp(txc->maxerror, 0, NTP_PHASE_LIMIT); + ntpdata->time_maxerror = clamp(txc->maxerror, 0, NTP_PHASE_LIMIT); if (txc->modes & ADJ_ESTERROR) - time_esterror = clamp(txc->esterror, 0, NTP_PHASE_LIMIT); + ntpdata->time_esterror = clamp(txc->esterror, 0, NTP_PHASE_LIMIT); if (txc->modes & ADJ_TIMECONST) { - time_constant = clamp(txc->constant, 0, MAXTC); - if (!(time_status & STA_NANO)) - time_constant += 4; - time_constant = clamp(time_constant, 0, MAXTC); + ntpdata->time_constant = clamp(txc->constant, 0, MAXTC); + if (!(ntpdata->time_status & STA_NANO)) + ntpdata->time_constant += 4; + ntpdata->time_constant = clamp(ntpdata->time_constant, 0, MAXTC); } - if (txc->modes & ADJ_TAI && - txc->constant >= 0 && txc->constant <= MAX_TAI_OFFSET) + if (txc->modes & ADJ_TAI && txc->constant >= 0 && txc->constant <= MAX_TAI_OFFSET) *time_tai = txc->constant; if (txc->modes & ADJ_OFFSET) - ntp_update_offset(txc->offset); + ntp_update_offset(ntpdata, txc->offset); if (txc->modes & ADJ_TICK) - tick_usec = txc->tick; + ntpdata->tick_usec = txc->tick; if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET)) - ntp_update_frequency(); + ntp_update_frequency(ntpdata); } - /* - * adjtimex mainly allows reading (and writing, if superuser) of + * adjtimex() mainly allows reading (and writing, if superuser) of * kernel time-keeping variables. used by xntpd. */ int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, s32 *time_tai, struct audit_ntp_data *ad) { + struct ntp_data *ntpdata = &tk_ntp_data; int result; if (txc->modes & ADJ_ADJTIME) { - long save_adjust = time_adjust; + long save_adjust = ntpdata->time_adjust; if (!(txc->modes & ADJ_OFFSET_READONLY)) { /* adjtime() is independent from ntp_adjtime() */ - time_adjust = txc->offset; - ntp_update_frequency(); + ntpdata->time_adjust = txc->offset; + ntp_update_frequency(ntpdata); audit_ntp_set_old(ad, AUDIT_NTP_ADJUST, save_adjust); - audit_ntp_set_new(ad, AUDIT_NTP_ADJUST, time_adjust); + audit_ntp_set_new(ad, AUDIT_NTP_ADJUST, ntpdata->time_adjust); } txc->offset = save_adjust; } else { /* If there are input parameters, then process them: */ if (txc->modes) { - audit_ntp_set_old(ad, AUDIT_NTP_OFFSET, time_offset); - audit_ntp_set_old(ad, AUDIT_NTP_FREQ, time_freq); - audit_ntp_set_old(ad, AUDIT_NTP_STATUS, time_status); + audit_ntp_set_old(ad, AUDIT_NTP_OFFSET, ntpdata->time_offset); + audit_ntp_set_old(ad, AUDIT_NTP_FREQ, ntpdata->time_freq); + audit_ntp_set_old(ad, AUDIT_NTP_STATUS, ntpdata->time_status); audit_ntp_set_old(ad, AUDIT_NTP_TAI, *time_tai); - audit_ntp_set_old(ad, AUDIT_NTP_TICK, tick_usec); + audit_ntp_set_old(ad, AUDIT_NTP_TICK, ntpdata->tick_usec); - process_adjtimex_modes(txc, time_tai); + process_adjtimex_modes(ntpdata, txc, time_tai); - audit_ntp_set_new(ad, AUDIT_NTP_OFFSET, time_offset); - audit_ntp_set_new(ad, AUDIT_NTP_FREQ, time_freq); - audit_ntp_set_new(ad, AUDIT_NTP_STATUS, time_status); + audit_ntp_set_new(ad, AUDIT_NTP_OFFSET, ntpdata->time_offset); + audit_ntp_set_new(ad, AUDIT_NTP_FREQ, ntpdata->time_freq); + audit_ntp_set_new(ad, AUDIT_NTP_STATUS, ntpdata->time_status); audit_ntp_set_new(ad, AUDIT_NTP_TAI, *time_tai); - audit_ntp_set_new(ad, AUDIT_NTP_TICK, tick_usec); + audit_ntp_set_new(ad, AUDIT_NTP_TICK, ntpdata->tick_usec); } - txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, - NTP_SCALE_SHIFT); - if (!(time_status & STA_NANO)) + txc->offset = shift_right(ntpdata->time_offset * NTP_INTERVAL_FREQ, NTP_SCALE_SHIFT); + if (!(ntpdata->time_status & STA_NANO)) txc->offset = (u32)txc->offset / NSEC_PER_USEC; } - result = time_state; /* mostly `TIME_OK' */ - /* check for errors */ - if (is_error_status(time_status)) + result = ntpdata->time_state; + if (is_error_status(ntpdata->time_status)) result = TIME_ERROR; - txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) * + txc->freq = shift_right((ntpdata->time_freq >> PPM_SCALE_INV_SHIFT) * PPM_SCALE_INV, NTP_SCALE_SHIFT); - txc->maxerror = time_maxerror; - txc->esterror = time_esterror; - txc->status = time_status; - txc->constant = time_constant; + txc->maxerror = ntpdata->time_maxerror; + txc->esterror = ntpdata->time_esterror; + txc->status = ntpdata->time_status; + txc->constant = ntpdata->time_constant; txc->precision = 1; txc->tolerance = MAXFREQ_SCALED / PPM_SCALE; - txc->tick = tick_usec; + txc->tick = ntpdata->tick_usec; txc->tai = *time_tai; - /* fill PPS status fields */ - pps_fill_timex(txc); + /* Fill PPS status fields */ + pps_fill_timex(ntpdata, txc); txc->time.tv_sec = ts->tv_sec; txc->time.tv_usec = ts->tv_nsec; - if (!(time_status & STA_NANO)) + if (!(ntpdata->time_status & STA_NANO)) txc->time.tv_usec = ts->tv_nsec / NSEC_PER_USEC; /* Handle leapsec adjustments */ - if (unlikely(ts->tv_sec >= ntp_next_leap_sec)) { - if ((time_state == TIME_INS) && (time_status & STA_INS)) { + if (unlikely(ts->tv_sec >= ntpdata->ntp_next_leap_sec)) { + if ((ntpdata->time_state == TIME_INS) && (ntpdata->time_status & STA_INS)) { result = TIME_OOP; txc->tai++; txc->time.tv_sec--; } - if ((time_state == TIME_DEL) && (time_status & STA_DEL)) { + if ((ntpdata->time_state == TIME_DEL) && (ntpdata->time_status & STA_DEL)) { result = TIME_WAIT; txc->tai--; txc->time.tv_sec++; } - if ((time_state == TIME_OOP) && - (ts->tv_sec == ntp_next_leap_sec)) { + if ((ntpdata->time_state == TIME_OOP) && (ts->tv_sec == ntpdata->ntp_next_leap_sec)) result = TIME_WAIT; - } } return result; @@ -854,17 +845,21 @@ int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, #ifdef CONFIG_NTP_PPS -/* actually struct pps_normtime is good old struct timespec, but it is +/* + * struct pps_normtime is basically a struct timespec, but it is * semantically different (and it is the reason why it was invented): * pps_normtime.nsec has a range of ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] - * while timespec.tv_nsec has a range of [0, NSEC_PER_SEC) */ + * while timespec.tv_nsec has a range of [0, NSEC_PER_SEC) + */ struct pps_normtime { s64 sec; /* seconds */ long nsec; /* nanoseconds */ }; -/* normalize the timestamp so that nsec is in the - ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] interval */ +/* + * Normalize the timestamp so that nsec is in the + * [ -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] interval + */ static inline struct pps_normtime pps_normalize_ts(struct timespec64 ts) { struct pps_normtime norm = { @@ -880,54 +875,57 @@ static inline struct pps_normtime pps_normalize_ts(struct timespec64 ts) return norm; } -/* get current phase correction and jitter */ -static inline long pps_phase_filter_get(long *jitter) +/* Get current phase correction and jitter */ +static inline long pps_phase_filter_get(struct ntp_data *ntpdata, long *jitter) { - *jitter = pps_tf[0] - pps_tf[1]; + *jitter = ntpdata->pps_tf[0] - ntpdata->pps_tf[1]; if (*jitter < 0) *jitter = -*jitter; /* TODO: test various filters */ - return pps_tf[0]; + return ntpdata->pps_tf[0]; } -/* add the sample to the phase filter */ -static inline void pps_phase_filter_add(long err) +/* Add the sample to the phase filter */ +static inline void pps_phase_filter_add(struct ntp_data *ntpdata, long err) { - pps_tf[2] = pps_tf[1]; - pps_tf[1] = pps_tf[0]; - pps_tf[0] = err; + ntpdata->pps_tf[2] = ntpdata->pps_tf[1]; + ntpdata->pps_tf[1] = ntpdata->pps_tf[0]; + ntpdata->pps_tf[0] = err; } -/* decrease frequency calibration interval length. - * It is halved after four consecutive unstable intervals. +/* + * Decrease frequency calibration interval length. It is halved after four + * consecutive unstable intervals. */ -static inline void pps_dec_freq_interval(void) +static inline void pps_dec_freq_interval(struct ntp_data *ntpdata) { - if (--pps_intcnt <= -PPS_INTCOUNT) { - pps_intcnt = -PPS_INTCOUNT; - if (pps_shift > PPS_INTMIN) { - pps_shift--; - pps_intcnt = 0; + if (--ntpdata->pps_intcnt <= -PPS_INTCOUNT) { + ntpdata->pps_intcnt = -PPS_INTCOUNT; + if (ntpdata->pps_shift > PPS_INTMIN) { + ntpdata->pps_shift--; + ntpdata->pps_intcnt = 0; } } } -/* increase frequency calibration interval length. - * It is doubled after four consecutive stable intervals. +/* + * Increase frequency calibration interval length. It is doubled after + * four consecutive stable intervals. */ -static inline void pps_inc_freq_interval(void) +static inline void pps_inc_freq_interval(struct ntp_data *ntpdata) { - if (++pps_intcnt >= PPS_INTCOUNT) { - pps_intcnt = PPS_INTCOUNT; - if (pps_shift < PPS_INTMAX) { - pps_shift++; - pps_intcnt = 0; + if (++ntpdata->pps_intcnt >= PPS_INTCOUNT) { + ntpdata->pps_intcnt = PPS_INTCOUNT; + if (ntpdata->pps_shift < PPS_INTMAX) { + ntpdata->pps_shift++; + ntpdata->pps_intcnt = 0; } } } -/* update clock frequency based on MONOTONIC_RAW clock PPS signal +/* + * Update clock frequency based on MONOTONIC_RAW clock PPS signal * timestamps * * At the end of the calibration interval the difference between the @@ -936,90 +934,88 @@ static inline void pps_inc_freq_interval(void) * too long, the data are discarded. * Returns the difference between old and new frequency values. */ -static long hardpps_update_freq(struct pps_normtime freq_norm) +static long hardpps_update_freq(struct ntp_data *ntpdata, struct pps_normtime freq_norm) { long delta, delta_mod; s64 ftemp; - /* check if the frequency interval was too long */ - if (freq_norm.sec > (2 << pps_shift)) { - time_status |= STA_PPSERROR; - pps_errcnt++; - pps_dec_freq_interval(); - printk_deferred(KERN_ERR - "hardpps: PPSERROR: interval too long - %lld s\n", - freq_norm.sec); + /* Check if the frequency interval was too long */ + if (freq_norm.sec > (2 << ntpdata->pps_shift)) { + ntpdata->time_status |= STA_PPSERROR; + ntpdata->pps_errcnt++; + pps_dec_freq_interval(ntpdata); + printk_deferred(KERN_ERR "hardpps: PPSERROR: interval too long - %lld s\n", + freq_norm.sec); return 0; } - /* here the raw frequency offset and wander (stability) is - * calculated. If the wander is less than the wander threshold - * the interval is increased; otherwise it is decreased. + /* + * Here the raw frequency offset and wander (stability) is + * calculated. If the wander is less than the wander threshold the + * interval is increased; otherwise it is decreased. */ ftemp = div_s64(((s64)(-freq_norm.nsec)) << NTP_SCALE_SHIFT, freq_norm.sec); - delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT); - pps_freq = ftemp; + delta = shift_right(ftemp - ntpdata->pps_freq, NTP_SCALE_SHIFT); + ntpdata->pps_freq = ftemp; if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) { - printk_deferred(KERN_WARNING - "hardpps: PPSWANDER: change=%ld\n", delta); - time_status |= STA_PPSWANDER; - pps_stbcnt++; - pps_dec_freq_interval(); - } else { /* good sample */ - pps_inc_freq_interval(); + printk_deferred(KERN_WARNING "hardpps: PPSWANDER: change=%ld\n", delta); + ntpdata->time_status |= STA_PPSWANDER; + ntpdata->pps_stbcnt++; + pps_dec_freq_interval(ntpdata); + } else { + /* Good sample */ + pps_inc_freq_interval(ntpdata); } - /* the stability metric is calculated as the average of recent - * frequency changes, but is used only for performance - * monitoring + /* + * The stability metric is calculated as the average of recent + * frequency changes, but is used only for performance monitoring */ delta_mod = delta; if (delta_mod < 0) delta_mod = -delta_mod; - pps_stabil += (div_s64(((s64)delta_mod) << - (NTP_SCALE_SHIFT - SHIFT_USEC), - NSEC_PER_USEC) - pps_stabil) >> PPS_INTMIN; - - /* if enabled, the system clock frequency is updated */ - if ((time_status & STA_PPSFREQ) != 0 && - (time_status & STA_FREQHOLD) == 0) { - time_freq = pps_freq; - ntp_update_frequency(); + ntpdata->pps_stabil += (div_s64(((s64)delta_mod) << (NTP_SCALE_SHIFT - SHIFT_USEC), + NSEC_PER_USEC) - ntpdata->pps_stabil) >> PPS_INTMIN; + + /* If enabled, the system clock frequency is updated */ + if ((ntpdata->time_status & STA_PPSFREQ) && !(ntpdata->time_status & STA_FREQHOLD)) { + ntpdata->time_freq = ntpdata->pps_freq; + ntp_update_frequency(ntpdata); } return delta; } -/* correct REALTIME clock phase error against PPS signal */ -static void hardpps_update_phase(long error) +/* Correct REALTIME clock phase error against PPS signal */ +static void hardpps_update_phase(struct ntp_data *ntpdata, long error) { long correction = -error; long jitter; - /* add the sample to the median filter */ - pps_phase_filter_add(correction); - correction = pps_phase_filter_get(&jitter); + /* Add the sample to the median filter */ + pps_phase_filter_add(ntpdata, correction); + correction = pps_phase_filter_get(ntpdata, &jitter); - /* Nominal jitter is due to PPS signal noise. If it exceeds the + /* + * Nominal jitter is due to PPS signal noise. If it exceeds the * threshold, the sample is discarded; otherwise, if so enabled, * the time offset is updated. */ - if (jitter > (pps_jitter << PPS_POPCORN)) { - printk_deferred(KERN_WARNING - "hardpps: PPSJITTER: jitter=%ld, limit=%ld\n", - jitter, (pps_jitter << PPS_POPCORN)); - time_status |= STA_PPSJITTER; - pps_jitcnt++; - } else if (time_status & STA_PPSTIME) { - /* correct the time using the phase offset */ - time_offset = div_s64(((s64)correction) << NTP_SCALE_SHIFT, - NTP_INTERVAL_FREQ); - /* cancel running adjtime() */ - time_adjust = 0; + if (jitter > (ntpdata->pps_jitter << PPS_POPCORN)) { + printk_deferred(KERN_WARNING "hardpps: PPSJITTER: jitter=%ld, limit=%ld\n", + jitter, (ntpdata->pps_jitter << PPS_POPCORN)); + ntpdata->time_status |= STA_PPSJITTER; + ntpdata->pps_jitcnt++; + } else if (ntpdata->time_status & STA_PPSTIME) { + /* Correct the time using the phase offset */ + ntpdata->time_offset = div_s64(((s64)correction) << NTP_SCALE_SHIFT, + NTP_INTERVAL_FREQ); + /* Cancel running adjtime() */ + ntpdata->time_adjust = 0; } - /* update jitter */ - pps_jitter += (jitter - pps_jitter) >> PPS_INTMIN; + /* Update jitter */ + ntpdata->pps_jitter += (jitter - ntpdata->pps_jitter) >> PPS_INTMIN; } /* @@ -1037,60 +1033,62 @@ static void hardpps_update_phase(long error) void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts) { struct pps_normtime pts_norm, freq_norm; + struct ntp_data *ntpdata = &tk_ntp_data; pts_norm = pps_normalize_ts(*phase_ts); - /* clear the error bits, they will be set again if needed */ - time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); + /* Clear the error bits, they will be set again if needed */ + ntpdata->time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); /* indicate signal presence */ - time_status |= STA_PPSSIGNAL; - pps_valid = PPS_VALID; + ntpdata->time_status |= STA_PPSSIGNAL; + ntpdata->pps_valid = PPS_VALID; - /* when called for the first time, - * just start the frequency interval */ - if (unlikely(pps_fbase.tv_sec == 0)) { - pps_fbase = *raw_ts; + /* + * When called for the first time, just start the frequency + * interval + */ + if (unlikely(ntpdata->pps_fbase.tv_sec == 0)) { + ntpdata->pps_fbase = *raw_ts; return; } - /* ok, now we have a base for frequency calculation */ - freq_norm = pps_normalize_ts(timespec64_sub(*raw_ts, pps_fbase)); - - /* check that the signal is in the range - * [1s - MAXFREQ us, 1s + MAXFREQ us], otherwise reject it */ - if ((freq_norm.sec == 0) || - (freq_norm.nsec > MAXFREQ * freq_norm.sec) || - (freq_norm.nsec < -MAXFREQ * freq_norm.sec)) { - time_status |= STA_PPSJITTER; - /* restart the frequency calibration interval */ - pps_fbase = *raw_ts; + /* Ok, now we have a base for frequency calculation */ + freq_norm = pps_normalize_ts(timespec64_sub(*raw_ts, ntpdata->pps_fbase)); + + /* + * Check that the signal is in the range + * [1s - MAXFREQ us, 1s + MAXFREQ us], otherwise reject it + */ + if ((freq_norm.sec == 0) || (freq_norm.nsec > MAXFREQ * freq_norm.sec) || + (freq_norm.nsec < -MAXFREQ * freq_norm.sec)) { + ntpdata->time_status |= STA_PPSJITTER; + /* Restart the frequency calibration interval */ + ntpdata->pps_fbase = *raw_ts; printk_deferred(KERN_ERR "hardpps: PPSJITTER: bad pulse\n"); return; } - /* signal is ok */ - - /* check if the current frequency interval is finished */ - if (freq_norm.sec >= (1 << pps_shift)) { - pps_calcnt++; - /* restart the frequency calibration interval */ - pps_fbase = *raw_ts; - hardpps_update_freq(freq_norm); + /* Signal is ok. Check if the current frequency interval is finished */ + if (freq_norm.sec >= (1 << ntpdata->pps_shift)) { + ntpdata->pps_calcnt++; + /* Restart the frequency calibration interval */ + ntpdata->pps_fbase = *raw_ts; + hardpps_update_freq(ntpdata, freq_norm); } - hardpps_update_phase(pts_norm.nsec); + hardpps_update_phase(ntpdata, pts_norm.nsec); } #endif /* CONFIG_NTP_PPS */ static int __init ntp_tick_adj_setup(char *str) { - int rc = kstrtos64(str, 0, &ntp_tick_adj); + int rc = kstrtos64(str, 0, &tk_ntp_data.ntp_tick_adj); if (rc) return rc; - ntp_tick_adj <<= NTP_SCALE_SHIFT; + tk_ntp_data.ntp_tick_adj <<= NTP_SCALE_SHIFT; return 1; } diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index 4782edcbe7b9..1af0bb2cc45c 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -168,7 +168,6 @@ static int posix_clock_release(struct inode *inode, struct file *fp) static const struct file_operations posix_clock_file_operations = { .owner = THIS_MODULE, - .llseek = no_llseek, .read = posix_clock_read, .poll = posix_clock_poll, .unlocked_ioctl = posix_clock_ioctl, @@ -310,6 +309,9 @@ static int pc_clock_settime(clockid_t id, const struct timespec64 *ts) struct posix_clock_desc cd; int err; + if (!timespec64_valid_strict(ts)) + return -EINVAL; + err = get_clock_desc(id, &cd); if (err) return err; diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 6bcee4704059..50e8d04ab661 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -453,7 +453,6 @@ static void disarm_timer(struct k_itimer *timer, struct task_struct *p) struct cpu_timer *ctmr = &timer->it.cpu; struct posix_cputimer_base *base; - timer->it_active = 0; if (!cpu_timer_dequeue(ctmr)) return; @@ -494,19 +493,28 @@ static int posix_cpu_timer_del(struct k_itimer *timer) */ WARN_ON_ONCE(ctmr->head || timerqueue_node_queued(&ctmr->node)); } else { - if (timer->it.cpu.firing) + if (timer->it.cpu.firing) { + /* + * Prevent signal delivery. The timer cannot be dequeued + * because it is on the firing list which is not protected + * by sighand->lock. The delivery path is waiting for + * the timer lock. So go back, unlock and retry. + */ + timer->it.cpu.firing = false; ret = TIMER_RETRY; - else + } else { disarm_timer(timer, p); - + } unlock_task_sighand(p, &flags); } out: rcu_read_unlock(); - if (!ret) - put_pid(ctmr->pid); + if (!ret) { + put_pid(ctmr->pid); + timer->it_status = POSIX_TIMER_DISARMED; + } return ret; } @@ -560,7 +568,7 @@ static void arm_timer(struct k_itimer *timer, struct task_struct *p) struct cpu_timer *ctmr = &timer->it.cpu; u64 newexp = cpu_timer_getexpires(ctmr); - timer->it_active = 1; + timer->it_status = POSIX_TIMER_ARMED; if (!cpu_timer_enqueue(&base->tqhead, ctmr)) return; @@ -586,29 +594,20 @@ static void cpu_timer_fire(struct k_itimer *timer) { struct cpu_timer *ctmr = &timer->it.cpu; - timer->it_active = 0; - if (unlikely(timer->sigq == NULL)) { + timer->it_status = POSIX_TIMER_DISARMED; + + if (unlikely(ctmr->nanosleep)) { /* * This a special case for clock_nanosleep, * not a normal timer from sys_timer_create. */ wake_up_process(timer->it_process); cpu_timer_setexpires(ctmr, 0); - } else if (!timer->it_interval) { - /* - * One-shot timer. Clear it as soon as it's fired. - */ + } else { posix_timer_queue_signal(timer); - cpu_timer_setexpires(ctmr, 0); - } else if (posix_timer_queue_signal(timer)) { - /* - * The signal did not get queued because the signal - * was ignored, so we won't get any callback to - * reload the timer. But we need to keep it - * ticking in case the signal is deliverable next time. - */ - posix_cpu_timer_rearm(timer); - ++timer->it_requeue_pending; + /* Disable oneshot timers */ + if (!timer->it_interval) + cpu_timer_setexpires(ctmr, 0); } } @@ -667,11 +666,17 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, old_expires = cpu_timer_getexpires(ctmr); if (unlikely(timer->it.cpu.firing)) { - timer->it.cpu.firing = -1; + /* + * Prevent signal delivery. The timer cannot be dequeued + * because it is on the firing list which is not protected + * by sighand->lock. The delivery path is waiting for + * the timer lock. So go back, unlock and retry. + */ + timer->it.cpu.firing = false; ret = TIMER_RETRY; } else { cpu_timer_dequeue(ctmr); - timer->it_active = 0; + timer->it_status = POSIX_TIMER_DISARMED; } /* @@ -745,7 +750,7 @@ static void __posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *i * - Timers which expired, but the signal has not yet been * delivered */ - if (iv && ((timer->it_requeue_pending & REQUEUE_PENDING) || sigev_none)) + if (iv && timer->it_status != POSIX_TIMER_ARMED) expires = bump_cpu_timer(timer, now); else expires = cpu_timer_getexpires(&timer->it.cpu); @@ -808,7 +813,7 @@ static u64 collect_timerqueue(struct timerqueue_head *head, if (++i == MAX_COLLECTED || now < expires) return expires; - ctmr->firing = 1; + ctmr->firing = true; /* See posix_cpu_timer_wait_running() */ rcu_assign_pointer(ctmr->handling, current); cpu_timer_dequeue(ctmr); @@ -1363,7 +1368,7 @@ static void handle_posix_cpu_timers(struct task_struct *tsk) * timer call will interfere. */ list_for_each_entry_safe(timer, next, &firing, it.cpu.elist) { - int cpu_firing; + bool cpu_firing; /* * spin_lock() is sufficient here even independent of the @@ -1375,13 +1380,13 @@ static void handle_posix_cpu_timers(struct task_struct *tsk) spin_lock(&timer->it_lock); list_del_init(&timer->it.cpu.elist); cpu_firing = timer->it.cpu.firing; - timer->it.cpu.firing = 0; + timer->it.cpu.firing = false; /* - * The firing flag is -1 if we collided with a reset - * of the timer, which already reported this - * almost-firing as an overrun. So don't generate an event. + * If the firing flag is cleared then this raced with a + * timer rearm/delete operation. So don't generate an + * event. */ - if (likely(cpu_firing >= 0)) + if (likely(cpu_firing)) cpu_timer_fire(timer); /* See posix_cpu_timer_wait_running() */ rcu_assign_pointer(timer->it.cpu.handling, NULL); @@ -1478,6 +1483,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, timer.it_overrun = -1; error = posix_cpu_timer_create(&timer); timer.it_process = current; + timer.it.cpu.nanosleep = true; if (!error) { static struct itimerspec64 zero_it; diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 4576aaed13b2..881a9ce96af7 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -233,11 +233,12 @@ __initcall(init_posix_timers); * The siginfo si_overrun field and the return value of timer_getoverrun(2) * are of type int. Clamp the overrun value to INT_MAX */ -static inline int timer_overrun_to_int(struct k_itimer *timr, int baseval) +static inline int timer_overrun_to_int(struct k_itimer *timr) { - s64 sum = timr->it_overrun_last + (s64)baseval; + if (timr->it_overrun_last > (s64)INT_MAX) + return INT_MAX; - return sum > (s64)INT_MAX ? INT_MAX : (int)sum; + return (int)timr->it_overrun_last; } static void common_hrtimer_rearm(struct k_itimer *timr) @@ -249,62 +250,62 @@ static void common_hrtimer_rearm(struct k_itimer *timr) hrtimer_restart(timer); } +static bool __posixtimer_deliver_signal(struct kernel_siginfo *info, struct k_itimer *timr) +{ + guard(spinlock)(&timr->it_lock); + + /* + * Check if the timer is still alive or whether it got modified + * since the signal was queued. In either case, don't rearm and + * drop the signal. + */ + if (timr->it_signal_seq != timr->it_sigqueue_seq || WARN_ON_ONCE(!timr->it_signal)) + return false; + + if (!timr->it_interval || WARN_ON_ONCE(timr->it_status != POSIX_TIMER_REQUEUE_PENDING)) + return true; + + timr->kclock->timer_rearm(timr); + timr->it_status = POSIX_TIMER_ARMED; + timr->it_overrun_last = timr->it_overrun; + timr->it_overrun = -1LL; + ++timr->it_signal_seq; + info->si_overrun = timer_overrun_to_int(timr); + return true; +} + /* - * This function is called from the signal delivery code if - * info->si_sys_private is not zero, which indicates that the timer has to - * be rearmed. Restart the timer and update info::si_overrun. + * This function is called from the signal delivery code. It decides + * whether the signal should be dropped and rearms interval timers. The + * timer can be unconditionally accessed as there is a reference held on + * it. */ -void posixtimer_rearm(struct kernel_siginfo *info) +bool posixtimer_deliver_signal(struct kernel_siginfo *info, struct sigqueue *timer_sigq) { - struct k_itimer *timr; - unsigned long flags; - - timr = lock_timer(info->si_tid, &flags); - if (!timr) - return; + struct k_itimer *timr = container_of(timer_sigq, struct k_itimer, sigq); + bool ret; - if (timr->it_interval && timr->it_requeue_pending == info->si_sys_private) { - timr->kclock->timer_rearm(timr); + /* + * Release siglock to ensure proper locking order versus + * timr::it_lock. Keep interrupts disabled. + */ + spin_unlock(¤t->sighand->siglock); - timr->it_active = 1; - timr->it_overrun_last = timr->it_overrun; - timr->it_overrun = -1LL; - ++timr->it_requeue_pending; + ret = __posixtimer_deliver_signal(info, timr); - info->si_overrun = timer_overrun_to_int(timr, info->si_overrun); - } + /* Drop the reference which was acquired when the signal was queued */ + posixtimer_putref(timr); - unlock_timer(timr, flags); + spin_lock(¤t->sighand->siglock); + return ret; } -int posix_timer_queue_signal(struct k_itimer *timr) +void posix_timer_queue_signal(struct k_itimer *timr) { - int ret, si_private = 0; - enum pid_type type; - lockdep_assert_held(&timr->it_lock); - timr->it_active = 0; - if (timr->it_interval) - si_private = ++timr->it_requeue_pending; - - /* - * FIXME: if ->sigq is queued we can race with - * dequeue_signal()->posixtimer_rearm(). - * - * If dequeue_signal() sees the "right" value of - * si_sys_private it calls posixtimer_rearm(). - * We re-queue ->sigq and drop ->it_lock(). - * posixtimer_rearm() locks the timer - * and re-schedules it while ->sigq is pending. - * Not really bad, but not that we want. - */ - timr->sigq->info.si_sys_private = si_private; - - type = !(timr->it_sigev_notify & SIGEV_THREAD_ID) ? PIDTYPE_TGID : PIDTYPE_PID; - ret = send_sigqueue(timr->sigq, timr->it_pid, type); - /* If we failed to send the signal the timer stops. */ - return ret > 0; + timr->it_status = timr->it_interval ? POSIX_TIMER_REQUEUE_PENDING : POSIX_TIMER_DISARMED; + posixtimer_send_sigqueue(timr); } /* @@ -317,62 +318,10 @@ int posix_timer_queue_signal(struct k_itimer *timr) static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) { struct k_itimer *timr = container_of(timer, struct k_itimer, it.real.timer); - enum hrtimer_restart ret = HRTIMER_NORESTART; - unsigned long flags; - - spin_lock_irqsave(&timr->it_lock, flags); - - if (posix_timer_queue_signal(timr)) { - /* - * The signal was not queued due to SIG_IGN. As a - * consequence the timer is not going to be rearmed from - * the signal delivery path. But as a real signal handler - * can be installed later the timer must be rearmed here. - */ - if (timr->it_interval != 0) { - ktime_t now = hrtimer_cb_get_time(timer); - - /* - * FIXME: What we really want, is to stop this - * timer completely and restart it in case the - * SIG_IGN is removed. This is a non trivial - * change to the signal handling code. - * - * For now let timers with an interval less than a - * jiffy expire every jiffy and recheck for a - * valid signal handler. - * - * This avoids interrupt starvation in case of a - * very small interval, which would expire the - * timer immediately again. - * - * Moving now ahead of time by one jiffy tricks - * hrtimer_forward() to expire the timer later, - * while it still maintains the overrun accuracy - * for the price of a slight inconsistency in the - * timer_gettime() case. This is at least better - * than a timer storm. - * - * Only required when high resolution timers are - * enabled as the periodic tick based timers are - * automatically aligned to the next tick. - */ - if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS)) { - ktime_t kj = TICK_NSEC; - - if (timr->it_interval < kj) - now = ktime_add(now, kj); - } - - timr->it_overrun += hrtimer_forward(timer, now, timr->it_interval); - ret = HRTIMER_RESTART; - ++timr->it_requeue_pending; - timr->it_active = 1; - } - } - unlock_timer(timr, flags); - return ret; + guard(spinlock_irqsave)(&timr->it_lock); + posix_timer_queue_signal(timr); + return HRTIMER_NORESTART; } static struct pid *good_sigevent(sigevent_t * event) @@ -399,32 +348,27 @@ static struct pid *good_sigevent(sigevent_t * event) } } -static struct k_itimer * alloc_posix_timer(void) +static struct k_itimer *alloc_posix_timer(void) { struct k_itimer *tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL); if (!tmr) return tmr; - if (unlikely(!(tmr->sigq = sigqueue_alloc()))) { + + if (unlikely(!posixtimer_init_sigqueue(&tmr->sigq))) { kmem_cache_free(posix_timers_cache, tmr); return NULL; } - clear_siginfo(&tmr->sigq->info); + rcuref_init(&tmr->rcuref, 1); return tmr; } -static void k_itimer_rcu_free(struct rcu_head *head) -{ - struct k_itimer *tmr = container_of(head, struct k_itimer, rcu); - - kmem_cache_free(posix_timers_cache, tmr); -} - -static void posix_timer_free(struct k_itimer *tmr) +void posixtimer_free_timer(struct k_itimer *tmr) { put_pid(tmr->it_pid); - sigqueue_free(tmr->sigq); - call_rcu(&tmr->rcu, k_itimer_rcu_free); + if (tmr->sigq.ucounts) + dec_rlimit_put_ucounts(tmr->sigq.ucounts, UCOUNT_RLIMIT_SIGPENDING); + kfree_rcu(tmr, rcu); } static void posix_timer_unhash_and_free(struct k_itimer *tmr) @@ -432,7 +376,7 @@ static void posix_timer_unhash_and_free(struct k_itimer *tmr) spin_lock(&hash_lock); hlist_del_rcu(&tmr->t_hash); spin_unlock(&hash_lock); - posix_timer_free(tmr); + posixtimer_putref(tmr); } static int common_timer_create(struct k_itimer *new_timer) @@ -467,7 +411,7 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, */ new_timer_id = posix_timer_add(new_timer); if (new_timer_id < 0) { - posix_timer_free(new_timer); + posixtimer_free_timer(new_timer); return new_timer_id; } @@ -485,18 +429,23 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, goto out; } new_timer->it_sigev_notify = event->sigev_notify; - new_timer->sigq->info.si_signo = event->sigev_signo; - new_timer->sigq->info.si_value = event->sigev_value; + new_timer->sigq.info.si_signo = event->sigev_signo; + new_timer->sigq.info.si_value = event->sigev_value; } else { new_timer->it_sigev_notify = SIGEV_SIGNAL; - new_timer->sigq->info.si_signo = SIGALRM; - memset(&new_timer->sigq->info.si_value, 0, sizeof(sigval_t)); - new_timer->sigq->info.si_value.sival_int = new_timer->it_id; + new_timer->sigq.info.si_signo = SIGALRM; + memset(&new_timer->sigq.info.si_value, 0, sizeof(sigval_t)); + new_timer->sigq.info.si_value.sival_int = new_timer->it_id; new_timer->it_pid = get_pid(task_tgid(current)); } - new_timer->sigq->info.si_tid = new_timer->it_id; - new_timer->sigq->info.si_code = SI_TIMER; + if (new_timer->it_sigev_notify & SIGEV_THREAD_ID) + new_timer->it_pid_type = PIDTYPE_PID; + else + new_timer->it_pid_type = PIDTYPE_TGID; + + new_timer->sigq.info.si_tid = new_timer->it_id; + new_timer->sigq.info.si_code = SI_TIMER; if (copy_to_user(created_timer_id, &new_timer_id, sizeof (new_timer_id))) { error = -EFAULT; @@ -580,7 +529,14 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) * 1) Set timr::it_signal to NULL with timr::it_lock held * 2) Release timr::it_lock * 3) Remove from the hash under hash_lock - * 4) Call RCU for removal after the grace period + * 4) Put the reference count. + * + * The reference count might not drop to zero if timr::sigq is + * queued. In that case the signal delivery or flush will put the + * last reference count. + * + * When the reference count reaches zero, the timer is scheduled + * for RCU removal after the grace period. * * Holding rcu_read_lock() accross the lookup ensures that * the timer cannot be freed. @@ -647,10 +603,10 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) /* interval timer ? */ if (iv) { cur_setting->it_interval = ktime_to_timespec64(iv); - } else if (!timr->it_active) { + } else if (timr->it_status == POSIX_TIMER_DISARMED) { /* * SIGEV_NONE oneshot timers are never queued and therefore - * timr->it_active is always false. The check below + * timr->it_status is always DISARMED. The check below * vs. remaining time will handle this case. * * For all other timers there is nothing to update here, so @@ -667,7 +623,7 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) * is a SIGEV_NONE timer move the expiry time forward by intervals, * so expiry is > now. */ - if (iv && (timr->it_requeue_pending & REQUEUE_PENDING || sig_none)) + if (iv && timr->it_status != POSIX_TIMER_ARMED) timr->it_overrun += kc->timer_forward(timr, now); remaining = kc->timer_remaining(timr, now); @@ -775,7 +731,7 @@ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id) if (!timr) return -EINVAL; - overrun = timer_overrun_to_int(timr, 0); + overrun = timer_overrun_to_int(timr); unlock_timer(timr, flags); return overrun; @@ -867,8 +823,6 @@ void posix_timer_set_common(struct k_itimer *timer, struct itimerspec64 *new_set else timer->it_interval = 0; - /* Prevent reloading in case there is a signal pending */ - timer->it_requeue_pending = (timer->it_requeue_pending + 2) & ~REQUEUE_PENDING; /* Reset overrun accounting */ timer->it_overrun_last = 0; timer->it_overrun = -1LL; @@ -886,8 +840,6 @@ int common_timer_set(struct k_itimer *timr, int flags, if (old_setting) common_timer_get(timr, old_setting); - /* Prevent rearming by clearing the interval */ - timr->it_interval = 0; /* * Careful here. On SMP systems the timer expiry function could be * active and spinning on timr->it_lock. @@ -895,7 +847,7 @@ int common_timer_set(struct k_itimer *timr, int flags, if (kc->timer_try_to_cancel(timr) < 0) return TIMER_RETRY; - timr->it_active = 0; + timr->it_status = POSIX_TIMER_DISARMED; posix_timer_set_common(timr, new_setting); /* Keep timer disarmed when it_value is zero */ @@ -908,7 +860,8 @@ int common_timer_set(struct k_itimer *timr, int flags, sigev_none = timr->it_sigev_notify == SIGEV_NONE; kc->timer_arm(timr, expires, flags & TIMER_ABSTIME, sigev_none); - timr->it_active = !sigev_none; + if (!sigev_none) + timr->it_status = POSIX_TIMER_ARMED; return 0; } @@ -936,6 +889,9 @@ retry: if (old_spec64) old_spec64->it_interval = ktime_to_timespec64(timr->it_interval); + /* Prevent signal delivery and rearming. */ + timr->it_signal_seq++; + kc = timr->kclock; if (WARN_ON_ONCE(!kc || !kc->timer_set)) error = -EINVAL; @@ -1004,17 +960,31 @@ int common_timer_del(struct k_itimer *timer) { const struct k_clock *kc = timer->kclock; - timer->it_interval = 0; if (kc->timer_try_to_cancel(timer) < 0) return TIMER_RETRY; - timer->it_active = 0; + timer->it_status = POSIX_TIMER_DISARMED; return 0; } +/* + * If the deleted timer is on the ignored list, remove it and + * drop the associated reference. + */ +static inline void posix_timer_cleanup_ignored(struct k_itimer *tmr) +{ + if (!hlist_unhashed(&tmr->ignored_list)) { + hlist_del_init(&tmr->ignored_list); + posixtimer_putref(tmr); + } +} + static inline int timer_delete_hook(struct k_itimer *timer) { const struct k_clock *kc = timer->kclock; + /* Prevent signal delivery and rearming. */ + timer->it_signal_seq++; + if (WARN_ON_ONCE(!kc || !kc->timer_del)) return -EINVAL; return kc->timer_del(timer); @@ -1040,12 +1010,18 @@ retry_delete: spin_lock(¤t->sighand->siglock); hlist_del(&timer->list); - spin_unlock(¤t->sighand->siglock); + posix_timer_cleanup_ignored(timer); /* * A concurrent lookup could check timer::it_signal lockless. It * will reevaluate with timer::it_lock held and observe the NULL. + * + * It must be written with siglock held so that the signal code + * observes timer->it_signal == NULL in do_sigaction(SIG_IGN), + * which prevents it from moving a pending signal of a deleted + * timer to the ignore list. */ WRITE_ONCE(timer->it_signal, NULL); + spin_unlock(¤t->sighand->siglock); unlock_timer(timer, flags); posix_timer_unhash_and_free(timer); @@ -1091,6 +1067,8 @@ retry_delete: } hlist_del(&timer->list); + posix_timer_cleanup_ignored(timer); + /* * Setting timer::it_signal to NULL is technically not required * here as nothing can access the timer anymore legitimately via @@ -1123,6 +1101,19 @@ void exit_itimers(struct task_struct *tsk) /* The timers are not longer accessible via tsk::signal */ while (!hlist_empty(&timers)) itimer_delete(hlist_entry(timers.first, struct k_itimer, list)); + + /* + * There should be no timers on the ignored list. itimer_delete() has + * mopped them up. + */ + if (!WARN_ON_ONCE(!hlist_empty(&tsk->signal->ignored_posix_timers))) + return; + + hlist_move_list(&tsk->signal->ignored_posix_timers, &timers); + while (!hlist_empty(&timers)) { + posix_timer_cleanup_ignored(hlist_entry(timers.first, struct k_itimer, + ignored_list)); + } } SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h index 4784ea65f685..61906f0688c1 100644 --- a/kernel/time/posix-timers.h +++ b/kernel/time/posix-timers.h @@ -1,6 +1,12 @@ /* SPDX-License-Identifier: GPL-2.0 */ #define TIMER_RETRY 1 +enum posix_timer_state { + POSIX_TIMER_DISARMED, + POSIX_TIMER_ARMED, + POSIX_TIMER_REQUEUE_PENDING, +}; + struct k_clock { int (*clock_getres)(const clockid_t which_clock, struct timespec64 *tp); @@ -36,7 +42,7 @@ extern const struct k_clock clock_process; extern const struct k_clock clock_thread; extern const struct k_clock alarm_clock; -int posix_timer_queue_signal(struct k_itimer *timr); +void posix_timer_queue_signal(struct k_itimer *timr); void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting); int common_timer_set(struct k_itimer *timr, int flags, diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 68d6c1190ac7..fcca4e72f1ef 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -71,16 +71,16 @@ static __always_inline u64 cyc_to_ns(u64 cyc, u32 mult, u32 shift) notrace struct clock_read_data *sched_clock_read_begin(unsigned int *seq) { - *seq = raw_read_seqcount_latch(&cd.seq); + *seq = read_seqcount_latch(&cd.seq); return cd.read_data + (*seq & 1); } notrace int sched_clock_read_retry(unsigned int seq) { - return raw_read_seqcount_latch_retry(&cd.seq, seq); + return read_seqcount_latch_retry(&cd.seq, seq); } -unsigned long long noinstr sched_clock_noinstr(void) +static __always_inline unsigned long long __sched_clock(void) { struct clock_read_data *rd; unsigned int seq; @@ -98,11 +98,23 @@ unsigned long long noinstr sched_clock_noinstr(void) return res; } +unsigned long long noinstr sched_clock_noinstr(void) +{ + return __sched_clock(); +} + unsigned long long notrace sched_clock(void) { unsigned long long ns; preempt_disable_notrace(); - ns = sched_clock_noinstr(); + /* + * All of __sched_clock() is a seqcount_latch reader critical section, + * but relies on the raw helpers which are uninstrumented. For KCSAN, + * mark all accesses in __sched_clock() as atomic. + */ + kcsan_nestable_atomic_begin(); + ns = __sched_clock(); + kcsan_nestable_atomic_end(); preempt_enable_notrace(); return ns; } @@ -119,17 +131,19 @@ unsigned long long notrace sched_clock(void) */ static void update_clock_read_data(struct clock_read_data *rd) { - /* update the backup (odd) copy with the new data */ - cd.read_data[1] = *rd; - /* steer readers towards the odd copy */ - raw_write_seqcount_latch(&cd.seq); + write_seqcount_latch_begin(&cd.seq); /* now its safe for us to update the normal (even) copy */ cd.read_data[0] = *rd; /* switch readers back to the even copy */ - raw_write_seqcount_latch(&cd.seq); + write_seqcount_latch(&cd.seq); + + /* update the backup (odd) copy with the new data */ + cd.read_data[1] = *rd; + + write_seqcount_latch_end(&cd.seq); } /* @@ -267,7 +281,7 @@ void __init generic_sched_clock_init(void) */ static u64 notrace suspended_sched_clock_read(void) { - unsigned int seq = raw_read_seqcount_latch(&cd.seq); + unsigned int seq = read_seqcount_latch(&cd.seq); return cd.read_data[seq & 1].epoch_cyc; } diff --git a/kernel/time/sleep_timeout.c b/kernel/time/sleep_timeout.c new file mode 100644 index 000000000000..dfe939f6e4ec --- /dev/null +++ b/kernel/time/sleep_timeout.c @@ -0,0 +1,377 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Kernel internal schedule timeout and sleeping functions + */ + +#include <linux/delay.h> +#include <linux/jiffies.h> +#include <linux/timer.h> +#include <linux/sched/signal.h> +#include <linux/sched/debug.h> + +#include "tick-internal.h" + +/* + * Since schedule_timeout()'s timer is defined on the stack, it must store + * the target task on the stack as well. + */ +struct process_timer { + struct timer_list timer; + struct task_struct *task; +}; + +static void process_timeout(struct timer_list *t) +{ + struct process_timer *timeout = from_timer(timeout, t, timer); + + wake_up_process(timeout->task); +} + +/** + * schedule_timeout - sleep until timeout + * @timeout: timeout value in jiffies + * + * Make the current task sleep until @timeout jiffies have elapsed. + * The function behavior depends on the current task state + * (see also set_current_state() description): + * + * %TASK_RUNNING - the scheduler is called, but the task does not sleep + * at all. That happens because sched_submit_work() does nothing for + * tasks in %TASK_RUNNING state. + * + * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to + * pass before the routine returns unless the current task is explicitly + * woken up, (e.g. by wake_up_process()). + * + * %TASK_INTERRUPTIBLE - the routine may return early if a signal is + * delivered to the current task or the current task is explicitly woken + * up. + * + * The current task state is guaranteed to be %TASK_RUNNING when this + * routine returns. + * + * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule + * the CPU away without a bound on the timeout. In this case the return + * value will be %MAX_SCHEDULE_TIMEOUT. + * + * Returns: 0 when the timer has expired otherwise the remaining time in + * jiffies will be returned. In all cases the return value is guaranteed + * to be non-negative. + */ +signed long __sched schedule_timeout(signed long timeout) +{ + struct process_timer timer; + unsigned long expire; + + switch (timeout) { + case MAX_SCHEDULE_TIMEOUT: + /* + * These two special cases are useful to be comfortable + * in the caller. Nothing more. We could take + * MAX_SCHEDULE_TIMEOUT from one of the negative value + * but I' d like to return a valid offset (>=0) to allow + * the caller to do everything it want with the retval. + */ + schedule(); + goto out; + default: + /* + * Another bit of PARANOID. Note that the retval will be + * 0 since no piece of kernel is supposed to do a check + * for a negative retval of schedule_timeout() (since it + * should never happens anyway). You just have the printk() + * that will tell you if something is gone wrong and where. + */ + if (timeout < 0) { + pr_err("%s: wrong timeout value %lx\n", __func__, timeout); + dump_stack(); + __set_current_state(TASK_RUNNING); + goto out; + } + } + + expire = timeout + jiffies; + + timer.task = current; + timer_setup_on_stack(&timer.timer, process_timeout, 0); + timer.timer.expires = expire; + add_timer(&timer.timer); + schedule(); + del_timer_sync(&timer.timer); + + /* Remove the timer from the object tracker */ + destroy_timer_on_stack(&timer.timer); + + timeout = expire - jiffies; + + out: + return timeout < 0 ? 0 : timeout; +} +EXPORT_SYMBOL(schedule_timeout); + +/* + * __set_current_state() can be used in schedule_timeout_*() functions, because + * schedule_timeout() calls schedule() unconditionally. + */ + +/** + * schedule_timeout_interruptible - sleep until timeout (interruptible) + * @timeout: timeout value in jiffies + * + * See schedule_timeout() for details. + * + * Task state is set to TASK_INTERRUPTIBLE before starting the timeout. + */ +signed long __sched schedule_timeout_interruptible(signed long timeout) +{ + __set_current_state(TASK_INTERRUPTIBLE); + return schedule_timeout(timeout); +} +EXPORT_SYMBOL(schedule_timeout_interruptible); + +/** + * schedule_timeout_killable - sleep until timeout (killable) + * @timeout: timeout value in jiffies + * + * See schedule_timeout() for details. + * + * Task state is set to TASK_KILLABLE before starting the timeout. + */ +signed long __sched schedule_timeout_killable(signed long timeout) +{ + __set_current_state(TASK_KILLABLE); + return schedule_timeout(timeout); +} +EXPORT_SYMBOL(schedule_timeout_killable); + +/** + * schedule_timeout_uninterruptible - sleep until timeout (uninterruptible) + * @timeout: timeout value in jiffies + * + * See schedule_timeout() for details. + * + * Task state is set to TASK_UNINTERRUPTIBLE before starting the timeout. + */ +signed long __sched schedule_timeout_uninterruptible(signed long timeout) +{ + __set_current_state(TASK_UNINTERRUPTIBLE); + return schedule_timeout(timeout); +} +EXPORT_SYMBOL(schedule_timeout_uninterruptible); + +/** + * schedule_timeout_idle - sleep until timeout (idle) + * @timeout: timeout value in jiffies + * + * See schedule_timeout() for details. + * + * Task state is set to TASK_IDLE before starting the timeout. It is similar to + * schedule_timeout_uninterruptible(), except this task will not contribute to + * load average. + */ +signed long __sched schedule_timeout_idle(signed long timeout) +{ + __set_current_state(TASK_IDLE); + return schedule_timeout(timeout); +} +EXPORT_SYMBOL(schedule_timeout_idle); + +/** + * schedule_hrtimeout_range_clock - sleep until timeout + * @expires: timeout value (ktime_t) + * @delta: slack in expires timeout (ktime_t) + * @mode: timer mode + * @clock_id: timer clock to be used + * + * Details are explained in schedule_hrtimeout_range() function description as + * this function is commonly used. + */ +int __sched schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, + const enum hrtimer_mode mode, clockid_t clock_id) +{ + struct hrtimer_sleeper t; + + /* + * Optimize when a zero timeout value is given. It does not + * matter whether this is an absolute or a relative time. + */ + if (expires && *expires == 0) { + __set_current_state(TASK_RUNNING); + return 0; + } + + /* + * A NULL parameter means "infinite" + */ + if (!expires) { + schedule(); + return -EINTR; + } + + hrtimer_setup_sleeper_on_stack(&t, clock_id, mode); + hrtimer_set_expires_range_ns(&t.timer, *expires, delta); + hrtimer_sleeper_start_expires(&t, mode); + + if (likely(t.task)) + schedule(); + + hrtimer_cancel(&t.timer); + destroy_hrtimer_on_stack(&t.timer); + + __set_current_state(TASK_RUNNING); + + return !t.task ? 0 : -EINTR; +} +EXPORT_SYMBOL_GPL(schedule_hrtimeout_range_clock); + +/** + * schedule_hrtimeout_range - sleep until timeout + * @expires: timeout value (ktime_t) + * @delta: slack in expires timeout (ktime_t) + * @mode: timer mode + * + * Make the current task sleep until the given expiry time has + * elapsed. The routine will return immediately unless + * the current task state has been set (see set_current_state()). + * + * The @delta argument gives the kernel the freedom to schedule the + * actual wakeup to a time that is both power and performance friendly + * for regular (non RT/DL) tasks. + * The kernel give the normal best effort behavior for "@expires+@delta", + * but may decide to fire the timer earlier, but no earlier than @expires. + * + * You can set the task state as follows - + * + * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to + * pass before the routine returns unless the current task is explicitly + * woken up, (e.g. by wake_up_process()). + * + * %TASK_INTERRUPTIBLE - the routine may return early if a signal is + * delivered to the current task or the current task is explicitly woken + * up. + * + * The current task state is guaranteed to be TASK_RUNNING when this + * routine returns. + * + * Returns: 0 when the timer has expired. If the task was woken before the + * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or + * by an explicit wakeup, it returns -EINTR. + */ +int __sched schedule_hrtimeout_range(ktime_t *expires, u64 delta, + const enum hrtimer_mode mode) +{ + return schedule_hrtimeout_range_clock(expires, delta, mode, + CLOCK_MONOTONIC); +} +EXPORT_SYMBOL_GPL(schedule_hrtimeout_range); + +/** + * schedule_hrtimeout - sleep until timeout + * @expires: timeout value (ktime_t) + * @mode: timer mode + * + * See schedule_hrtimeout_range() for details. @delta argument of + * schedule_hrtimeout_range() is set to 0 and has therefore no impact. + */ +int __sched schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode) +{ + return schedule_hrtimeout_range(expires, 0, mode); +} +EXPORT_SYMBOL_GPL(schedule_hrtimeout); + +/** + * msleep - sleep safely even with waitqueue interruptions + * @msecs: Requested sleep duration in milliseconds + * + * msleep() uses jiffy based timeouts for the sleep duration. Because of the + * design of the timer wheel, the maximum additional percentage delay (slack) is + * 12.5%. This is only valid for timers which will end up in level 1 or a higher + * level of the timer wheel. For explanation of those 12.5% please check the + * detailed description about the basics of the timer wheel. + * + * The slack of timers which will end up in level 0 depends on sleep duration + * (msecs) and HZ configuration and can be calculated in the following way (with + * the timer wheel design restriction that the slack is not less than 12.5%): + * + * ``slack = MSECS_PER_TICK / msecs`` + * + * When the allowed slack of the callsite is known, the calculation could be + * turned around to find the minimal allowed sleep duration to meet the + * constraints. For example: + * + * * ``HZ=1000`` with ``slack=25%``: ``MSECS_PER_TICK / slack = 1 / (1/4) = 4``: + * all sleep durations greater or equal 4ms will meet the constraints. + * * ``HZ=1000`` with ``slack=12.5%``: ``MSECS_PER_TICK / slack = 1 / (1/8) = 8``: + * all sleep durations greater or equal 8ms will meet the constraints. + * * ``HZ=250`` with ``slack=25%``: ``MSECS_PER_TICK / slack = 4 / (1/4) = 16``: + * all sleep durations greater or equal 16ms will meet the constraints. + * * ``HZ=250`` with ``slack=12.5%``: ``MSECS_PER_TICK / slack = 4 / (1/8) = 32``: + * all sleep durations greater or equal 32ms will meet the constraints. + * + * See also the signal aware variant msleep_interruptible(). + */ +void msleep(unsigned int msecs) +{ + unsigned long timeout = msecs_to_jiffies(msecs); + + while (timeout) + timeout = schedule_timeout_uninterruptible(timeout); +} +EXPORT_SYMBOL(msleep); + +/** + * msleep_interruptible - sleep waiting for signals + * @msecs: Requested sleep duration in milliseconds + * + * See msleep() for some basic information. + * + * The difference between msleep() and msleep_interruptible() is that the sleep + * could be interrupted by a signal delivery and then returns early. + * + * Returns: The remaining time of the sleep duration transformed to msecs (see + * schedule_timeout() for details). + */ +unsigned long msleep_interruptible(unsigned int msecs) +{ + unsigned long timeout = msecs_to_jiffies(msecs); + + while (timeout && !signal_pending(current)) + timeout = schedule_timeout_interruptible(timeout); + return jiffies_to_msecs(timeout); +} +EXPORT_SYMBOL(msleep_interruptible); + +/** + * usleep_range_state - Sleep for an approximate time in a given state + * @min: Minimum time in usecs to sleep + * @max: Maximum time in usecs to sleep + * @state: State of the current task that will be while sleeping + * + * usleep_range_state() sleeps at least for the minimum specified time but not + * longer than the maximum specified amount of time. The range might reduce + * power usage by allowing hrtimers to coalesce an already scheduled interrupt + * with this hrtimer. In the worst case, an interrupt is scheduled for the upper + * bound. + * + * The sleeping task is set to the specified state before starting the sleep. + * + * In non-atomic context where the exact wakeup time is flexible, use + * usleep_range() or its variants instead of udelay(). The sleep improves + * responsiveness by avoiding the CPU-hogging busy-wait of udelay(). + */ +void __sched usleep_range_state(unsigned long min, unsigned long max, unsigned int state) +{ + ktime_t exp = ktime_add_us(ktime_get(), min); + u64 delta = (u64)(max - min) * NSEC_PER_USEC; + + if (WARN_ON_ONCE(max < min)) + delta = 0; + + for (;;) { + __set_current_state(state); + /* Do not return before the requested sleep time has elapsed */ + if (!schedule_hrtimeout_range(&exp, delta, HRTIMER_MODE_ABS)) + break; + } +} +EXPORT_SYMBOL(usleep_range_state); diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 5f2105e637bd..faac36de35b9 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -25,6 +25,7 @@ extern int tick_do_timer_cpu __read_mostly; extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); extern void tick_handle_periodic(struct clock_event_device *dev); extern void tick_check_new_device(struct clock_event_device *dev); +extern void tick_offline_cpu(unsigned int cpu); extern void tick_shutdown(unsigned int cpu); extern void tick_suspend(void); extern void tick_resume(void); @@ -142,10 +143,8 @@ static inline bool tick_broadcast_oneshot_available(void) { return tick_oneshot_ #endif /* !(BROADCAST && ONESHOT) */ #if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_HOTPLUG_CPU) -extern void tick_offline_cpu(unsigned int cpu); extern void tick_broadcast_offline(unsigned int cpu); #else -static inline void tick_offline_cpu(unsigned int cpu) { } static inline void tick_broadcast_offline(unsigned int cpu) { } #endif diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 753a184c7090..fa058510af9c 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -311,14 +311,6 @@ static enum hrtimer_restart tick_nohz_handler(struct hrtimer *timer) return HRTIMER_RESTART; } -static void tick_sched_timer_cancel(struct tick_sched *ts) -{ - if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) - hrtimer_cancel(&ts->sched_timer); - else if (tick_sched_flag_test(ts, TS_FLAG_NOHZ)) - tick_program_event(KTIME_MAX, 1); -} - #ifdef CONFIG_NO_HZ_FULL cpumask_var_t tick_nohz_full_mask; EXPORT_SYMBOL_GPL(tick_nohz_full_mask); @@ -434,6 +426,12 @@ static void tick_nohz_kick_task(struct task_struct *tsk) * smp_mb__after_spin_lock() * tick_nohz_task_switch() * LOAD p->tick_dep_mask + * + * XXX given a task picks up the dependency on schedule(), should we + * only care about tasks that are currently on the CPU instead of all + * that are on the runqueue? + * + * That is, does this want to be: task_on_cpu() / task_curr()? */ if (!sched_task_on_rq(tsk)) return; @@ -859,7 +857,7 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) static inline bool local_timer_softirq_pending(void) { - return local_softirq_pending() & BIT(TIMER_SOFTIRQ); + return local_timers_pending() & BIT(TIMER_SOFTIRQ); } /* @@ -1055,7 +1053,10 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) * the tick timer. */ if (unlikely(expires == KTIME_MAX)) { - tick_sched_timer_cancel(ts); + if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) + hrtimer_cancel(&ts->sched_timer); + else + tick_program_event(KTIME_MAX, 1); return; } @@ -1604,21 +1605,13 @@ void tick_setup_sched_timer(bool hrtimer) */ void tick_sched_timer_dying(int cpu) { - struct tick_device *td = &per_cpu(tick_cpu_device, cpu); struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); - struct clock_event_device *dev = td->evtdev; ktime_t idle_sleeptime, iowait_sleeptime; unsigned long idle_calls, idle_sleeps; /* This must happen before hrtimers are migrated! */ - tick_sched_timer_cancel(ts); - - /* - * If the clockevents doesn't support CLOCK_EVT_STATE_ONESHOT_STOPPED, - * make sure not to call low-res tick handler. - */ - if (tick_sched_flag_test(ts, TS_FLAG_NOHZ)) - dev->event_handler = clockevents_handle_noop; + if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) + hrtimer_cancel(&ts->sched_timer); idle_sleeptime = ts->idle_sleeptime; iowait_sleeptime = ts->iowait_sleeptime; diff --git a/kernel/time/time.c b/kernel/time/time.c index 642647f5046b..1b69caa87480 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -556,9 +556,9 @@ EXPORT_SYMBOL(ns_to_timespec64); * - all other values are converted to jiffies by either multiplying * the input value by a factor or dividing it with a factor and * handling any 32-bit overflows. - * for the details see __msecs_to_jiffies() + * for the details see _msecs_to_jiffies() * - * __msecs_to_jiffies() checks for the passed in value being a constant + * msecs_to_jiffies() checks for the passed in value being a constant * via __builtin_constant_p() allowing gcc to eliminate most of the * code, __msecs_to_jiffies() is called if the value passed does not * allow constant folding and the actual conversion must be done at @@ -866,7 +866,7 @@ struct timespec64 timespec64_add_safe(const struct timespec64 lhs, * * Handles compat or 32-bit modes. * - * Return: %0 on success or negative errno on error + * Return: 0 on success or negative errno on error */ int get_timespec64(struct timespec64 *ts, const struct __kernel_timespec __user *uts) @@ -897,7 +897,7 @@ EXPORT_SYMBOL_GPL(get_timespec64); * @ts: input &struct timespec64 * @uts: user's &struct __kernel_timespec * - * Return: %0 on success or negative errno on error + * Return: 0 on success or negative errno on error */ int put_timespec64(const struct timespec64 *ts, struct __kernel_timespec __user *uts) @@ -944,7 +944,7 @@ static int __put_old_timespec32(const struct timespec64 *ts64, * * Handles X86_X32_ABI compatibility conversion. * - * Return: %0 on success or negative errno on error + * Return: 0 on success or negative errno on error */ int get_old_timespec32(struct timespec64 *ts, const void __user *uts) { @@ -963,7 +963,7 @@ EXPORT_SYMBOL_GPL(get_old_timespec32); * * Handles X86_X32_ABI compatibility conversion. * - * Return: %0 on success or negative errno on error + * Return: 0 on success or negative errno on error */ int put_old_timespec32(const struct timespec64 *ts, void __user *uts) { @@ -979,7 +979,7 @@ EXPORT_SYMBOL_GPL(put_old_timespec32); * @it: destination &struct itimerspec64 * @uit: user's &struct __kernel_itimerspec * - * Return: %0 on success or negative errno on error + * Return: 0 on success or negative errno on error */ int get_itimerspec64(struct itimerspec64 *it, const struct __kernel_itimerspec __user *uit) @@ -1002,7 +1002,7 @@ EXPORT_SYMBOL_GPL(get_itimerspec64); * @it: input &struct itimerspec64 * @uit: user's &struct __kernel_itimerspec * - * Return: %0 on success or negative errno on error + * Return: 0 on success or negative errno on error */ int put_itimerspec64(const struct itimerspec64 *it, struct __kernel_itimerspec __user *uit) @@ -1024,7 +1024,7 @@ EXPORT_SYMBOL_GPL(put_itimerspec64); * @its: destination &struct itimerspec64 * @uits: user's &struct old_itimerspec32 * - * Return: %0 on success or negative errno on error + * Return: 0 on success or negative errno on error */ int get_old_itimerspec32(struct itimerspec64 *its, const struct old_itimerspec32 __user *uits) @@ -1043,7 +1043,7 @@ EXPORT_SYMBOL_GPL(get_old_itimerspec32); * @its: input &struct itimerspec64 * @uits: user's &struct old_itimerspec32 * - * Return: %0 on success or negative errno on error + * Return: 0 on success or negative errno on error */ int put_old_itimerspec32(const struct itimerspec64 *its, struct old_itimerspec32 __user *uits) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 7e6f409bf311..0ca85ff4fbb4 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -30,8 +30,9 @@ #include "timekeeping_internal.h" #define TK_CLEAR_NTP (1 << 0) -#define TK_MIRROR (1 << 1) -#define TK_CLOCK_WAS_SET (1 << 2) +#define TK_CLOCK_WAS_SET (1 << 1) + +#define TK_UPDATE_ALL (TK_CLEAR_NTP | TK_CLOCK_WAS_SET) enum timekeeping_adv_mode { /* Update timekeeper when a tick has passed */ @@ -41,20 +42,18 @@ enum timekeeping_adv_mode { TK_ADV_FREQ }; -DEFINE_RAW_SPINLOCK(timekeeper_lock); - /* * The most important data for readout fits into a single 64 byte * cache line. */ -static struct { +struct tk_data { seqcount_raw_spinlock_t seq; struct timekeeper timekeeper; -} tk_core ____cacheline_aligned = { - .seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_core.seq, &timekeeper_lock), -}; + struct timekeeper shadow_timekeeper; + raw_spinlock_t lock; +} ____cacheline_aligned; -static struct timekeeper shadow_timekeeper; +static struct tk_data tk_core; /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; @@ -114,6 +113,36 @@ static struct tk_fast tk_fast_raw ____cacheline_aligned = { .base[1] = FAST_TK_INIT, }; +unsigned long timekeeper_lock_irqsave(void) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&tk_core.lock, flags); + return flags; +} + +void timekeeper_unlock_irqrestore(unsigned long flags) +{ + raw_spin_unlock_irqrestore(&tk_core.lock, flags); +} + +/* + * Multigrain timestamps require tracking the latest fine-grained timestamp + * that has been issued, and never returning a coarse-grained timestamp that is + * earlier than that value. + * + * mg_floor represents the latest fine-grained time that has been handed out as + * a file timestamp on the system. This is tracked as a monotonic ktime_t, and + * converted to a realtime clock value on an as-needed basis. + * + * Maintaining mg_floor ensures the multigrain interfaces never issue a + * timestamp earlier than one that has been previously issued. + * + * The exception to this rule is when there is a backward realtime clock jump. If + * such an event occurs, a timestamp can appear to be earlier than a previous one. + */ +static __cacheline_aligned_in_smp atomic64_t mg_floor; + static inline void tk_normalize_xtime(struct timekeeper *tk) { while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) { @@ -161,13 +190,15 @@ static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm) WARN_ON_ONCE(tk->offs_real != timespec64_to_ktime(tmp)); tk->wall_to_monotonic = wtm; set_normalized_timespec64(&tmp, -wtm.tv_sec, -wtm.tv_nsec); - tk->offs_real = timespec64_to_ktime(tmp); - tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0)); + /* Paired with READ_ONCE() in ktime_mono_to_any() */ + WRITE_ONCE(tk->offs_real, timespec64_to_ktime(tmp)); + WRITE_ONCE(tk->offs_tai, ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0))); } static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) { - tk->offs_boot = ktime_add(tk->offs_boot, delta); + /* Paired with READ_ONCE() in ktime_mono_to_any() */ + WRITE_ONCE(tk->offs_boot, ktime_add(tk->offs_boot, delta)); /* * Timespec representation for VDSO update to avoid 64bit division * on every update. @@ -184,7 +215,7 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) * the tkr's clocksource may change between the read reference, and the * clock reference passed to the read function. This can cause crashes if * the wrong clocksource is passed to the wrong read function. - * This isn't necessary to use when holding the timekeeper_lock or doing + * This isn't necessary to use when holding the tk_core.lock or doing * a read of the fast-timekeeper tkrs (which is protected by its own locking * and update logic). */ @@ -195,97 +226,6 @@ static inline u64 tk_clock_read(const struct tk_read_base *tkr) return clock->read(clock); } -#ifdef CONFIG_DEBUG_TIMEKEEPING -#define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */ - -static void timekeeping_check_update(struct timekeeper *tk, u64 offset) -{ - - u64 max_cycles = tk->tkr_mono.clock->max_cycles; - const char *name = tk->tkr_mono.clock->name; - - if (offset > max_cycles) { - printk_deferred("WARNING: timekeeping: Cycle offset (%lld) is larger than allowed by the '%s' clock's max_cycles value (%lld): time overflow danger\n", - offset, name, max_cycles); - printk_deferred(" timekeeping: Your kernel is sick, but tries to cope by capping time updates\n"); - } else { - if (offset > (max_cycles >> 1)) { - printk_deferred("INFO: timekeeping: Cycle offset (%lld) is larger than the '%s' clock's 50%% safety margin (%lld)\n", - offset, name, max_cycles >> 1); - printk_deferred(" timekeeping: Your kernel is still fine, but is feeling a bit nervous\n"); - } - } - - if (tk->underflow_seen) { - if (jiffies - tk->last_warning > WARNING_FREQ) { - printk_deferred("WARNING: Underflow in clocksource '%s' observed, time update ignored.\n", name); - printk_deferred(" Please report this, consider using a different clocksource, if possible.\n"); - printk_deferred(" Your kernel is probably still fine.\n"); - tk->last_warning = jiffies; - } - tk->underflow_seen = 0; - } - - if (tk->overflow_seen) { - if (jiffies - tk->last_warning > WARNING_FREQ) { - printk_deferred("WARNING: Overflow in clocksource '%s' observed, time update capped.\n", name); - printk_deferred(" Please report this, consider using a different clocksource, if possible.\n"); - printk_deferred(" Your kernel is probably still fine.\n"); - tk->last_warning = jiffies; - } - tk->overflow_seen = 0; - } -} - -static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles); - -static inline u64 timekeeping_debug_get_ns(const struct tk_read_base *tkr) -{ - struct timekeeper *tk = &tk_core.timekeeper; - u64 now, last, mask, max, delta; - unsigned int seq; - - /* - * Since we're called holding a seqcount, the data may shift - * under us while we're doing the calculation. This can cause - * false positives, since we'd note a problem but throw the - * results away. So nest another seqcount here to atomically - * grab the points we are checking with. - */ - do { - seq = read_seqcount_begin(&tk_core.seq); - now = tk_clock_read(tkr); - last = tkr->cycle_last; - mask = tkr->mask; - max = tkr->clock->max_cycles; - } while (read_seqcount_retry(&tk_core.seq, seq)); - - delta = clocksource_delta(now, last, mask); - - /* - * Try to catch underflows by checking if we are seeing small - * mask-relative negative values. - */ - if (unlikely((~delta & mask) < (mask >> 3))) - tk->underflow_seen = 1; - - /* Check for multiplication overflows */ - if (unlikely(delta > max)) - tk->overflow_seen = 1; - - /* timekeeping_cycles_to_ns() handles both under and overflow */ - return timekeeping_cycles_to_ns(tkr, now); -} -#else -static inline void timekeeping_check_update(struct timekeeper *tk, u64 offset) -{ -} -static inline u64 timekeeping_debug_get_ns(const struct tk_read_base *tkr) -{ - BUG(); -} -#endif - /** * tk_setup_internals - Set up internals to use clocksource clock. * @@ -390,19 +330,11 @@ static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 c return ((delta * tkr->mult) + tkr->xtime_nsec) >> tkr->shift; } -static __always_inline u64 __timekeeping_get_ns(const struct tk_read_base *tkr) +static __always_inline u64 timekeeping_get_ns(const struct tk_read_base *tkr) { return timekeeping_cycles_to_ns(tkr, tk_clock_read(tkr)); } -static inline u64 timekeeping_get_ns(const struct tk_read_base *tkr) -{ - if (IS_ENABLED(CONFIG_DEBUG_TIMEKEEPING)) - return timekeeping_debug_get_ns(tkr); - - return __timekeeping_get_ns(tkr); -} - /** * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper. * @tkr: Timekeeping readout base from which we take the update @@ -411,7 +343,7 @@ static inline u64 timekeeping_get_ns(const struct tk_read_base *tkr) * We want to use this from any context including NMI and tracing / * instrumenting the timekeeping code itself. * - * Employ the latch technique; see @raw_write_seqcount_latch. + * Employ the latch technique; see @write_seqcount_latch. * * So if a NMI hits the update of base[0] then it will use base[1] * which is still consistent. In the worst case this can result is a @@ -424,16 +356,18 @@ static void update_fast_timekeeper(const struct tk_read_base *tkr, struct tk_read_base *base = tkf->base; /* Force readers off to base[1] */ - raw_write_seqcount_latch(&tkf->seq); + write_seqcount_latch_begin(&tkf->seq); /* Update base[0] */ memcpy(base, tkr, sizeof(*base)); /* Force readers back to base[0] */ - raw_write_seqcount_latch(&tkf->seq); + write_seqcount_latch(&tkf->seq); /* Update base[1] */ memcpy(base + 1, base, sizeof(*base)); + + write_seqcount_latch_end(&tkf->seq); } static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) @@ -443,11 +377,11 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) u64 now; do { - seq = raw_read_seqcount_latch(&tkf->seq); + seq = read_seqcount_latch(&tkf->seq); tkr = tkf->base + (seq & 0x01); now = ktime_to_ns(tkr->base); - now += __timekeeping_get_ns(tkr); - } while (raw_read_seqcount_latch_retry(&tkf->seq, seq)); + now += timekeeping_get_ns(tkr); + } while (read_seqcount_latch_retry(&tkf->seq, seq)); return now; } @@ -517,7 +451,7 @@ EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns); * timekeeping_inject_sleeptime64() * __timekeeping_inject_sleeptime(tk, delta); * timestamp(); - * timekeeping_update(tk, TK_CLEAR_NTP...); + * timekeeping_update_staged(tkd, TK_CLEAR_NTP...); * * (2) On 32-bit systems, the 64-bit boot offset (tk->offs_boot) may be * partially updated. Since the tk->offs_boot update is a rare event, this @@ -562,7 +496,7 @@ static __always_inline u64 __ktime_get_real_fast(struct tk_fast *tkf, u64 *mono) tkr = tkf->base + (seq & 0x01); basem = ktime_to_ns(tkr->base); baser = ktime_to_ns(tkr->base_real); - delta = __timekeeping_get_ns(tkr); + delta = timekeeping_get_ns(tkr); } while (raw_read_seqcount_latch_retry(&tkf->seq, seq)); if (mono) @@ -676,13 +610,11 @@ static void update_pvclock_gtod(struct timekeeper *tk, bool was_set) int pvclock_gtod_register_notifier(struct notifier_block *nb) { struct timekeeper *tk = &tk_core.timekeeper; - unsigned long flags; int ret; - raw_spin_lock_irqsave(&timekeeper_lock, flags); + guard(raw_spinlock_irqsave)(&tk_core.lock); ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb); update_pvclock_gtod(tk, true); - raw_spin_unlock_irqrestore(&timekeeper_lock, flags); return ret; } @@ -695,14 +627,8 @@ EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier); */ int pvclock_gtod_unregister_notifier(struct notifier_block *nb) { - unsigned long flags; - int ret; - - raw_spin_lock_irqsave(&timekeeper_lock, flags); - ret = raw_notifier_chain_unregister(&pvclock_gtod_chain, nb); - raw_spin_unlock_irqrestore(&timekeeper_lock, flags); - - return ret; + guard(raw_spinlock_irqsave)(&tk_core.lock); + return raw_notifier_chain_unregister(&pvclock_gtod_chain, nb); } EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); @@ -718,6 +644,18 @@ static inline void tk_update_leap_state(struct timekeeper *tk) } /* + * Leap state update for both shadow and the real timekeeper + * Separate to spare a full memcpy() of the timekeeper. + */ +static void tk_update_leap_state_all(struct tk_data *tkd) +{ + write_seqcount_begin(&tkd->seq); + tk_update_leap_state(&tkd->shadow_timekeeper); + tkd->timekeeper.next_leap_ktime = tkd->shadow_timekeeper.next_leap_ktime; + write_seqcount_end(&tkd->seq); +} + +/* * Update the ktime_t based scalar nsec members of the timekeeper */ static inline void tk_update_ktime_data(struct timekeeper *tk) @@ -750,9 +688,30 @@ static inline void tk_update_ktime_data(struct timekeeper *tk) tk->tkr_raw.base = ns_to_ktime(tk->raw_sec * NSEC_PER_SEC); } -/* must hold timekeeper_lock */ -static void timekeeping_update(struct timekeeper *tk, unsigned int action) +/* + * Restore the shadow timekeeper from the real timekeeper. + */ +static void timekeeping_restore_shadow(struct tk_data *tkd) { + lockdep_assert_held(&tkd->lock); + memcpy(&tkd->shadow_timekeeper, &tkd->timekeeper, sizeof(tkd->timekeeper)); +} + +static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int action) +{ + struct timekeeper *tk = &tk_core.shadow_timekeeper; + + lockdep_assert_held(&tkd->lock); + + /* + * Block out readers before running the updates below because that + * updates VDSO and other time related infrastructure. Not blocking + * the readers might let a reader see time going backwards when + * reading from the VDSO after the VDSO update and then reading in + * the kernel from the timekeeper before that got updated. + */ + write_seqcount_begin(&tkd->seq); + if (action & TK_CLEAR_NTP) { tk->ntp_error = 0; ntp_clear(); @@ -770,14 +729,17 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) if (action & TK_CLOCK_WAS_SET) tk->clock_was_set_seq++; + /* - * The mirroring of the data to the shadow-timekeeper needs - * to happen last here to ensure we don't over-write the - * timekeeper structure on the next update with stale data + * Update the real timekeeper. + * + * We could avoid this memcpy() by switching pointers, but that has + * the downside that the reader side does not longer benefit from + * the cacheline optimized data layout of the timekeeper and requires + * another indirection. */ - if (action & TK_MIRROR) - memcpy(&shadow_timekeeper, &tk_core.timekeeper, - sizeof(tk_core.timekeeper)); + memcpy(&tkd->timekeeper, tk, sizeof(*tk)); + write_seqcount_end(&tkd->seq); } /** @@ -930,6 +892,14 @@ ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs) unsigned int seq; ktime_t tconv; + if (IS_ENABLED(CONFIG_64BIT)) { + /* + * Paired with WRITE_ONCE()s in tk_set_wall_to_mono() and + * tk_update_sleep_time(). + */ + return ktime_add(tmono, READ_ONCE(*offset)); + } + do { seq = read_seqcount_begin(&tk_core.seq); tconv = ktime_add(tmono, *offset); @@ -1060,6 +1030,7 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) unsigned int seq; ktime_t base_raw; ktime_t base_real; + ktime_t base_boot; u64 nsec_raw; u64 nsec_real; u64 now; @@ -1074,6 +1045,8 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq; base_real = ktime_add(tk->tkr_mono.base, tk_core.timekeeper.offs_real); + base_boot = ktime_add(tk->tkr_mono.base, + tk_core.timekeeper.offs_boot); base_raw = tk->tkr_raw.base; nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, now); nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, now); @@ -1081,6 +1054,7 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) systime_snapshot->cycles = now; systime_snapshot->real = ktime_add_ns(base_real, nsec_real); + systime_snapshot->boot = ktime_add_ns(base_boot, nsec_real); systime_snapshot->raw = ktime_add_ns(base_raw, nsec_raw); } EXPORT_SYMBOL_GPL(ktime_get_snapshot); @@ -1440,45 +1414,35 @@ EXPORT_SYMBOL_GPL(timekeeping_clocksource_has_base); */ int do_settimeofday64(const struct timespec64 *ts) { - struct timekeeper *tk = &tk_core.timekeeper; struct timespec64 ts_delta, xt; - unsigned long flags; - int ret = 0; if (!timespec64_valid_settod(ts)) return -EINVAL; - raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&tk_core.seq); + scoped_guard (raw_spinlock_irqsave, &tk_core.lock) { + struct timekeeper *tks = &tk_core.shadow_timekeeper; - timekeeping_forward_now(tk); + timekeeping_forward_now(tks); - xt = tk_xtime(tk); - ts_delta = timespec64_sub(*ts, xt); + xt = tk_xtime(tks); + ts_delta = timespec64_sub(*ts, xt); - if (timespec64_compare(&tk->wall_to_monotonic, &ts_delta) > 0) { - ret = -EINVAL; - goto out; - } - - tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts_delta)); - - tk_set_xtime(tk, ts); -out: - timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); + if (timespec64_compare(&tks->wall_to_monotonic, &ts_delta) > 0) { + timekeeping_restore_shadow(&tk_core); + return -EINVAL; + } - write_seqcount_end(&tk_core.seq); - raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + tk_set_wall_to_mono(tks, timespec64_sub(tks->wall_to_monotonic, ts_delta)); + tk_set_xtime(tks, ts); + timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL); + } /* Signal hrtimers about time change */ clock_was_set(CLOCK_SET_WALL); - if (!ret) { - audit_tk_injoffset(ts_delta); - add_device_randomness(ts, sizeof(*ts)); - } - - return ret; + audit_tk_injoffset(ts_delta); + add_device_randomness(ts, sizeof(*ts)); + return 0; } EXPORT_SYMBOL(do_settimeofday64); @@ -1490,40 +1454,31 @@ EXPORT_SYMBOL(do_settimeofday64); */ static int timekeeping_inject_offset(const struct timespec64 *ts) { - struct timekeeper *tk = &tk_core.timekeeper; - unsigned long flags; - struct timespec64 tmp; - int ret = 0; - if (ts->tv_nsec < 0 || ts->tv_nsec >= NSEC_PER_SEC) return -EINVAL; - raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&tk_core.seq); - - timekeeping_forward_now(tk); - - /* Make sure the proposed value is valid */ - tmp = timespec64_add(tk_xtime(tk), *ts); - if (timespec64_compare(&tk->wall_to_monotonic, ts) > 0 || - !timespec64_valid_settod(&tmp)) { - ret = -EINVAL; - goto error; - } + scoped_guard (raw_spinlock_irqsave, &tk_core.lock) { + struct timekeeper *tks = &tk_core.shadow_timekeeper; + struct timespec64 tmp; - tk_xtime_add(tk, ts); - tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *ts)); + timekeeping_forward_now(tks); -error: /* even if we error out, we forwarded the time, so call update */ - timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); + /* Make sure the proposed value is valid */ + tmp = timespec64_add(tk_xtime(tks), *ts); + if (timespec64_compare(&tks->wall_to_monotonic, ts) > 0 || + !timespec64_valid_settod(&tmp)) { + timekeeping_restore_shadow(&tk_core); + return -EINVAL; + } - write_seqcount_end(&tk_core.seq); - raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + tk_xtime_add(tks, ts); + tk_set_wall_to_mono(tks, timespec64_sub(tks->wall_to_monotonic, *ts)); + timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL); + } /* Signal hrtimers about time change */ clock_was_set(CLOCK_SET_WALL); - - return ret; + return 0; } /* @@ -1576,43 +1531,34 @@ static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset) */ static int change_clocksource(void *data) { - struct timekeeper *tk = &tk_core.timekeeper; - struct clocksource *new, *old = NULL; - unsigned long flags; - bool change = false; - - new = (struct clocksource *) data; + struct clocksource *new = data, *old = NULL; /* - * If the cs is in module, get a module reference. Succeeds - * for built-in code (owner == NULL) as well. + * If the clocksource is in a module, get a module reference. + * Succeeds for built-in code (owner == NULL) as well. Abort if the + * reference can't be acquired. */ - if (try_module_get(new->owner)) { - if (!new->enable || new->enable(new) == 0) - change = true; - else - module_put(new->owner); - } - - raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&tk_core.seq); - - timekeeping_forward_now(tk); + if (!try_module_get(new->owner)) + return 0; - if (change) { - old = tk->tkr_mono.clock; - tk_setup_internals(tk, new); + /* Abort if the device can't be enabled */ + if (new->enable && new->enable(new) != 0) { + module_put(new->owner); + return 0; } - timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); + scoped_guard (raw_spinlock_irqsave, &tk_core.lock) { + struct timekeeper *tks = &tk_core.shadow_timekeeper; - write_seqcount_end(&tk_core.seq); - raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + timekeeping_forward_now(tks); + old = tks->tkr_mono.clock; + tk_setup_internals(tks, new); + timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL); + } if (old) { if (old->disable) old->disable(old); - module_put(old->owner); } @@ -1737,6 +1683,12 @@ read_persistent_wall_and_boot_offset(struct timespec64 *wall_time, *boot_offset = ns_to_timespec64(local_clock()); } +static __init void tkd_basic_setup(struct tk_data *tkd) +{ + raw_spin_lock_init(&tkd->lock); + seqcount_raw_spinlock_init(&tkd->seq, &tkd->lock); +} + /* * Flag reflecting whether timekeeping_resume() has injected sleeptime. * @@ -1761,9 +1713,10 @@ static bool persistent_clock_exists; void __init timekeeping_init(void) { struct timespec64 wall_time, boot_offset, wall_to_mono; - struct timekeeper *tk = &tk_core.timekeeper; + struct timekeeper *tks = &tk_core.shadow_timekeeper; struct clocksource *clock; - unsigned long flags; + + tkd_basic_setup(&tk_core); read_persistent_wall_and_boot_offset(&wall_time, &boot_offset); if (timespec64_valid_settod(&wall_time) && @@ -1783,24 +1736,21 @@ void __init timekeeping_init(void) */ wall_to_mono = timespec64_sub(boot_offset, wall_time); - raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&tk_core.seq); + guard(raw_spinlock_irqsave)(&tk_core.lock); + ntp_init(); clock = clocksource_default_clock(); if (clock->enable) clock->enable(clock); - tk_setup_internals(tk, clock); + tk_setup_internals(tks, clock); - tk_set_xtime(tk, &wall_time); - tk->raw_sec = 0; + tk_set_xtime(tks, &wall_time); + tks->raw_sec = 0; - tk_set_wall_to_mono(tk, wall_to_mono); + tk_set_wall_to_mono(tks, wall_to_mono); - timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); - - write_seqcount_end(&tk_core.seq); - raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + timekeeping_update_from_shadow(&tk_core, TK_CLOCK_WAS_SET); } /* time in seconds when suspend began for persistent clock */ @@ -1878,22 +1828,14 @@ bool timekeeping_rtc_skipsuspend(void) */ void timekeeping_inject_sleeptime64(const struct timespec64 *delta) { - struct timekeeper *tk = &tk_core.timekeeper; - unsigned long flags; - - raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&tk_core.seq); - - suspend_timing_needed = false; + scoped_guard(raw_spinlock_irqsave, &tk_core.lock) { + struct timekeeper *tks = &tk_core.shadow_timekeeper; - timekeeping_forward_now(tk); - - __timekeeping_inject_sleeptime(tk, delta); - - timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); - - write_seqcount_end(&tk_core.seq); - raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + suspend_timing_needed = false; + timekeeping_forward_now(tks); + __timekeeping_inject_sleeptime(tks, delta); + timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL); + } /* Signal hrtimers about time change */ clock_was_set(CLOCK_SET_WALL | CLOCK_SET_BOOT); @@ -1905,20 +1847,19 @@ void timekeeping_inject_sleeptime64(const struct timespec64 *delta) */ void timekeeping_resume(void) { - struct timekeeper *tk = &tk_core.timekeeper; - struct clocksource *clock = tk->tkr_mono.clock; - unsigned long flags; + struct timekeeper *tks = &tk_core.shadow_timekeeper; + struct clocksource *clock = tks->tkr_mono.clock; struct timespec64 ts_new, ts_delta; - u64 cycle_now, nsec; bool inject_sleeptime = false; + u64 cycle_now, nsec; + unsigned long flags; read_persistent_clock64(&ts_new); clockevents_resume(); clocksource_resume(); - raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&tk_core.seq); + raw_spin_lock_irqsave(&tk_core.lock, flags); /* * After system resumes, we need to calculate the suspended time and @@ -1932,7 +1873,7 @@ void timekeeping_resume(void) * The less preferred source will only be tried if there is no better * usable source. The rtc part is handled separately in rtc core code. */ - cycle_now = tk_clock_read(&tk->tkr_mono); + cycle_now = tk_clock_read(&tks->tkr_mono); nsec = clocksource_stop_suspend_timing(clock, cycle_now); if (nsec > 0) { ts_delta = ns_to_timespec64(nsec); @@ -1944,18 +1885,17 @@ void timekeeping_resume(void) if (inject_sleeptime) { suspend_timing_needed = false; - __timekeeping_inject_sleeptime(tk, &ts_delta); + __timekeeping_inject_sleeptime(tks, &ts_delta); } /* Re-base the last cycle value */ - tk->tkr_mono.cycle_last = cycle_now; - tk->tkr_raw.cycle_last = cycle_now; + tks->tkr_mono.cycle_last = cycle_now; + tks->tkr_raw.cycle_last = cycle_now; - tk->ntp_error = 0; + tks->ntp_error = 0; timekeeping_suspended = 0; - timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); - write_seqcount_end(&tk_core.seq); - raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + timekeeping_update_from_shadow(&tk_core, TK_CLOCK_WAS_SET); + raw_spin_unlock_irqrestore(&tk_core.lock, flags); touch_softlockup_watchdog(); @@ -1967,11 +1907,11 @@ void timekeeping_resume(void) int timekeeping_suspend(void) { - struct timekeeper *tk = &tk_core.timekeeper; - unsigned long flags; - struct timespec64 delta, delta_delta; - static struct timespec64 old_delta; + struct timekeeper *tks = &tk_core.shadow_timekeeper; + struct timespec64 delta, delta_delta; + static struct timespec64 old_delta; struct clocksource *curr_clock; + unsigned long flags; u64 cycle_now; read_persistent_clock64(&timekeeping_suspend_time); @@ -1986,9 +1926,8 @@ int timekeeping_suspend(void) suspend_timing_needed = true; - raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&tk_core.seq); - timekeeping_forward_now(tk); + raw_spin_lock_irqsave(&tk_core.lock, flags); + timekeeping_forward_now(tks); timekeeping_suspended = 1; /* @@ -1996,8 +1935,8 @@ int timekeeping_suspend(void) * just read from the current clocksource. Save this to potentially * use in suspend timing. */ - curr_clock = tk->tkr_mono.clock; - cycle_now = tk->tkr_mono.cycle_last; + curr_clock = tks->tkr_mono.clock; + cycle_now = tks->tkr_mono.cycle_last; clocksource_start_suspend_timing(curr_clock, cycle_now); if (persistent_clock_exists) { @@ -2007,7 +1946,7 @@ int timekeeping_suspend(void) * try to compensate so the difference in system time * and persistent_clock time stays close to constant. */ - delta = timespec64_sub(tk_xtime(tk), timekeeping_suspend_time); + delta = timespec64_sub(tk_xtime(tks), timekeeping_suspend_time); delta_delta = timespec64_sub(delta, old_delta); if (abs(delta_delta.tv_sec) >= 2) { /* @@ -2022,10 +1961,9 @@ int timekeeping_suspend(void) } } - timekeeping_update(tk, TK_MIRROR); - halt_fast_timekeeper(tk); - write_seqcount_end(&tk_core.seq); - raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + timekeeping_update_from_shadow(&tk_core, 0); + halt_fast_timekeeper(tks); + raw_spin_unlock_irqrestore(&tk_core.lock, flags); tick_suspend(); clocksource_suspend(); @@ -2130,16 +2068,17 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk, */ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) { + u64 ntp_tl = ntp_tick_length(); u32 mult; /* * Determine the multiplier from the current NTP tick length. * Avoid expensive division when the tick length doesn't change. */ - if (likely(tk->ntp_tick == ntp_tick_length())) { + if (likely(tk->ntp_tick == ntp_tl)) { mult = tk->tkr_mono.mult - tk->ntp_err_mult; } else { - tk->ntp_tick = ntp_tick_length(); + tk->ntp_tick = ntp_tl; mult = div64_u64((tk->ntp_tick >> tk->ntp_error_shift) - tk->xtime_remainder, tk->cycle_interval); } @@ -2278,28 +2217,24 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset, */ static bool timekeeping_advance(enum timekeeping_adv_mode mode) { + struct timekeeper *tk = &tk_core.shadow_timekeeper; struct timekeeper *real_tk = &tk_core.timekeeper; - struct timekeeper *tk = &shadow_timekeeper; - u64 offset; - int shift = 0, maxshift; unsigned int clock_set = 0; - unsigned long flags; + int shift = 0, maxshift; + u64 offset; - raw_spin_lock_irqsave(&timekeeper_lock, flags); + guard(raw_spinlock_irqsave)(&tk_core.lock); /* Make sure we're fully resumed: */ if (unlikely(timekeeping_suspended)) - goto out; + return false; offset = clocksource_delta(tk_clock_read(&tk->tkr_mono), tk->tkr_mono.cycle_last, tk->tkr_mono.mask); /* Check if there's really nothing to do */ if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK) - goto out; - - /* Do some additional sanity checking */ - timekeeping_check_update(tk, offset); + return false; /* * With NO_HZ we may have to accumulate many cycle_intervals @@ -2315,8 +2250,7 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode) maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1; shift = min(shift, maxshift); while (offset >= tk->cycle_interval) { - offset = logarithmic_accumulation(tk, offset, shift, - &clock_set); + offset = logarithmic_accumulation(tk, offset, shift, &clock_set); if (offset < tk->cycle_interval<<shift) shift--; } @@ -2330,23 +2264,7 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode) */ clock_set |= accumulate_nsecs_to_secs(tk); - write_seqcount_begin(&tk_core.seq); - /* - * Update the real timekeeper. - * - * We could avoid this memcpy by switching pointers, but that - * requires changes to all other timekeeper usage sites as - * well, i.e. move the timekeeper pointer getter into the - * spinlocked/seqcount protected sections. And we trade this - * memcpy under the tk_core.seq against one before we start - * updating. - */ - timekeeping_update(tk, clock_set); - memcpy(real_tk, tk, sizeof(*tk)); - /* The memcpy must come last. Do not put anything here! */ - write_seqcount_end(&tk_core.seq); -out: - raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + timekeeping_update_from_shadow(&tk_core, clock_set); return !!clock_set; } @@ -2394,6 +2312,94 @@ void ktime_get_coarse_real_ts64(struct timespec64 *ts) } EXPORT_SYMBOL(ktime_get_coarse_real_ts64); +/** + * ktime_get_coarse_real_ts64_mg - return latter of coarse grained time or floor + * @ts: timespec64 to be filled + * + * Fetch the global mg_floor value, convert it to realtime and compare it + * to the current coarse-grained time. Fill @ts with whichever is + * latest. Note that this is a filesystem-specific interface and should be + * avoided outside of that context. + */ +void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts) +{ + struct timekeeper *tk = &tk_core.timekeeper; + u64 floor = atomic64_read(&mg_floor); + ktime_t f_real, offset, coarse; + unsigned int seq; + + do { + seq = read_seqcount_begin(&tk_core.seq); + *ts = tk_xtime(tk); + offset = tk_core.timekeeper.offs_real; + } while (read_seqcount_retry(&tk_core.seq, seq)); + + coarse = timespec64_to_ktime(*ts); + f_real = ktime_add(floor, offset); + if (ktime_after(f_real, coarse)) + *ts = ktime_to_timespec64(f_real); +} + +/** + * ktime_get_real_ts64_mg - attempt to update floor value and return result + * @ts: pointer to the timespec to be set + * + * Get a monotonic fine-grained time value and attempt to swap it into + * mg_floor. If that succeeds then accept the new floor value. If it fails + * then another task raced in during the interim time and updated the + * floor. Since any update to the floor must be later than the previous + * floor, either outcome is acceptable. + * + * Typically this will be called after calling ktime_get_coarse_real_ts64_mg(), + * and determining that the resulting coarse-grained timestamp did not effect + * a change in ctime. Any more recent floor value would effect a change to + * ctime, so there is no need to retry the atomic64_try_cmpxchg() on failure. + * + * @ts will be filled with the latest floor value, regardless of the outcome of + * the cmpxchg. Note that this is a filesystem specific interface and should be + * avoided outside of that context. + */ +void ktime_get_real_ts64_mg(struct timespec64 *ts) +{ + struct timekeeper *tk = &tk_core.timekeeper; + ktime_t old = atomic64_read(&mg_floor); + ktime_t offset, mono; + unsigned int seq; + u64 nsecs; + + do { + seq = read_seqcount_begin(&tk_core.seq); + + ts->tv_sec = tk->xtime_sec; + mono = tk->tkr_mono.base; + nsecs = timekeeping_get_ns(&tk->tkr_mono); + offset = tk_core.timekeeper.offs_real; + } while (read_seqcount_retry(&tk_core.seq, seq)); + + mono = ktime_add_ns(mono, nsecs); + + /* + * Attempt to update the floor with the new time value. As any + * update must be later then the existing floor, and would effect + * a change to ctime from the perspective of the current task, + * accept the resulting floor value regardless of the outcome of + * the swap. + */ + if (atomic64_try_cmpxchg(&mg_floor, &old, mono)) { + ts->tv_nsec = 0; + timespec64_add_ns(ts, nsecs); + timekeeping_inc_mg_floor_swaps(); + } else { + /* + * Another task changed mg_floor since "old" was fetched. + * "old" has been updated with the latest value of "mg_floor". + * That value is newer than the previous floor value, which + * is enough to effect a change to ctime. Accept it. + */ + *ts = ktime_to_timespec64(ktime_add(old, offset)); + } +} + void ktime_get_coarse_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; @@ -2551,13 +2557,10 @@ EXPORT_SYMBOL_GPL(random_get_entropy_fallback); */ int do_adjtimex(struct __kernel_timex *txc) { - struct timekeeper *tk = &tk_core.timekeeper; struct audit_ntp_data ad; bool offset_set = false; bool clock_set = false; struct timespec64 ts; - unsigned long flags; - s32 orig_tai, tai; int ret; /* Validate the data before disabling interrupts */ @@ -2568,6 +2571,7 @@ int do_adjtimex(struct __kernel_timex *txc) if (txc->modes & ADJ_SETOFFSET) { struct timespec64 delta; + delta.tv_sec = txc->time.tv_sec; delta.tv_nsec = txc->time.tv_usec; if (!(txc->modes & ADJ_NANO)) @@ -2585,21 +2589,21 @@ int do_adjtimex(struct __kernel_timex *txc) ktime_get_real_ts64(&ts); add_device_randomness(&ts, sizeof(ts)); - raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&tk_core.seq); + scoped_guard (raw_spinlock_irqsave, &tk_core.lock) { + struct timekeeper *tks = &tk_core.shadow_timekeeper; + s32 orig_tai, tai; - orig_tai = tai = tk->tai_offset; - ret = __do_adjtimex(txc, &ts, &tai, &ad); + orig_tai = tai = tks->tai_offset; + ret = __do_adjtimex(txc, &ts, &tai, &ad); - if (tai != orig_tai) { - __timekeeping_set_tai_offset(tk, tai); - timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); - clock_set = true; + if (tai != orig_tai) { + __timekeeping_set_tai_offset(tks, tai); + timekeeping_update_from_shadow(&tk_core, TK_CLOCK_WAS_SET); + clock_set = true; + } else { + tk_update_leap_state_all(&tk_core); + } } - tk_update_leap_state(tk); - - write_seqcount_end(&tk_core.seq); - raw_spin_unlock_irqrestore(&timekeeper_lock, flags); audit_ntp_log(&ad); @@ -2623,15 +2627,8 @@ int do_adjtimex(struct __kernel_timex *txc) */ void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts) { - unsigned long flags; - - raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&tk_core.seq); - + guard(raw_spinlock_irqsave)(&tk_core.lock); __hardpps(phase_ts, raw_ts); - - write_seqcount_end(&tk_core.seq); - raw_spin_unlock_irqrestore(&timekeeper_lock, flags); } EXPORT_SYMBOL(hardpps); #endif /* CONFIG_NTP_PPS */ diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c index b73e8850e58d..badeb222eab9 100644 --- a/kernel/time/timekeeping_debug.c +++ b/kernel/time/timekeeping_debug.c @@ -17,6 +17,9 @@ #define NUM_BINS 32 +/* Incremented every time mg_floor is updated */ +DEFINE_PER_CPU(unsigned long, timekeeping_mg_floor_swaps); + static unsigned int sleep_time_bin[NUM_BINS] = {0}; static int tk_debug_sleep_time_show(struct seq_file *s, void *data) @@ -53,3 +56,13 @@ void tk_debug_account_sleep_time(const struct timespec64 *t) (s64)t->tv_sec, t->tv_nsec / NSEC_PER_MSEC); } +unsigned long timekeeping_get_mg_floor_swaps(void) +{ + unsigned long sum = 0; + int cpu; + + for_each_possible_cpu(cpu) + sum += data_race(per_cpu(timekeeping_mg_floor_swaps, cpu)); + + return sum; +} diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h index 4ca2787d1642..63e600e943a7 100644 --- a/kernel/time/timekeeping_internal.h +++ b/kernel/time/timekeeping_internal.h @@ -10,12 +10,26 @@ * timekeeping debug functions */ #ifdef CONFIG_DEBUG_FS + +DECLARE_PER_CPU(unsigned long, timekeeping_mg_floor_swaps); + +static inline void timekeeping_inc_mg_floor_swaps(void) +{ + this_cpu_inc(timekeeping_mg_floor_swaps); +} + extern void tk_debug_account_sleep_time(const struct timespec64 *t); + #else + #define tk_debug_account_sleep_time(x) + +static inline void timekeeping_inc_mg_floor_swaps(void) +{ +} + #endif -#ifdef CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE static inline u64 clocksource_delta(u64 now, u64 last, u64 mask) { u64 ret = (now - last) & mask; @@ -26,14 +40,9 @@ static inline u64 clocksource_delta(u64 now, u64 last, u64 mask) */ return ret & ~(mask >> 1) ? 0 : ret; } -#else -static inline u64 clocksource_delta(u64 now, u64 last, u64 mask) -{ - return (now - last) & mask; -} -#endif /* Semi public for serialization of non timekeeper VDSO updates. */ -extern raw_spinlock_t timekeeper_lock; +unsigned long timekeeper_lock_irqsave(void); +void timekeeper_unlock_irqrestore(unsigned long flags); #endif /* _TIMEKEEPING_INTERNAL_H */ diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 0fc9d066a7be..a5860bf6d16f 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -37,7 +37,6 @@ #include <linux/tick.h> #include <linux/kallsyms.h> #include <linux/irq_work.h> -#include <linux/sched/signal.h> #include <linux/sched/sysctl.h> #include <linux/sched/nohz.h> #include <linux/sched/debug.h> @@ -2422,7 +2421,8 @@ static inline void __run_timers(struct timer_base *base) static void __run_timer_base(struct timer_base *base) { - if (time_before(jiffies, base->next_expiry)) + /* Can race against a remote CPU updating next_expiry under the lock */ + if (time_before(jiffies, READ_ONCE(base->next_expiry))) return; timer_base_lock_expiry(base); @@ -2499,7 +2499,7 @@ static void run_local_timers(void) */ if (time_after_eq(jiffies, READ_ONCE(base->next_expiry)) || (i == BASE_DEF && tmigr_requires_handle_remote())) { - raise_softirq(TIMER_SOFTIRQ); + raise_timer_softirq(TIMER_SOFTIRQ); return; } } @@ -2526,141 +2526,6 @@ void update_process_times(int user_tick) run_posix_cpu_timers(); } -/* - * Since schedule_timeout()'s timer is defined on the stack, it must store - * the target task on the stack as well. - */ -struct process_timer { - struct timer_list timer; - struct task_struct *task; -}; - -static void process_timeout(struct timer_list *t) -{ - struct process_timer *timeout = from_timer(timeout, t, timer); - - wake_up_process(timeout->task); -} - -/** - * schedule_timeout - sleep until timeout - * @timeout: timeout value in jiffies - * - * Make the current task sleep until @timeout jiffies have elapsed. - * The function behavior depends on the current task state - * (see also set_current_state() description): - * - * %TASK_RUNNING - the scheduler is called, but the task does not sleep - * at all. That happens because sched_submit_work() does nothing for - * tasks in %TASK_RUNNING state. - * - * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to - * pass before the routine returns unless the current task is explicitly - * woken up, (e.g. by wake_up_process()). - * - * %TASK_INTERRUPTIBLE - the routine may return early if a signal is - * delivered to the current task or the current task is explicitly woken - * up. - * - * The current task state is guaranteed to be %TASK_RUNNING when this - * routine returns. - * - * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule - * the CPU away without a bound on the timeout. In this case the return - * value will be %MAX_SCHEDULE_TIMEOUT. - * - * Returns 0 when the timer has expired otherwise the remaining time in - * jiffies will be returned. In all cases the return value is guaranteed - * to be non-negative. - */ -signed long __sched schedule_timeout(signed long timeout) -{ - struct process_timer timer; - unsigned long expire; - - switch (timeout) - { - case MAX_SCHEDULE_TIMEOUT: - /* - * These two special cases are useful to be comfortable - * in the caller. Nothing more. We could take - * MAX_SCHEDULE_TIMEOUT from one of the negative value - * but I' d like to return a valid offset (>=0) to allow - * the caller to do everything it want with the retval. - */ - schedule(); - goto out; - default: - /* - * Another bit of PARANOID. Note that the retval will be - * 0 since no piece of kernel is supposed to do a check - * for a negative retval of schedule_timeout() (since it - * should never happens anyway). You just have the printk() - * that will tell you if something is gone wrong and where. - */ - if (timeout < 0) { - printk(KERN_ERR "schedule_timeout: wrong timeout " - "value %lx\n", timeout); - dump_stack(); - __set_current_state(TASK_RUNNING); - goto out; - } - } - - expire = timeout + jiffies; - - timer.task = current; - timer_setup_on_stack(&timer.timer, process_timeout, 0); - __mod_timer(&timer.timer, expire, MOD_TIMER_NOTPENDING); - schedule(); - del_timer_sync(&timer.timer); - - /* Remove the timer from the object tracker */ - destroy_timer_on_stack(&timer.timer); - - timeout = expire - jiffies; - - out: - return timeout < 0 ? 0 : timeout; -} -EXPORT_SYMBOL(schedule_timeout); - -/* - * We can use __set_current_state() here because schedule_timeout() calls - * schedule() unconditionally. - */ -signed long __sched schedule_timeout_interruptible(signed long timeout) -{ - __set_current_state(TASK_INTERRUPTIBLE); - return schedule_timeout(timeout); -} -EXPORT_SYMBOL(schedule_timeout_interruptible); - -signed long __sched schedule_timeout_killable(signed long timeout) -{ - __set_current_state(TASK_KILLABLE); - return schedule_timeout(timeout); -} -EXPORT_SYMBOL(schedule_timeout_killable); - -signed long __sched schedule_timeout_uninterruptible(signed long timeout) -{ - __set_current_state(TASK_UNINTERRUPTIBLE); - return schedule_timeout(timeout); -} -EXPORT_SYMBOL(schedule_timeout_uninterruptible); - -/* - * Like schedule_timeout_uninterruptible(), except this task will not contribute - * to load average. - */ -signed long __sched schedule_timeout_idle(signed long timeout) -{ - __set_current_state(TASK_IDLE); - return schedule_timeout(timeout); -} -EXPORT_SYMBOL(schedule_timeout_idle); - #ifdef CONFIG_HOTPLUG_CPU static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *head) { @@ -2757,59 +2622,3 @@ void __init init_timers(void) posix_cputimers_init_work(); open_softirq(TIMER_SOFTIRQ, run_timer_softirq); } - -/** - * msleep - sleep safely even with waitqueue interruptions - * @msecs: Time in milliseconds to sleep for - */ -void msleep(unsigned int msecs) -{ - unsigned long timeout = msecs_to_jiffies(msecs); - - while (timeout) - timeout = schedule_timeout_uninterruptible(timeout); -} - -EXPORT_SYMBOL(msleep); - -/** - * msleep_interruptible - sleep waiting for signals - * @msecs: Time in milliseconds to sleep for - */ -unsigned long msleep_interruptible(unsigned int msecs) -{ - unsigned long timeout = msecs_to_jiffies(msecs); - - while (timeout && !signal_pending(current)) - timeout = schedule_timeout_interruptible(timeout); - return jiffies_to_msecs(timeout); -} - -EXPORT_SYMBOL(msleep_interruptible); - -/** - * usleep_range_state - Sleep for an approximate time in a given state - * @min: Minimum time in usecs to sleep - * @max: Maximum time in usecs to sleep - * @state: State of the current task that will be while sleeping - * - * In non-atomic context where the exact wakeup time is flexible, use - * usleep_range_state() instead of udelay(). The sleep improves responsiveness - * by avoiding the CPU-hogging busy-wait of udelay(), and the range reduces - * power usage by allowing hrtimers to take advantage of an already- - * scheduled interrupt instead of scheduling a new one just for this sleep. - */ -void __sched usleep_range_state(unsigned long min, unsigned long max, - unsigned int state) -{ - ktime_t exp = ktime_add_us(ktime_get(), min); - u64 delta = (u64)(max - min) * NSEC_PER_USEC; - - for (;;) { - __set_current_state(state); - /* Do not return before the requested sleep time has elapsed */ - if (!schedule_hrtimeout_range(&exp, delta, HRTIMER_MODE_ABS)) - break; - } -} -EXPORT_SYMBOL(usleep_range_state); diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c index 9193d6133e5d..05d383143165 100644 --- a/kernel/time/vsyscall.c +++ b/kernel/time/vsyscall.c @@ -119,7 +119,7 @@ void update_vsyscall(struct timekeeper *tk) if (clock_mode != VDSO_CLOCKMODE_NONE) update_vdso_data(vdata, tk); - __arch_update_vsyscall(vdata, tk); + __arch_update_vsyscall(vdata); vdso_write_end(vdata); @@ -151,9 +151,8 @@ void update_vsyscall_tz(void) unsigned long vdso_update_begin(void) { struct vdso_data *vdata = __arch_get_k_vdso_data(); - unsigned long flags; + unsigned long flags = timekeeper_lock_irqsave(); - raw_spin_lock_irqsave(&timekeeper_lock, flags); vdso_write_begin(vdata); return flags; } @@ -172,5 +171,5 @@ void vdso_update_end(unsigned long flags) vdso_write_end(vdata); __arch_sync_vdso_data(vdata); - raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + timekeeper_unlock_irqrestore(flags); } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index a582cd25ca87..f86c78961708 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1202,7 +1202,7 @@ static const struct bpf_func_proto bpf_get_func_arg_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_ALIGNED, + .arg3_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED, .arg3_size = sizeof(u64), }; @@ -1219,7 +1219,7 @@ static const struct bpf_func_proto bpf_get_func_ret_proto = { .func = get_func_ret, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_ALIGNED, + .arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED, .arg2_size = sizeof(u64), }; @@ -2216,8 +2216,6 @@ void perf_event_detach_bpf_prog(struct perf_event *event) old_array = bpf_event_rcu_dereference(event->tp_event->prog_array); ret = bpf_prog_array_copy(old_array, event->prog, NULL, 0, &new_array); - if (ret == -ENOENT) - goto unlock; if (ret < 0) { bpf_prog_array_delete_safe(old_array, event->prog); } else { @@ -3133,7 +3131,8 @@ static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link, struct bpf_uprobe_multi_link *umulti_link; u32 ucount = info->uprobe_multi.count; int err = 0, i; - long left; + char *p, *buf; + long left = 0; if (!upath ^ !upath_size) return -EINVAL; @@ -3147,26 +3146,23 @@ static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link, info->uprobe_multi.pid = umulti_link->task ? task_pid_nr_ns(umulti_link->task, task_active_pid_ns(current)) : 0; - if (upath) { - char *p, *buf; - - upath_size = min_t(u32, upath_size, PATH_MAX); - - buf = kmalloc(upath_size, GFP_KERNEL); - if (!buf) - return -ENOMEM; - p = d_path(&umulti_link->path, buf, upath_size); - if (IS_ERR(p)) { - kfree(buf); - return PTR_ERR(p); - } - upath_size = buf + upath_size - p; - left = copy_to_user(upath, p, upath_size); + upath_size = upath_size ? min_t(u32, upath_size, PATH_MAX) : PATH_MAX; + buf = kmalloc(upath_size, GFP_KERNEL); + if (!buf) + return -ENOMEM; + p = d_path(&umulti_link->path, buf, upath_size); + if (IS_ERR(p)) { kfree(buf); - if (left) - return -EFAULT; - info->uprobe_multi.path_size = upath_size; + return PTR_ERR(p); } + upath_size = buf + upath_size - p; + + if (upath) + left = copy_to_user(upath, p, upath_size); + kfree(buf); + if (left) + return -EFAULT; + info->uprobe_multi.path_size = upath_size; if (!uoffsets && !ucookies && !uref_ctr_offsets) return 0; @@ -3244,7 +3240,8 @@ uprobe_multi_link_filter(struct uprobe_consumer *con, struct mm_struct *mm) } static int -uprobe_multi_link_handler(struct uprobe_consumer *con, struct pt_regs *regs) +uprobe_multi_link_handler(struct uprobe_consumer *con, struct pt_regs *regs, + __u64 *data) { struct bpf_uprobe *uprobe; @@ -3253,7 +3250,8 @@ uprobe_multi_link_handler(struct uprobe_consumer *con, struct pt_regs *regs) } static int -uprobe_multi_link_ret_handler(struct uprobe_consumer *con, unsigned long func, struct pt_regs *regs) +uprobe_multi_link_ret_handler(struct uprobe_consumer *con, unsigned long func, struct pt_regs *regs, + __u64 *data) { struct bpf_uprobe *uprobe; diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index d7d4fb403f6f..69e226a48daa 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -1160,19 +1160,14 @@ void fgraph_update_pid_func(void) static int start_graph_tracing(void) { unsigned long **ret_stack_list; - int ret, cpu; + int ret; - ret_stack_list = kmalloc(SHADOW_STACK_SIZE, GFP_KERNEL); + ret_stack_list = kcalloc(FTRACE_RETSTACK_ALLOC_SIZE, + sizeof(*ret_stack_list), GFP_KERNEL); if (!ret_stack_list) return -ENOMEM; - /* The cpu_boot init_task->ret_stack will never be freed */ - for_each_online_cpu(cpu) { - if (!idle_task(cpu)->ret_stack) - ftrace_graph_init_idle_task(idle_task(cpu), cpu); - } - do { ret = alloc_retstack_tasklist(ret_stack_list); } while (ret == -EAGAIN); @@ -1242,13 +1237,33 @@ static void ftrace_graph_disable_direct(bool disable_branch) fgraph_direct_gops = &fgraph_stub; } +/* The cpu_boot init_task->ret_stack will never be freed */ +static int fgraph_cpu_init(unsigned int cpu) +{ + if (!idle_task(cpu)->ret_stack) + ftrace_graph_init_idle_task(idle_task(cpu), cpu); + return 0; +} + int register_ftrace_graph(struct fgraph_ops *gops) { + static bool fgraph_initialized; int command = 0; int ret = 0; int i = -1; - mutex_lock(&ftrace_lock); + guard(mutex)(&ftrace_lock); + + if (!fgraph_initialized) { + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "fgraph:online", + fgraph_cpu_init, NULL); + if (ret < 0) { + pr_warn("fgraph: Error to init cpu hotplug support\n"); + return ret; + } + fgraph_initialized = true; + ret = 0; + } if (!fgraph_array[0]) { /* The array must always have real data on it */ @@ -1258,10 +1273,8 @@ int register_ftrace_graph(struct fgraph_ops *gops) } i = fgraph_lru_alloc_index(); - if (i < 0 || WARN_ON_ONCE(fgraph_array[i] != &fgraph_stub)) { - ret = -ENOSPC; - goto out; - } + if (i < 0 || WARN_ON_ONCE(fgraph_array[i] != &fgraph_stub)) + return -ENOSPC; gops->idx = i; ftrace_graph_active++; @@ -1298,8 +1311,6 @@ error: gops->saved_func = NULL; fgraph_lru_release_index(i); } -out: - mutex_unlock(&ftrace_lock); return ret; } diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 77dc0b25140e..5807116bcd0b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -6725,39 +6725,38 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) } for_each_buffer_cpu(buffer, cpu) { + struct buffer_data_page *old_free_data_page; + struct list_head old_pages; + unsigned long flags; if (!cpumask_test_cpu(cpu, buffer->cpumask)) continue; cpu_buffer = buffer->buffers[cpu]; + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + /* Clear the head bit to make the link list normal to read */ rb_head_page_deactivate(cpu_buffer); - /* Now walk the list and free all the old sub buffers */ - list_for_each_entry_safe(bpage, tmp, cpu_buffer->pages, list) { - list_del_init(&bpage->list); - free_buffer_page(bpage); - } - /* The above loop stopped an the last page needing to be freed */ - bpage = list_entry(cpu_buffer->pages, struct buffer_page, list); - free_buffer_page(bpage); - - /* Free the current reader page */ - free_buffer_page(cpu_buffer->reader_page); + /* + * Collect buffers from the cpu_buffer pages list and the + * reader_page on old_pages, so they can be freed later when not + * under a spinlock. The pages list is a linked list with no + * head, adding old_pages turns it into a regular list with + * old_pages being the head. + */ + list_add(&old_pages, cpu_buffer->pages); + list_add(&cpu_buffer->reader_page->list, &old_pages); /* One page was allocated for the reader page */ cpu_buffer->reader_page = list_entry(cpu_buffer->new_pages.next, struct buffer_page, list); list_del_init(&cpu_buffer->reader_page->list); - /* The cpu_buffer pages are a link list with no head */ + /* Install the new pages, remove the head from the list */ cpu_buffer->pages = cpu_buffer->new_pages.next; - cpu_buffer->new_pages.next->prev = cpu_buffer->new_pages.prev; - cpu_buffer->new_pages.prev->next = cpu_buffer->new_pages.next; - - /* Clear the new_pages list */ - INIT_LIST_HEAD(&cpu_buffer->new_pages); + list_del_init(&cpu_buffer->new_pages); cpu_buffer->head_page = list_entry(cpu_buffer->pages, struct buffer_page, list); @@ -6766,11 +6765,20 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) cpu_buffer->nr_pages = cpu_buffer->nr_pages_to_update; cpu_buffer->nr_pages_to_update = 0; - free_pages((unsigned long)cpu_buffer->free_page, old_order); + old_free_data_page = cpu_buffer->free_page; cpu_buffer->free_page = NULL; rb_head_page_activate(cpu_buffer); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + /* Free old sub buffers */ + list_for_each_entry_safe(bpage, tmp, &old_pages, list) { + list_del_init(&bpage->list); + free_buffer_page(bpage); + } + free_pages((unsigned long)old_free_data_page, old_order); + rb_check_pages(cpu_buffer); } diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c index df0745a42a3f..dc819aec43e8 100644 --- a/kernel/trace/rv/rv.c +++ b/kernel/trace/rv/rv.c @@ -306,7 +306,6 @@ static ssize_t monitor_enable_write_data(struct file *filp, const char __user *u static const struct file_operations interface_enable_fops = { .open = simple_open, - .llseek = no_llseek, .write = monitor_enable_write_data, .read = monitor_enable_read_data, }; @@ -329,7 +328,6 @@ static ssize_t monitor_desc_read_data(struct file *filp, char __user *user_buf, static const struct file_operations interface_desc_fops = { .open = simple_open, - .llseek = no_llseek, .read = monitor_desc_read_data, }; @@ -674,7 +672,6 @@ static ssize_t monitoring_on_write_data(struct file *filp, const char __user *us static const struct file_operations monitoring_on_fops = { .open = simple_open, - .llseek = no_llseek, .write = monitoring_on_write_data, .read = monitoring_on_read_data, }; diff --git a/kernel/trace/rv/rv_reactors.c b/kernel/trace/rv/rv_reactors.c index 6aae106695b6..7b49cbe388d4 100644 --- a/kernel/trace/rv/rv_reactors.c +++ b/kernel/trace/rv/rv_reactors.c @@ -426,7 +426,6 @@ static ssize_t reacting_on_write_data(struct file *filp, const char __user *user static const struct file_operations reacting_on_fops = { .open = simple_open, - .llseek = no_llseek, .write = reacting_on_write_data, .read = reacting_on_read_data, }; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b4f348b4653f..6a891e00aa7f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2386,6 +2386,25 @@ void tracing_reset_online_cpus(struct array_buffer *buf) ring_buffer_record_enable(buffer); } +static void tracing_reset_all_cpus(struct array_buffer *buf) +{ + struct trace_buffer *buffer = buf->buffer; + + if (!buffer) + return; + + ring_buffer_record_disable(buffer); + + /* Make sure all commits have finished */ + synchronize_rcu(); + + buf->time_start = buffer_ftrace_now(buf, buf->cpu); + + ring_buffer_reset(buffer); + + ring_buffer_record_enable(buffer); +} + /* Must have trace_types_lock held */ void tracing_reset_all_online_cpus_unlocked(void) { @@ -3697,8 +3716,8 @@ static void test_can_verify(void) void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, va_list ap) { - long text_delta = iter->tr->text_delta; - long data_delta = iter->tr->data_delta; + long text_delta = 0; + long data_delta = 0; const char *p = fmt; const char *str; bool good; @@ -3710,6 +3729,17 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, if (static_branch_unlikely(&trace_no_verify)) goto print; + /* + * When the kernel is booted with the tp_printk command line + * parameter, trace events go directly through to printk(). + * It also is checked by this function, but it does not + * have an associated trace_array (tr) for it. + */ + if (iter->tr) { + text_delta = iter->tr->text_delta; + data_delta = iter->tr->data_delta; + } + /* Don't bother checking when doing a ftrace_dump() */ if (iter->fmt == static_fmt_buf) goto print; @@ -5490,6 +5520,10 @@ static const struct file_operations tracing_iter_fops = { static const char readme_msg[] = "tracing mini-HOWTO:\n\n" + "By default tracefs removes all OTH file permission bits.\n" + "When mounting tracefs an optional group id can be specified\n" + "which adds the group to every directory and file in tracefs:\n\n" + "\t e.g. mount -t tracefs [-o [gid=<gid>]] nodev /sys/kernel/tracing\n\n" "# echo 0 > tracing_on : quick way to disable tracing\n" "# echo 1 > tracing_on : quick way to re-enable tracing\n\n" " Important files:\n" @@ -6130,8 +6164,13 @@ static void update_last_data(struct trace_array *tr) if (!tr->text_delta && !tr->data_delta) return; - /* Clear old data */ - tracing_reset_online_cpus(&tr->array_buffer); + /* + * Need to clear all CPU buffers as there cannot be events + * from the previous boot mixed with events with this boot + * as that will cause a confusing trace. Need to clear all + * CPU buffers, even for those that may currently be offline. + */ + tracing_reset_all_cpus(&tr->array_buffer); /* Using current data now */ tr->text_delta = 0; @@ -7557,7 +7596,6 @@ static const struct file_operations tracing_pipe_fops = { .read = tracing_read_pipe, .splice_read = tracing_splice_read_pipe, .release = tracing_release_pipe, - .llseek = no_llseek, }; static const struct file_operations tracing_entries_fops = { @@ -7636,7 +7674,6 @@ static const struct file_operations snapshot_raw_fops = { .read = tracing_buffers_read, .release = tracing_buffers_release, .splice_read = tracing_buffers_splice_read, - .llseek = no_llseek, }; #endif /* CONFIG_TRACER_SNAPSHOT */ @@ -8466,7 +8503,6 @@ static const struct file_operations tracing_buffers_fops = { .flush = tracing_buffers_flush, .splice_read = tracing_buffers_splice_read, .unlocked_ioctl = tracing_buffers_ioctl, - .llseek = no_llseek, .mmap = tracing_buffers_mmap, }; @@ -10613,10 +10649,10 @@ __init static void enable_instances(void) * cannot be deleted by user space, so keep the reference * to it. */ - if (start) + if (start) { tr->flags |= TRACE_ARRAY_FL_BOOT; - else - trace_array_put(tr); + tr->ref++; + } while ((tok = strsep(&curr_str, ","))) { early_enable_events(tr, tok, true); diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index b0e0ec85912e..ebda68ee9abf 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -912,6 +912,11 @@ static int __trace_eprobe_create(int argc, const char *argv[]) } } + if (argc - 2 > MAX_TRACE_ARGS) { + ret = -E2BIG; + goto error; + } + mutex_lock(&event_mutex); event_call = find_and_get_event(sys_name, sys_event); ep = alloc_event_probe(group, event, event_call, argc - 2); @@ -937,7 +942,7 @@ static int __trace_eprobe_create(int argc, const char *argv[]) argc -= 2; argv += 2; /* parse arguments */ - for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { + for (i = 0; i < argc; i++) { trace_probe_log_set_index(i + 2); ret = trace_eprobe_tp_update_arg(ep, argv, i); if (ret) diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c index 62e6a8f4aae9..c62d1629cffe 100644 --- a/kernel/trace/trace_fprobe.c +++ b/kernel/trace/trace_fprobe.c @@ -21,6 +21,7 @@ #define FPROBE_EVENT_SYSTEM "fprobes" #define TRACEPOINT_EVENT_SYSTEM "tracepoints" #define RETHOOK_MAXACTIVE_MAX 4096 +#define TRACEPOINT_STUB ERR_PTR(-ENOENT) static int trace_fprobe_create(const char *raw_command); static int trace_fprobe_show(struct seq_file *m, struct dyn_event *ev); @@ -385,6 +386,7 @@ static struct trace_fprobe *alloc_trace_fprobe(const char *group, const char *event, const char *symbol, struct tracepoint *tpoint, + struct module *mod, int maxactive, int nargs, bool is_return) { @@ -405,6 +407,7 @@ static struct trace_fprobe *alloc_trace_fprobe(const char *group, tf->fp.entry_handler = fentry_dispatcher; tf->tpoint = tpoint; + tf->mod = mod; tf->fp.nr_maxactive = maxactive; ret = trace_probe_init(&tf->tp, event, group, false, nargs); @@ -672,6 +675,24 @@ static int unregister_fprobe_event(struct trace_fprobe *tf) return trace_probe_unregister_event_call(&tf->tp); } +static int __regsiter_tracepoint_fprobe(struct trace_fprobe *tf) +{ + struct tracepoint *tpoint = tf->tpoint; + unsigned long ip = (unsigned long)tpoint->probestub; + int ret; + + /* + * Here, we do 2 steps to enable fprobe on a tracepoint. + * At first, put __probestub_##TP function on the tracepoint + * and put a fprobe on the stub function. + */ + ret = tracepoint_probe_register_prio_may_exist(tpoint, + tpoint->probestub, NULL, 0); + if (ret < 0) + return ret; + return register_fprobe_ips(&tf->fp, &ip, 1); +} + /* Internal register function - just handle fprobe and flags */ static int __register_trace_fprobe(struct trace_fprobe *tf) { @@ -698,18 +719,12 @@ static int __register_trace_fprobe(struct trace_fprobe *tf) tf->fp.flags |= FPROBE_FL_DISABLED; if (trace_fprobe_is_tracepoint(tf)) { - struct tracepoint *tpoint = tf->tpoint; - unsigned long ip = (unsigned long)tpoint->probestub; - /* - * Here, we do 2 steps to enable fprobe on a tracepoint. - * At first, put __probestub_##TP function on the tracepoint - * and put a fprobe on the stub function. - */ - ret = tracepoint_probe_register_prio_may_exist(tpoint, - tpoint->probestub, NULL, 0); - if (ret < 0) - return ret; - return register_fprobe_ips(&tf->fp, &ip, 1); + + /* This tracepoint is not loaded yet */ + if (tf->tpoint == TRACEPOINT_STUB) + return 0; + + return __regsiter_tracepoint_fprobe(tf); } /* TODO: handle filter, nofilter or symbol list */ @@ -862,20 +877,106 @@ end: return ret; } +struct __find_tracepoint_cb_data { + const char *tp_name; + struct tracepoint *tpoint; + struct module *mod; +}; + +static void __find_tracepoint_module_cb(struct tracepoint *tp, struct module *mod, void *priv) +{ + struct __find_tracepoint_cb_data *data = priv; + + if (!data->tpoint && !strcmp(data->tp_name, tp->name)) { + data->tpoint = tp; + if (!data->mod) { + data->mod = mod; + if (!try_module_get(data->mod)) { + data->tpoint = NULL; + data->mod = NULL; + } + } + } +} + +static void __find_tracepoint_cb(struct tracepoint *tp, void *priv) +{ + struct __find_tracepoint_cb_data *data = priv; + + if (!data->tpoint && !strcmp(data->tp_name, tp->name)) + data->tpoint = tp; +} + +/* + * Find a tracepoint from kernel and module. If the tracepoint is in a module, + * this increments the module refcount to prevent unloading until the + * trace_fprobe is registered to the list. After registering the trace_fprobe + * on the trace_fprobe list, the module refcount is decremented because + * tracepoint_probe_module_cb will handle it. + */ +static struct tracepoint *find_tracepoint(const char *tp_name, + struct module **tp_mod) +{ + struct __find_tracepoint_cb_data data = { + .tp_name = tp_name, + .mod = NULL, + }; + + for_each_kernel_tracepoint(__find_tracepoint_cb, &data); + + if (!data.tpoint && IS_ENABLED(CONFIG_MODULES)) { + for_each_module_tracepoint(__find_tracepoint_module_cb, &data); + *tp_mod = data.mod; + } + + return data.tpoint; +} + #ifdef CONFIG_MODULES +static void reenable_trace_fprobe(struct trace_fprobe *tf) +{ + struct trace_probe *tp = &tf->tp; + + list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) { + __enable_trace_fprobe(tf); + } +} + +static struct tracepoint *find_tracepoint_in_module(struct module *mod, + const char *tp_name) +{ + struct __find_tracepoint_cb_data data = { + .tp_name = tp_name, + .mod = mod, + }; + + for_each_tracepoint_in_module(mod, __find_tracepoint_module_cb, &data); + return data.tpoint; +} + static int __tracepoint_probe_module_cb(struct notifier_block *self, unsigned long val, void *data) { struct tp_module *tp_mod = data; + struct tracepoint *tpoint; struct trace_fprobe *tf; struct dyn_event *pos; - if (val != MODULE_STATE_GOING) + if (val != MODULE_STATE_GOING && val != MODULE_STATE_COMING) return NOTIFY_DONE; mutex_lock(&event_mutex); for_each_trace_fprobe(tf, pos) { - if (tp_mod->mod == tf->mod) { + if (val == MODULE_STATE_COMING && tf->tpoint == TRACEPOINT_STUB) { + tpoint = find_tracepoint_in_module(tp_mod->mod, tf->symbol); + if (tpoint) { + tf->tpoint = tpoint; + tf->mod = tp_mod->mod; + if (!WARN_ON_ONCE(__regsiter_tracepoint_fprobe(tf)) && + trace_probe_is_enabled(&tf->tp)) + reenable_trace_fprobe(tf); + } + } else if (val == MODULE_STATE_GOING && tp_mod->mod == tf->mod) { tracepoint_probe_unregister(tf->tpoint, tf->tpoint->probestub, NULL); tf->tpoint = NULL; @@ -892,30 +993,6 @@ static struct notifier_block tracepoint_module_nb = { }; #endif /* CONFIG_MODULES */ -struct __find_tracepoint_cb_data { - const char *tp_name; - struct tracepoint *tpoint; -}; - -static void __find_tracepoint_cb(struct tracepoint *tp, void *priv) -{ - struct __find_tracepoint_cb_data *data = priv; - - if (!data->tpoint && !strcmp(data->tp_name, tp->name)) - data->tpoint = tp; -} - -static struct tracepoint *find_tracepoint(const char *tp_name) -{ - struct __find_tracepoint_cb_data data = { - .tp_name = tp_name, - }; - - for_each_kernel_tracepoint(__find_tracepoint_cb, &data); - - return data.tpoint; -} - static int parse_symbol_and_return(int argc, const char *argv[], char **symbol, bool *is_return, bool is_tracepoint) @@ -996,6 +1073,7 @@ static int __trace_fprobe_create(int argc, const char *argv[]) char abuf[MAX_BTF_ARGS_LEN]; char *dbuf = NULL; bool is_tracepoint = false; + struct module *tp_mod = NULL; struct tracepoint *tpoint = NULL; struct traceprobe_parse_context ctx = { .flags = TPARG_FL_KERNEL | TPARG_FL_FPROBE, @@ -1080,15 +1158,20 @@ static int __trace_fprobe_create(int argc, const char *argv[]) if (is_tracepoint) { ctx.flags |= TPARG_FL_TPOINT; - tpoint = find_tracepoint(symbol); - if (!tpoint) { + tpoint = find_tracepoint(symbol, &tp_mod); + if (tpoint) { + ctx.funcname = kallsyms_lookup( + (unsigned long)tpoint->probestub, + NULL, NULL, NULL, sbuf); + } else if (IS_ENABLED(CONFIG_MODULES)) { + /* This *may* be loaded afterwards */ + tpoint = TRACEPOINT_STUB; + ctx.funcname = symbol; + } else { trace_probe_log_set_index(1); trace_probe_log_err(0, NO_TRACEPOINT); goto parse_error; } - ctx.funcname = kallsyms_lookup( - (unsigned long)tpoint->probestub, - NULL, NULL, NULL, sbuf); } else ctx.funcname = symbol; @@ -1104,14 +1187,18 @@ static int __trace_fprobe_create(int argc, const char *argv[]) argc = new_argc; argv = new_argv; } + if (argc > MAX_TRACE_ARGS) { + ret = -E2BIG; + goto out; + } ret = traceprobe_expand_dentry_args(argc, argv, &dbuf); if (ret) goto out; /* setup a probe */ - tf = alloc_trace_fprobe(group, event, symbol, tpoint, maxactive, - argc, is_return); + tf = alloc_trace_fprobe(group, event, symbol, tpoint, tp_mod, + maxactive, argc, is_return); if (IS_ERR(tf)) { ret = PTR_ERR(tf); /* This must return -ENOMEM, else there is a bug */ @@ -1119,12 +1206,8 @@ static int __trace_fprobe_create(int argc, const char *argv[]) goto out; /* We know tf is not allocated */ } - if (is_tracepoint) - tf->mod = __module_text_address( - (unsigned long)tf->tpoint->probestub); - /* parse arguments */ - for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { + for (i = 0; i < argc; i++) { trace_probe_log_set_index(i + 2); ctx.offset = 0; ret = traceprobe_parse_probe_arg(&tf->tp, i, argv[i], &ctx); @@ -1155,6 +1238,8 @@ static int __trace_fprobe_create(int argc, const char *argv[]) } out: + if (tp_mod) + module_put(tp_mod); traceprobe_finish_parse(&ctx); trace_probe_log_clear(); kfree(new_argv); diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index b791524a6536..3bd6071441ad 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -520,6 +520,8 @@ static void hwlat_hotplug_workfn(struct work_struct *dummy) if (!hwlat_busy || hwlat_data.thread_mode != MODE_PER_CPU) goto out_unlock; + if (!cpu_online(cpu)) + goto out_unlock; if (!cpumask_test_cpu(cpu, tr->tracing_cpumask)) goto out_unlock; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 61a6da808203..263fac44d3ca 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1013,6 +1013,10 @@ static int __trace_kprobe_create(int argc, const char *argv[]) argc = new_argc; argv = new_argv; } + if (argc > MAX_TRACE_ARGS) { + ret = -E2BIG; + goto out; + } ret = traceprobe_expand_dentry_args(argc, argv, &dbuf); if (ret) @@ -1029,7 +1033,7 @@ static int __trace_kprobe_create(int argc, const char *argv[]) } /* parse arguments */ - for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { + for (i = 0; i < argc; i++) { trace_probe_log_set_index(i + 2); ctx.offset = 0; ret = traceprobe_parse_probe_arg(&tk->tp, i, argv[i], &ctx); diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 1439064f65d6..a50ed23bee77 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1953,12 +1953,8 @@ static void stop_kthread(unsigned int cpu) { struct task_struct *kthread; - mutex_lock(&interface_lock); - kthread = per_cpu(per_cpu_osnoise_var, cpu).kthread; + kthread = xchg_relaxed(&(per_cpu(per_cpu_osnoise_var, cpu).kthread), NULL); if (kthread) { - per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL; - mutex_unlock(&interface_lock); - if (cpumask_test_and_clear_cpu(cpu, &kthread_cpumask) && !WARN_ON(!test_bit(OSN_WORKLOAD, &osnoise_options))) { kthread_stop(kthread); @@ -1972,7 +1968,6 @@ static void stop_kthread(unsigned int cpu) put_task_struct(kthread); } } else { - mutex_unlock(&interface_lock); /* if no workload, just return */ if (!test_bit(OSN_WORKLOAD, &osnoise_options)) { /* @@ -1994,8 +1989,12 @@ static void stop_per_cpu_kthreads(void) { int cpu; - for_each_possible_cpu(cpu) + cpus_read_lock(); + + for_each_online_cpu(cpu) stop_kthread(cpu); + + cpus_read_unlock(); } /* @@ -2007,6 +2006,10 @@ static int start_kthread(unsigned int cpu) void *main = osnoise_main; char comm[24]; + /* Do not start a new thread if it is already running */ + if (per_cpu(per_cpu_osnoise_var, cpu).kthread) + return 0; + if (timerlat_enabled()) { snprintf(comm, 24, "timerlat/%d", cpu); main = timerlat_main; @@ -2061,11 +2064,10 @@ static int start_per_cpu_kthreads(void) if (cpumask_test_and_clear_cpu(cpu, &kthread_cpumask)) { struct task_struct *kthread; - kthread = per_cpu(per_cpu_osnoise_var, cpu).kthread; + kthread = xchg_relaxed(&(per_cpu(per_cpu_osnoise_var, cpu).kthread), NULL); if (!WARN_ON(!kthread)) kthread_stop(kthread); } - per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL; } for_each_cpu(cpu, current_mask) { @@ -2095,6 +2097,8 @@ static void osnoise_hotplug_workfn(struct work_struct *dummy) mutex_lock(&interface_lock); cpus_read_lock(); + if (!cpu_online(cpu)) + goto out_unlock; if (!cpumask_test_cpu(cpu, &osnoise_cpumask)) goto out_unlock; diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 39877c80d6cb..16a5e368e7b7 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -276,7 +276,7 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, } trace_probe_log_err(offset, NO_EVENT_NAME); return -EINVAL; - } else if (len > MAX_EVENT_NAME_LEN) { + } else if (len >= MAX_EVENT_NAME_LEN) { trace_probe_log_err(offset, EVENT_TOO_LONG); return -EINVAL; } diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index c4ad7cd7e778..1469dd8075fa 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -1485,7 +1485,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) /* reset the max latency */ tr->max_latency = 0; - while (p->on_rq) { + while (task_is_runnable(p)) { /* * Sleep to make sure the -deadline thread is asleep too. * On virtual machines we can't rely on timings, diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index f7443e996b1b..fed382b7881b 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -17,6 +17,7 @@ #include <linux/string.h> #include <linux/rculist.h> #include <linux/filter.h> +#include <linux/percpu.h> #include "trace_dynevent.h" #include "trace_probe.h" @@ -62,7 +63,7 @@ struct trace_uprobe { struct uprobe *uprobe; unsigned long offset; unsigned long ref_ctr_offset; - unsigned long nhit; + unsigned long __percpu *nhits; struct trace_probe tp; }; @@ -88,9 +89,11 @@ static struct trace_uprobe *to_trace_uprobe(struct dyn_event *ev) static int register_uprobe_event(struct trace_uprobe *tu); static int unregister_uprobe_event(struct trace_uprobe *tu); -static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs); +static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs, + __u64 *data); static int uretprobe_dispatcher(struct uprobe_consumer *con, - unsigned long func, struct pt_regs *regs); + unsigned long func, struct pt_regs *regs, + __u64 *data); #ifdef CONFIG_STACK_GROWSUP static unsigned long adjust_stack_addr(unsigned long addr, unsigned int n) @@ -337,6 +340,12 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret) if (!tu) return ERR_PTR(-ENOMEM); + tu->nhits = alloc_percpu(unsigned long); + if (!tu->nhits) { + ret = -ENOMEM; + goto error; + } + ret = trace_probe_init(&tu->tp, event, group, true, nargs); if (ret < 0) goto error; @@ -349,6 +358,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret) return tu; error: + free_percpu(tu->nhits); kfree(tu); return ERR_PTR(ret); @@ -362,6 +372,7 @@ static void free_trace_uprobe(struct trace_uprobe *tu) path_put(&tu->path); trace_probe_cleanup(&tu->tp); kfree(tu->filename); + free_percpu(tu->nhits); kfree(tu); } @@ -556,6 +567,8 @@ static int __trace_uprobe_create(int argc, const char **argv) if (argc < 2) return -ECANCELED; + if (argc - 2 > MAX_TRACE_ARGS) + return -E2BIG; if (argv[0][1] == ':') event = &argv[0][2]; @@ -681,7 +694,7 @@ static int __trace_uprobe_create(int argc, const char **argv) tu->filename = filename; /* parse arguments */ - for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { + for (i = 0; i < argc; i++) { struct traceprobe_parse_context ctx = { .flags = (is_return ? TPARG_FL_RETURN : 0) | TPARG_FL_USER, }; @@ -815,13 +828,21 @@ static int probes_profile_seq_show(struct seq_file *m, void *v) { struct dyn_event *ev = v; struct trace_uprobe *tu; + unsigned long nhits; + int cpu; if (!is_trace_uprobe(ev)) return 0; tu = to_trace_uprobe(ev); + + nhits = 0; + for_each_possible_cpu(cpu) { + nhits += per_cpu(*tu->nhits, cpu); + } + seq_printf(m, " %s %-44s %15lu\n", tu->filename, - trace_probe_name(&tu->tp), tu->nhit); + trace_probe_name(&tu->tp), nhits); return 0; } @@ -858,6 +879,7 @@ struct uprobe_cpu_buffer { }; static struct uprobe_cpu_buffer __percpu *uprobe_cpu_buffer; static int uprobe_buffer_refcnt; +#define MAX_UCB_BUFFER_SIZE PAGE_SIZE static int uprobe_buffer_init(void) { @@ -962,6 +984,11 @@ static struct uprobe_cpu_buffer *prepare_uprobe_buffer(struct trace_uprobe *tu, ucb = uprobe_buffer_get(); ucb->dsize = tu->tp.size + dsize; + if (WARN_ON_ONCE(ucb->dsize > MAX_UCB_BUFFER_SIZE)) { + ucb->dsize = MAX_UCB_BUFFER_SIZE; + dsize = MAX_UCB_BUFFER_SIZE - tu->tp.size; + } + store_trace_args(ucb->buf, &tu->tp, regs, NULL, esize, dsize); *ucbp = ucb; @@ -981,9 +1008,6 @@ static void __uprobe_trace_func(struct trace_uprobe *tu, WARN_ON(call != trace_file->event_call); - if (WARN_ON_ONCE(ucb->dsize > PAGE_SIZE)) - return; - if (trace_trigger_soft_disabled(trace_file)) return; @@ -1500,7 +1524,8 @@ trace_uprobe_register(struct trace_event_call *event, enum trace_reg type, } } -static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) +static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs, + __u64 *data) { struct trace_uprobe *tu; struct uprobe_dispatch_data udd; @@ -1508,7 +1533,8 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) int ret = 0; tu = container_of(con, struct trace_uprobe, consumer); - tu->nhit++; + + this_cpu_inc(*tu->nhits); udd.tu = tu; udd.bp_addr = instruction_pointer(regs); @@ -1530,7 +1556,8 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) } static int uretprobe_dispatcher(struct uprobe_consumer *con, - unsigned long func, struct pt_regs *regs) + unsigned long func, struct pt_regs *regs, + __u64 *data) { struct trace_uprobe *tu; struct uprobe_dispatch_data udd; diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 8d1507dd0724..8879da16ef4d 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -735,6 +735,48 @@ static __init int init_tracepoints(void) return ret; } __initcall(init_tracepoints); + +/** + * for_each_tracepoint_in_module - iteration on all tracepoints in a module + * @mod: module + * @fct: callback + * @priv: private data + */ +void for_each_tracepoint_in_module(struct module *mod, + void (*fct)(struct tracepoint *tp, + struct module *mod, void *priv), + void *priv) +{ + tracepoint_ptr_t *begin, *end, *iter; + + lockdep_assert_held(&tracepoint_module_list_mutex); + + if (!mod) + return; + + begin = mod->tracepoints_ptrs; + end = mod->tracepoints_ptrs + mod->num_tracepoints; + + for (iter = begin; iter < end; iter++) + fct(tracepoint_ptr_deref(iter), mod, priv); +} + +/** + * for_each_module_tracepoint - iteration on all tracepoints in all modules + * @fct: callback + * @priv: private data + */ +void for_each_module_tracepoint(void (*fct)(struct tracepoint *tp, + struct module *mod, void *priv), + void *priv) +{ + struct tp_module *tp_mod; + + mutex_lock(&tracepoint_module_list_mutex); + list_for_each_entry(tp_mod, &tracepoint_module_list, list) + for_each_tracepoint_in_module(tp_mod->mod, fct, priv); + mutex_unlock(&tracepoint_module_list_mutex); +} #endif /* CONFIG_MODULES */ /** diff --git a/kernel/ucount.c b/kernel/ucount.c index 8c07714ff27d..696406939be5 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -307,7 +307,8 @@ void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum rlimit_type type) do_dec_rlimit_put_ucounts(ucounts, NULL, type); } -long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type) +long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type, + bool override_rlimit) { /* Caller must hold a reference to ucounts */ struct ucounts *iter; @@ -317,10 +318,11 @@ long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type) for (iter = ucounts; iter; iter = iter->ns->ucounts) { long new = atomic_long_add_return(1, &iter->rlimit[type]); if (new < 0 || new > max) - goto unwind; + goto dec_unwind; if (iter == ucounts) ret = new; - max = get_userns_rlimit_max(iter->ns, type); + if (!override_rlimit) + max = get_userns_rlimit_max(iter->ns, type); /* * Grab an extra ucount reference for the caller when * the rlimit count was previously 0. @@ -334,7 +336,6 @@ long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type) dec_unwind: dec = atomic_long_sub_return(1, &iter->rlimit[type]); WARN_ON_ONCE(dec < 0); -unwind: do_dec_rlimit_put_ucounts(ucounts, iter, type); return 0; } diff --git a/kernel/umh.c b/kernel/umh.c index ff1f13a27d29..be9234270777 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -13,7 +13,6 @@ #include <linux/completion.h> #include <linux/cred.h> #include <linux/file.h> -#include <linux/fdtable.h> #include <linux/fs_struct.h> #include <linux/workqueue.h> #include <linux/security.h> diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index d36242fd4936..1895fbc32bcb 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -663,16 +663,14 @@ struct watch_queue *get_watch_queue(int fd) { struct pipe_inode_info *pipe; struct watch_queue *wqueue = ERR_PTR(-EINVAL); - struct fd f; + CLASS(fd, f)(fd); - f = fdget(fd); - if (fd_file(f)) { + if (!fd_empty(f)) { pipe = get_pipe_info(fd_file(f), false); if (pipe && pipe->watch_queue) { wqueue = pipe->watch_queue; kref_get(&wqueue->usage); } - fdput(f); } return wqueue; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 9949ffad8df0..8b07576814a5 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3833,16 +3833,28 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, { bool wait = false; struct pool_workqueue *pwq; + struct worker_pool *current_pool = NULL; if (flush_color >= 0) { WARN_ON_ONCE(atomic_read(&wq->nr_pwqs_to_flush)); atomic_set(&wq->nr_pwqs_to_flush, 1); } + /* + * For unbound workqueue, pwqs will map to only a few pools. + * Most of the time, pwqs within the same pool will be linked + * sequentially to wq->pwqs by cpu index. So in the majority + * of pwq iters, the pool is the same, only doing lock/unlock + * if the pool has changed. This can largely reduce expensive + * lock operations. + */ for_each_pwq(pwq, wq) { - struct worker_pool *pool = pwq->pool; - - raw_spin_lock_irq(&pool->lock); + if (current_pool != pwq->pool) { + if (likely(current_pool)) + raw_spin_unlock_irq(¤t_pool->lock); + current_pool = pwq->pool; + raw_spin_lock_irq(¤t_pool->lock); + } if (flush_color >= 0) { WARN_ON_ONCE(pwq->flush_color != -1); @@ -3859,9 +3871,11 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, pwq->work_color = work_color; } - raw_spin_unlock_irq(&pool->lock); } + if (current_pool) + raw_spin_unlock_irq(¤t_pool->lock); + if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_pwqs_to_flush)) complete(&wq->first_flusher->done); |