diff options
Diffstat (limited to 'kernel')
45 files changed, 1498 insertions, 959 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 10ef068f598d..6fc72b3afbde 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -15,6 +15,7 @@ obj-y = fork.o exec_domain.o panic.o \ obj-$(CONFIG_USERMODE_DRIVER) += usermode_driver.o obj-$(CONFIG_MODULES) += kmod.o obj-$(CONFIG_MULTIUSER) += groups.o +obj-$(CONFIG_VHOST_TASK) += vhost_task.o ifdef CONFIG_FUNCTION_TRACER # Do not trace internal ftrace files diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index 05f4c66c9089..85720311cc67 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -84,16 +84,13 @@ void bpf_inode_storage_free(struct inode *inode) static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key) { struct bpf_local_storage_data *sdata; - struct file *f; - int fd; + struct fd f = fdget_raw(*(int *)key); - fd = *(int *)key; - f = fget_raw(fd); - if (!f) + if (!f.file) return ERR_PTR(-EBADF); - sdata = inode_storage_lookup(f->f_inode, map, true); - fput(f); + sdata = inode_storage_lookup(file_inode(f.file), map, true); + fdput(f); return sdata ? sdata->data : NULL; } @@ -101,22 +98,19 @@ static int bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_local_storage_data *sdata; - struct file *f; - int fd; + struct fd f = fdget_raw(*(int *)key); - fd = *(int *)key; - f = fget_raw(fd); - if (!f) + if (!f.file) return -EBADF; - if (!inode_storage_ptr(f->f_inode)) { - fput(f); + if (!inode_storage_ptr(file_inode(f.file))) { + fdput(f); return -EBADF; } - sdata = bpf_local_storage_update(f->f_inode, + sdata = bpf_local_storage_update(file_inode(f.file), (struct bpf_local_storage_map *)map, value, map_flags, GFP_ATOMIC); - fput(f); + fdput(f); return PTR_ERR_OR_ZERO(sdata); } @@ -135,16 +129,14 @@ static int inode_storage_delete(struct inode *inode, struct bpf_map *map) static int bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key) { - struct file *f; - int fd, err; + struct fd f = fdget_raw(*(int *)key); + int err; - fd = *(int *)key; - f = fget_raw(fd); - if (!f) + if (!f.file) return -EBADF; - err = inode_storage_delete(f->f_inode, map); - fput(f); + err = inode_storage_delete(file_inode(f.file), map); + fdput(f); return err; } diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index d0ed7d6f5eec..a14d0af534b3 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -45,8 +45,8 @@ static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, enum ftrace_ops_cmd lockdep_assert_held_once(&tr->mutex); /* Instead of updating the trampoline here, we propagate - * -EAGAIN to register_ftrace_direct_multi(). Then we can - * retry register_ftrace_direct_multi() after updating the + * -EAGAIN to register_ftrace_direct(). Then we can + * retry register_ftrace_direct() after updating the * trampoline. */ if ((tr->flags & BPF_TRAMP_F_CALL_ORIG) && @@ -198,7 +198,7 @@ static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr) int ret; if (tr->func.ftrace_managed) - ret = unregister_ftrace_direct_multi(tr->fops, (long)old_addr); + ret = unregister_ftrace_direct(tr->fops, (long)old_addr, false); else ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL); @@ -215,9 +215,9 @@ static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_ad if (tr->func.ftrace_managed) { if (lock_direct_mutex) - ret = modify_ftrace_direct_multi(tr->fops, (long)new_addr); + ret = modify_ftrace_direct(tr->fops, (long)new_addr); else - ret = modify_ftrace_direct_multi_nolock(tr->fops, (long)new_addr); + ret = modify_ftrace_direct_nolock(tr->fops, (long)new_addr); } else { ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, new_addr); } @@ -243,7 +243,7 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr) if (tr->func.ftrace_managed) { ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 0, 1); - ret = register_ftrace_direct_multi(tr->fops, (long)new_addr); + ret = register_ftrace_direct(tr->fops, (long)new_addr); } else { ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr); } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d517d13878cf..767e8930b0bd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2967,6 +2967,21 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, } } else if (opcode == BPF_EXIT) { return -ENOTSUPP; + } else if (BPF_SRC(insn->code) == BPF_X) { + if (!(*reg_mask & (dreg | sreg))) + return 0; + /* dreg <cond> sreg + * Both dreg and sreg need precision before + * this insn. If only sreg was marked precise + * before it would be equally necessary to + * propagate it to dreg. + */ + *reg_mask |= (sreg | dreg); + /* else dreg <cond> K + * Only dreg still needs precision before + * this insn, so for the K-based conditional + * there is nothing new to be marked. + */ } } else if (class == BPF_LD) { if (!(*reg_mask & dreg)) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 935e8121b21e..4b249f81c693 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6856,14 +6856,12 @@ EXPORT_SYMBOL_GPL(cgroup_get_from_path); struct cgroup *cgroup_v1v2_get_from_fd(int fd) { struct cgroup *cgrp; - struct file *f; - - f = fget_raw(fd); - if (!f) + struct fd f = fdget_raw(fd); + if (!f.file) return ERR_PTR(-EBADF); - cgrp = cgroup_v1v2_get_from_file(f); - fput(f); + cgrp = cgroup_v1v2_get_from_file(f.file); + fdput(f); return cgrp; } diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config index c2f9c912df1c..144b2bd86b14 100644 --- a/kernel/configs/tiny.config +++ b/kernel/configs/tiny.config @@ -7,6 +7,5 @@ CONFIG_KERNEL_XZ=y # CONFIG_KERNEL_LZO is not set # CONFIG_KERNEL_LZ4 is not set # CONFIG_SLAB is not set -# CONFIG_SLOB_DEPRECATED is not set CONFIG_SLUB=y CONFIG_SLUB_TINY=y diff --git a/kernel/entry/syscall_user_dispatch.c b/kernel/entry/syscall_user_dispatch.c index 0b6379adff6b..5340c5aa89e7 100644 --- a/kernel/entry/syscall_user_dispatch.c +++ b/kernel/entry/syscall_user_dispatch.c @@ -4,6 +4,7 @@ */ #include <linux/sched.h> #include <linux/prctl.h> +#include <linux/ptrace.h> #include <linux/syscall_user_dispatch.h> #include <linux/uaccess.h> #include <linux/signal.h> @@ -68,8 +69,9 @@ bool syscall_user_dispatch(struct pt_regs *regs) return true; } -int set_syscall_user_dispatch(unsigned long mode, unsigned long offset, - unsigned long len, char __user *selector) +static int task_set_syscall_user_dispatch(struct task_struct *task, unsigned long mode, + unsigned long offset, unsigned long len, + char __user *selector) { switch (mode) { case PR_SYS_DISPATCH_OFF: @@ -86,7 +88,16 @@ int set_syscall_user_dispatch(unsigned long mode, unsigned long offset, if (offset && offset + len <= offset) return -EINVAL; - if (selector && !access_ok(selector, sizeof(*selector))) + /* + * access_ok() will clear memory tags for tagged addresses + * if current has memory tagging enabled. + + * To enable a tracer to set a tracees selector the + * selector address must be untagged for access_ok(), + * otherwise an untagged tracer will always fail to set a + * tagged tracees selector. + */ + if (selector && !access_ok(untagged_addr(selector), sizeof(*selector))) return -EFAULT; break; @@ -94,15 +105,60 @@ int set_syscall_user_dispatch(unsigned long mode, unsigned long offset, return -EINVAL; } - current->syscall_dispatch.selector = selector; - current->syscall_dispatch.offset = offset; - current->syscall_dispatch.len = len; - current->syscall_dispatch.on_dispatch = false; + task->syscall_dispatch.selector = selector; + task->syscall_dispatch.offset = offset; + task->syscall_dispatch.len = len; + task->syscall_dispatch.on_dispatch = false; if (mode == PR_SYS_DISPATCH_ON) - set_syscall_work(SYSCALL_USER_DISPATCH); + set_task_syscall_work(task, SYSCALL_USER_DISPATCH); + else + clear_task_syscall_work(task, SYSCALL_USER_DISPATCH); + + return 0; +} + +int set_syscall_user_dispatch(unsigned long mode, unsigned long offset, + unsigned long len, char __user *selector) +{ + return task_set_syscall_user_dispatch(current, mode, offset, len, selector); +} + +int syscall_user_dispatch_get_config(struct task_struct *task, unsigned long size, + void __user *data) +{ + struct syscall_user_dispatch *sd = &task->syscall_dispatch; + struct ptrace_sud_config cfg; + + if (size != sizeof(cfg)) + return -EINVAL; + + if (test_task_syscall_work(task, SYSCALL_USER_DISPATCH)) + cfg.mode = PR_SYS_DISPATCH_ON; else - clear_syscall_work(SYSCALL_USER_DISPATCH); + cfg.mode = PR_SYS_DISPATCH_OFF; + + cfg.offset = sd->offset; + cfg.len = sd->len; + cfg.selector = (__u64)(uintptr_t)sd->selector; + + if (copy_to_user(data, &cfg, sizeof(cfg))) + return -EFAULT; return 0; } + +int syscall_user_dispatch_set_config(struct task_struct *task, unsigned long size, + void __user *data) +{ + struct ptrace_sud_config cfg; + + if (size != sizeof(cfg)) + return -EINVAL; + + if (copy_from_user(&cfg, data, sizeof(cfg))) + return -EFAULT; + + return task_set_syscall_user_dispatch(task, cfg.mode, cfg.offset, cfg.len, + (char __user *)(uintptr_t)cfg.selector); +} diff --git a/kernel/fork.c b/kernel/fork.c index 0c92f224c68c..bfe73db1c26c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1174,6 +1174,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, fail_pcpu: while (i > 0) percpu_counter_destroy(&mm->rss_stat[--i]); + destroy_context(mm); fail_nocontext: mm_free_pgd(mm); fail_nopgd: @@ -1625,7 +1626,8 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) return 0; } -static int copy_files(unsigned long clone_flags, struct task_struct *tsk) +static int copy_files(unsigned long clone_flags, struct task_struct *tsk, + int no_files) { struct files_struct *oldf, *newf; int error = 0; @@ -1637,6 +1639,11 @@ static int copy_files(unsigned long clone_flags, struct task_struct *tsk) if (!oldf) goto out; + if (no_files) { + tsk->files = NULL; + goto out; + } + if (clone_flags & CLONE_FILES) { atomic_inc(&oldf->count); goto out; @@ -1954,6 +1961,91 @@ const struct file_operations pidfd_fops = { #endif }; +/** + * __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd + * @pid: the struct pid for which to create a pidfd + * @flags: flags of the new @pidfd + * @pidfd: the pidfd to return + * + * Allocate a new file that stashes @pid and reserve a new pidfd number in the + * caller's file descriptor table. The pidfd is reserved but not installed yet. + + * The helper doesn't perform checks on @pid which makes it useful for pidfds + * created via CLONE_PIDFD where @pid has no task attached when the pidfd and + * pidfd file are prepared. + * + * If this function returns successfully the caller is responsible to either + * call fd_install() passing the returned pidfd and pidfd file as arguments in + * order to install the pidfd into its file descriptor table or they must use + * put_unused_fd() and fput() on the returned pidfd and pidfd file + * respectively. + * + * This function is useful when a pidfd must already be reserved but there + * might still be points of failure afterwards and the caller wants to ensure + * that no pidfd is leaked into its file descriptor table. + * + * Return: On success, a reserved pidfd is returned from the function and a new + * pidfd file is returned in the last argument to the function. On + * error, a negative error code is returned from the function and the + * last argument remains unchanged. + */ +static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret) +{ + int pidfd; + struct file *pidfd_file; + + if (flags & ~(O_NONBLOCK | O_RDWR | O_CLOEXEC)) + return -EINVAL; + + pidfd = get_unused_fd_flags(O_RDWR | O_CLOEXEC); + if (pidfd < 0) + return pidfd; + + pidfd_file = anon_inode_getfile("[pidfd]", &pidfd_fops, pid, + flags | O_RDWR | O_CLOEXEC); + if (IS_ERR(pidfd_file)) { + put_unused_fd(pidfd); + return PTR_ERR(pidfd_file); + } + get_pid(pid); /* held by pidfd_file now */ + *ret = pidfd_file; + return pidfd; +} + +/** + * pidfd_prepare - allocate a new pidfd_file and reserve a pidfd + * @pid: the struct pid for which to create a pidfd + * @flags: flags of the new @pidfd + * @pidfd: the pidfd to return + * + * Allocate a new file that stashes @pid and reserve a new pidfd number in the + * caller's file descriptor table. The pidfd is reserved but not installed yet. + * + * The helper verifies that @pid is used as a thread group leader. + * + * If this function returns successfully the caller is responsible to either + * call fd_install() passing the returned pidfd and pidfd file as arguments in + * order to install the pidfd into its file descriptor table or they must use + * put_unused_fd() and fput() on the returned pidfd and pidfd file + * respectively. + * + * This function is useful when a pidfd must already be reserved but there + * might still be points of failure afterwards and the caller wants to ensure + * that no pidfd is leaked into its file descriptor table. + * + * Return: On success, a reserved pidfd is returned from the function and a new + * pidfd file is returned in the last argument to the function. On + * error, a negative error code is returned from the function and the + * last argument remains unchanged. + */ +int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret) +{ + if (!pid || !pid_has_task(pid, PIDTYPE_TGID)) + return -EINVAL; + + return __pidfd_prepare(pid, flags, ret); +} + static void __delayed_free_task(struct rcu_head *rhp) { struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); @@ -2008,7 +2100,7 @@ static void rv_task_fork(struct task_struct *p) * parts of the process environment (as per the clone * flags). The actual kick-off is left to the caller. */ -static __latent_entropy struct task_struct *copy_process( +__latent_entropy struct task_struct *copy_process( struct pid *pid, int trace, int node, @@ -2101,6 +2193,8 @@ static __latent_entropy struct task_struct *copy_process( p->flags &= ~PF_KTHREAD; if (args->kthread) p->flags |= PF_KTHREAD; + if (args->user_worker) + p->flags |= PF_USER_WORKER; if (args->io_thread) { /* * Mark us an IO worker, and block any signal that isn't @@ -2110,6 +2204,9 @@ static __latent_entropy struct task_struct *copy_process( siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP)); } + if (args->name) + strscpy_pad(p->comm, args->name, sizeof(p->comm)); + p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL; /* * Clear TID on mm_release()? @@ -2252,7 +2349,7 @@ static __latent_entropy struct task_struct *copy_process( retval = copy_semundo(clone_flags, p); if (retval) goto bad_fork_cleanup_security; - retval = copy_files(clone_flags, p); + retval = copy_files(clone_flags, p, args->no_files); if (retval) goto bad_fork_cleanup_semundo; retval = copy_fs(clone_flags, p); @@ -2277,6 +2374,9 @@ static __latent_entropy struct task_struct *copy_process( if (retval) goto bad_fork_cleanup_io; + if (args->ignore_signals) + ignore_signals(p); + stackleak_task_init(p); if (pid != &init_struct_pid) { @@ -2294,21 +2394,12 @@ static __latent_entropy struct task_struct *copy_process( * if the fd table isn't shared). */ if (clone_flags & CLONE_PIDFD) { - retval = get_unused_fd_flags(O_RDWR | O_CLOEXEC); + /* Note that no task has been attached to @pid yet. */ + retval = __pidfd_prepare(pid, O_RDWR | O_CLOEXEC, &pidfile); if (retval < 0) goto bad_fork_free_pid; - pidfd = retval; - pidfile = anon_inode_getfile("[pidfd]", &pidfd_fops, pid, - O_RDWR | O_CLOEXEC); - if (IS_ERR(pidfile)) { - put_unused_fd(pidfd); - retval = PTR_ERR(pidfile); - goto bad_fork_free_pid; - } - get_pid(pid); /* held by pidfile now */ - retval = put_user(pidfd, args->pidfd); if (retval) goto bad_fork_put_pidfd; @@ -2625,6 +2716,7 @@ struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node) .fn = fn, .fn_arg = arg, .io_thread = 1, + .user_worker = 1, }; return copy_process(NULL, 0, node, &args); @@ -2728,7 +2820,8 @@ pid_t kernel_clone(struct kernel_clone_args *args) /* * Create a kernel thread. */ -pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) +pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name, + unsigned long flags) { struct kernel_clone_args args = { .flags = ((lower_32_bits(flags) | CLONE_VM | @@ -2736,6 +2829,7 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) .exit_signal = (lower_32_bits(flags) & CSIGNAL), .fn = fn, .fn_arg = arg, + .name = name, .kthread = 1, }; diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 8ce75495e04f..d2742af0f0fd 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -189,9 +189,12 @@ void irq_set_thread_affinity(struct irq_desc *desc) { struct irqaction *action; - for_each_action_of_desc(desc, action) + for_each_action_of_desc(desc, action) { if (action->thread) set_bit(IRQTF_AFFINITY, &action->thread_flags); + if (action->secondary && action->secondary->thread) + set_bit(IRQTF_AFFINITY, &action->secondary->thread_flags); + } } #ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 54d077e1a2dc..5a60cc52adc0 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -337,11 +337,20 @@ static void delay_access(int type) */ static __always_inline u64 read_instrumented_memory(const volatile void *ptr, size_t size) { + /* + * In the below we don't necessarily need the read of the location to + * be atomic, and we don't use READ_ONCE(), since all we need for race + * detection is to observe 2 different values. + * + * Furthermore, on certain architectures (such as arm64), READ_ONCE() + * may turn into more complex instructions than a plain load that cannot + * do unaligned accesses. + */ switch (size) { - case 1: return READ_ONCE(*(const u8 *)ptr); - case 2: return READ_ONCE(*(const u16 *)ptr); - case 4: return READ_ONCE(*(const u32 *)ptr); - case 8: return READ_ONCE(*(const u64 *)ptr); + case 1: return *(const volatile u8 *)ptr; + case 2: return *(const volatile u16 *)ptr; + case 4: return *(const volatile u32 *)ptr; + case 8: return *(const volatile u64 *)ptr; default: return 0; /* Ignore; we do not diff the values. */ } } diff --git a/kernel/kthread.c b/kernel/kthread.c index 7e6751b29101..4bc6e0971ec9 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -38,6 +38,7 @@ struct task_struct *kthreadd_task; struct kthread_create_info { /* Information passed to kthread() from kthreadd. */ + char *full_name; int (*threadfn)(void *data); void *data; int node; @@ -343,10 +344,12 @@ static int kthread(void *_create) /* Release the structure when caller killed by a fatal signal. */ done = xchg(&create->done, NULL); if (!done) { + kfree(create->full_name); kfree(create); kthread_exit(-EINTR); } + self->full_name = create->full_name; self->threadfn = threadfn; self->data = data; @@ -396,11 +399,13 @@ static void create_kthread(struct kthread_create_info *create) current->pref_node_fork = create->node; #endif /* We want our own signal handler (we take no signals by default). */ - pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD); + pid = kernel_thread(kthread, create, create->full_name, + CLONE_FS | CLONE_FILES | SIGCHLD); if (pid < 0) { /* Release the structure when caller killed by a fatal signal. */ struct completion *done = xchg(&create->done, NULL); + kfree(create->full_name); if (!done) { kfree(create); return; @@ -427,6 +432,11 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), create->data = data; create->node = node; create->done = &done; + create->full_name = kvasprintf(GFP_KERNEL, namefmt, args); + if (!create->full_name) { + task = ERR_PTR(-ENOMEM); + goto free_create; + } spin_lock(&kthread_create_lock); list_add_tail(&create->list, &kthread_create_list); @@ -453,26 +463,7 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), wait_for_completion(&done); } task = create->result; - if (!IS_ERR(task)) { - char name[TASK_COMM_LEN]; - va_list aq; - int len; - - /* - * task is already visible to other tasks, so updating - * COMM must be protected. - */ - va_copy(aq, args); - len = vsnprintf(name, sizeof(name), namefmt, aq); - va_end(aq); - if (len >= TASK_COMM_LEN) { - struct kthread *kthread = to_kthread(task); - - /* leave it truncated when out of memory. */ - kthread->full_name = kvasprintf(GFP_KERNEL, namefmt, args); - } - set_task_comm(task, name); - } +free_create: kfree(create); return task; } diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 4bd2d5e10f20..feaf90c5e537 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -596,7 +596,7 @@ static void klp_kobj_release_patch(struct kobject *kobj) complete(&patch->finish); } -static struct kobj_type klp_ktype_patch = { +static const struct kobj_type klp_ktype_patch = { .release = klp_kobj_release_patch, .sysfs_ops = &kobj_sysfs_ops, .default_groups = klp_patch_groups, @@ -612,7 +612,7 @@ static void klp_kobj_release_object(struct kobject *kobj) klp_free_object_dynamic(obj); } -static struct kobj_type klp_ktype_object = { +static const struct kobj_type klp_ktype_object = { .release = klp_kobj_release_object, .sysfs_ops = &kobj_sysfs_ops, .default_groups = klp_object_groups, @@ -628,7 +628,7 @@ static void klp_kobj_release_func(struct kobject *kobj) klp_free_func_nop(func); } -static struct kobj_type klp_ktype_func = { +static const struct kobj_type klp_ktype_func = { .release = klp_kobj_release_func, .sysfs_ops = &kobj_sysfs_ops, }; diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 50d4863974e7..dcd1d5bfc1e0 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1881,6 +1881,8 @@ print_circular_lock_scenario(struct held_lock *src, struct lock_class *source = hlock_class(src); struct lock_class *target = hlock_class(tgt); struct lock_class *parent = prt->class; + int src_read = src->read; + int tgt_read = tgt->read; /* * A direct locking problem where unsafe_class lock is taken @@ -1908,7 +1910,10 @@ print_circular_lock_scenario(struct held_lock *src, printk(" Possible unsafe locking scenario:\n\n"); printk(" CPU0 CPU1\n"); printk(" ---- ----\n"); - printk(" lock("); + if (tgt_read != 0) + printk(" rlock("); + else + printk(" lock("); __print_lock_name(target); printk(KERN_CONT ");\n"); printk(" lock("); @@ -1917,7 +1922,12 @@ print_circular_lock_scenario(struct held_lock *src, printk(" lock("); __print_lock_name(target); printk(KERN_CONT ");\n"); - printk(" lock("); + if (src_read != 0) + printk(" rlock("); + else if (src->sync) + printk(" sync("); + else + printk(" lock("); __print_lock_name(source); printk(KERN_CONT ");\n"); printk("\n *** DEADLOCK ***\n\n"); @@ -4531,7 +4541,13 @@ mark_usage(struct task_struct *curr, struct held_lock *hlock, int check) return 0; } } - if (!hlock->hardirqs_off) { + + /* + * For lock_sync(), don't mark the ENABLED usage, since lock_sync() + * creates no critical section and no extra dependency can be introduced + * by interrupts + */ + if (!hlock->hardirqs_off && !hlock->sync) { if (hlock->read) { if (!mark_lock(curr, hlock, LOCK_ENABLED_HARDIRQ_READ)) @@ -4910,7 +4926,7 @@ static int __lock_is_held(const struct lockdep_map *lock, int read); static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, int trylock, int read, int check, int hardirqs_off, struct lockdep_map *nest_lock, unsigned long ip, - int references, int pin_count) + int references, int pin_count, int sync) { struct task_struct *curr = current; struct lock_class *class = NULL; @@ -4961,7 +4977,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, class_idx = class - lock_classes; - if (depth) { /* we're holding locks */ + if (depth && !sync) { + /* we're holding locks and the new held lock is not a sync */ hlock = curr->held_locks + depth - 1; if (hlock->class_idx == class_idx && nest_lock) { if (!references) @@ -4995,6 +5012,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, hlock->trylock = trylock; hlock->read = read; hlock->check = check; + hlock->sync = !!sync; hlock->hardirqs_off = !!hardirqs_off; hlock->references = references; #ifdef CONFIG_LOCK_STAT @@ -5056,6 +5074,10 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (!validate_chain(curr, hlock, chain_head, chain_key)) return 0; + /* For lock_sync(), we are done here since no actual critical section */ + if (hlock->sync) + return 1; + curr->curr_chain_key = chain_key; curr->lockdep_depth++; check_chain_key(curr); @@ -5197,7 +5219,7 @@ static int reacquire_held_locks(struct task_struct *curr, unsigned int depth, hlock->read, hlock->check, hlock->hardirqs_off, hlock->nest_lock, hlock->acquire_ip, - hlock->references, hlock->pin_count)) { + hlock->references, hlock->pin_count, 0)) { case 0: return 1; case 1: @@ -5667,7 +5689,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, lockdep_recursion_inc(); __lock_acquire(lock, subclass, trylock, read, check, - irqs_disabled_flags(flags), nest_lock, ip, 0, 0); + irqs_disabled_flags(flags), nest_lock, ip, 0, 0, 0); lockdep_recursion_finish(); raw_local_irq_restore(flags); } @@ -5693,6 +5715,34 @@ void lock_release(struct lockdep_map *lock, unsigned long ip) } EXPORT_SYMBOL_GPL(lock_release); +/* + * lock_sync() - A special annotation for synchronize_{s,}rcu()-like API. + * + * No actual critical section is created by the APIs annotated with this: these + * APIs are used to wait for one or multiple critical sections (on other CPUs + * or threads), and it means that calling these APIs inside these critical + * sections is potential deadlock. + */ +void lock_sync(struct lockdep_map *lock, unsigned subclass, int read, + int check, struct lockdep_map *nest_lock, unsigned long ip) +{ + unsigned long flags; + + if (unlikely(!lockdep_enabled())) + return; + + raw_local_irq_save(flags); + check_flags(flags); + + lockdep_recursion_inc(); + __lock_acquire(lock, subclass, 0, read, check, + irqs_disabled_flags(flags), nest_lock, ip, 0, 0, 1); + check_chain_key(current); + lockdep_recursion_finish(); + raw_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(lock_sync); + noinstr int lock_is_held_type(const struct lockdep_map *lock, int read) { unsigned long flags; diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index f04b1978899d..153ddc4c47ef 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -51,8 +51,11 @@ torture_param(int, rt_boost, 2, torture_param(int, rt_boost_factor, 50, "A factor determining how often rt-boost happens."); torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); +torture_param(int, nested_locks, 0, "Number of nested locks (max = 8)"); +/* Going much higher trips "BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!" errors */ +#define MAX_NESTED_LOCKS 8 -static char *torture_type = "spin_lock"; +static char *torture_type = IS_ENABLED(CONFIG_PREEMPT_RT) ? "raw_spin_lock" : "spin_lock"; module_param(torture_type, charp, 0444); MODULE_PARM_DESC(torture_type, "Type of lock to torture (spin_lock, spin_lock_irq, mutex_lock, ...)"); @@ -79,10 +82,12 @@ static void lock_torture_cleanup(void); struct lock_torture_ops { void (*init)(void); void (*exit)(void); + int (*nested_lock)(int tid, u32 lockset); int (*writelock)(int tid); void (*write_delay)(struct torture_random_state *trsp); void (*task_boost)(struct torture_random_state *trsp); void (*writeunlock)(int tid); + void (*nested_unlock)(int tid, u32 lockset); int (*readlock)(int tid); void (*read_delay)(struct torture_random_state *trsp); void (*readunlock)(int tid); @@ -252,6 +257,59 @@ static struct lock_torture_ops spin_lock_irq_ops = { .name = "spin_lock_irq" }; +static DEFINE_RAW_SPINLOCK(torture_raw_spinlock); + +static int torture_raw_spin_lock_write_lock(int tid __maybe_unused) +__acquires(torture_raw_spinlock) +{ + raw_spin_lock(&torture_raw_spinlock); + return 0; +} + +static void torture_raw_spin_lock_write_unlock(int tid __maybe_unused) +__releases(torture_raw_spinlock) +{ + raw_spin_unlock(&torture_raw_spinlock); +} + +static struct lock_torture_ops raw_spin_lock_ops = { + .writelock = torture_raw_spin_lock_write_lock, + .write_delay = torture_spin_lock_write_delay, + .task_boost = torture_rt_boost, + .writeunlock = torture_raw_spin_lock_write_unlock, + .readlock = NULL, + .read_delay = NULL, + .readunlock = NULL, + .name = "raw_spin_lock" +}; + +static int torture_raw_spin_lock_write_lock_irq(int tid __maybe_unused) +__acquires(torture_raw_spinlock) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&torture_raw_spinlock, flags); + cxt.cur_ops->flags = flags; + return 0; +} + +static void torture_raw_spin_lock_write_unlock_irq(int tid __maybe_unused) +__releases(torture_raw_spinlock) +{ + raw_spin_unlock_irqrestore(&torture_raw_spinlock, cxt.cur_ops->flags); +} + +static struct lock_torture_ops raw_spin_lock_irq_ops = { + .writelock = torture_raw_spin_lock_write_lock_irq, + .write_delay = torture_spin_lock_write_delay, + .task_boost = torture_rt_boost, + .writeunlock = torture_raw_spin_lock_write_unlock_irq, + .readlock = NULL, + .read_delay = NULL, + .readunlock = NULL, + .name = "raw_spin_lock_irq" +}; + static DEFINE_RWLOCK(torture_rwlock); static int torture_rwlock_write_lock(int tid __maybe_unused) @@ -365,6 +423,28 @@ static struct lock_torture_ops rw_lock_irq_ops = { }; static DEFINE_MUTEX(torture_mutex); +static struct mutex torture_nested_mutexes[MAX_NESTED_LOCKS]; +static struct lock_class_key nested_mutex_keys[MAX_NESTED_LOCKS]; + +static void torture_mutex_init(void) +{ + int i; + + for (i = 0; i < MAX_NESTED_LOCKS; i++) + __mutex_init(&torture_nested_mutexes[i], __func__, + &nested_mutex_keys[i]); +} + +static int torture_mutex_nested_lock(int tid __maybe_unused, + u32 lockset) +{ + int i; + + for (i = 0; i < nested_locks; i++) + if (lockset & (1 << i)) + mutex_lock(&torture_nested_mutexes[i]); + return 0; +} static int torture_mutex_lock(int tid __maybe_unused) __acquires(torture_mutex) @@ -393,11 +473,24 @@ __releases(torture_mutex) mutex_unlock(&torture_mutex); } +static void torture_mutex_nested_unlock(int tid __maybe_unused, + u32 lockset) +{ + int i; + + for (i = nested_locks - 1; i >= 0; i--) + if (lockset & (1 << i)) + mutex_unlock(&torture_nested_mutexes[i]); +} + static struct lock_torture_ops mutex_lock_ops = { + .init = torture_mutex_init, + .nested_lock = torture_mutex_nested_lock, .writelock = torture_mutex_lock, .write_delay = torture_mutex_delay, .task_boost = torture_rt_boost, .writeunlock = torture_mutex_unlock, + .nested_unlock = torture_mutex_nested_unlock, .readlock = NULL, .read_delay = NULL, .readunlock = NULL, @@ -504,6 +597,28 @@ static struct lock_torture_ops ww_mutex_lock_ops = { #ifdef CONFIG_RT_MUTEXES static DEFINE_RT_MUTEX(torture_rtmutex); +static struct rt_mutex torture_nested_rtmutexes[MAX_NESTED_LOCKS]; +static struct lock_class_key nested_rtmutex_keys[MAX_NESTED_LOCKS]; + +static void torture_rtmutex_init(void) +{ + int i; + + for (i = 0; i < MAX_NESTED_LOCKS; i++) + __rt_mutex_init(&torture_nested_rtmutexes[i], __func__, + &nested_rtmutex_keys[i]); +} + +static int torture_rtmutex_nested_lock(int tid __maybe_unused, + u32 lockset) +{ + int i; + + for (i = 0; i < nested_locks; i++) + if (lockset & (1 << i)) + rt_mutex_lock(&torture_nested_rtmutexes[i]); + return 0; +} static int torture_rtmutex_lock(int tid __maybe_unused) __acquires(torture_rtmutex) @@ -545,11 +660,24 @@ static void torture_rt_boost_rtmutex(struct torture_random_state *trsp) __torture_rt_boost(trsp); } +static void torture_rtmutex_nested_unlock(int tid __maybe_unused, + u32 lockset) +{ + int i; + + for (i = nested_locks - 1; i >= 0; i--) + if (lockset & (1 << i)) + rt_mutex_unlock(&torture_nested_rtmutexes[i]); +} + static struct lock_torture_ops rtmutex_lock_ops = { + .init = torture_rtmutex_init, + .nested_lock = torture_rtmutex_nested_lock, .writelock = torture_rtmutex_lock, .write_delay = torture_rtmutex_delay, .task_boost = torture_rt_boost_rtmutex, .writeunlock = torture_rtmutex_unlock, + .nested_unlock = torture_rtmutex_nested_unlock, .readlock = NULL, .read_delay = NULL, .readunlock = NULL, @@ -684,6 +812,8 @@ static int lock_torture_writer(void *arg) struct lock_stress_stats *lwsp = arg; int tid = lwsp - cxt.lwsa; DEFINE_TORTURE_RANDOM(rand); + u32 lockset_mask; + bool skip_main_lock; VERBOSE_TOROUT_STRING("lock_torture_writer task started"); set_user_nice(current, MAX_NICE); @@ -692,19 +822,40 @@ static int lock_torture_writer(void *arg) if ((torture_random(&rand) & 0xfffff) == 0) schedule_timeout_uninterruptible(1); - cxt.cur_ops->task_boost(&rand); - cxt.cur_ops->writelock(tid); - if (WARN_ON_ONCE(lock_is_write_held)) - lwsp->n_lock_fail++; - lock_is_write_held = true; - if (WARN_ON_ONCE(atomic_read(&lock_is_read_held))) - lwsp->n_lock_fail++; /* rare, but... */ + lockset_mask = torture_random(&rand); + /* + * When using nested_locks, we want to occasionally + * skip the main lock so we can avoid always serializing + * the lock chains on that central lock. By skipping the + * main lock occasionally, we can create different + * contention patterns (allowing for multiple disjoint + * blocked trees) + */ + skip_main_lock = (nested_locks && + !(torture_random(&rand) % 100)); - lwsp->n_lock_acquired++; + cxt.cur_ops->task_boost(&rand); + if (cxt.cur_ops->nested_lock) + cxt.cur_ops->nested_lock(tid, lockset_mask); + + if (!skip_main_lock) { + cxt.cur_ops->writelock(tid); + if (WARN_ON_ONCE(lock_is_write_held)) + lwsp->n_lock_fail++; + lock_is_write_held = true; + if (WARN_ON_ONCE(atomic_read(&lock_is_read_held))) + lwsp->n_lock_fail++; /* rare, but... */ + + lwsp->n_lock_acquired++; + } cxt.cur_ops->write_delay(&rand); - lock_is_write_held = false; - WRITE_ONCE(last_lock_release, jiffies); - cxt.cur_ops->writeunlock(tid); + if (!skip_main_lock) { + lock_is_write_held = false; + WRITE_ONCE(last_lock_release, jiffies); + cxt.cur_ops->writeunlock(tid); + } + if (cxt.cur_ops->nested_unlock) + cxt.cur_ops->nested_unlock(tid, lockset_mask); stutter_wait("lock_torture_writer"); } while (!torture_must_stop()); @@ -845,11 +996,11 @@ lock_torture_print_module_parms(struct lock_torture_ops *cur_ops, const char *tag) { pr_alert("%s" TORTURE_FLAG - "--- %s%s: nwriters_stress=%d nreaders_stress=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n", + "--- %s%s: nwriters_stress=%d nreaders_stress=%d nested_locks=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n", torture_type, tag, cxt.debug_lock ? " [debug]": "", - cxt.nrealwriters_stress, cxt.nrealreaders_stress, stat_interval, - verbose, shuffle_interval, stutter, shutdown_secs, - onoff_interval, onoff_holdoff); + cxt.nrealwriters_stress, cxt.nrealreaders_stress, + nested_locks, stat_interval, verbose, shuffle_interval, + stutter, shutdown_secs, onoff_interval, onoff_holdoff); } static void lock_torture_cleanup(void) @@ -919,6 +1070,7 @@ static int __init lock_torture_init(void) static struct lock_torture_ops *torture_ops[] = { &lock_busted_ops, &spin_lock_ops, &spin_lock_irq_ops, + &raw_spin_lock_ops, &raw_spin_lock_irq_ops, &rw_lock_ops, &rw_lock_irq_ops, &mutex_lock_ops, &ww_mutex_lock_ops, @@ -1068,6 +1220,10 @@ static int __init lock_torture_init(void) } } + /* cap nested_locks to MAX_NESTED_LOCKS */ + if (nested_locks > MAX_NESTED_LOCKS) + nested_locks = MAX_NESTED_LOCKS; + if (cxt.cur_ops->readlock) { reader_tasks = kcalloc(cxt.nrealreaders_stress, sizeof(reader_tasks[0]), diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c index 29dc253d03af..93cca6e69860 100644 --- a/kernel/locking/test-ww_mutex.c +++ b/kernel/locking/test-ww_mutex.c @@ -659,7 +659,7 @@ static int __init test_ww_mutex_init(void) if (ret) return ret; - ret = stress(4095, hweight32(STRESS_ALL)*ncpus, STRESS_ALL); + ret = stress(2047, hweight32(STRESS_ALL)*ncpus, STRESS_ALL); if (ret) return ret; diff --git a/kernel/module/livepatch.c b/kernel/module/livepatch.c index 486d4ff92719..a89f01e1d6b7 100644 --- a/kernel/module/livepatch.c +++ b/kernel/module/livepatch.c @@ -11,7 +11,7 @@ #include "internal.h" /* - * Persist Elf information about a module. Copy the Elf header, + * Persist ELF information about a module. Copy the ELF header, * section header table, section string table, and symtab section * index from info to mod->klp_info. */ @@ -25,11 +25,11 @@ int copy_module_elf(struct module *mod, struct load_info *info) if (!mod->klp_info) return -ENOMEM; - /* Elf header */ + /* ELF header */ size = sizeof(mod->klp_info->hdr); memcpy(&mod->klp_info->hdr, info->hdr, size); - /* Elf section header table */ + /* ELF section header table */ size = sizeof(*info->sechdrs) * info->hdr->e_shnum; mod->klp_info->sechdrs = kmemdup(info->sechdrs, size, GFP_KERNEL); if (!mod->klp_info->sechdrs) { @@ -37,7 +37,7 @@ int copy_module_elf(struct module *mod, struct load_info *info) goto free_info; } - /* Elf section name string table */ + /* ELF section name string table */ size = info->sechdrs[info->hdr->e_shstrndx].sh_size; mod->klp_info->secstrings = kmemdup(info->secstrings, size, GFP_KERNEL); if (!mod->klp_info->secstrings) { @@ -45,7 +45,7 @@ int copy_module_elf(struct module *mod, struct load_info *info) goto free_sechdrs; } - /* Elf symbol section index */ + /* ELF symbol section index */ symndx = info->index.sym; mod->klp_info->symndx = symndx; diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index a487ff24129b..80d9c6d77a45 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -545,21 +545,20 @@ static void commit_nsset(struct nsset *nsset) SYSCALL_DEFINE2(setns, int, fd, int, flags) { - struct file *file; + struct fd f = fdget(fd); struct ns_common *ns = NULL; struct nsset nsset = {}; int err = 0; - file = fget(fd); - if (!file) + if (!f.file) return -EBADF; - if (proc_ns_file(file)) { - ns = get_proc_ns(file_inode(file)); + if (proc_ns_file(f.file)) { + ns = get_proc_ns(file_inode(f.file)); if (flags && (ns->ops->type != flags)) err = -EINVAL; flags = ns->ops->type; - } else if (!IS_ERR(pidfd_pid(file))) { + } else if (!IS_ERR(pidfd_pid(f.file))) { err = check_setns_flags(flags); } else { err = -EINVAL; @@ -571,17 +570,17 @@ SYSCALL_DEFINE2(setns, int, fd, int, flags) if (err) goto out; - if (proc_ns_file(file)) + if (proc_ns_file(f.file)) err = validate_ns(&nsset, ns); else - err = validate_nsset(&nsset, file->private_data); + err = validate_nsset(&nsset, f.file->private_data); if (!err) { commit_nsset(&nsset); perf_event_namespaces(current); } put_nsset(&nsset); out: - fput(file); + fdput(f); return err; } diff --git a/kernel/pid.c b/kernel/pid.c index 3fbc5e46b721..f93954a0384d 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -594,20 +594,15 @@ struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags) */ int pidfd_create(struct pid *pid, unsigned int flags) { - int fd; - - if (!pid || !pid_has_task(pid, PIDTYPE_TGID)) - return -EINVAL; + int pidfd; + struct file *pidfd_file; - if (flags & ~(O_NONBLOCK | O_RDWR | O_CLOEXEC)) - return -EINVAL; - - fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid), - flags | O_RDWR | O_CLOEXEC); - if (fd < 0) - put_pid(pid); + pidfd = pidfd_prepare(pid, flags, &pidfd_file); + if (pidfd < 0) + return pidfd; - return fd; + fd_install(pidfd, pidfd_file); + return pidfd; } /** diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index fd0c9f913940..9644f6e5bf15 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -730,7 +730,7 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from) size_t len = iov_iter_count(from); ssize_t ret = len; - if (!user || len > PRINTKRB_RECORD_MAX) + if (len > PRINTKRB_RECORD_MAX) return -EINVAL; /* Ignore when user logging is disabled. */ @@ -792,9 +792,6 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, }; ssize_t ret; - if (!user) - return -EBADF; - ret = mutex_lock_interruptible(&user->lock); if (ret) return ret; @@ -859,8 +856,6 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) struct devkmsg_user *user = file->private_data; loff_t ret = 0; - if (!user) - return -EBADF; if (offset) return -ESPIPE; @@ -893,9 +888,6 @@ static __poll_t devkmsg_poll(struct file *file, poll_table *wait) struct printk_info info; __poll_t ret = 0; - if (!user) - return EPOLLERR|EPOLLNVAL; - poll_wait(file, &log_wait, wait); if (prb_read_valid_info(prb, atomic64_read(&user->seq), &info, NULL)) { @@ -944,9 +936,6 @@ static int devkmsg_release(struct inode *inode, struct file *file) { struct devkmsg_user *user = file->private_data; - if (!user) - return 0; - ratelimit_state_exit(&user->rs); mutex_destroy(&user->lock); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 0786450074c1..443057bee87c 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -32,6 +32,7 @@ #include <linux/compat.h> #include <linux/sched/signal.h> #include <linux/minmax.h> +#include <linux/syscall_user_dispatch.h> #include <asm/syscall.h> /* for syscall_get_* */ @@ -1259,6 +1260,14 @@ int ptrace_request(struct task_struct *child, long request, break; #endif + case PTRACE_SET_SYSCALL_USER_DISPATCH_CONFIG: + ret = syscall_user_dispatch_set_config(child, addr, datavp); + break; + + case PTRACE_GET_SYSCALL_USER_DISPATCH_CONFIG: + ret = syscall_user_dispatch_get_config(child, addr, datavp); + break; + default: break; } diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index ab62074174c3..9071182b1284 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -53,9 +53,6 @@ config RCU_EXPERT Say N if you are unsure. -config SRCU - def_bool y - config TINY_SRCU bool default y if TINY_RCU diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 115616ac3bfa..4a1b9622598b 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -14,6 +14,43 @@ /* * Grace-period counter management. + * + * The two least significant bits contain the control flags. + * The most significant bits contain the grace-period sequence counter. + * + * When both control flags are zero, no grace period is in progress. + * When either bit is non-zero, a grace period has started and is in + * progress. When the grace period completes, the control flags are reset + * to 0 and the grace-period sequence counter is incremented. + * + * However some specific RCU usages make use of custom values. + * + * SRCU special control values: + * + * SRCU_SNP_INIT_SEQ : Invalid/init value set when SRCU node + * is initialized. + * + * SRCU_STATE_IDLE : No SRCU gp is in progress + * + * SRCU_STATE_SCAN1 : State set by rcu_seq_start(). Indicates + * we are scanning the readers on the slot + * defined as inactive (there might well + * be pending readers that will use that + * index, but their number is bounded). + * + * SRCU_STATE_SCAN2 : State set manually via rcu_seq_set_state() + * Indicates we are flipping the readers + * index and then scanning the readers on the + * slot newly designated as inactive (again, + * the number of pending readers that will use + * this inactive index is bounded). + * + * RCU polled GP special control value: + * + * RCU_GET_STATE_COMPLETED : State value indicating an already-completed + * polled GP has completed. This value covers + * both the state and the counter of the + * grace-period sequence number. */ #define RCU_SEQ_CTR_SHIFT 2 @@ -341,11 +378,13 @@ extern void rcu_init_geometry(void); * specified state structure (for SRCU) or the only rcu_state structure * (for RCU). */ -#define srcu_for_each_node_breadth_first(sp, rnp) \ +#define _rcu_for_each_node_breadth_first(sp, rnp) \ for ((rnp) = &(sp)->node[0]; \ (rnp) < &(sp)->node[rcu_num_nodes]; (rnp)++) #define rcu_for_each_node_breadth_first(rnp) \ - srcu_for_each_node_breadth_first(&rcu_state, rnp) + _rcu_for_each_node_breadth_first(&rcu_state, rnp) +#define srcu_for_each_node_breadth_first(ssp, rnp) \ + _rcu_for_each_node_breadth_first(ssp->srcu_sup, rnp) /* * Scan the leaves of the rcu_node hierarchy for the rcu_state structure. diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index 91fb5905a008..e82ec9f9a5d8 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -631,8 +631,7 @@ static int compute_real(int n) static int rcu_scale_shutdown(void *arg) { - wait_event(shutdown_wq, - atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters); + wait_event_idle(shutdown_wq, atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters); smp_mb(); /* Wake before output. */ rcu_scale_cleanup(); kernel_power_off(); @@ -716,7 +715,7 @@ kfree_scale_thread(void *arg) // is tested. if ((kfree_rcu_test_single && !kfree_rcu_test_double) || (kfree_rcu_test_both && torture_random(&tr) & 0x800)) - kfree_rcu(alloc_ptr); + kfree_rcu_mightsleep(alloc_ptr); else kfree_rcu(alloc_ptr, rh); } @@ -771,8 +770,8 @@ kfree_scale_cleanup(void) static int kfree_scale_shutdown(void *arg) { - wait_event(shutdown_wq, - atomic_read(&n_kfree_scale_thread_ended) >= kfree_nrealthreads); + wait_event_idle(shutdown_wq, + atomic_read(&n_kfree_scale_thread_ended) >= kfree_nrealthreads); smp_mb(); /* Wake before output. */ diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 8e6c023212cb..147551c23baf 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -119,7 +119,9 @@ torture_param(int, stutter, 5, "Number of seconds to run/halt test"); torture_param(int, test_boost, 1, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); torture_param(int, test_boost_duration, 4, "Duration of each boost test, seconds."); torture_param(int, test_boost_interval, 7, "Interval between boost tests, seconds."); +torture_param(int, test_nmis, 0, "End-test NMI tests, 0 to disable."); torture_param(bool, test_no_idle_hz, true, "Test support for tickless idle CPUs"); +torture_param(int, test_srcu_lockdep, 0, "Test specified SRCU deadlock scenario."); torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); static char *torture_type = "rcu"; @@ -179,7 +181,6 @@ static atomic_t n_rcu_torture_mbchk_tries; static atomic_t n_rcu_torture_error; static long n_rcu_torture_barrier_error; static long n_rcu_torture_boost_ktrerror; -static long n_rcu_torture_boost_rterror; static long n_rcu_torture_boost_failure; static long n_rcu_torture_boosts; static atomic_long_t n_rcu_torture_timers; @@ -2194,12 +2195,11 @@ rcu_torture_stats_print(void) atomic_read(&n_rcu_torture_alloc), atomic_read(&n_rcu_torture_alloc_fail), atomic_read(&n_rcu_torture_free)); - pr_cont("rtmbe: %d rtmbkf: %d/%d rtbe: %ld rtbke: %ld rtbre: %ld ", + pr_cont("rtmbe: %d rtmbkf: %d/%d rtbe: %ld rtbke: %ld ", atomic_read(&n_rcu_torture_mberror), atomic_read(&n_rcu_torture_mbchk_fail), atomic_read(&n_rcu_torture_mbchk_tries), n_rcu_torture_barrier_error, - n_rcu_torture_boost_ktrerror, - n_rcu_torture_boost_rterror); + n_rcu_torture_boost_ktrerror); pr_cont("rtbf: %ld rtb: %ld nt: %ld ", n_rcu_torture_boost_failure, n_rcu_torture_boosts, @@ -2217,15 +2217,13 @@ rcu_torture_stats_print(void) if (atomic_read(&n_rcu_torture_mberror) || atomic_read(&n_rcu_torture_mbchk_fail) || n_rcu_torture_barrier_error || n_rcu_torture_boost_ktrerror || - n_rcu_torture_boost_rterror || n_rcu_torture_boost_failure || - i > 1) { + n_rcu_torture_boost_failure || i > 1) { pr_cont("%s", "!!! "); atomic_inc(&n_rcu_torture_error); WARN_ON_ONCE(atomic_read(&n_rcu_torture_mberror)); WARN_ON_ONCE(atomic_read(&n_rcu_torture_mbchk_fail)); WARN_ON_ONCE(n_rcu_torture_barrier_error); // rcu_barrier() WARN_ON_ONCE(n_rcu_torture_boost_ktrerror); // no boost kthread - WARN_ON_ONCE(n_rcu_torture_boost_rterror); // can't set RT prio WARN_ON_ONCE(n_rcu_torture_boost_failure); // boost failed (TIMER_SOFTIRQ RT prio?) WARN_ON_ONCE(i > 1); // Too-short grace period } @@ -2358,7 +2356,8 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) "n_barrier_cbs=%d " "onoff_interval=%d onoff_holdoff=%d " "read_exit_delay=%d read_exit_burst=%d " - "nocbs_nthreads=%d nocbs_toggle=%d\n", + "nocbs_nthreads=%d nocbs_toggle=%d " + "test_nmis=%d\n", torture_type, tag, nrealreaders, nfakewriters, stat_interval, verbose, test_no_idle_hz, shuffle_interval, stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, @@ -2369,7 +2368,8 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) n_barrier_cbs, onoff_interval, onoff_holdoff, read_exit_delay, read_exit_burst, - nocbs_nthreads, nocbs_toggle); + nocbs_nthreads, nocbs_toggle, + test_nmis); } static int rcutorture_booster_cleanup(unsigned int cpu) @@ -3273,6 +3273,29 @@ static void rcu_torture_read_exit_cleanup(void) torture_stop_kthread(rcutorture_read_exit, read_exit_task); } +static void rcutorture_test_nmis(int n) +{ +#if IS_BUILTIN(CONFIG_RCU_TORTURE_TEST) + int cpu; + int dumpcpu; + int i; + + for (i = 0; i < n; i++) { + preempt_disable(); + cpu = smp_processor_id(); + dumpcpu = cpu + 1; + if (dumpcpu >= nr_cpu_ids) + dumpcpu = 0; + pr_alert("%s: CPU %d invoking dump_cpu_task(%d)\n", __func__, cpu, dumpcpu); + dump_cpu_task(dumpcpu); + preempt_enable(); + schedule_timeout_uninterruptible(15 * HZ); + } +#else // #if IS_BUILTIN(CONFIG_RCU_TORTURE_TEST) + WARN_ONCE(n, "Non-zero rcutorture.test_nmis=%d permitted only when rcutorture is built in.\n", test_nmis); +#endif // #else // #if IS_BUILTIN(CONFIG_RCU_TORTURE_TEST) +} + static enum cpuhp_state rcutor_hp; static void @@ -3297,6 +3320,8 @@ rcu_torture_cleanup(void) return; } + rcutorture_test_nmis(test_nmis); + if (cur_ops->gp_kthread_dbg) cur_ops->gp_kthread_dbg(); rcu_torture_read_exit_cleanup(); @@ -3463,6 +3488,188 @@ static void rcutorture_sync(void) cur_ops->sync(); } +static DEFINE_MUTEX(mut0); +static DEFINE_MUTEX(mut1); +static DEFINE_MUTEX(mut2); +static DEFINE_MUTEX(mut3); +static DEFINE_MUTEX(mut4); +static DEFINE_MUTEX(mut5); +static DEFINE_MUTEX(mut6); +static DEFINE_MUTEX(mut7); +static DEFINE_MUTEX(mut8); +static DEFINE_MUTEX(mut9); + +static DECLARE_RWSEM(rwsem0); +static DECLARE_RWSEM(rwsem1); +static DECLARE_RWSEM(rwsem2); +static DECLARE_RWSEM(rwsem3); +static DECLARE_RWSEM(rwsem4); +static DECLARE_RWSEM(rwsem5); +static DECLARE_RWSEM(rwsem6); +static DECLARE_RWSEM(rwsem7); +static DECLARE_RWSEM(rwsem8); +static DECLARE_RWSEM(rwsem9); + +DEFINE_STATIC_SRCU(srcu0); +DEFINE_STATIC_SRCU(srcu1); +DEFINE_STATIC_SRCU(srcu2); +DEFINE_STATIC_SRCU(srcu3); +DEFINE_STATIC_SRCU(srcu4); +DEFINE_STATIC_SRCU(srcu5); +DEFINE_STATIC_SRCU(srcu6); +DEFINE_STATIC_SRCU(srcu7); +DEFINE_STATIC_SRCU(srcu8); +DEFINE_STATIC_SRCU(srcu9); + +static int srcu_lockdep_next(const char *f, const char *fl, const char *fs, const char *fu, int i, + int cyclelen, int deadlock) +{ + int j = i + 1; + + if (j >= cyclelen) + j = deadlock ? 0 : -1; + if (j >= 0) + pr_info("%s: %s(%d), %s(%d), %s(%d)\n", f, fl, i, fs, j, fu, i); + else + pr_info("%s: %s(%d), %s(%d)\n", f, fl, i, fu, i); + return j; +} + +// Test lockdep on SRCU-based deadlock scenarios. +static void rcu_torture_init_srcu_lockdep(void) +{ + int cyclelen; + int deadlock; + bool err = false; + int i; + int j; + int idx; + struct mutex *muts[] = { &mut0, &mut1, &mut2, &mut3, &mut4, + &mut5, &mut6, &mut7, &mut8, &mut9 }; + struct rw_semaphore *rwsems[] = { &rwsem0, &rwsem1, &rwsem2, &rwsem3, &rwsem4, + &rwsem5, &rwsem6, &rwsem7, &rwsem8, &rwsem9 }; + struct srcu_struct *srcus[] = { &srcu0, &srcu1, &srcu2, &srcu3, &srcu4, + &srcu5, &srcu6, &srcu7, &srcu8, &srcu9 }; + int testtype; + + if (!test_srcu_lockdep) + return; + + deadlock = test_srcu_lockdep / 1000; + testtype = (test_srcu_lockdep / 10) % 100; + cyclelen = test_srcu_lockdep % 10; + WARN_ON_ONCE(ARRAY_SIZE(muts) != ARRAY_SIZE(srcus)); + if (WARN_ONCE(deadlock != !!deadlock, + "%s: test_srcu_lockdep=%d and deadlock digit %d must be zero or one.\n", + __func__, test_srcu_lockdep, deadlock)) + err = true; + if (WARN_ONCE(cyclelen <= 0, + "%s: test_srcu_lockdep=%d and cycle-length digit %d must be greater than zero.\n", + __func__, test_srcu_lockdep, cyclelen)) + err = true; + if (err) + goto err_out; + + if (testtype == 0) { + pr_info("%s: test_srcu_lockdep = %05d: SRCU %d-way %sdeadlock.\n", + __func__, test_srcu_lockdep, cyclelen, deadlock ? "" : "non-"); + if (deadlock && cyclelen == 1) + pr_info("%s: Expect hang.\n", __func__); + for (i = 0; i < cyclelen; i++) { + j = srcu_lockdep_next(__func__, "srcu_read_lock", "synchronize_srcu", + "srcu_read_unlock", i, cyclelen, deadlock); + idx = srcu_read_lock(srcus[i]); + if (j >= 0) + synchronize_srcu(srcus[j]); + srcu_read_unlock(srcus[i], idx); + } + return; + } + + if (testtype == 1) { + pr_info("%s: test_srcu_lockdep = %05d: SRCU/mutex %d-way %sdeadlock.\n", + __func__, test_srcu_lockdep, cyclelen, deadlock ? "" : "non-"); + for (i = 0; i < cyclelen; i++) { + pr_info("%s: srcu_read_lock(%d), mutex_lock(%d), mutex_unlock(%d), srcu_read_unlock(%d)\n", + __func__, i, i, i, i); + idx = srcu_read_lock(srcus[i]); + mutex_lock(muts[i]); + mutex_unlock(muts[i]); + srcu_read_unlock(srcus[i], idx); + + j = srcu_lockdep_next(__func__, "mutex_lock", "synchronize_srcu", + "mutex_unlock", i, cyclelen, deadlock); + mutex_lock(muts[i]); + if (j >= 0) + synchronize_srcu(srcus[j]); + mutex_unlock(muts[i]); + } + return; + } + + if (testtype == 2) { + pr_info("%s: test_srcu_lockdep = %05d: SRCU/rwsem %d-way %sdeadlock.\n", + __func__, test_srcu_lockdep, cyclelen, deadlock ? "" : "non-"); + for (i = 0; i < cyclelen; i++) { + pr_info("%s: srcu_read_lock(%d), down_read(%d), up_read(%d), srcu_read_unlock(%d)\n", + __func__, i, i, i, i); + idx = srcu_read_lock(srcus[i]); + down_read(rwsems[i]); + up_read(rwsems[i]); + srcu_read_unlock(srcus[i], idx); + + j = srcu_lockdep_next(__func__, "down_write", "synchronize_srcu", + "up_write", i, cyclelen, deadlock); + down_write(rwsems[i]); + if (j >= 0) + synchronize_srcu(srcus[j]); + up_write(rwsems[i]); + } + return; + } + +#ifdef CONFIG_TASKS_TRACE_RCU + if (testtype == 3) { + pr_info("%s: test_srcu_lockdep = %05d: SRCU and Tasks Trace RCU %d-way %sdeadlock.\n", + __func__, test_srcu_lockdep, cyclelen, deadlock ? "" : "non-"); + if (deadlock && cyclelen == 1) + pr_info("%s: Expect hang.\n", __func__); + for (i = 0; i < cyclelen; i++) { + char *fl = i == 0 ? "rcu_read_lock_trace" : "srcu_read_lock"; + char *fs = i == cyclelen - 1 ? "synchronize_rcu_tasks_trace" + : "synchronize_srcu"; + char *fu = i == 0 ? "rcu_read_unlock_trace" : "srcu_read_unlock"; + + j = srcu_lockdep_next(__func__, fl, fs, fu, i, cyclelen, deadlock); + if (i == 0) + rcu_read_lock_trace(); + else + idx = srcu_read_lock(srcus[i]); + if (j >= 0) { + if (i == cyclelen - 1) + synchronize_rcu_tasks_trace(); + else + synchronize_srcu(srcus[j]); + } + if (i == 0) + rcu_read_unlock_trace(); + else + srcu_read_unlock(srcus[i], idx); + } + return; + } +#endif // #ifdef CONFIG_TASKS_TRACE_RCU + +err_out: + pr_info("%s: test_srcu_lockdep = %05d does nothing.\n", __func__, test_srcu_lockdep); + pr_info("%s: test_srcu_lockdep = DNNL.\n", __func__); + pr_info("%s: D: Deadlock if nonzero.\n", __func__); + pr_info("%s: NN: Test number, 0=SRCU, 1=SRCU/mutex, 2=SRCU/rwsem, 3=SRCU/Tasks Trace RCU.\n", __func__); + pr_info("%s: L: Cycle length.\n", __func__); + if (!IS_ENABLED(CONFIG_TASKS_TRACE_RCU)) + pr_info("%s: NN=3 disallowed because kernel is built with CONFIG_TASKS_TRACE_RCU=n\n", __func__); +} + static int __init rcu_torture_init(void) { @@ -3501,9 +3708,17 @@ rcu_torture_init(void) pr_alert("rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n"); fqs_duration = 0; } + if (nocbs_nthreads != 0 && (cur_ops != &rcu_ops || + !IS_ENABLED(CONFIG_RCU_NOCB_CPU))) { + pr_alert("rcu-torture types: %s and CONFIG_RCU_NOCB_CPU=%d, nocb toggle disabled.\n", + cur_ops->name, IS_ENABLED(CONFIG_RCU_NOCB_CPU)); + nocbs_nthreads = 0; + } if (cur_ops->init) cur_ops->init(); + rcu_torture_init_srcu_lockdep(); + if (nreaders >= 0) { nrealreaders = nreaders; } else { @@ -3540,7 +3755,6 @@ rcu_torture_init(void) atomic_set(&n_rcu_torture_error, 0); n_rcu_torture_barrier_error = 0; n_rcu_torture_boost_ktrerror = 0; - n_rcu_torture_boost_rterror = 0; n_rcu_torture_boost_failure = 0; n_rcu_torture_boosts = 0; for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index afa3e1a2f690..1970ce5f22d4 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -1031,7 +1031,7 @@ ref_scale_cleanup(void) static int ref_scale_shutdown(void *arg) { - wait_event(shutdown_wq, shutdown_start); + wait_event_idle(shutdown_wq, shutdown_start); smp_mb(); // Wake before output. ref_scale_cleanup(); diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index b12fb0cec44d..336af24e0fe3 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -197,6 +197,8 @@ void synchronize_srcu(struct srcu_struct *ssp) { struct rcu_synchronize rs; + srcu_lock_sync(&ssp->dep_map); + RCU_LOCKDEP_WARN(lockdep_is_held(ssp) || lock_is_held(&rcu_bh_lock_map) || lock_is_held(&rcu_lock_map) || diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index ab4ee58af84b..20d7a238d675 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -103,7 +103,7 @@ do { \ #define spin_trylock_irqsave_rcu_node(p, flags) \ ({ \ - bool ___locked = spin_trylock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \ + bool ___locked = spin_trylock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \ \ if (___locked) \ smp_mb__after_unlock_lock(); \ @@ -135,8 +135,8 @@ static void init_srcu_struct_data(struct srcu_struct *ssp) spin_lock_init(&ACCESS_PRIVATE(sdp, lock)); rcu_segcblist_init(&sdp->srcu_cblist); sdp->srcu_cblist_invoking = false; - sdp->srcu_gp_seq_needed = ssp->srcu_gp_seq; - sdp->srcu_gp_seq_needed_exp = ssp->srcu_gp_seq; + sdp->srcu_gp_seq_needed = ssp->srcu_sup->srcu_gp_seq; + sdp->srcu_gp_seq_needed_exp = ssp->srcu_sup->srcu_gp_seq; sdp->mynode = NULL; sdp->cpu = cpu; INIT_WORK(&sdp->work, srcu_invoke_callbacks); @@ -173,14 +173,14 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags) /* Initialize geometry if it has not already been initialized. */ rcu_init_geometry(); - ssp->node = kcalloc(rcu_num_nodes, sizeof(*ssp->node), gfp_flags); - if (!ssp->node) + ssp->srcu_sup->node = kcalloc(rcu_num_nodes, sizeof(*ssp->srcu_sup->node), gfp_flags); + if (!ssp->srcu_sup->node) return false; /* Work out the overall tree geometry. */ - ssp->level[0] = &ssp->node[0]; + ssp->srcu_sup->level[0] = &ssp->srcu_sup->node[0]; for (i = 1; i < rcu_num_lvls; i++) - ssp->level[i] = ssp->level[i - 1] + num_rcu_lvl[i - 1]; + ssp->srcu_sup->level[i] = ssp->srcu_sup->level[i - 1] + num_rcu_lvl[i - 1]; rcu_init_levelspread(levelspread, num_rcu_lvl); /* Each pass through this loop initializes one srcu_node structure. */ @@ -195,17 +195,17 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags) snp->srcu_gp_seq_needed_exp = SRCU_SNP_INIT_SEQ; snp->grplo = -1; snp->grphi = -1; - if (snp == &ssp->node[0]) { + if (snp == &ssp->srcu_sup->node[0]) { /* Root node, special case. */ snp->srcu_parent = NULL; continue; } /* Non-root node. */ - if (snp == ssp->level[level + 1]) + if (snp == ssp->srcu_sup->level[level + 1]) level++; - snp->srcu_parent = ssp->level[level - 1] + - (snp - ssp->level[level]) / + snp->srcu_parent = ssp->srcu_sup->level[level - 1] + + (snp - ssp->srcu_sup->level[level]) / levelspread[level - 1]; } @@ -214,7 +214,7 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags) * leaves of the srcu_node tree. */ level = rcu_num_lvls - 1; - snp_first = ssp->level[level]; + snp_first = ssp->srcu_sup->level[level]; for_each_possible_cpu(cpu) { sdp = per_cpu_ptr(ssp->sda, cpu); sdp->mynode = &snp_first[cpu / levelspread[level]]; @@ -225,7 +225,7 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags) } sdp->grpmask = 1 << (cpu - sdp->mynode->grplo); } - smp_store_release(&ssp->srcu_size_state, SRCU_SIZE_WAIT_BARRIER); + smp_store_release(&ssp->srcu_sup->srcu_size_state, SRCU_SIZE_WAIT_BARRIER); return true; } @@ -236,36 +236,47 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags) */ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) { - ssp->srcu_size_state = SRCU_SIZE_SMALL; - ssp->node = NULL; - mutex_init(&ssp->srcu_cb_mutex); - mutex_init(&ssp->srcu_gp_mutex); + if (!is_static) + ssp->srcu_sup = kzalloc(sizeof(*ssp->srcu_sup), GFP_KERNEL); + if (!ssp->srcu_sup) + return -ENOMEM; + if (!is_static) + spin_lock_init(&ACCESS_PRIVATE(ssp->srcu_sup, lock)); + ssp->srcu_sup->srcu_size_state = SRCU_SIZE_SMALL; + ssp->srcu_sup->node = NULL; + mutex_init(&ssp->srcu_sup->srcu_cb_mutex); + mutex_init(&ssp->srcu_sup->srcu_gp_mutex); ssp->srcu_idx = 0; - ssp->srcu_gp_seq = 0; - ssp->srcu_barrier_seq = 0; - mutex_init(&ssp->srcu_barrier_mutex); - atomic_set(&ssp->srcu_barrier_cpu_cnt, 0); - INIT_DELAYED_WORK(&ssp->work, process_srcu); - ssp->sda_is_static = is_static; + ssp->srcu_sup->srcu_gp_seq = 0; + ssp->srcu_sup->srcu_barrier_seq = 0; + mutex_init(&ssp->srcu_sup->srcu_barrier_mutex); + atomic_set(&ssp->srcu_sup->srcu_barrier_cpu_cnt, 0); + INIT_DELAYED_WORK(&ssp->srcu_sup->work, process_srcu); + ssp->srcu_sup->sda_is_static = is_static; if (!is_static) ssp->sda = alloc_percpu(struct srcu_data); - if (!ssp->sda) + if (!ssp->sda) { + if (!is_static) + kfree(ssp->srcu_sup); return -ENOMEM; + } init_srcu_struct_data(ssp); - ssp->srcu_gp_seq_needed_exp = 0; - ssp->srcu_last_gp_end = ktime_get_mono_fast_ns(); - if (READ_ONCE(ssp->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) { + ssp->srcu_sup->srcu_gp_seq_needed_exp = 0; + ssp->srcu_sup->srcu_last_gp_end = ktime_get_mono_fast_ns(); + if (READ_ONCE(ssp->srcu_sup->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) { if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC)) { - if (!ssp->sda_is_static) { + if (!ssp->srcu_sup->sda_is_static) { free_percpu(ssp->sda); ssp->sda = NULL; + kfree(ssp->srcu_sup); return -ENOMEM; } } else { - WRITE_ONCE(ssp->srcu_size_state, SRCU_SIZE_BIG); + WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_BIG); } } - smp_store_release(&ssp->srcu_gp_seq_needed, 0); /* Init done. */ + ssp->srcu_sup->srcu_ssp = ssp; + smp_store_release(&ssp->srcu_sup->srcu_gp_seq_needed, 0); /* Init done. */ return 0; } @@ -277,7 +288,6 @@ int __init_srcu_struct(struct srcu_struct *ssp, const char *name, /* Don't re-initialize a lock while it is held. */ debug_check_no_locks_freed((void *)ssp, sizeof(*ssp)); lockdep_init_map(&ssp->dep_map, name, key, 0); - spin_lock_init(&ACCESS_PRIVATE(ssp, lock)); return init_srcu_struct_fields(ssp, false); } EXPORT_SYMBOL_GPL(__init_srcu_struct); @@ -294,7 +304,6 @@ EXPORT_SYMBOL_GPL(__init_srcu_struct); */ int init_srcu_struct(struct srcu_struct *ssp) { - spin_lock_init(&ACCESS_PRIVATE(ssp, lock)); return init_srcu_struct_fields(ssp, false); } EXPORT_SYMBOL_GPL(init_srcu_struct); @@ -306,8 +315,8 @@ EXPORT_SYMBOL_GPL(init_srcu_struct); */ static void __srcu_transition_to_big(struct srcu_struct *ssp) { - lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock)); - smp_store_release(&ssp->srcu_size_state, SRCU_SIZE_ALLOC); + lockdep_assert_held(&ACCESS_PRIVATE(ssp->srcu_sup, lock)); + smp_store_release(&ssp->srcu_sup->srcu_size_state, SRCU_SIZE_ALLOC); } /* @@ -318,15 +327,15 @@ static void srcu_transition_to_big(struct srcu_struct *ssp) unsigned long flags; /* Double-checked locking on ->srcu_size-state. */ - if (smp_load_acquire(&ssp->srcu_size_state) != SRCU_SIZE_SMALL) + if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) != SRCU_SIZE_SMALL) return; - spin_lock_irqsave_rcu_node(ssp, flags); - if (smp_load_acquire(&ssp->srcu_size_state) != SRCU_SIZE_SMALL) { - spin_unlock_irqrestore_rcu_node(ssp, flags); + spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags); + if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) != SRCU_SIZE_SMALL) { + spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); return; } __srcu_transition_to_big(ssp); - spin_unlock_irqrestore_rcu_node(ssp, flags); + spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); } /* @@ -337,14 +346,14 @@ static void spin_lock_irqsave_check_contention(struct srcu_struct *ssp) { unsigned long j; - if (!SRCU_SIZING_IS_CONTEND() || ssp->srcu_size_state) + if (!SRCU_SIZING_IS_CONTEND() || ssp->srcu_sup->srcu_size_state) return; j = jiffies; - if (ssp->srcu_size_jiffies != j) { - ssp->srcu_size_jiffies = j; - ssp->srcu_n_lock_retries = 0; + if (ssp->srcu_sup->srcu_size_jiffies != j) { + ssp->srcu_sup->srcu_size_jiffies = j; + ssp->srcu_sup->srcu_n_lock_retries = 0; } - if (++ssp->srcu_n_lock_retries <= small_contention_lim) + if (++ssp->srcu_sup->srcu_n_lock_retries <= small_contention_lim) return; __srcu_transition_to_big(ssp); } @@ -361,9 +370,9 @@ static void spin_lock_irqsave_sdp_contention(struct srcu_data *sdp, unsigned lon if (spin_trylock_irqsave_rcu_node(sdp, *flags)) return; - spin_lock_irqsave_rcu_node(ssp, *flags); + spin_lock_irqsave_rcu_node(ssp->srcu_sup, *flags); spin_lock_irqsave_check_contention(ssp); - spin_unlock_irqrestore_rcu_node(ssp, *flags); + spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, *flags); spin_lock_irqsave_rcu_node(sdp, *flags); } @@ -375,9 +384,9 @@ static void spin_lock_irqsave_sdp_contention(struct srcu_data *sdp, unsigned lon */ static void spin_lock_irqsave_ssp_contention(struct srcu_struct *ssp, unsigned long *flags) { - if (spin_trylock_irqsave_rcu_node(ssp, *flags)) + if (spin_trylock_irqsave_rcu_node(ssp->srcu_sup, *flags)) return; - spin_lock_irqsave_rcu_node(ssp, *flags); + spin_lock_irqsave_rcu_node(ssp->srcu_sup, *flags); spin_lock_irqsave_check_contention(ssp); } @@ -394,15 +403,15 @@ static void check_init_srcu_struct(struct srcu_struct *ssp) unsigned long flags; /* The smp_load_acquire() pairs with the smp_store_release(). */ - if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_gp_seq_needed))) /*^^^*/ + if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_sup->srcu_gp_seq_needed))) /*^^^*/ return; /* Already initialized. */ - spin_lock_irqsave_rcu_node(ssp, flags); - if (!rcu_seq_state(ssp->srcu_gp_seq_needed)) { - spin_unlock_irqrestore_rcu_node(ssp, flags); + spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags); + if (!rcu_seq_state(ssp->srcu_sup->srcu_gp_seq_needed)) { + spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); return; } init_srcu_struct_fields(ssp, true); - spin_unlock_irqrestore_rcu_node(ssp, flags); + spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); } /* @@ -607,17 +616,18 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp) unsigned long gpstart; unsigned long j; unsigned long jbase = SRCU_INTERVAL; + struct srcu_usage *sup = ssp->srcu_sup; - if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), READ_ONCE(ssp->srcu_gp_seq_needed_exp))) + if (ULONG_CMP_LT(READ_ONCE(sup->srcu_gp_seq), READ_ONCE(sup->srcu_gp_seq_needed_exp))) jbase = 0; - if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq))) { + if (rcu_seq_state(READ_ONCE(sup->srcu_gp_seq))) { j = jiffies - 1; - gpstart = READ_ONCE(ssp->srcu_gp_start); + gpstart = READ_ONCE(sup->srcu_gp_start); if (time_after(j, gpstart)) jbase += j - gpstart; if (!jbase) { - WRITE_ONCE(ssp->srcu_n_exp_nodelay, READ_ONCE(ssp->srcu_n_exp_nodelay) + 1); - if (READ_ONCE(ssp->srcu_n_exp_nodelay) > srcu_max_nodelay_phase) + WRITE_ONCE(sup->srcu_n_exp_nodelay, READ_ONCE(sup->srcu_n_exp_nodelay) + 1); + if (READ_ONCE(sup->srcu_n_exp_nodelay) > srcu_max_nodelay_phase) jbase = 1; } } @@ -634,12 +644,13 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp) void cleanup_srcu_struct(struct srcu_struct *ssp) { int cpu; + struct srcu_usage *sup = ssp->srcu_sup; if (WARN_ON(!srcu_get_delay(ssp))) return; /* Just leak it! */ if (WARN_ON(srcu_readers_active(ssp))) return; /* Just leak it! */ - flush_delayed_work(&ssp->work); + flush_delayed_work(&sup->work); for_each_possible_cpu(cpu) { struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); @@ -648,21 +659,23 @@ void cleanup_srcu_struct(struct srcu_struct *ssp) if (WARN_ON(rcu_segcblist_n_cbs(&sdp->srcu_cblist))) return; /* Forgot srcu_barrier(), so just leak it! */ } - if (WARN_ON(rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) != SRCU_STATE_IDLE) || - WARN_ON(rcu_seq_current(&ssp->srcu_gp_seq) != ssp->srcu_gp_seq_needed) || + if (WARN_ON(rcu_seq_state(READ_ONCE(sup->srcu_gp_seq)) != SRCU_STATE_IDLE) || + WARN_ON(rcu_seq_current(&sup->srcu_gp_seq) != sup->srcu_gp_seq_needed) || WARN_ON(srcu_readers_active(ssp))) { pr_info("%s: Active srcu_struct %p read state: %d gp state: %lu/%lu\n", - __func__, ssp, rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)), - rcu_seq_current(&ssp->srcu_gp_seq), ssp->srcu_gp_seq_needed); + __func__, ssp, rcu_seq_state(READ_ONCE(sup->srcu_gp_seq)), + rcu_seq_current(&sup->srcu_gp_seq), sup->srcu_gp_seq_needed); return; /* Caller forgot to stop doing call_srcu()? */ } - if (!ssp->sda_is_static) { + kfree(sup->node); + sup->node = NULL; + sup->srcu_size_state = SRCU_SIZE_SMALL; + if (!sup->sda_is_static) { free_percpu(ssp->sda); ssp->sda = NULL; + kfree(sup); + ssp->srcu_sup = NULL; } - kfree(ssp->node); - ssp->node = NULL; - ssp->srcu_size_state = SRCU_SIZE_SMALL; } EXPORT_SYMBOL_GPL(cleanup_srcu_struct); @@ -760,23 +773,23 @@ static void srcu_gp_start(struct srcu_struct *ssp) struct srcu_data *sdp; int state; - if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) + if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) sdp = per_cpu_ptr(ssp->sda, get_boot_cpu_id()); else sdp = this_cpu_ptr(ssp->sda); - lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock)); - WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)); + lockdep_assert_held(&ACCESS_PRIVATE(ssp->srcu_sup, lock)); + WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed)); spin_lock_rcu_node(sdp); /* Interrupts already disabled. */ rcu_segcblist_advance(&sdp->srcu_cblist, - rcu_seq_current(&ssp->srcu_gp_seq)); + rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq)); (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, - rcu_seq_snap(&ssp->srcu_gp_seq)); + rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq)); spin_unlock_rcu_node(sdp); /* Interrupts remain disabled. */ - WRITE_ONCE(ssp->srcu_gp_start, jiffies); - WRITE_ONCE(ssp->srcu_n_exp_nodelay, 0); + WRITE_ONCE(ssp->srcu_sup->srcu_gp_start, jiffies); + WRITE_ONCE(ssp->srcu_sup->srcu_n_exp_nodelay, 0); smp_mb(); /* Order prior store to ->srcu_gp_seq_needed vs. GP start. */ - rcu_seq_start(&ssp->srcu_gp_seq); - state = rcu_seq_state(ssp->srcu_gp_seq); + rcu_seq_start(&ssp->srcu_sup->srcu_gp_seq); + state = rcu_seq_state(ssp->srcu_sup->srcu_gp_seq); WARN_ON_ONCE(state != SRCU_STATE_SCAN1); } @@ -849,28 +862,29 @@ static void srcu_gp_end(struct srcu_struct *ssp) unsigned long sgsne; struct srcu_node *snp; int ss_state; + struct srcu_usage *sup = ssp->srcu_sup; /* Prevent more than one additional grace period. */ - mutex_lock(&ssp->srcu_cb_mutex); + mutex_lock(&sup->srcu_cb_mutex); /* End the current grace period. */ - spin_lock_irq_rcu_node(ssp); - idx = rcu_seq_state(ssp->srcu_gp_seq); + spin_lock_irq_rcu_node(sup); + idx = rcu_seq_state(sup->srcu_gp_seq); WARN_ON_ONCE(idx != SRCU_STATE_SCAN2); - if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), READ_ONCE(ssp->srcu_gp_seq_needed_exp))) + if (ULONG_CMP_LT(READ_ONCE(sup->srcu_gp_seq), READ_ONCE(sup->srcu_gp_seq_needed_exp))) cbdelay = 0; - WRITE_ONCE(ssp->srcu_last_gp_end, ktime_get_mono_fast_ns()); - rcu_seq_end(&ssp->srcu_gp_seq); - gpseq = rcu_seq_current(&ssp->srcu_gp_seq); - if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, gpseq)) - WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, gpseq); - spin_unlock_irq_rcu_node(ssp); - mutex_unlock(&ssp->srcu_gp_mutex); + WRITE_ONCE(sup->srcu_last_gp_end, ktime_get_mono_fast_ns()); + rcu_seq_end(&sup->srcu_gp_seq); + gpseq = rcu_seq_current(&sup->srcu_gp_seq); + if (ULONG_CMP_LT(sup->srcu_gp_seq_needed_exp, gpseq)) + WRITE_ONCE(sup->srcu_gp_seq_needed_exp, gpseq); + spin_unlock_irq_rcu_node(sup); + mutex_unlock(&sup->srcu_gp_mutex); /* A new grace period can start at this point. But only one. */ /* Initiate callback invocation as needed. */ - ss_state = smp_load_acquire(&ssp->srcu_size_state); + ss_state = smp_load_acquire(&sup->srcu_size_state); if (ss_state < SRCU_SIZE_WAIT_BARRIER) { srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, get_boot_cpu_id()), cbdelay); @@ -879,7 +893,7 @@ static void srcu_gp_end(struct srcu_struct *ssp) srcu_for_each_node_breadth_first(ssp, snp) { spin_lock_irq_rcu_node(snp); cbs = false; - last_lvl = snp >= ssp->level[rcu_num_lvls - 1]; + last_lvl = snp >= sup->level[rcu_num_lvls - 1]; if (last_lvl) cbs = ss_state < SRCU_SIZE_BIG || snp->srcu_have_cbs[idx] == gpseq; snp->srcu_have_cbs[idx] = gpseq; @@ -911,18 +925,18 @@ static void srcu_gp_end(struct srcu_struct *ssp) } /* Callback initiation done, allow grace periods after next. */ - mutex_unlock(&ssp->srcu_cb_mutex); + mutex_unlock(&sup->srcu_cb_mutex); /* Start a new grace period if needed. */ - spin_lock_irq_rcu_node(ssp); - gpseq = rcu_seq_current(&ssp->srcu_gp_seq); + spin_lock_irq_rcu_node(sup); + gpseq = rcu_seq_current(&sup->srcu_gp_seq); if (!rcu_seq_state(gpseq) && - ULONG_CMP_LT(gpseq, ssp->srcu_gp_seq_needed)) { + ULONG_CMP_LT(gpseq, sup->srcu_gp_seq_needed)) { srcu_gp_start(ssp); - spin_unlock_irq_rcu_node(ssp); + spin_unlock_irq_rcu_node(sup); srcu_reschedule(ssp, 0); } else { - spin_unlock_irq_rcu_node(ssp); + spin_unlock_irq_rcu_node(sup); } /* Transition to big if needed. */ @@ -930,7 +944,7 @@ static void srcu_gp_end(struct srcu_struct *ssp) if (ss_state == SRCU_SIZE_ALLOC) init_srcu_struct_nodes(ssp, GFP_KERNEL); else - smp_store_release(&ssp->srcu_size_state, ss_state + 1); + smp_store_release(&sup->srcu_size_state, ss_state + 1); } } @@ -950,7 +964,7 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp if (snp) for (; snp != NULL; snp = snp->srcu_parent) { sgsne = READ_ONCE(snp->srcu_gp_seq_needed_exp); - if (WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_gp_seq, s)) || + if (WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_sup->srcu_gp_seq, s)) || (!srcu_invl_snp_seq(sgsne) && ULONG_CMP_GE(sgsne, s))) return; spin_lock_irqsave_rcu_node(snp, flags); @@ -963,9 +977,9 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp spin_unlock_irqrestore_rcu_node(snp, flags); } spin_lock_irqsave_ssp_contention(ssp, &flags); - if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s)) - WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, s); - spin_unlock_irqrestore_rcu_node(ssp, flags); + if (ULONG_CMP_LT(ssp->srcu_sup->srcu_gp_seq_needed_exp, s)) + WRITE_ONCE(ssp->srcu_sup->srcu_gp_seq_needed_exp, s); + spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); } /* @@ -990,9 +1004,10 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, struct srcu_node *snp; struct srcu_node *snp_leaf; unsigned long snp_seq; + struct srcu_usage *sup = ssp->srcu_sup; /* Ensure that snp node tree is fully initialized before traversing it */ - if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) + if (smp_load_acquire(&sup->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) snp_leaf = NULL; else snp_leaf = sdp->mynode; @@ -1000,7 +1015,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, if (snp_leaf) /* Each pass through the loop does one level of the srcu_node tree. */ for (snp = snp_leaf; snp != NULL; snp = snp->srcu_parent) { - if (WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_gp_seq, s)) && snp != snp_leaf) + if (WARN_ON_ONCE(rcu_seq_done(&sup->srcu_gp_seq, s)) && snp != snp_leaf) return; /* GP already done and CBs recorded. */ spin_lock_irqsave_rcu_node(snp, flags); snp_seq = snp->srcu_have_cbs[idx]; @@ -1027,20 +1042,20 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, /* Top of tree, must ensure the grace period will be started. */ spin_lock_irqsave_ssp_contention(ssp, &flags); - if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed, s)) { + if (ULONG_CMP_LT(sup->srcu_gp_seq_needed, s)) { /* * Record need for grace period s. Pair with load * acquire setting up for initialization. */ - smp_store_release(&ssp->srcu_gp_seq_needed, s); /*^^^*/ + smp_store_release(&sup->srcu_gp_seq_needed, s); /*^^^*/ } - if (!do_norm && ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s)) - WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, s); + if (!do_norm && ULONG_CMP_LT(sup->srcu_gp_seq_needed_exp, s)) + WRITE_ONCE(sup->srcu_gp_seq_needed_exp, s); /* If grace period not already in progress, start it. */ - if (!WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_gp_seq, s)) && - rcu_seq_state(ssp->srcu_gp_seq) == SRCU_STATE_IDLE) { - WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)); + if (!WARN_ON_ONCE(rcu_seq_done(&sup->srcu_gp_seq, s)) && + rcu_seq_state(sup->srcu_gp_seq) == SRCU_STATE_IDLE) { + WARN_ON_ONCE(ULONG_CMP_GE(sup->srcu_gp_seq, sup->srcu_gp_seq_needed)); srcu_gp_start(ssp); // And how can that list_add() in the "else" clause @@ -1049,12 +1064,12 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, // can only be executed during early boot when there is only // the one boot CPU running with interrupts still disabled. if (likely(srcu_init_done)) - queue_delayed_work(rcu_gp_wq, &ssp->work, + queue_delayed_work(rcu_gp_wq, &sup->work, !!srcu_get_delay(ssp)); - else if (list_empty(&ssp->work.work.entry)) - list_add(&ssp->work.work.entry, &srcu_boot_list); + else if (list_empty(&sup->work.work.entry)) + list_add(&sup->work.work.entry, &srcu_boot_list); } - spin_unlock_irqrestore_rcu_node(ssp, flags); + spin_unlock_irqrestore_rcu_node(sup, flags); } /* @@ -1085,16 +1100,36 @@ static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount) static void srcu_flip(struct srcu_struct *ssp) { /* - * Ensure that if this updater saw a given reader's increment - * from __srcu_read_lock(), that reader was using an old value - * of ->srcu_idx. Also ensure that if a given reader sees the - * new value of ->srcu_idx, this updater's earlier scans cannot - * have seen that reader's increments (which is OK, because this - * grace period need not wait on that reader). + * Because the flip of ->srcu_idx is executed only if the + * preceding call to srcu_readers_active_idx_check() found that + * the ->srcu_unlock_count[] and ->srcu_lock_count[] sums matched + * and because that summing uses atomic_long_read(), there is + * ordering due to a control dependency between that summing and + * the WRITE_ONCE() in this call to srcu_flip(). This ordering + * ensures that if this updater saw a given reader's increment from + * __srcu_read_lock(), that reader was using a value of ->srcu_idx + * from before the previous call to srcu_flip(), which should be + * quite rare. This ordering thus helps forward progress because + * the grace period could otherwise be delayed by additional + * calls to __srcu_read_lock() using that old (soon to be new) + * value of ->srcu_idx. + * + * This sum-equality check and ordering also ensures that if + * a given call to __srcu_read_lock() uses the new value of + * ->srcu_idx, this updater's earlier scans cannot have seen + * that reader's increments, which is all to the good, because + * this grace period need not wait on that reader. After all, + * if those earlier scans had seen that reader, there would have + * been a sum mismatch and this code would not be reached. + * + * This means that the following smp_mb() is redundant, but + * it stays until either (1) Compilers learn about this sort of + * control dependency or (2) Some production workload running on + * a production system is unduly delayed by this slowpath smp_mb(). */ smp_mb(); /* E */ /* Pairs with B and C. */ - WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); + WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); // Flip the counter. /* * Ensure that if the updater misses an __srcu_read_unlock() @@ -1154,18 +1189,18 @@ static bool srcu_might_be_idle(struct srcu_struct *ssp) /* First, see if enough time has passed since the last GP. */ t = ktime_get_mono_fast_ns(); - tlast = READ_ONCE(ssp->srcu_last_gp_end); + tlast = READ_ONCE(ssp->srcu_sup->srcu_last_gp_end); if (exp_holdoff == 0 || time_in_range_open(t, tlast, tlast + exp_holdoff)) return false; /* Too soon after last GP. */ /* Next, check for probable idleness. */ - curseq = rcu_seq_current(&ssp->srcu_gp_seq); + curseq = rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq); smp_mb(); /* Order ->srcu_gp_seq with ->srcu_gp_seq_needed. */ - if (ULONG_CMP_LT(curseq, READ_ONCE(ssp->srcu_gp_seq_needed))) + if (ULONG_CMP_LT(curseq, READ_ONCE(ssp->srcu_sup->srcu_gp_seq_needed))) return false; /* Grace period in progress, so not idle. */ smp_mb(); /* Order ->srcu_gp_seq with prior access. */ - if (curseq != rcu_seq_current(&ssp->srcu_gp_seq)) + if (curseq != rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq)) return false; /* GP # changed, so not idle. */ return true; /* With reasonable probability, idle! */ } @@ -1199,7 +1234,7 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp, * sequence number cannot wrap around in the meantime. */ idx = __srcu_read_lock_nmisafe(ssp); - ss_state = smp_load_acquire(&ssp->srcu_size_state); + ss_state = smp_load_acquire(&ssp->srcu_sup->srcu_size_state); if (ss_state < SRCU_SIZE_WAIT_CALL) sdp = per_cpu_ptr(ssp->sda, get_boot_cpu_id()); else @@ -1208,8 +1243,8 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp, if (rhp) rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp); rcu_segcblist_advance(&sdp->srcu_cblist, - rcu_seq_current(&ssp->srcu_gp_seq)); - s = rcu_seq_snap(&ssp->srcu_gp_seq); + rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq)); + s = rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq); (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s); if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) { sdp->srcu_gp_seq_needed = s; @@ -1307,6 +1342,8 @@ static void __synchronize_srcu(struct srcu_struct *ssp, bool do_norm) { struct rcu_synchronize rcu; + srcu_lock_sync(&ssp->dep_map); + RCU_LOCKDEP_WARN(lockdep_is_held(ssp) || lock_is_held(&rcu_bh_lock_map) || lock_is_held(&rcu_lock_map) || @@ -1420,7 +1457,7 @@ unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp) // Any prior manipulation of SRCU-protected data must happen // before the load from ->srcu_gp_seq. smp_mb(); - return rcu_seq_snap(&ssp->srcu_gp_seq); + return rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq); } EXPORT_SYMBOL_GPL(get_state_synchronize_srcu); @@ -1467,7 +1504,7 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu); */ bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie) { - if (!rcu_seq_done(&ssp->srcu_gp_seq, cookie)) + if (!rcu_seq_done(&ssp->srcu_sup->srcu_gp_seq, cookie)) return false; // Ensure that the end of the SRCU grace period happens before // any subsequent code that the caller might execute. @@ -1486,8 +1523,8 @@ static void srcu_barrier_cb(struct rcu_head *rhp) sdp = container_of(rhp, struct srcu_data, srcu_barrier_head); ssp = sdp->ssp; - if (atomic_dec_and_test(&ssp->srcu_barrier_cpu_cnt)) - complete(&ssp->srcu_barrier_completion); + if (atomic_dec_and_test(&ssp->srcu_sup->srcu_barrier_cpu_cnt)) + complete(&ssp->srcu_sup->srcu_barrier_completion); } /* @@ -1501,13 +1538,13 @@ static void srcu_barrier_cb(struct rcu_head *rhp) static void srcu_barrier_one_cpu(struct srcu_struct *ssp, struct srcu_data *sdp) { spin_lock_irq_rcu_node(sdp); - atomic_inc(&ssp->srcu_barrier_cpu_cnt); + atomic_inc(&ssp->srcu_sup->srcu_barrier_cpu_cnt); sdp->srcu_barrier_head.func = srcu_barrier_cb; debug_rcu_head_queue(&sdp->srcu_barrier_head); if (!rcu_segcblist_entrain(&sdp->srcu_cblist, &sdp->srcu_barrier_head)) { debug_rcu_head_unqueue(&sdp->srcu_barrier_head); - atomic_dec(&ssp->srcu_barrier_cpu_cnt); + atomic_dec(&ssp->srcu_sup->srcu_barrier_cpu_cnt); } spin_unlock_irq_rcu_node(sdp); } @@ -1520,23 +1557,23 @@ void srcu_barrier(struct srcu_struct *ssp) { int cpu; int idx; - unsigned long s = rcu_seq_snap(&ssp->srcu_barrier_seq); + unsigned long s = rcu_seq_snap(&ssp->srcu_sup->srcu_barrier_seq); check_init_srcu_struct(ssp); - mutex_lock(&ssp->srcu_barrier_mutex); - if (rcu_seq_done(&ssp->srcu_barrier_seq, s)) { + mutex_lock(&ssp->srcu_sup->srcu_barrier_mutex); + if (rcu_seq_done(&ssp->srcu_sup->srcu_barrier_seq, s)) { smp_mb(); /* Force ordering following return. */ - mutex_unlock(&ssp->srcu_barrier_mutex); + mutex_unlock(&ssp->srcu_sup->srcu_barrier_mutex); return; /* Someone else did our work for us. */ } - rcu_seq_start(&ssp->srcu_barrier_seq); - init_completion(&ssp->srcu_barrier_completion); + rcu_seq_start(&ssp->srcu_sup->srcu_barrier_seq); + init_completion(&ssp->srcu_sup->srcu_barrier_completion); /* Initial count prevents reaching zero until all CBs are posted. */ - atomic_set(&ssp->srcu_barrier_cpu_cnt, 1); + atomic_set(&ssp->srcu_sup->srcu_barrier_cpu_cnt, 1); idx = __srcu_read_lock_nmisafe(ssp); - if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) + if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, get_boot_cpu_id())); else for_each_possible_cpu(cpu) @@ -1544,12 +1581,12 @@ void srcu_barrier(struct srcu_struct *ssp) __srcu_read_unlock_nmisafe(ssp, idx); /* Remove the initial count, at which point reaching zero can happen. */ - if (atomic_dec_and_test(&ssp->srcu_barrier_cpu_cnt)) - complete(&ssp->srcu_barrier_completion); - wait_for_completion(&ssp->srcu_barrier_completion); + if (atomic_dec_and_test(&ssp->srcu_sup->srcu_barrier_cpu_cnt)) + complete(&ssp->srcu_sup->srcu_barrier_completion); + wait_for_completion(&ssp->srcu_sup->srcu_barrier_completion); - rcu_seq_end(&ssp->srcu_barrier_seq); - mutex_unlock(&ssp->srcu_barrier_mutex); + rcu_seq_end(&ssp->srcu_sup->srcu_barrier_seq); + mutex_unlock(&ssp->srcu_sup->srcu_barrier_mutex); } EXPORT_SYMBOL_GPL(srcu_barrier); @@ -1575,7 +1612,7 @@ static void srcu_advance_state(struct srcu_struct *ssp) { int idx; - mutex_lock(&ssp->srcu_gp_mutex); + mutex_lock(&ssp->srcu_sup->srcu_gp_mutex); /* * Because readers might be delayed for an extended period after @@ -1587,39 +1624,39 @@ static void srcu_advance_state(struct srcu_struct *ssp) * The load-acquire ensures that we see the accesses performed * by the prior grace period. */ - idx = rcu_seq_state(smp_load_acquire(&ssp->srcu_gp_seq)); /* ^^^ */ + idx = rcu_seq_state(smp_load_acquire(&ssp->srcu_sup->srcu_gp_seq)); /* ^^^ */ if (idx == SRCU_STATE_IDLE) { - spin_lock_irq_rcu_node(ssp); - if (ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)) { - WARN_ON_ONCE(rcu_seq_state(ssp->srcu_gp_seq)); - spin_unlock_irq_rcu_node(ssp); - mutex_unlock(&ssp->srcu_gp_mutex); + spin_lock_irq_rcu_node(ssp->srcu_sup); + if (ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed)) { + WARN_ON_ONCE(rcu_seq_state(ssp->srcu_sup->srcu_gp_seq)); + spin_unlock_irq_rcu_node(ssp->srcu_sup); + mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex); return; } - idx = rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)); + idx = rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)); if (idx == SRCU_STATE_IDLE) srcu_gp_start(ssp); - spin_unlock_irq_rcu_node(ssp); + spin_unlock_irq_rcu_node(ssp->srcu_sup); if (idx != SRCU_STATE_IDLE) { - mutex_unlock(&ssp->srcu_gp_mutex); + mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex); return; /* Someone else started the grace period. */ } } - if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) == SRCU_STATE_SCAN1) { + if (rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)) == SRCU_STATE_SCAN1) { idx = 1 ^ (ssp->srcu_idx & 1); if (!try_check_zero(ssp, idx, 1)) { - mutex_unlock(&ssp->srcu_gp_mutex); + mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex); return; /* readers present, retry later. */ } srcu_flip(ssp); - spin_lock_irq_rcu_node(ssp); - rcu_seq_set_state(&ssp->srcu_gp_seq, SRCU_STATE_SCAN2); - ssp->srcu_n_exp_nodelay = 0; - spin_unlock_irq_rcu_node(ssp); + spin_lock_irq_rcu_node(ssp->srcu_sup); + rcu_seq_set_state(&ssp->srcu_sup->srcu_gp_seq, SRCU_STATE_SCAN2); + ssp->srcu_sup->srcu_n_exp_nodelay = 0; + spin_unlock_irq_rcu_node(ssp->srcu_sup); } - if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) == SRCU_STATE_SCAN2) { + if (rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)) == SRCU_STATE_SCAN2) { /* * SRCU read-side critical sections are normally short, @@ -1627,10 +1664,10 @@ static void srcu_advance_state(struct srcu_struct *ssp) */ idx = 1 ^ (ssp->srcu_idx & 1); if (!try_check_zero(ssp, idx, 2)) { - mutex_unlock(&ssp->srcu_gp_mutex); + mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex); return; /* readers present, retry later. */ } - ssp->srcu_n_exp_nodelay = 0; + ssp->srcu_sup->srcu_n_exp_nodelay = 0; srcu_gp_end(ssp); /* Releases ->srcu_gp_mutex. */ } } @@ -1656,7 +1693,7 @@ static void srcu_invoke_callbacks(struct work_struct *work) rcu_cblist_init(&ready_cbs); spin_lock_irq_rcu_node(sdp); rcu_segcblist_advance(&sdp->srcu_cblist, - rcu_seq_current(&ssp->srcu_gp_seq)); + rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq)); if (sdp->srcu_cblist_invoking || !rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) { spin_unlock_irq_rcu_node(sdp); @@ -1684,7 +1721,7 @@ static void srcu_invoke_callbacks(struct work_struct *work) spin_lock_irq_rcu_node(sdp); rcu_segcblist_add_len(&sdp->srcu_cblist, -len); (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, - rcu_seq_snap(&ssp->srcu_gp_seq)); + rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq)); sdp->srcu_cblist_invoking = false; more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist); spin_unlock_irq_rcu_node(sdp); @@ -1700,20 +1737,20 @@ static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay) { bool pushgp = true; - spin_lock_irq_rcu_node(ssp); - if (ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)) { - if (!WARN_ON_ONCE(rcu_seq_state(ssp->srcu_gp_seq))) { + spin_lock_irq_rcu_node(ssp->srcu_sup); + if (ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed)) { + if (!WARN_ON_ONCE(rcu_seq_state(ssp->srcu_sup->srcu_gp_seq))) { /* All requests fulfilled, time to go idle. */ pushgp = false; } - } else if (!rcu_seq_state(ssp->srcu_gp_seq)) { + } else if (!rcu_seq_state(ssp->srcu_sup->srcu_gp_seq)) { /* Outstanding request and no GP. Start one. */ srcu_gp_start(ssp); } - spin_unlock_irq_rcu_node(ssp); + spin_unlock_irq_rcu_node(ssp->srcu_sup); if (pushgp) - queue_delayed_work(rcu_gp_wq, &ssp->work, delay); + queue_delayed_work(rcu_gp_wq, &ssp->srcu_sup->work, delay); } /* @@ -1724,22 +1761,24 @@ static void process_srcu(struct work_struct *work) unsigned long curdelay; unsigned long j; struct srcu_struct *ssp; + struct srcu_usage *sup; - ssp = container_of(work, struct srcu_struct, work.work); + sup = container_of(work, struct srcu_usage, work.work); + ssp = sup->srcu_ssp; srcu_advance_state(ssp); curdelay = srcu_get_delay(ssp); if (curdelay) { - WRITE_ONCE(ssp->reschedule_count, 0); + WRITE_ONCE(sup->reschedule_count, 0); } else { j = jiffies; - if (READ_ONCE(ssp->reschedule_jiffies) == j) { - WRITE_ONCE(ssp->reschedule_count, READ_ONCE(ssp->reschedule_count) + 1); - if (READ_ONCE(ssp->reschedule_count) > srcu_max_nodelay) + if (READ_ONCE(sup->reschedule_jiffies) == j) { + WRITE_ONCE(sup->reschedule_count, READ_ONCE(sup->reschedule_count) + 1); + if (READ_ONCE(sup->reschedule_count) > srcu_max_nodelay) curdelay = 1; } else { - WRITE_ONCE(ssp->reschedule_count, 1); - WRITE_ONCE(ssp->reschedule_jiffies, j); + WRITE_ONCE(sup->reschedule_count, 1); + WRITE_ONCE(sup->reschedule_jiffies, j); } } srcu_reschedule(ssp, curdelay); @@ -1752,7 +1791,7 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type, if (test_type != SRCU_FLAVOR) return; *flags = 0; - *gp_seq = rcu_seq_current(&ssp->srcu_gp_seq); + *gp_seq = rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq); } EXPORT_SYMBOL_GPL(srcutorture_get_gp_data); @@ -1774,14 +1813,14 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) int cpu; int idx; unsigned long s0 = 0, s1 = 0; - int ss_state = READ_ONCE(ssp->srcu_size_state); + int ss_state = READ_ONCE(ssp->srcu_sup->srcu_size_state); int ss_state_idx = ss_state; idx = ssp->srcu_idx & 0x1; if (ss_state < 0 || ss_state >= ARRAY_SIZE(srcu_size_state_name)) ss_state_idx = ARRAY_SIZE(srcu_size_state_name) - 1; pr_alert("%s%s Tree SRCU g%ld state %d (%s)", - tt, tf, rcu_seq_current(&ssp->srcu_gp_seq), ss_state, + tt, tf, rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq), ss_state, srcu_size_state_name[ss_state_idx]); if (!ssp->sda) { // Called after cleanup_srcu_struct(), perhaps. @@ -1838,7 +1877,7 @@ early_initcall(srcu_bootup_announce); void __init srcu_init(void) { - struct srcu_struct *ssp; + struct srcu_usage *sup; /* Decide on srcu_struct-size strategy. */ if (SRCU_SIZING_IS(SRCU_SIZING_AUTO)) { @@ -1858,12 +1897,13 @@ void __init srcu_init(void) */ srcu_init_done = true; while (!list_empty(&srcu_boot_list)) { - ssp = list_first_entry(&srcu_boot_list, struct srcu_struct, + sup = list_first_entry(&srcu_boot_list, struct srcu_usage, work.work.entry); - list_del_init(&ssp->work.work.entry); - if (SRCU_SIZING_IS(SRCU_SIZING_INIT) && ssp->srcu_size_state == SRCU_SIZE_SMALL) - ssp->srcu_size_state = SRCU_SIZE_ALLOC; - queue_work(rcu_gp_wq, &ssp->work.work); + list_del_init(&sup->work.work.entry); + if (SRCU_SIZING_IS(SRCU_SIZING_INIT) && + sup->srcu_size_state == SRCU_SIZE_SMALL) + sup->srcu_size_state = SRCU_SIZE_ALLOC; + queue_work(rcu_gp_wq, &sup->work.work); } } @@ -1873,13 +1913,14 @@ void __init srcu_init(void) static int srcu_module_coming(struct module *mod) { int i; + struct srcu_struct *ssp; struct srcu_struct **sspp = mod->srcu_struct_ptrs; - int ret; for (i = 0; i < mod->num_srcu_structs; i++) { - ret = init_srcu_struct(*(sspp++)); - if (WARN_ON_ONCE(ret)) - return ret; + ssp = *(sspp++); + ssp->sda = alloc_percpu(struct srcu_data); + if (WARN_ON_ONCE(!ssp->sda)) + return -ENOMEM; } return 0; } @@ -1888,10 +1929,17 @@ static int srcu_module_coming(struct module *mod) static void srcu_module_going(struct module *mod) { int i; + struct srcu_struct *ssp; struct srcu_struct **sspp = mod->srcu_struct_ptrs; - for (i = 0; i < mod->num_srcu_structs; i++) - cleanup_srcu_struct(*(sspp++)); + for (i = 0; i < mod->num_srcu_structs; i++) { + ssp = *(sspp++); + if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_sup->srcu_gp_seq_needed)) && + !WARN_ON_ONCE(!ssp->srcu_sup->sda_is_static)) + cleanup_srcu_struct(ssp); + if (!WARN_ON(srcu_readers_active(ssp))) + free_percpu(ssp->sda); + } } /* Handle one module, either coming or going. */ diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index bfb5e1549f2b..5f4fc8184dd0 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -136,8 +136,16 @@ static struct rcu_tasks rt_name = \ .kname = #rt_name, \ } +#ifdef CONFIG_TASKS_RCU /* Track exiting tasks in order to allow them to be waited for. */ DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu); +#endif + +#ifdef CONFIG_TASKS_RCU +/* Report delay in synchronize_srcu() completion in rcu_tasks_postscan(). */ +static void tasks_rcu_exit_srcu_stall(struct timer_list *unused); +static DEFINE_TIMER(tasks_rcu_exit_srcu_stall_timer, tasks_rcu_exit_srcu_stall); +#endif /* Avoid IPIing CPUs early in the grace period. */ #define RCU_TASK_IPI_DELAY (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) ? HZ / 2 : 0) @@ -830,6 +838,13 @@ static void rcu_tasks_pertask(struct task_struct *t, struct list_head *hop) /* Processing between scanning taskslist and draining the holdout list. */ static void rcu_tasks_postscan(struct list_head *hop) { + int rtsi = READ_ONCE(rcu_task_stall_info); + + if (!IS_ENABLED(CONFIG_TINY_RCU)) { + tasks_rcu_exit_srcu_stall_timer.expires = jiffies + rtsi; + add_timer(&tasks_rcu_exit_srcu_stall_timer); + } + /* * Exiting tasks may escape the tasklist scan. Those are vulnerable * until their final schedule() with TASK_DEAD state. To cope with @@ -848,6 +863,9 @@ static void rcu_tasks_postscan(struct list_head *hop) * call to synchronize_rcu(). */ synchronize_srcu(&tasks_rcu_exit_srcu); + + if (!IS_ENABLED(CONFIG_TINY_RCU)) + del_timer_sync(&tasks_rcu_exit_srcu_stall_timer); } /* See if tasks are still holding out, complain if so. */ @@ -923,6 +941,21 @@ static void rcu_tasks_postgp(struct rcu_tasks *rtp) void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func); DEFINE_RCU_TASKS(rcu_tasks, rcu_tasks_wait_gp, call_rcu_tasks, "RCU Tasks"); +static void tasks_rcu_exit_srcu_stall(struct timer_list *unused) +{ +#ifndef CONFIG_TINY_RCU + int rtsi; + + rtsi = READ_ONCE(rcu_task_stall_info); + pr_info("%s: %s grace period number %lu (since boot) gp_state: %s is %lu jiffies old.\n", + __func__, rcu_tasks.kname, rcu_tasks.tasks_gp_seq, + tasks_gp_state_getname(&rcu_tasks), jiffies - rcu_tasks.gp_jiffies); + pr_info("Please check any exiting tasks stuck between calls to exit_tasks_rcu_start() and exit_tasks_rcu_finish()\n"); + tasks_rcu_exit_srcu_stall_timer.expires = jiffies + rtsi; + add_timer(&tasks_rcu_exit_srcu_stall_timer); +#endif // #ifndef CONFIG_TINY_RCU +} + /** * call_rcu_tasks() - Queue an RCU for invocation task-based grace period * @rhp: structure to be used for queueing the RCU updates. diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 7b95ee98a1a5..f52ff7241041 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -640,6 +640,7 @@ void __rcu_irq_enter_check_tick(void) } raw_spin_unlock_rcu_node(rdp->mynode); } +NOKPROBE_SYMBOL(__rcu_irq_enter_check_tick); #endif /* CONFIG_NO_HZ_FULL */ /* @@ -1955,7 +1956,6 @@ rcu_report_qs_rdp(struct rcu_data *rdp) { unsigned long flags; unsigned long mask; - bool needwake = false; bool needacc = false; struct rcu_node *rnp; @@ -1987,7 +1987,12 @@ rcu_report_qs_rdp(struct rcu_data *rdp) * NOCB kthreads have their own way to deal with that... */ if (!rcu_rdp_is_offloaded(rdp)) { - needwake = rcu_accelerate_cbs(rnp, rdp); + /* + * The current GP has not yet ended, so it + * should not be possible for rcu_accelerate_cbs() + * to return true. So complain, but don't awaken. + */ + WARN_ON_ONCE(rcu_accelerate_cbs(rnp, rdp)); } else if (!rcu_segcblist_completely_offloaded(&rdp->cblist)) { /* * ...but NOCB kthreads may miss or delay callbacks acceleration @@ -1999,8 +2004,6 @@ rcu_report_qs_rdp(struct rcu_data *rdp) rcu_disable_urgency_upon_qs(rdp); rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); /* ^^^ Released rnp->lock */ - if (needwake) - rcu_gp_kthread_wake(); if (needacc) { rcu_nocb_lock_irqsave(rdp, flags); @@ -2131,6 +2134,8 @@ static void rcu_do_batch(struct rcu_data *rdp) break; } } else { + // In rcuoc context, so no worries about depriving + // other softirq vectors of CPU cycles. local_bh_enable(); lockdep_assert_irqs_enabled(); cond_resched_tasks_rcu_qs(); @@ -4951,9 +4956,8 @@ void __init rcu_init(void) else qovld_calc = qovld; - // Kick-start any polled grace periods that started early. - if (!(per_cpu_ptr(&rcu_data, cpu)->mynode->exp_seq_poll_rq & 0x1)) - (void)start_poll_synchronize_rcu_expedited(); + // Kick-start in case any polled grace periods started early. + (void)start_poll_synchronize_rcu_expedited(); rcu_test_sync_prims(); } diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 249c2967d9e6..3b7abb58157d 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -594,6 +594,7 @@ static void synchronize_rcu_expedited_wait(void) struct rcu_data *rdp; struct rcu_node *rnp; struct rcu_node *rnp_root = rcu_get_root(); + unsigned long flags; trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("startwait")); jiffies_stall = rcu_exp_jiffies_till_stall_check(); @@ -602,17 +603,17 @@ static void synchronize_rcu_expedited_wait(void) if (synchronize_rcu_expedited_wait_once(1)) return; rcu_for_each_leaf_node(rnp) { + raw_spin_lock_irqsave_rcu_node(rnp, flags); mask = READ_ONCE(rnp->expmask); for_each_leaf_node_cpu_mask(rnp, cpu, mask) { rdp = per_cpu_ptr(&rcu_data, cpu); if (rdp->rcu_forced_tick_exp) continue; rdp->rcu_forced_tick_exp = true; - preempt_disable(); if (cpu_online(cpu)) tick_dep_set_cpu(cpu, TICK_DEP_BIT_RCU_EXP); - preempt_enable(); } + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } j = READ_ONCE(jiffies_till_first_fqs); if (synchronize_rcu_expedited_wait_once(j + HZ)) @@ -802,9 +803,11 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) int ndetected = 0; struct task_struct *t; - if (!READ_ONCE(rnp->exp_tasks)) - return 0; raw_spin_lock_irqsave_rcu_node(rnp, flags); + if (!rnp->exp_tasks) { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + return 0; + } t = list_entry(rnp->exp_tasks->prev, struct task_struct, rcu_node_entry); list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { @@ -1065,9 +1068,10 @@ unsigned long start_poll_synchronize_rcu_expedited(void) if (rcu_init_invoked()) raw_spin_lock_irqsave(&rnp->exp_poll_lock, flags); if (!poll_state_synchronize_rcu(s)) { - rnp->exp_seq_poll_rq = s; - if (rcu_init_invoked()) + if (rcu_init_invoked()) { + rnp->exp_seq_poll_rq = s; queue_work(rcu_gp_wq, &rnp->exp_poll_wq); + } } if (rcu_init_invoked()) raw_spin_unlock_irqrestore(&rnp->exp_poll_lock, flags); diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 9e1c8caec5ce..f2280616f9d5 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -1312,6 +1312,7 @@ int rcu_nocb_cpu_offload(int cpu) } EXPORT_SYMBOL_GPL(rcu_nocb_cpu_offload); +#ifdef CONFIG_RCU_LAZY static unsigned long lazy_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { @@ -1360,6 +1361,7 @@ static struct shrinker lazy_rcu_shrinker = { .batch = 0, .seeks = DEFAULT_SEEKS, }; +#endif // #ifdef CONFIG_RCU_LAZY void __init rcu_init_nohz(void) { @@ -1391,8 +1393,10 @@ void __init rcu_init_nohz(void) if (!rcu_state.nocb_is_setup) return; +#ifdef CONFIG_RCU_LAZY if (register_shrinker(&lazy_rcu_shrinker, "rcu-lazy")) pr_err("Failed to register lazy_rcu shrinker!\n"); +#endif // #ifdef CONFIG_RCU_LAZY if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) { pr_info("\tNote: kernel parameter 'rcu_nocbs=', 'nohz_full', or 'isolcpus=' contains nonexistent CPUs.\n"); diff --git a/kernel/signal.c b/kernel/signal.c index 8cb28f1df294..8f6330f0e9ca 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1003,8 +1003,7 @@ static void complete_signal(int sig, struct task_struct *p, enum pid_type type) /* * Now find a thread we can wake up to take the signal off the queue. * - * If the main thread wants the signal, it gets first crack. - * Probably the least surprising to the average bear. + * Try the suggested task first (may or may not be the main thread). */ if (wants_signal(sig, p)) t = p; @@ -1970,8 +1969,24 @@ int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type) ret = -1; rcu_read_lock(); + + /* + * This function is used by POSIX timers to deliver a timer signal. + * Where type is PIDTYPE_PID (such as for timers with SIGEV_THREAD_ID + * set), the signal must be delivered to the specific thread (queues + * into t->pending). + * + * Where type is not PIDTYPE_PID, signals must be delivered to the + * process. In this case, prefer to deliver to current if it is in + * the same thread group as the target process, which avoids + * unnecessarily waking up a potentially idle task. + */ t = pid_task(pid, type); - if (!t || !likely(lock_task_sighand(t, &flags))) + if (!t) + goto ret; + if (type != PIDTYPE_PID && same_thread_group(t, current)) + t = current; + if (!likely(lock_task_sighand(t, &flags))) goto ret; ret = 1; /* the signal is ignored */ diff --git a/kernel/softirq.c b/kernel/softirq.c index c8a6913c067d..1b725510dd0f 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -793,10 +793,15 @@ static void tasklet_action_common(struct softirq_action *a, if (tasklet_trylock(t)) { if (!atomic_read(&t->count)) { if (tasklet_clear_sched(t)) { - if (t->use_callback) + if (t->use_callback) { + trace_tasklet_entry(t, t->callback); t->callback(t); - else + trace_tasklet_exit(t, t->callback); + } else { + trace_tasklet_entry(t, t->func); t->func(t->data); + trace_tasklet_exit(t, t->func); + } } tasklet_unlock(t); continue; diff --git a/kernel/sys.c b/kernel/sys.c index 495cd87d9bf4..351de7916302 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -664,6 +664,7 @@ long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) struct cred *new; int retval; kuid_t kruid, keuid, ksuid; + bool ruid_new, euid_new, suid_new; kruid = make_kuid(ns, ruid); keuid = make_kuid(ns, euid); @@ -678,25 +679,29 @@ long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) if ((suid != (uid_t) -1) && !uid_valid(ksuid)) return -EINVAL; + old = current_cred(); + + /* check for no-op */ + if ((ruid == (uid_t) -1 || uid_eq(kruid, old->uid)) && + (euid == (uid_t) -1 || (uid_eq(keuid, old->euid) && + uid_eq(keuid, old->fsuid))) && + (suid == (uid_t) -1 || uid_eq(ksuid, old->suid))) + return 0; + + ruid_new = ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) && + !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid); + euid_new = euid != (uid_t) -1 && !uid_eq(keuid, old->uid) && + !uid_eq(keuid, old->euid) && !uid_eq(keuid, old->suid); + suid_new = suid != (uid_t) -1 && !uid_eq(ksuid, old->uid) && + !uid_eq(ksuid, old->euid) && !uid_eq(ksuid, old->suid); + if ((ruid_new || euid_new || suid_new) && + !ns_capable_setid(old->user_ns, CAP_SETUID)) + return -EPERM; + new = prepare_creds(); if (!new) return -ENOMEM; - old = current_cred(); - - retval = -EPERM; - if (!ns_capable_setid(old->user_ns, CAP_SETUID)) { - if (ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) && - !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid)) - goto error; - if (euid != (uid_t) -1 && !uid_eq(keuid, old->uid) && - !uid_eq(keuid, old->euid) && !uid_eq(keuid, old->suid)) - goto error; - if (suid != (uid_t) -1 && !uid_eq(ksuid, old->uid) && - !uid_eq(ksuid, old->euid) && !uid_eq(ksuid, old->suid)) - goto error; - } - if (ruid != (uid_t) -1) { new->uid = kruid; if (!uid_eq(kruid, old->uid)) { @@ -761,6 +766,7 @@ long __sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) struct cred *new; int retval; kgid_t krgid, kegid, ksgid; + bool rgid_new, egid_new, sgid_new; krgid = make_kgid(ns, rgid); kegid = make_kgid(ns, egid); @@ -773,23 +779,28 @@ long __sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) if ((sgid != (gid_t) -1) && !gid_valid(ksgid)) return -EINVAL; + old = current_cred(); + + /* check for no-op */ + if ((rgid == (gid_t) -1 || gid_eq(krgid, old->gid)) && + (egid == (gid_t) -1 || (gid_eq(kegid, old->egid) && + gid_eq(kegid, old->fsgid))) && + (sgid == (gid_t) -1 || gid_eq(ksgid, old->sgid))) + return 0; + + rgid_new = rgid != (gid_t) -1 && !gid_eq(krgid, old->gid) && + !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid); + egid_new = egid != (gid_t) -1 && !gid_eq(kegid, old->gid) && + !gid_eq(kegid, old->egid) && !gid_eq(kegid, old->sgid); + sgid_new = sgid != (gid_t) -1 && !gid_eq(ksgid, old->gid) && + !gid_eq(ksgid, old->egid) && !gid_eq(ksgid, old->sgid); + if ((rgid_new || egid_new || sgid_new) && + !ns_capable_setid(old->user_ns, CAP_SETGID)) + return -EPERM; + new = prepare_creds(); if (!new) return -ENOMEM; - old = current_cred(); - - retval = -EPERM; - if (!ns_capable_setid(old->user_ns, CAP_SETGID)) { - if (rgid != (gid_t) -1 && !gid_eq(krgid, old->gid) && - !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid)) - goto error; - if (egid != (gid_t) -1 && !gid_eq(kegid, old->gid) && - !gid_eq(kegid, old->egid) && !gid_eq(kegid, old->sgid)) - goto error; - if (sgid != (gid_t) -1 && !gid_eq(ksgid, old->gid) && - !gid_eq(ksgid, old->egid) && !gid_eq(ksgid, old->sgid)) - goto error; - } if (rgid != (gid_t) -1) new->gid = krgid; diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 2f5e9b34022c..e9c6f9d0e42c 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -846,6 +846,8 @@ static u64 collect_timerqueue(struct timerqueue_head *head, return expires; ctmr->firing = 1; + /* See posix_cpu_timer_wait_running() */ + rcu_assign_pointer(ctmr->handling, current); cpu_timer_dequeue(ctmr); list_add_tail(&ctmr->elist, firing); } @@ -1161,7 +1163,49 @@ static void handle_posix_cpu_timers(struct task_struct *tsk); #ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK static void posix_cpu_timers_work(struct callback_head *work) { + struct posix_cputimers_work *cw = container_of(work, typeof(*cw), work); + + mutex_lock(&cw->mutex); handle_posix_cpu_timers(current); + mutex_unlock(&cw->mutex); +} + +/* + * Invoked from the posix-timer core when a cancel operation failed because + * the timer is marked firing. The caller holds rcu_read_lock(), which + * protects the timer and the task which is expiring it from being freed. + */ +static void posix_cpu_timer_wait_running(struct k_itimer *timr) +{ + struct task_struct *tsk = rcu_dereference(timr->it.cpu.handling); + + /* Has the handling task completed expiry already? */ + if (!tsk) + return; + + /* Ensure that the task cannot go away */ + get_task_struct(tsk); + /* Now drop the RCU protection so the mutex can be locked */ + rcu_read_unlock(); + /* Wait on the expiry mutex */ + mutex_lock(&tsk->posix_cputimers_work.mutex); + /* Release it immediately again. */ + mutex_unlock(&tsk->posix_cputimers_work.mutex); + /* Drop the task reference. */ + put_task_struct(tsk); + /* Relock RCU so the callsite is balanced */ + rcu_read_lock(); +} + +static void posix_cpu_timer_wait_running_nsleep(struct k_itimer *timr) +{ + /* Ensure that timr->it.cpu.handling task cannot go away */ + rcu_read_lock(); + spin_unlock_irq(&timr->it_lock); + posix_cpu_timer_wait_running(timr); + rcu_read_unlock(); + /* @timr is on stack and is valid */ + spin_lock_irq(&timr->it_lock); } /* @@ -1177,6 +1221,7 @@ void clear_posix_cputimers_work(struct task_struct *p) sizeof(p->posix_cputimers_work.work)); init_task_work(&p->posix_cputimers_work.work, posix_cpu_timers_work); + mutex_init(&p->posix_cputimers_work.mutex); p->posix_cputimers_work.scheduled = false; } @@ -1255,6 +1300,18 @@ static inline void __run_posix_cpu_timers(struct task_struct *tsk) lockdep_posixtimer_exit(); } +static void posix_cpu_timer_wait_running(struct k_itimer *timr) +{ + cpu_relax(); +} + +static void posix_cpu_timer_wait_running_nsleep(struct k_itimer *timr) +{ + spin_unlock_irq(&timr->it_lock); + cpu_relax(); + spin_lock_irq(&timr->it_lock); +} + static inline bool posix_cpu_timers_work_scheduled(struct task_struct *tsk) { return false; @@ -1363,6 +1420,8 @@ static void handle_posix_cpu_timers(struct task_struct *tsk) */ if (likely(cpu_firing >= 0)) cpu_timer_fire(timer); + /* See posix_cpu_timer_wait_running() */ + rcu_assign_pointer(timer->it.cpu.handling, NULL); spin_unlock(&timer->it_lock); } } @@ -1497,23 +1556,16 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, expires = cpu_timer_getexpires(&timer.it.cpu); error = posix_cpu_timer_set(&timer, 0, &zero_it, &it); if (!error) { - /* - * Timer is now unarmed, deletion can not fail. - */ + /* Timer is now unarmed, deletion can not fail. */ posix_cpu_timer_del(&timer); + } else { + while (error == TIMER_RETRY) { + posix_cpu_timer_wait_running_nsleep(&timer); + error = posix_cpu_timer_del(&timer); + } } - spin_unlock_irq(&timer.it_lock); - while (error == TIMER_RETRY) { - /* - * We need to handle case when timer was or is in the - * middle of firing. In other cases we already freed - * resources. - */ - spin_lock_irq(&timer.it_lock); - error = posix_cpu_timer_del(&timer); - spin_unlock_irq(&timer.it_lock); - } + spin_unlock_irq(&timer.it_lock); if ((it.it_value.tv_sec | it.it_value.tv_nsec) == 0) { /* @@ -1623,6 +1675,7 @@ const struct k_clock clock_posix_cpu = { .timer_del = posix_cpu_timer_del, .timer_get = posix_cpu_timer_get, .timer_rearm = posix_cpu_timer_rearm, + .timer_wait_running = posix_cpu_timer_wait_running, }; const struct k_clock clock_process = { diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 0c8a87a11b39..808a247205a9 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -846,6 +846,10 @@ static struct k_itimer *timer_wait_running(struct k_itimer *timer, rcu_read_lock(); unlock_timer(timer, *flags); + /* + * kc->timer_wait_running() might drop RCU lock. So @timer + * cannot be touched anymore after the function returns! + */ if (!WARN_ON_ONCE(!kc->timer_wait_running)) kc->timer_wait_running(timer); diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 46789356f856..65b8658da829 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -218,9 +218,19 @@ static void tick_setup_device(struct tick_device *td, * this cpu: */ if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) { + ktime_t next_p; + u32 rem; + tick_do_timer_cpu = cpu; - tick_next_period = ktime_get(); + next_p = ktime_get(); + div_u64_rem(next_p, TICK_NSEC, &rem); + if (rem) { + next_p -= rem; + next_p += TICK_NSEC; + } + + tick_next_period = next_p; #ifdef CONFIG_NO_HZ_FULL /* * The boot CPU may be nohz_full, in which case set diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index b0e3c9205946..52254679ec48 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -281,6 +281,11 @@ static bool check_tick_dependency(atomic_t *dep) return true; } + if (val & TICK_DEP_MASK_RCU_EXP) { + trace_tick_stop(0, TICK_DEP_MASK_RCU_EXP); + return true; + } + return false; } @@ -527,7 +532,7 @@ void __init tick_nohz_full_setup(cpumask_var_t cpumask) tick_nohz_full_running = true; } -static int tick_nohz_cpu_down(unsigned int cpu) +bool tick_nohz_cpu_hotpluggable(unsigned int cpu) { /* * The tick_do_timer_cpu CPU handles housekeeping duty (unbound @@ -535,8 +540,13 @@ static int tick_nohz_cpu_down(unsigned int cpu) * CPUs. It must remain online when nohz full is enabled. */ if (tick_nohz_full_running && tick_do_timer_cpu == cpu) - return -EBUSY; - return 0; + return false; + return true; +} + +static int tick_nohz_cpu_down(unsigned int cpu) +{ + return tick_nohz_cpu_hotpluggable(cpu) ? 0 : -EBUSY; } void __init tick_nohz_init(void) @@ -637,43 +647,67 @@ static void tick_nohz_update_jiffies(ktime_t now) touch_softlockup_watchdog_sched(); } -/* - * Updates the per-CPU time idle statistics counters - */ -static void -update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time) +static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now) { ktime_t delta; - if (ts->idle_active) { - delta = ktime_sub(now, ts->idle_entrytime); - if (nr_iowait_cpu(cpu) > 0) - ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); - else - ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); - ts->idle_entrytime = now; - } + if (WARN_ON_ONCE(!ts->idle_active)) + return; - if (last_update_time) - *last_update_time = ktime_to_us(now); + delta = ktime_sub(now, ts->idle_entrytime); -} + write_seqcount_begin(&ts->idle_sleeptime_seq); + if (nr_iowait_cpu(smp_processor_id()) > 0) + ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); + else + ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); -static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now) -{ - update_ts_time_stats(smp_processor_id(), ts, now, NULL); + ts->idle_entrytime = now; ts->idle_active = 0; + write_seqcount_end(&ts->idle_sleeptime_seq); sched_clock_idle_wakeup_event(); } static void tick_nohz_start_idle(struct tick_sched *ts) { + write_seqcount_begin(&ts->idle_sleeptime_seq); ts->idle_entrytime = ktime_get(); ts->idle_active = 1; + write_seqcount_end(&ts->idle_sleeptime_seq); + sched_clock_idle_sleep_event(); } +static u64 get_cpu_sleep_time_us(struct tick_sched *ts, ktime_t *sleeptime, + bool compute_delta, u64 *last_update_time) +{ + ktime_t now, idle; + unsigned int seq; + + if (!tick_nohz_active) + return -1; + + now = ktime_get(); + if (last_update_time) + *last_update_time = ktime_to_us(now); + + do { + seq = read_seqcount_begin(&ts->idle_sleeptime_seq); + + if (ts->idle_active && compute_delta) { + ktime_t delta = ktime_sub(now, ts->idle_entrytime); + + idle = ktime_add(*sleeptime, delta); + } else { + idle = *sleeptime; + } + } while (read_seqcount_retry(&ts->idle_sleeptime_seq, seq)); + + return ktime_to_us(idle); + +} + /** * get_cpu_idle_time_us - get the total idle time of a CPU * @cpu: CPU number to query @@ -681,7 +715,10 @@ static void tick_nohz_start_idle(struct tick_sched *ts) * counters if NULL. * * Return the cumulative idle time (since boot) for a given - * CPU, in microseconds. + * CPU, in microseconds. Note this is partially broken due to + * the counter of iowait tasks that can be remotely updated without + * any synchronization. Therefore it is possible to observe backward + * values within two consecutive reads. * * This time is measured via accounting rather than sampling, * and is as accurate as ktime_get() is. @@ -691,27 +728,9 @@ static void tick_nohz_start_idle(struct tick_sched *ts) u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); - ktime_t now, idle; - - if (!tick_nohz_active) - return -1; - - now = ktime_get(); - if (last_update_time) { - update_ts_time_stats(cpu, ts, now, last_update_time); - idle = ts->idle_sleeptime; - } else { - if (ts->idle_active && !nr_iowait_cpu(cpu)) { - ktime_t delta = ktime_sub(now, ts->idle_entrytime); - - idle = ktime_add(ts->idle_sleeptime, delta); - } else { - idle = ts->idle_sleeptime; - } - } - - return ktime_to_us(idle); + return get_cpu_sleep_time_us(ts, &ts->idle_sleeptime, + !nr_iowait_cpu(cpu), last_update_time); } EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); @@ -722,7 +741,10 @@ EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); * counters if NULL. * * Return the cumulative iowait time (since boot) for a given - * CPU, in microseconds. + * CPU, in microseconds. Note this is partially broken due to + * the counter of iowait tasks that can be remotely updated without + * any synchronization. Therefore it is possible to observe backward + * values within two consecutive reads. * * This time is measured via accounting rather than sampling, * and is as accurate as ktime_get() is. @@ -732,26 +754,9 @@ EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); - ktime_t now, iowait; - - if (!tick_nohz_active) - return -1; - - now = ktime_get(); - if (last_update_time) { - update_ts_time_stats(cpu, ts, now, last_update_time); - iowait = ts->iowait_sleeptime; - } else { - if (ts->idle_active && nr_iowait_cpu(cpu) > 0) { - ktime_t delta = ktime_sub(now, ts->idle_entrytime); - iowait = ktime_add(ts->iowait_sleeptime, delta); - } else { - iowait = ts->iowait_sleeptime; - } - } - - return ktime_to_us(iowait); + return get_cpu_sleep_time_us(ts, &ts->iowait_sleeptime, + nr_iowait_cpu(cpu), last_update_time); } EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); @@ -1084,10 +1089,16 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) return true; } -static void __tick_nohz_idle_stop_tick(struct tick_sched *ts) +/** + * tick_nohz_idle_stop_tick - stop the idle tick from the idle task + * + * When the next event is more than a tick into the future, stop the idle tick + */ +void tick_nohz_idle_stop_tick(void) { - ktime_t expires; + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); int cpu = smp_processor_id(); + ktime_t expires; /* * If tick_nohz_get_sleep_length() ran tick_nohz_next_event(), the @@ -1119,16 +1130,6 @@ static void __tick_nohz_idle_stop_tick(struct tick_sched *ts) } } -/** - * tick_nohz_idle_stop_tick - stop the idle tick from the idle task - * - * When the next event is more than a tick into the future, stop the idle tick - */ -void tick_nohz_idle_stop_tick(void) -{ - __tick_nohz_idle_stop_tick(this_cpu_ptr(&tick_cpu_sched)); -} - void tick_nohz_idle_retain_tick(void) { tick_nohz_retain_tick(this_cpu_ptr(&tick_cpu_sched)); diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h index 504649513399..5ed5a9d41d5a 100644 --- a/kernel/time/tick-sched.h +++ b/kernel/time/tick-sched.h @@ -22,65 +22,82 @@ enum tick_nohz_mode { /** * struct tick_sched - sched tick emulation and no idle tick control/stats - * @sched_timer: hrtimer to schedule the periodic tick in high - * resolution mode - * @check_clocks: Notification mechanism about clocksource changes - * @nohz_mode: Mode - one state of tick_nohz_mode + * * @inidle: Indicator that the CPU is in the tick idle mode * @tick_stopped: Indicator that the idle tick has been stopped * @idle_active: Indicator that the CPU is actively in the tick idle mode; * it is reset during irq handling phases. - * @do_timer_lst: CPU was the last one doing do_timer before going idle + * @do_timer_last: CPU was the last one doing do_timer before going idle * @got_idle_tick: Tick timer function has run with @inidle set + * @stalled_jiffies: Number of stalled jiffies detected across ticks + * @last_tick_jiffies: Value of jiffies seen on last tick + * @sched_timer: hrtimer to schedule the periodic tick in high + * resolution mode * @last_tick: Store the last tick expiry time when the tick * timer is modified for nohz sleeps. This is necessary * to resume the tick timer operation in the timeline * when the CPU returns from nohz sleep. * @next_tick: Next tick to be fired when in dynticks mode. * @idle_jiffies: jiffies at the entry to idle for idle time accounting + * @idle_waketime: Time when the idle was interrupted + * @idle_entrytime: Time when the idle call was entered + * @nohz_mode: Mode - one state of tick_nohz_mode + * @last_jiffies: Base jiffies snapshot when next event was last computed + * @timer_expires_base: Base time clock monotonic for @timer_expires + * @timer_expires: Anticipated timer expiration time (in case sched tick is stopped) + * @next_timer: Expiry time of next expiring timer for debugging purpose only + * @idle_expires: Next tick in idle, for debugging purpose only * @idle_calls: Total number of idle calls * @idle_sleeps: Number of idle calls, where the sched tick was stopped - * @idle_entrytime: Time when the idle call was entered - * @idle_waketime: Time when the idle was interrupted * @idle_exittime: Time when the idle state was left * @idle_sleeptime: Sum of the time slept in idle with sched tick stopped * @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding - * @timer_expires: Anticipated timer expiration time (in case sched tick is stopped) - * @timer_expires_base: Base time clock monotonic for @timer_expires - * @next_timer: Expiry time of next expiring timer for debugging purpose only * @tick_dep_mask: Tick dependency mask - is set, if someone needs the tick - * @last_tick_jiffies: Value of jiffies seen on last tick - * @stalled_jiffies: Number of stalled jiffies detected across ticks + * @check_clocks: Notification mechanism about clocksource changes */ struct tick_sched { - struct hrtimer sched_timer; - unsigned long check_clocks; - enum tick_nohz_mode nohz_mode; - + /* Common flags */ unsigned int inidle : 1; unsigned int tick_stopped : 1; unsigned int idle_active : 1; unsigned int do_timer_last : 1; unsigned int got_idle_tick : 1; + /* Tick handling: jiffies stall check */ + unsigned int stalled_jiffies; + unsigned long last_tick_jiffies; + + /* Tick handling */ + struct hrtimer sched_timer; ktime_t last_tick; ktime_t next_tick; unsigned long idle_jiffies; - unsigned long idle_calls; - unsigned long idle_sleeps; - ktime_t idle_entrytime; ktime_t idle_waketime; - ktime_t idle_exittime; - ktime_t idle_sleeptime; - ktime_t iowait_sleeptime; + + /* Idle entry */ + seqcount_t idle_sleeptime_seq; + ktime_t idle_entrytime; + + /* Tick stop */ + enum tick_nohz_mode nohz_mode; unsigned long last_jiffies; - u64 timer_expires; u64 timer_expires_base; + u64 timer_expires; u64 next_timer; ktime_t idle_expires; + unsigned long idle_calls; + unsigned long idle_sleeps; + + /* Idle exit */ + ktime_t idle_exittime; + ktime_t idle_sleeptime; + ktime_t iowait_sleeptime; + + /* Full dynticks handling */ atomic_t tick_dep_mask; - unsigned long last_tick_jiffies; - unsigned int stalled_jiffies; + + /* Clocksource changes */ + unsigned long check_clocks; }; extern struct tick_sched *tick_get_tick_sched(int cpu); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index a856d4a34c67..5b1e7fa41ca8 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -257,7 +257,7 @@ config DYNAMIC_FTRACE_WITH_REGS config DYNAMIC_FTRACE_WITH_DIRECT_CALLS def_bool y - depends on DYNAMIC_FTRACE_WITH_REGS + depends on DYNAMIC_FTRACE_WITH_REGS || DYNAMIC_FTRACE_WITH_ARGS depends on HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS config DYNAMIC_FTRACE_WITH_CALL_OPS diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index c67bcc89a771..3b46dba3f69b 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2583,28 +2583,13 @@ ftrace_add_rec_direct(unsigned long ip, unsigned long addr, static void call_direct_funcs(unsigned long ip, unsigned long pip, struct ftrace_ops *ops, struct ftrace_regs *fregs) { - unsigned long addr; + unsigned long addr = READ_ONCE(ops->direct_call); - addr = ftrace_find_rec_direct(ip); if (!addr) return; arch_ftrace_set_direct_caller(fregs, addr); } - -static struct ftrace_ops direct_ops = { - .func = call_direct_funcs, - .flags = FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS - | FTRACE_OPS_FL_PERMANENT, - /* - * By declaring the main trampoline as this trampoline - * it will never have one allocated for it. Allocated - * trampolines should not call direct functions. - * The direct_ops should only be called by the builtin - * ftrace_regs_caller trampoline. - */ - .trampoline = FTRACE_REGS_ADDR, -}; #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ /** @@ -5301,391 +5286,9 @@ struct ftrace_direct_func { static LIST_HEAD(ftrace_direct_funcs); -/** - * ftrace_find_direct_func - test an address if it is a registered direct caller - * @addr: The address of a registered direct caller - * - * This searches to see if a ftrace direct caller has been registered - * at a specific address, and if so, it returns a descriptor for it. - * - * This can be used by architecture code to see if an address is - * a direct caller (trampoline) attached to a fentry/mcount location. - * This is useful for the function_graph tracer, as it may need to - * do adjustments if it traced a location that also has a direct - * trampoline attached to it. - */ -struct ftrace_direct_func *ftrace_find_direct_func(unsigned long addr) -{ - struct ftrace_direct_func *entry; - bool found = false; - - /* May be called by fgraph trampoline (protected by rcu tasks) */ - list_for_each_entry_rcu(entry, &ftrace_direct_funcs, next) { - if (entry->addr == addr) { - found = true; - break; - } - } - if (found) - return entry; - - return NULL; -} - -static struct ftrace_direct_func *ftrace_alloc_direct_func(unsigned long addr) -{ - struct ftrace_direct_func *direct; - - direct = kmalloc(sizeof(*direct), GFP_KERNEL); - if (!direct) - return NULL; - direct->addr = addr; - direct->count = 0; - list_add_rcu(&direct->next, &ftrace_direct_funcs); - ftrace_direct_func_count++; - return direct; -} - static int register_ftrace_function_nolock(struct ftrace_ops *ops); -/** - * register_ftrace_direct - Call a custom trampoline directly - * @ip: The address of the nop at the beginning of a function - * @addr: The address of the trampoline to call at @ip - * - * This is used to connect a direct call from the nop location (@ip) - * at the start of ftrace traced functions. The location that it calls - * (@addr) must be able to handle a direct call, and save the parameters - * of the function being traced, and restore them (or inject new ones - * if needed), before returning. - * - * Returns: - * 0 on success - * -EBUSY - Another direct function is already attached (there can be only one) - * -ENODEV - @ip does not point to a ftrace nop location (or not supported) - * -ENOMEM - There was an allocation failure. - */ -int register_ftrace_direct(unsigned long ip, unsigned long addr) -{ - struct ftrace_direct_func *direct; - struct ftrace_func_entry *entry; - struct ftrace_hash *free_hash = NULL; - struct dyn_ftrace *rec; - int ret = -ENODEV; - - mutex_lock(&direct_mutex); - - ip = ftrace_location(ip); - if (!ip) - goto out_unlock; - - /* See if there's a direct function at @ip already */ - ret = -EBUSY; - if (ftrace_find_rec_direct(ip)) - goto out_unlock; - - ret = -ENODEV; - rec = lookup_rec(ip, ip); - if (!rec) - goto out_unlock; - - /* - * Check if the rec says it has a direct call but we didn't - * find one earlier? - */ - if (WARN_ON(rec->flags & FTRACE_FL_DIRECT)) - goto out_unlock; - - /* Make sure the ip points to the exact record */ - if (ip != rec->ip) { - ip = rec->ip; - /* Need to check this ip for a direct. */ - if (ftrace_find_rec_direct(ip)) - goto out_unlock; - } - - ret = -ENOMEM; - direct = ftrace_find_direct_func(addr); - if (!direct) { - direct = ftrace_alloc_direct_func(addr); - if (!direct) - goto out_unlock; - } - - entry = ftrace_add_rec_direct(ip, addr, &free_hash); - if (!entry) - goto out_unlock; - - ret = ftrace_set_filter_ip(&direct_ops, ip, 0, 0); - - if (!ret && !(direct_ops.flags & FTRACE_OPS_FL_ENABLED)) { - ret = register_ftrace_function_nolock(&direct_ops); - if (ret) - ftrace_set_filter_ip(&direct_ops, ip, 1, 0); - } - - if (ret) { - remove_hash_entry(direct_functions, entry); - kfree(entry); - if (!direct->count) { - list_del_rcu(&direct->next); - synchronize_rcu_tasks(); - kfree(direct); - if (free_hash) - free_ftrace_hash(free_hash); - free_hash = NULL; - ftrace_direct_func_count--; - } - } else { - direct->count++; - } - out_unlock: - mutex_unlock(&direct_mutex); - - if (free_hash) { - synchronize_rcu_tasks(); - free_ftrace_hash(free_hash); - } - - return ret; -} -EXPORT_SYMBOL_GPL(register_ftrace_direct); - -static struct ftrace_func_entry *find_direct_entry(unsigned long *ip, - struct dyn_ftrace **recp) -{ - struct ftrace_func_entry *entry; - struct dyn_ftrace *rec; - - rec = lookup_rec(*ip, *ip); - if (!rec) - return NULL; - - entry = __ftrace_lookup_ip(direct_functions, rec->ip); - if (!entry) { - WARN_ON(rec->flags & FTRACE_FL_DIRECT); - return NULL; - } - - WARN_ON(!(rec->flags & FTRACE_FL_DIRECT)); - - /* Passed in ip just needs to be on the call site */ - *ip = rec->ip; - - if (recp) - *recp = rec; - - return entry; -} - -int unregister_ftrace_direct(unsigned long ip, unsigned long addr) -{ - struct ftrace_direct_func *direct; - struct ftrace_func_entry *entry; - struct ftrace_hash *hash; - int ret = -ENODEV; - - mutex_lock(&direct_mutex); - - ip = ftrace_location(ip); - if (!ip) - goto out_unlock; - - entry = find_direct_entry(&ip, NULL); - if (!entry) - goto out_unlock; - - hash = direct_ops.func_hash->filter_hash; - if (hash->count == 1) - unregister_ftrace_function(&direct_ops); - - ret = ftrace_set_filter_ip(&direct_ops, ip, 1, 0); - - WARN_ON(ret); - - remove_hash_entry(direct_functions, entry); - - direct = ftrace_find_direct_func(addr); - if (!WARN_ON(!direct)) { - /* This is the good path (see the ! before WARN) */ - direct->count--; - WARN_ON(direct->count < 0); - if (!direct->count) { - list_del_rcu(&direct->next); - synchronize_rcu_tasks(); - kfree(direct); - kfree(entry); - ftrace_direct_func_count--; - } - } - out_unlock: - mutex_unlock(&direct_mutex); - - return ret; -} -EXPORT_SYMBOL_GPL(unregister_ftrace_direct); - -static struct ftrace_ops stub_ops = { - .func = ftrace_stub, -}; - -/** - * ftrace_modify_direct_caller - modify ftrace nop directly - * @entry: The ftrace hash entry of the direct helper for @rec - * @rec: The record representing the function site to patch - * @old_addr: The location that the site at @rec->ip currently calls - * @new_addr: The location that the site at @rec->ip should call - * - * An architecture may overwrite this function to optimize the - * changing of the direct callback on an ftrace nop location. - * This is called with the ftrace_lock mutex held, and no other - * ftrace callbacks are on the associated record (@rec). Thus, - * it is safe to modify the ftrace record, where it should be - * currently calling @old_addr directly, to call @new_addr. - * - * This is called with direct_mutex locked. - * - * Safety checks should be made to make sure that the code at - * @rec->ip is currently calling @old_addr. And this must - * also update entry->direct to @new_addr. - */ -int __weak ftrace_modify_direct_caller(struct ftrace_func_entry *entry, - struct dyn_ftrace *rec, - unsigned long old_addr, - unsigned long new_addr) -{ - unsigned long ip = rec->ip; - int ret; - - lockdep_assert_held(&direct_mutex); - - /* - * The ftrace_lock was used to determine if the record - * had more than one registered user to it. If it did, - * we needed to prevent that from changing to do the quick - * switch. But if it did not (only a direct caller was attached) - * then this function is called. But this function can deal - * with attached callers to the rec that we care about, and - * since this function uses standard ftrace calls that take - * the ftrace_lock mutex, we need to release it. - */ - mutex_unlock(&ftrace_lock); - - /* - * By setting a stub function at the same address, we force - * the code to call the iterator and the direct_ops helper. - * This means that @ip does not call the direct call, and - * we can simply modify it. - */ - ret = ftrace_set_filter_ip(&stub_ops, ip, 0, 0); - if (ret) - goto out_lock; - - ret = register_ftrace_function_nolock(&stub_ops); - if (ret) { - ftrace_set_filter_ip(&stub_ops, ip, 1, 0); - goto out_lock; - } - - entry->direct = new_addr; - - /* - * By removing the stub, we put back the direct call, calling - * the @new_addr. - */ - unregister_ftrace_function(&stub_ops); - ftrace_set_filter_ip(&stub_ops, ip, 1, 0); - - out_lock: - mutex_lock(&ftrace_lock); - - return ret; -} - -/** - * modify_ftrace_direct - Modify an existing direct call to call something else - * @ip: The instruction pointer to modify - * @old_addr: The address that the current @ip calls directly - * @new_addr: The address that the @ip should call - * - * This modifies a ftrace direct caller at an instruction pointer without - * having to disable it first. The direct call will switch over to the - * @new_addr without missing anything. - * - * Returns: zero on success. Non zero on error, which includes: - * -ENODEV : the @ip given has no direct caller attached - * -EINVAL : the @old_addr does not match the current direct caller - */ -int modify_ftrace_direct(unsigned long ip, - unsigned long old_addr, unsigned long new_addr) -{ - struct ftrace_direct_func *direct, *new_direct = NULL; - struct ftrace_func_entry *entry; - struct dyn_ftrace *rec; - int ret = -ENODEV; - - mutex_lock(&direct_mutex); - - mutex_lock(&ftrace_lock); - - ip = ftrace_location(ip); - if (!ip) - goto out_unlock; - - entry = find_direct_entry(&ip, &rec); - if (!entry) - goto out_unlock; - - ret = -EINVAL; - if (entry->direct != old_addr) - goto out_unlock; - - direct = ftrace_find_direct_func(old_addr); - if (WARN_ON(!direct)) - goto out_unlock; - if (direct->count > 1) { - ret = -ENOMEM; - new_direct = ftrace_alloc_direct_func(new_addr); - if (!new_direct) - goto out_unlock; - direct->count--; - new_direct->count++; - } else { - direct->addr = new_addr; - } - - /* - * If there's no other ftrace callback on the rec->ip location, - * then it can be changed directly by the architecture. - * If there is another caller, then we just need to change the - * direct caller helper to point to @new_addr. - */ - if (ftrace_rec_count(rec) == 1) { - ret = ftrace_modify_direct_caller(entry, rec, old_addr, new_addr); - } else { - entry->direct = new_addr; - ret = 0; - } - - if (ret) { - direct->addr = old_addr; - if (unlikely(new_direct)) { - direct->count++; - list_del_rcu(&new_direct->next); - synchronize_rcu_tasks(); - kfree(new_direct); - ftrace_direct_func_count--; - } - } - - out_unlock: - mutex_unlock(&ftrace_lock); - mutex_unlock(&direct_mutex); - return ret; -} -EXPORT_SYMBOL_GPL(modify_ftrace_direct); - -#define MULTI_FLAGS (FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS) +#define MULTI_FLAGS (FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_ARGS) static int check_direct_multi(struct ftrace_ops *ops) { @@ -5714,7 +5317,7 @@ static void remove_direct_functions_hash(struct ftrace_hash *hash, unsigned long } /** - * register_ftrace_direct_multi - Call a custom trampoline directly + * register_ftrace_direct - Call a custom trampoline directly * for multiple functions registered in @ops * @ops: The address of the struct ftrace_ops object * @addr: The address of the trampoline to call at @ops functions @@ -5735,7 +5338,7 @@ static void remove_direct_functions_hash(struct ftrace_hash *hash, unsigned long * -ENODEV - @ip does not point to a ftrace nop location (or not supported) * -ENOMEM - There was an allocation failure. */ -int register_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) +int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) { struct ftrace_hash *hash, *free_hash = NULL; struct ftrace_func_entry *entry, *new; @@ -5777,6 +5380,7 @@ int register_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) ops->func = call_direct_funcs; ops->flags = MULTI_FLAGS; ops->trampoline = FTRACE_REGS_ADDR; + ops->direct_call = addr; err = register_ftrace_function_nolock(ops); @@ -5793,11 +5397,11 @@ int register_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) } return err; } -EXPORT_SYMBOL_GPL(register_ftrace_direct_multi); +EXPORT_SYMBOL_GPL(register_ftrace_direct); /** - * unregister_ftrace_direct_multi - Remove calls to custom trampoline - * previously registered by register_ftrace_direct_multi for @ops object. + * unregister_ftrace_direct - Remove calls to custom trampoline + * previously registered by register_ftrace_direct for @ops object. * @ops: The address of the struct ftrace_ops object * * This is used to remove a direct calls to @addr from the nop locations @@ -5808,7 +5412,8 @@ EXPORT_SYMBOL_GPL(register_ftrace_direct_multi); * 0 on success * -EINVAL - The @ops object was not properly registered. */ -int unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) +int unregister_ftrace_direct(struct ftrace_ops *ops, unsigned long addr, + bool free_filters) { struct ftrace_hash *hash = ops->func_hash->filter_hash; int err; @@ -5826,12 +5431,15 @@ int unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) /* cleanup for possible another register call */ ops->func = NULL; ops->trampoline = 0; + + if (free_filters) + ftrace_free_filter(ops); return err; } -EXPORT_SYMBOL_GPL(unregister_ftrace_direct_multi); +EXPORT_SYMBOL_GPL(unregister_ftrace_direct); static int -__modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) +__modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) { struct ftrace_hash *hash; struct ftrace_func_entry *entry, *iter; @@ -5847,6 +5455,7 @@ __modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) /* Enable the tmp_ops to have the same functions as the direct ops */ ftrace_ops_init(&tmp_ops); tmp_ops.func_hash = ops->func_hash; + tmp_ops.direct_call = addr; err = register_ftrace_function_nolock(&tmp_ops); if (err) @@ -5868,6 +5477,8 @@ __modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) entry->direct = addr; } } + /* Prevent store tearing if a trampoline concurrently accesses the value */ + WRITE_ONCE(ops->direct_call, addr); mutex_unlock(&ftrace_lock); @@ -5878,7 +5489,7 @@ __modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) } /** - * modify_ftrace_direct_multi_nolock - Modify an existing direct 'multi' call + * modify_ftrace_direct_nolock - Modify an existing direct 'multi' call * to call something else * @ops: The address of the struct ftrace_ops object * @addr: The address of the new trampoline to call at @ops functions @@ -5895,19 +5506,19 @@ __modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) * Returns: zero on success. Non zero on error, which includes: * -EINVAL - The @ops object was not properly registered. */ -int modify_ftrace_direct_multi_nolock(struct ftrace_ops *ops, unsigned long addr) +int modify_ftrace_direct_nolock(struct ftrace_ops *ops, unsigned long addr) { if (check_direct_multi(ops)) return -EINVAL; if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) return -EINVAL; - return __modify_ftrace_direct_multi(ops, addr); + return __modify_ftrace_direct(ops, addr); } -EXPORT_SYMBOL_GPL(modify_ftrace_direct_multi_nolock); +EXPORT_SYMBOL_GPL(modify_ftrace_direct_nolock); /** - * modify_ftrace_direct_multi - Modify an existing direct 'multi' call + * modify_ftrace_direct - Modify an existing direct 'multi' call * to call something else * @ops: The address of the struct ftrace_ops object * @addr: The address of the new trampoline to call at @ops functions @@ -5921,7 +5532,7 @@ EXPORT_SYMBOL_GPL(modify_ftrace_direct_multi_nolock); * Returns: zero on success. Non zero on error, which includes: * -EINVAL - The @ops object was not properly registered. */ -int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) +int modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) { int err; @@ -5931,11 +5542,11 @@ int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) return -EINVAL; mutex_lock(&direct_mutex); - err = __modify_ftrace_direct_multi(ops, addr); + err = __modify_ftrace_direct(ops, addr); mutex_unlock(&direct_mutex); return err; } -EXPORT_SYMBOL_GPL(modify_ftrace_direct_multi); +EXPORT_SYMBOL_GPL(modify_ftrace_direct); #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ /** diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 4496975f2029..efbbec2caff8 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -159,7 +159,7 @@ static void osnoise_unregister_instance(struct trace_array *tr) if (!found) return; - kvfree_rcu(inst); + kvfree_rcu_mightsleep(inst); } /* diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 20d0c4a97633..2d2616678295 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -1172,7 +1172,7 @@ int trace_probe_remove_file(struct trace_probe *tp, return -ENOENT; list_del_rcu(&link->list); - kvfree_rcu(link); + kvfree_rcu_mightsleep(link); if (list_empty(&tp->event->files)) trace_probe_clear_flag(tp, TP_FLAG_TRACE); diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index ff0536cea968..a931d9aaea26 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -785,14 +785,7 @@ static struct fgraph_ops fgraph_ops __initdata = { }; #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS -#ifndef CALL_DEPTH_ACCOUNT -#define CALL_DEPTH_ACCOUNT "" -#endif - -noinline __noclone static void trace_direct_tramp(void) -{ - asm(CALL_DEPTH_ACCOUNT); -} +static struct ftrace_ops direct; #endif /* @@ -870,8 +863,9 @@ trace_selftest_startup_function_graph(struct tracer *trace, * Register direct function together with graph tracer * and make sure we get graph trace. */ - ret = register_ftrace_direct((unsigned long) DYN_FTRACE_TEST_NAME, - (unsigned long) trace_direct_tramp); + ftrace_set_filter_ip(&direct, (unsigned long)DYN_FTRACE_TEST_NAME, 0, 0); + ret = register_ftrace_direct(&direct, + (unsigned long)ftrace_stub_direct_tramp); if (ret) goto out; @@ -891,8 +885,9 @@ trace_selftest_startup_function_graph(struct tracer *trace, unregister_ftrace_graph(&fgraph_ops); - ret = unregister_ftrace_direct((unsigned long) DYN_FTRACE_TEST_NAME, - (unsigned long) trace_direct_tramp); + ret = unregister_ftrace_direct(&direct, + (unsigned long)ftrace_stub_direct_tramp, + true); if (ret) goto out; diff --git a/kernel/vhost_task.c b/kernel/vhost_task.c new file mode 100644 index 000000000000..b7cbd66f889e --- /dev/null +++ b/kernel/vhost_task.c @@ -0,0 +1,117 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2021 Oracle Corporation + */ +#include <linux/slab.h> +#include <linux/completion.h> +#include <linux/sched/task.h> +#include <linux/sched/vhost_task.h> +#include <linux/sched/signal.h> + +enum vhost_task_flags { + VHOST_TASK_FLAGS_STOP, +}; + +static int vhost_task_fn(void *data) +{ + struct vhost_task *vtsk = data; + int ret; + + ret = vtsk->fn(vtsk->data); + complete(&vtsk->exited); + do_exit(ret); +} + +/** + * vhost_task_stop - stop a vhost_task + * @vtsk: vhost_task to stop + * + * Callers must call vhost_task_should_stop and return from their worker + * function when it returns true; + */ +void vhost_task_stop(struct vhost_task *vtsk) +{ + pid_t pid = vtsk->task->pid; + + set_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags); + wake_up_process(vtsk->task); + /* + * Make sure vhost_task_fn is no longer accessing the vhost_task before + * freeing it below. If userspace crashed or exited without closing, + * then the vhost_task->task could already be marked dead so + * kernel_wait will return early. + */ + wait_for_completion(&vtsk->exited); + /* + * If we are just closing/removing a device and the parent process is + * not exiting then reap the task. + */ + kernel_wait4(pid, NULL, __WCLONE, NULL); + kfree(vtsk); +} +EXPORT_SYMBOL_GPL(vhost_task_stop); + +/** + * vhost_task_should_stop - should the vhost task return from the work function + * @vtsk: vhost_task to stop + */ +bool vhost_task_should_stop(struct vhost_task *vtsk) +{ + return test_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags); +} +EXPORT_SYMBOL_GPL(vhost_task_should_stop); + +/** + * vhost_task_create - create a copy of a process to be used by the kernel + * @fn: thread stack + * @arg: data to be passed to fn + * @name: the thread's name + * + * This returns a specialized task for use by the vhost layer or NULL on + * failure. The returned task is inactive, and the caller must fire it up + * through vhost_task_start(). + */ +struct vhost_task *vhost_task_create(int (*fn)(void *), void *arg, + const char *name) +{ + struct kernel_clone_args args = { + .flags = CLONE_FS | CLONE_UNTRACED | CLONE_VM, + .exit_signal = 0, + .fn = vhost_task_fn, + .name = name, + .user_worker = 1, + .no_files = 1, + .ignore_signals = 1, + }; + struct vhost_task *vtsk; + struct task_struct *tsk; + + vtsk = kzalloc(sizeof(*vtsk), GFP_KERNEL); + if (!vtsk) + return NULL; + init_completion(&vtsk->exited); + vtsk->data = arg; + vtsk->fn = fn; + + args.fn_arg = vtsk; + + tsk = copy_process(NULL, 0, NUMA_NO_NODE, &args); + if (IS_ERR(tsk)) { + kfree(vtsk); + return NULL; + } + + vtsk->task = tsk; + return vtsk; +} +EXPORT_SYMBOL_GPL(vhost_task_create); + +/** + * vhost_task_start - start a vhost_task created with vhost_task_create + * @vtsk: vhost_task to wake up + */ +void vhost_task_start(struct vhost_task *vtsk) +{ + wake_up_new_task(vtsk->task); +} +EXPORT_SYMBOL_GPL(vhost_task_start); |