aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/bpf/bpf_inode_storage.c38
-rw-r--r--kernel/bpf/trampoline.c12
-rw-r--r--kernel/bpf/verifier.c15
-rw-r--r--kernel/cgroup/cgroup.c10
-rw-r--r--kernel/configs/tiny.config1
-rw-r--r--kernel/entry/syscall_user_dispatch.c74
-rw-r--r--kernel/fork.c124
-rw-r--r--kernel/irq/manage.c5
-rw-r--r--kernel/kcsan/core.c17
-rw-r--r--kernel/kthread.c33
-rw-r--r--kernel/livepatch/core.c6
-rw-r--r--kernel/locking/lockdep.c64
-rw-r--r--kernel/locking/locktorture.c188
-rw-r--r--kernel/locking/test-ww_mutex.c2
-rw-r--r--kernel/module/livepatch.c10
-rw-r--r--kernel/nsproxy.c17
-rw-r--r--kernel/pid.c19
-rw-r--r--kernel/printk/printk.c13
-rw-r--r--kernel/ptrace.c9
-rw-r--r--kernel/rcu/Kconfig3
-rw-r--r--kernel/rcu/rcu.h43
-rw-r--r--kernel/rcu/rcuscale.c9
-rw-r--r--kernel/rcu/rcutorture.c234
-rw-r--r--kernel/rcu/refscale.c2
-rw-r--r--kernel/rcu/srcutiny.c2
-rw-r--r--kernel/rcu/srcutree.c438
-rw-r--r--kernel/rcu/tasks.h33
-rw-r--r--kernel/rcu/tree.c18
-rw-r--r--kernel/rcu/tree_exp.h16
-rw-r--r--kernel/rcu/tree_nocb.h4
-rw-r--r--kernel/signal.c21
-rw-r--r--kernel/softirq.c9
-rw-r--r--kernel/sys.c69
-rw-r--r--kernel/time/posix-cpu-timers.c81
-rw-r--r--kernel/time/posix-timers.c4
-rw-r--r--kernel/time/tick-common.c12
-rw-r--r--kernel/time/tick-sched.c151
-rw-r--r--kernel/time/tick-sched.h67
-rw-r--r--kernel/trace/Kconfig2
-rw-r--r--kernel/trace/ftrace.c441
-rw-r--r--kernel/trace/trace_osnoise.c2
-rw-r--r--kernel/trace/trace_probe.c2
-rw-r--r--kernel/trace/trace_selftest.c19
-rw-r--r--kernel/vhost_task.c117
45 files changed, 1498 insertions, 959 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 10ef068f598d..6fc72b3afbde 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -15,6 +15,7 @@ obj-y = fork.o exec_domain.o panic.o \
obj-$(CONFIG_USERMODE_DRIVER) += usermode_driver.o
obj-$(CONFIG_MODULES) += kmod.o
obj-$(CONFIG_MULTIUSER) += groups.o
+obj-$(CONFIG_VHOST_TASK) += vhost_task.o
ifdef CONFIG_FUNCTION_TRACER
# Do not trace internal ftrace files
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index 05f4c66c9089..85720311cc67 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -84,16 +84,13 @@ void bpf_inode_storage_free(struct inode *inode)
static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key)
{
struct bpf_local_storage_data *sdata;
- struct file *f;
- int fd;
+ struct fd f = fdget_raw(*(int *)key);
- fd = *(int *)key;
- f = fget_raw(fd);
- if (!f)
+ if (!f.file)
return ERR_PTR(-EBADF);
- sdata = inode_storage_lookup(f->f_inode, map, true);
- fput(f);
+ sdata = inode_storage_lookup(file_inode(f.file), map, true);
+ fdput(f);
return sdata ? sdata->data : NULL;
}
@@ -101,22 +98,19 @@ static int bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key,
void *value, u64 map_flags)
{
struct bpf_local_storage_data *sdata;
- struct file *f;
- int fd;
+ struct fd f = fdget_raw(*(int *)key);
- fd = *(int *)key;
- f = fget_raw(fd);
- if (!f)
+ if (!f.file)
return -EBADF;
- if (!inode_storage_ptr(f->f_inode)) {
- fput(f);
+ if (!inode_storage_ptr(file_inode(f.file))) {
+ fdput(f);
return -EBADF;
}
- sdata = bpf_local_storage_update(f->f_inode,
+ sdata = bpf_local_storage_update(file_inode(f.file),
(struct bpf_local_storage_map *)map,
value, map_flags, GFP_ATOMIC);
- fput(f);
+ fdput(f);
return PTR_ERR_OR_ZERO(sdata);
}
@@ -135,16 +129,14 @@ static int inode_storage_delete(struct inode *inode, struct bpf_map *map)
static int bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key)
{
- struct file *f;
- int fd, err;
+ struct fd f = fdget_raw(*(int *)key);
+ int err;
- fd = *(int *)key;
- f = fget_raw(fd);
- if (!f)
+ if (!f.file)
return -EBADF;
- err = inode_storage_delete(f->f_inode, map);
- fput(f);
+ err = inode_storage_delete(file_inode(f.file), map);
+ fdput(f);
return err;
}
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index d0ed7d6f5eec..a14d0af534b3 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -45,8 +45,8 @@ static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, enum ftrace_ops_cmd
lockdep_assert_held_once(&tr->mutex);
/* Instead of updating the trampoline here, we propagate
- * -EAGAIN to register_ftrace_direct_multi(). Then we can
- * retry register_ftrace_direct_multi() after updating the
+ * -EAGAIN to register_ftrace_direct(). Then we can
+ * retry register_ftrace_direct() after updating the
* trampoline.
*/
if ((tr->flags & BPF_TRAMP_F_CALL_ORIG) &&
@@ -198,7 +198,7 @@ static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr)
int ret;
if (tr->func.ftrace_managed)
- ret = unregister_ftrace_direct_multi(tr->fops, (long)old_addr);
+ ret = unregister_ftrace_direct(tr->fops, (long)old_addr, false);
else
ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL);
@@ -215,9 +215,9 @@ static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_ad
if (tr->func.ftrace_managed) {
if (lock_direct_mutex)
- ret = modify_ftrace_direct_multi(tr->fops, (long)new_addr);
+ ret = modify_ftrace_direct(tr->fops, (long)new_addr);
else
- ret = modify_ftrace_direct_multi_nolock(tr->fops, (long)new_addr);
+ ret = modify_ftrace_direct_nolock(tr->fops, (long)new_addr);
} else {
ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, new_addr);
}
@@ -243,7 +243,7 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
if (tr->func.ftrace_managed) {
ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 0, 1);
- ret = register_ftrace_direct_multi(tr->fops, (long)new_addr);
+ ret = register_ftrace_direct(tr->fops, (long)new_addr);
} else {
ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr);
}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d517d13878cf..767e8930b0bd 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2967,6 +2967,21 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
}
} else if (opcode == BPF_EXIT) {
return -ENOTSUPP;
+ } else if (BPF_SRC(insn->code) == BPF_X) {
+ if (!(*reg_mask & (dreg | sreg)))
+ return 0;
+ /* dreg <cond> sreg
+ * Both dreg and sreg need precision before
+ * this insn. If only sreg was marked precise
+ * before it would be equally necessary to
+ * propagate it to dreg.
+ */
+ *reg_mask |= (sreg | dreg);
+ /* else dreg <cond> K
+ * Only dreg still needs precision before
+ * this insn, so for the K-based conditional
+ * there is nothing new to be marked.
+ */
}
} else if (class == BPF_LD) {
if (!(*reg_mask & dreg))
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 935e8121b21e..4b249f81c693 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -6856,14 +6856,12 @@ EXPORT_SYMBOL_GPL(cgroup_get_from_path);
struct cgroup *cgroup_v1v2_get_from_fd(int fd)
{
struct cgroup *cgrp;
- struct file *f;
-
- f = fget_raw(fd);
- if (!f)
+ struct fd f = fdget_raw(fd);
+ if (!f.file)
return ERR_PTR(-EBADF);
- cgrp = cgroup_v1v2_get_from_file(f);
- fput(f);
+ cgrp = cgroup_v1v2_get_from_file(f.file);
+ fdput(f);
return cgrp;
}
diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config
index c2f9c912df1c..144b2bd86b14 100644
--- a/kernel/configs/tiny.config
+++ b/kernel/configs/tiny.config
@@ -7,6 +7,5 @@ CONFIG_KERNEL_XZ=y
# CONFIG_KERNEL_LZO is not set
# CONFIG_KERNEL_LZ4 is not set
# CONFIG_SLAB is not set
-# CONFIG_SLOB_DEPRECATED is not set
CONFIG_SLUB=y
CONFIG_SLUB_TINY=y
diff --git a/kernel/entry/syscall_user_dispatch.c b/kernel/entry/syscall_user_dispatch.c
index 0b6379adff6b..5340c5aa89e7 100644
--- a/kernel/entry/syscall_user_dispatch.c
+++ b/kernel/entry/syscall_user_dispatch.c
@@ -4,6 +4,7 @@
*/
#include <linux/sched.h>
#include <linux/prctl.h>
+#include <linux/ptrace.h>
#include <linux/syscall_user_dispatch.h>
#include <linux/uaccess.h>
#include <linux/signal.h>
@@ -68,8 +69,9 @@ bool syscall_user_dispatch(struct pt_regs *regs)
return true;
}
-int set_syscall_user_dispatch(unsigned long mode, unsigned long offset,
- unsigned long len, char __user *selector)
+static int task_set_syscall_user_dispatch(struct task_struct *task, unsigned long mode,
+ unsigned long offset, unsigned long len,
+ char __user *selector)
{
switch (mode) {
case PR_SYS_DISPATCH_OFF:
@@ -86,7 +88,16 @@ int set_syscall_user_dispatch(unsigned long mode, unsigned long offset,
if (offset && offset + len <= offset)
return -EINVAL;
- if (selector && !access_ok(selector, sizeof(*selector)))
+ /*
+ * access_ok() will clear memory tags for tagged addresses
+ * if current has memory tagging enabled.
+
+ * To enable a tracer to set a tracees selector the
+ * selector address must be untagged for access_ok(),
+ * otherwise an untagged tracer will always fail to set a
+ * tagged tracees selector.
+ */
+ if (selector && !access_ok(untagged_addr(selector), sizeof(*selector)))
return -EFAULT;
break;
@@ -94,15 +105,60 @@ int set_syscall_user_dispatch(unsigned long mode, unsigned long offset,
return -EINVAL;
}
- current->syscall_dispatch.selector = selector;
- current->syscall_dispatch.offset = offset;
- current->syscall_dispatch.len = len;
- current->syscall_dispatch.on_dispatch = false;
+ task->syscall_dispatch.selector = selector;
+ task->syscall_dispatch.offset = offset;
+ task->syscall_dispatch.len = len;
+ task->syscall_dispatch.on_dispatch = false;
if (mode == PR_SYS_DISPATCH_ON)
- set_syscall_work(SYSCALL_USER_DISPATCH);
+ set_task_syscall_work(task, SYSCALL_USER_DISPATCH);
+ else
+ clear_task_syscall_work(task, SYSCALL_USER_DISPATCH);
+
+ return 0;
+}
+
+int set_syscall_user_dispatch(unsigned long mode, unsigned long offset,
+ unsigned long len, char __user *selector)
+{
+ return task_set_syscall_user_dispatch(current, mode, offset, len, selector);
+}
+
+int syscall_user_dispatch_get_config(struct task_struct *task, unsigned long size,
+ void __user *data)
+{
+ struct syscall_user_dispatch *sd = &task->syscall_dispatch;
+ struct ptrace_sud_config cfg;
+
+ if (size != sizeof(cfg))
+ return -EINVAL;
+
+ if (test_task_syscall_work(task, SYSCALL_USER_DISPATCH))
+ cfg.mode = PR_SYS_DISPATCH_ON;
else
- clear_syscall_work(SYSCALL_USER_DISPATCH);
+ cfg.mode = PR_SYS_DISPATCH_OFF;
+
+ cfg.offset = sd->offset;
+ cfg.len = sd->len;
+ cfg.selector = (__u64)(uintptr_t)sd->selector;
+
+ if (copy_to_user(data, &cfg, sizeof(cfg)))
+ return -EFAULT;
return 0;
}
+
+int syscall_user_dispatch_set_config(struct task_struct *task, unsigned long size,
+ void __user *data)
+{
+ struct ptrace_sud_config cfg;
+
+ if (size != sizeof(cfg))
+ return -EINVAL;
+
+ if (copy_from_user(&cfg, data, sizeof(cfg)))
+ return -EFAULT;
+
+ return task_set_syscall_user_dispatch(task, cfg.mode, cfg.offset, cfg.len,
+ (char __user *)(uintptr_t)cfg.selector);
+}
diff --git a/kernel/fork.c b/kernel/fork.c
index 0c92f224c68c..bfe73db1c26c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1174,6 +1174,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
fail_pcpu:
while (i > 0)
percpu_counter_destroy(&mm->rss_stat[--i]);
+ destroy_context(mm);
fail_nocontext:
mm_free_pgd(mm);
fail_nopgd:
@@ -1625,7 +1626,8 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
return 0;
}
-static int copy_files(unsigned long clone_flags, struct task_struct *tsk)
+static int copy_files(unsigned long clone_flags, struct task_struct *tsk,
+ int no_files)
{
struct files_struct *oldf, *newf;
int error = 0;
@@ -1637,6 +1639,11 @@ static int copy_files(unsigned long clone_flags, struct task_struct *tsk)
if (!oldf)
goto out;
+ if (no_files) {
+ tsk->files = NULL;
+ goto out;
+ }
+
if (clone_flags & CLONE_FILES) {
atomic_inc(&oldf->count);
goto out;
@@ -1954,6 +1961,91 @@ const struct file_operations pidfd_fops = {
#endif
};
+/**
+ * __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
+ * @pid: the struct pid for which to create a pidfd
+ * @flags: flags of the new @pidfd
+ * @pidfd: the pidfd to return
+ *
+ * Allocate a new file that stashes @pid and reserve a new pidfd number in the
+ * caller's file descriptor table. The pidfd is reserved but not installed yet.
+
+ * The helper doesn't perform checks on @pid which makes it useful for pidfds
+ * created via CLONE_PIDFD where @pid has no task attached when the pidfd and
+ * pidfd file are prepared.
+ *
+ * If this function returns successfully the caller is responsible to either
+ * call fd_install() passing the returned pidfd and pidfd file as arguments in
+ * order to install the pidfd into its file descriptor table or they must use
+ * put_unused_fd() and fput() on the returned pidfd and pidfd file
+ * respectively.
+ *
+ * This function is useful when a pidfd must already be reserved but there
+ * might still be points of failure afterwards and the caller wants to ensure
+ * that no pidfd is leaked into its file descriptor table.
+ *
+ * Return: On success, a reserved pidfd is returned from the function and a new
+ * pidfd file is returned in the last argument to the function. On
+ * error, a negative error code is returned from the function and the
+ * last argument remains unchanged.
+ */
+static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
+{
+ int pidfd;
+ struct file *pidfd_file;
+
+ if (flags & ~(O_NONBLOCK | O_RDWR | O_CLOEXEC))
+ return -EINVAL;
+
+ pidfd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
+ if (pidfd < 0)
+ return pidfd;
+
+ pidfd_file = anon_inode_getfile("[pidfd]", &pidfd_fops, pid,
+ flags | O_RDWR | O_CLOEXEC);
+ if (IS_ERR(pidfd_file)) {
+ put_unused_fd(pidfd);
+ return PTR_ERR(pidfd_file);
+ }
+ get_pid(pid); /* held by pidfd_file now */
+ *ret = pidfd_file;
+ return pidfd;
+}
+
+/**
+ * pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
+ * @pid: the struct pid for which to create a pidfd
+ * @flags: flags of the new @pidfd
+ * @pidfd: the pidfd to return
+ *
+ * Allocate a new file that stashes @pid and reserve a new pidfd number in the
+ * caller's file descriptor table. The pidfd is reserved but not installed yet.
+ *
+ * The helper verifies that @pid is used as a thread group leader.
+ *
+ * If this function returns successfully the caller is responsible to either
+ * call fd_install() passing the returned pidfd and pidfd file as arguments in
+ * order to install the pidfd into its file descriptor table or they must use
+ * put_unused_fd() and fput() on the returned pidfd and pidfd file
+ * respectively.
+ *
+ * This function is useful when a pidfd must already be reserved but there
+ * might still be points of failure afterwards and the caller wants to ensure
+ * that no pidfd is leaked into its file descriptor table.
+ *
+ * Return: On success, a reserved pidfd is returned from the function and a new
+ * pidfd file is returned in the last argument to the function. On
+ * error, a negative error code is returned from the function and the
+ * last argument remains unchanged.
+ */
+int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
+{
+ if (!pid || !pid_has_task(pid, PIDTYPE_TGID))
+ return -EINVAL;
+
+ return __pidfd_prepare(pid, flags, ret);
+}
+
static void __delayed_free_task(struct rcu_head *rhp)
{
struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
@@ -2008,7 +2100,7 @@ static void rv_task_fork(struct task_struct *p)
* parts of the process environment (as per the clone
* flags). The actual kick-off is left to the caller.
*/
-static __latent_entropy struct task_struct *copy_process(
+__latent_entropy struct task_struct *copy_process(
struct pid *pid,
int trace,
int node,
@@ -2101,6 +2193,8 @@ static __latent_entropy struct task_struct *copy_process(
p->flags &= ~PF_KTHREAD;
if (args->kthread)
p->flags |= PF_KTHREAD;
+ if (args->user_worker)
+ p->flags |= PF_USER_WORKER;
if (args->io_thread) {
/*
* Mark us an IO worker, and block any signal that isn't
@@ -2110,6 +2204,9 @@ static __latent_entropy struct task_struct *copy_process(
siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP));
}
+ if (args->name)
+ strscpy_pad(p->comm, args->name, sizeof(p->comm));
+
p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL;
/*
* Clear TID on mm_release()?
@@ -2252,7 +2349,7 @@ static __latent_entropy struct task_struct *copy_process(
retval = copy_semundo(clone_flags, p);
if (retval)
goto bad_fork_cleanup_security;
- retval = copy_files(clone_flags, p);
+ retval = copy_files(clone_flags, p, args->no_files);
if (retval)
goto bad_fork_cleanup_semundo;
retval = copy_fs(clone_flags, p);
@@ -2277,6 +2374,9 @@ static __latent_entropy struct task_struct *copy_process(
if (retval)
goto bad_fork_cleanup_io;
+ if (args->ignore_signals)
+ ignore_signals(p);
+
stackleak_task_init(p);
if (pid != &init_struct_pid) {
@@ -2294,21 +2394,12 @@ static __latent_entropy struct task_struct *copy_process(
* if the fd table isn't shared).
*/
if (clone_flags & CLONE_PIDFD) {
- retval = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
+ /* Note that no task has been attached to @pid yet. */
+ retval = __pidfd_prepare(pid, O_RDWR | O_CLOEXEC, &pidfile);
if (retval < 0)
goto bad_fork_free_pid;
-
pidfd = retval;
- pidfile = anon_inode_getfile("[pidfd]", &pidfd_fops, pid,
- O_RDWR | O_CLOEXEC);
- if (IS_ERR(pidfile)) {
- put_unused_fd(pidfd);
- retval = PTR_ERR(pidfile);
- goto bad_fork_free_pid;
- }
- get_pid(pid); /* held by pidfile now */
-
retval = put_user(pidfd, args->pidfd);
if (retval)
goto bad_fork_put_pidfd;
@@ -2625,6 +2716,7 @@ struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
.fn = fn,
.fn_arg = arg,
.io_thread = 1,
+ .user_worker = 1,
};
return copy_process(NULL, 0, node, &args);
@@ -2728,7 +2820,8 @@ pid_t kernel_clone(struct kernel_clone_args *args)
/*
* Create a kernel thread.
*/
-pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
+pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name,
+ unsigned long flags)
{
struct kernel_clone_args args = {
.flags = ((lower_32_bits(flags) | CLONE_VM |
@@ -2736,6 +2829,7 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
.exit_signal = (lower_32_bits(flags) & CSIGNAL),
.fn = fn,
.fn_arg = arg,
+ .name = name,
.kthread = 1,
};
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 8ce75495e04f..d2742af0f0fd 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -189,9 +189,12 @@ void irq_set_thread_affinity(struct irq_desc *desc)
{
struct irqaction *action;
- for_each_action_of_desc(desc, action)
+ for_each_action_of_desc(desc, action) {
if (action->thread)
set_bit(IRQTF_AFFINITY, &action->thread_flags);
+ if (action->secondary && action->secondary->thread)
+ set_bit(IRQTF_AFFINITY, &action->secondary->thread_flags);
+ }
}
#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c
index 54d077e1a2dc..5a60cc52adc0 100644
--- a/kernel/kcsan/core.c
+++ b/kernel/kcsan/core.c
@@ -337,11 +337,20 @@ static void delay_access(int type)
*/
static __always_inline u64 read_instrumented_memory(const volatile void *ptr, size_t size)
{
+ /*
+ * In the below we don't necessarily need the read of the location to
+ * be atomic, and we don't use READ_ONCE(), since all we need for race
+ * detection is to observe 2 different values.
+ *
+ * Furthermore, on certain architectures (such as arm64), READ_ONCE()
+ * may turn into more complex instructions than a plain load that cannot
+ * do unaligned accesses.
+ */
switch (size) {
- case 1: return READ_ONCE(*(const u8 *)ptr);
- case 2: return READ_ONCE(*(const u16 *)ptr);
- case 4: return READ_ONCE(*(const u32 *)ptr);
- case 8: return READ_ONCE(*(const u64 *)ptr);
+ case 1: return *(const volatile u8 *)ptr;
+ case 2: return *(const volatile u16 *)ptr;
+ case 4: return *(const volatile u32 *)ptr;
+ case 8: return *(const volatile u64 *)ptr;
default: return 0; /* Ignore; we do not diff the values. */
}
}
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 7e6751b29101..4bc6e0971ec9 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -38,6 +38,7 @@ struct task_struct *kthreadd_task;
struct kthread_create_info
{
/* Information passed to kthread() from kthreadd. */
+ char *full_name;
int (*threadfn)(void *data);
void *data;
int node;
@@ -343,10 +344,12 @@ static int kthread(void *_create)
/* Release the structure when caller killed by a fatal signal. */
done = xchg(&create->done, NULL);
if (!done) {
+ kfree(create->full_name);
kfree(create);
kthread_exit(-EINTR);
}
+ self->full_name = create->full_name;
self->threadfn = threadfn;
self->data = data;
@@ -396,11 +399,13 @@ static void create_kthread(struct kthread_create_info *create)
current->pref_node_fork = create->node;
#endif
/* We want our own signal handler (we take no signals by default). */
- pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
+ pid = kernel_thread(kthread, create, create->full_name,
+ CLONE_FS | CLONE_FILES | SIGCHLD);
if (pid < 0) {
/* Release the structure when caller killed by a fatal signal. */
struct completion *done = xchg(&create->done, NULL);
+ kfree(create->full_name);
if (!done) {
kfree(create);
return;
@@ -427,6 +432,11 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
create->data = data;
create->node = node;
create->done = &done;
+ create->full_name = kvasprintf(GFP_KERNEL, namefmt, args);
+ if (!create->full_name) {
+ task = ERR_PTR(-ENOMEM);
+ goto free_create;
+ }
spin_lock(&kthread_create_lock);
list_add_tail(&create->list, &kthread_create_list);
@@ -453,26 +463,7 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
wait_for_completion(&done);
}
task = create->result;
- if (!IS_ERR(task)) {
- char name[TASK_COMM_LEN];
- va_list aq;
- int len;
-
- /*
- * task is already visible to other tasks, so updating
- * COMM must be protected.
- */
- va_copy(aq, args);
- len = vsnprintf(name, sizeof(name), namefmt, aq);
- va_end(aq);
- if (len >= TASK_COMM_LEN) {
- struct kthread *kthread = to_kthread(task);
-
- /* leave it truncated when out of memory. */
- kthread->full_name = kvasprintf(GFP_KERNEL, namefmt, args);
- }
- set_task_comm(task, name);
- }
+free_create:
kfree(create);
return task;
}
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 4bd2d5e10f20..feaf90c5e537 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -596,7 +596,7 @@ static void klp_kobj_release_patch(struct kobject *kobj)
complete(&patch->finish);
}
-static struct kobj_type klp_ktype_patch = {
+static const struct kobj_type klp_ktype_patch = {
.release = klp_kobj_release_patch,
.sysfs_ops = &kobj_sysfs_ops,
.default_groups = klp_patch_groups,
@@ -612,7 +612,7 @@ static void klp_kobj_release_object(struct kobject *kobj)
klp_free_object_dynamic(obj);
}
-static struct kobj_type klp_ktype_object = {
+static const struct kobj_type klp_ktype_object = {
.release = klp_kobj_release_object,
.sysfs_ops = &kobj_sysfs_ops,
.default_groups = klp_object_groups,
@@ -628,7 +628,7 @@ static void klp_kobj_release_func(struct kobject *kobj)
klp_free_func_nop(func);
}
-static struct kobj_type klp_ktype_func = {
+static const struct kobj_type klp_ktype_func = {
.release = klp_kobj_release_func,
.sysfs_ops = &kobj_sysfs_ops,
};
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 50d4863974e7..dcd1d5bfc1e0 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -1881,6 +1881,8 @@ print_circular_lock_scenario(struct held_lock *src,
struct lock_class *source = hlock_class(src);
struct lock_class *target = hlock_class(tgt);
struct lock_class *parent = prt->class;
+ int src_read = src->read;
+ int tgt_read = tgt->read;
/*
* A direct locking problem where unsafe_class lock is taken
@@ -1908,7 +1910,10 @@ print_circular_lock_scenario(struct held_lock *src,
printk(" Possible unsafe locking scenario:\n\n");
printk(" CPU0 CPU1\n");
printk(" ---- ----\n");
- printk(" lock(");
+ if (tgt_read != 0)
+ printk(" rlock(");
+ else
+ printk(" lock(");
__print_lock_name(target);
printk(KERN_CONT ");\n");
printk(" lock(");
@@ -1917,7 +1922,12 @@ print_circular_lock_scenario(struct held_lock *src,
printk(" lock(");
__print_lock_name(target);
printk(KERN_CONT ");\n");
- printk(" lock(");
+ if (src_read != 0)
+ printk(" rlock(");
+ else if (src->sync)
+ printk(" sync(");
+ else
+ printk(" lock(");
__print_lock_name(source);
printk(KERN_CONT ");\n");
printk("\n *** DEADLOCK ***\n\n");
@@ -4531,7 +4541,13 @@ mark_usage(struct task_struct *curr, struct held_lock *hlock, int check)
return 0;
}
}
- if (!hlock->hardirqs_off) {
+
+ /*
+ * For lock_sync(), don't mark the ENABLED usage, since lock_sync()
+ * creates no critical section and no extra dependency can be introduced
+ * by interrupts
+ */
+ if (!hlock->hardirqs_off && !hlock->sync) {
if (hlock->read) {
if (!mark_lock(curr, hlock,
LOCK_ENABLED_HARDIRQ_READ))
@@ -4910,7 +4926,7 @@ static int __lock_is_held(const struct lockdep_map *lock, int read);
static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
int trylock, int read, int check, int hardirqs_off,
struct lockdep_map *nest_lock, unsigned long ip,
- int references, int pin_count)
+ int references, int pin_count, int sync)
{
struct task_struct *curr = current;
struct lock_class *class = NULL;
@@ -4961,7 +4977,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
class_idx = class - lock_classes;
- if (depth) { /* we're holding locks */
+ if (depth && !sync) {
+ /* we're holding locks and the new held lock is not a sync */
hlock = curr->held_locks + depth - 1;
if (hlock->class_idx == class_idx && nest_lock) {
if (!references)
@@ -4995,6 +5012,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
hlock->trylock = trylock;
hlock->read = read;
hlock->check = check;
+ hlock->sync = !!sync;
hlock->hardirqs_off = !!hardirqs_off;
hlock->references = references;
#ifdef CONFIG_LOCK_STAT
@@ -5056,6 +5074,10 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
if (!validate_chain(curr, hlock, chain_head, chain_key))
return 0;
+ /* For lock_sync(), we are done here since no actual critical section */
+ if (hlock->sync)
+ return 1;
+
curr->curr_chain_key = chain_key;
curr->lockdep_depth++;
check_chain_key(curr);
@@ -5197,7 +5219,7 @@ static int reacquire_held_locks(struct task_struct *curr, unsigned int depth,
hlock->read, hlock->check,
hlock->hardirqs_off,
hlock->nest_lock, hlock->acquire_ip,
- hlock->references, hlock->pin_count)) {
+ hlock->references, hlock->pin_count, 0)) {
case 0:
return 1;
case 1:
@@ -5667,7 +5689,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
lockdep_recursion_inc();
__lock_acquire(lock, subclass, trylock, read, check,
- irqs_disabled_flags(flags), nest_lock, ip, 0, 0);
+ irqs_disabled_flags(flags), nest_lock, ip, 0, 0, 0);
lockdep_recursion_finish();
raw_local_irq_restore(flags);
}
@@ -5693,6 +5715,34 @@ void lock_release(struct lockdep_map *lock, unsigned long ip)
}
EXPORT_SYMBOL_GPL(lock_release);
+/*
+ * lock_sync() - A special annotation for synchronize_{s,}rcu()-like API.
+ *
+ * No actual critical section is created by the APIs annotated with this: these
+ * APIs are used to wait for one or multiple critical sections (on other CPUs
+ * or threads), and it means that calling these APIs inside these critical
+ * sections is potential deadlock.
+ */
+void lock_sync(struct lockdep_map *lock, unsigned subclass, int read,
+ int check, struct lockdep_map *nest_lock, unsigned long ip)
+{
+ unsigned long flags;
+
+ if (unlikely(!lockdep_enabled()))
+ return;
+
+ raw_local_irq_save(flags);
+ check_flags(flags);
+
+ lockdep_recursion_inc();
+ __lock_acquire(lock, subclass, 0, read, check,
+ irqs_disabled_flags(flags), nest_lock, ip, 0, 0, 1);
+ check_chain_key(current);
+ lockdep_recursion_finish();
+ raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_sync);
+
noinstr int lock_is_held_type(const struct lockdep_map *lock, int read)
{
unsigned long flags;
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index f04b1978899d..153ddc4c47ef 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -51,8 +51,11 @@ torture_param(int, rt_boost, 2,
torture_param(int, rt_boost_factor, 50, "A factor determining how often rt-boost happens.");
torture_param(int, verbose, 1,
"Enable verbose debugging printk()s");
+torture_param(int, nested_locks, 0, "Number of nested locks (max = 8)");
+/* Going much higher trips "BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!" errors */
+#define MAX_NESTED_LOCKS 8
-static char *torture_type = "spin_lock";
+static char *torture_type = IS_ENABLED(CONFIG_PREEMPT_RT) ? "raw_spin_lock" : "spin_lock";
module_param(torture_type, charp, 0444);
MODULE_PARM_DESC(torture_type,
"Type of lock to torture (spin_lock, spin_lock_irq, mutex_lock, ...)");
@@ -79,10 +82,12 @@ static void lock_torture_cleanup(void);
struct lock_torture_ops {
void (*init)(void);
void (*exit)(void);
+ int (*nested_lock)(int tid, u32 lockset);
int (*writelock)(int tid);
void (*write_delay)(struct torture_random_state *trsp);
void (*task_boost)(struct torture_random_state *trsp);
void (*writeunlock)(int tid);
+ void (*nested_unlock)(int tid, u32 lockset);
int (*readlock)(int tid);
void (*read_delay)(struct torture_random_state *trsp);
void (*readunlock)(int tid);
@@ -252,6 +257,59 @@ static struct lock_torture_ops spin_lock_irq_ops = {
.name = "spin_lock_irq"
};
+static DEFINE_RAW_SPINLOCK(torture_raw_spinlock);
+
+static int torture_raw_spin_lock_write_lock(int tid __maybe_unused)
+__acquires(torture_raw_spinlock)
+{
+ raw_spin_lock(&torture_raw_spinlock);
+ return 0;
+}
+
+static void torture_raw_spin_lock_write_unlock(int tid __maybe_unused)
+__releases(torture_raw_spinlock)
+{
+ raw_spin_unlock(&torture_raw_spinlock);
+}
+
+static struct lock_torture_ops raw_spin_lock_ops = {
+ .writelock = torture_raw_spin_lock_write_lock,
+ .write_delay = torture_spin_lock_write_delay,
+ .task_boost = torture_rt_boost,
+ .writeunlock = torture_raw_spin_lock_write_unlock,
+ .readlock = NULL,
+ .read_delay = NULL,
+ .readunlock = NULL,
+ .name = "raw_spin_lock"
+};
+
+static int torture_raw_spin_lock_write_lock_irq(int tid __maybe_unused)
+__acquires(torture_raw_spinlock)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&torture_raw_spinlock, flags);
+ cxt.cur_ops->flags = flags;
+ return 0;
+}
+
+static void torture_raw_spin_lock_write_unlock_irq(int tid __maybe_unused)
+__releases(torture_raw_spinlock)
+{
+ raw_spin_unlock_irqrestore(&torture_raw_spinlock, cxt.cur_ops->flags);
+}
+
+static struct lock_torture_ops raw_spin_lock_irq_ops = {
+ .writelock = torture_raw_spin_lock_write_lock_irq,
+ .write_delay = torture_spin_lock_write_delay,
+ .task_boost = torture_rt_boost,
+ .writeunlock = torture_raw_spin_lock_write_unlock_irq,
+ .readlock = NULL,
+ .read_delay = NULL,
+ .readunlock = NULL,
+ .name = "raw_spin_lock_irq"
+};
+
static DEFINE_RWLOCK(torture_rwlock);
static int torture_rwlock_write_lock(int tid __maybe_unused)
@@ -365,6 +423,28 @@ static struct lock_torture_ops rw_lock_irq_ops = {
};
static DEFINE_MUTEX(torture_mutex);
+static struct mutex torture_nested_mutexes[MAX_NESTED_LOCKS];
+static struct lock_class_key nested_mutex_keys[MAX_NESTED_LOCKS];
+
+static void torture_mutex_init(void)
+{
+ int i;
+
+ for (i = 0; i < MAX_NESTED_LOCKS; i++)
+ __mutex_init(&torture_nested_mutexes[i], __func__,
+ &nested_mutex_keys[i]);
+}
+
+static int torture_mutex_nested_lock(int tid __maybe_unused,
+ u32 lockset)
+{
+ int i;
+
+ for (i = 0; i < nested_locks; i++)
+ if (lockset & (1 << i))
+ mutex_lock(&torture_nested_mutexes[i]);
+ return 0;
+}
static int torture_mutex_lock(int tid __maybe_unused)
__acquires(torture_mutex)
@@ -393,11 +473,24 @@ __releases(torture_mutex)
mutex_unlock(&torture_mutex);
}
+static void torture_mutex_nested_unlock(int tid __maybe_unused,
+ u32 lockset)
+{
+ int i;
+
+ for (i = nested_locks - 1; i >= 0; i--)
+ if (lockset & (1 << i))
+ mutex_unlock(&torture_nested_mutexes[i]);
+}
+
static struct lock_torture_ops mutex_lock_ops = {
+ .init = torture_mutex_init,
+ .nested_lock = torture_mutex_nested_lock,
.writelock = torture_mutex_lock,
.write_delay = torture_mutex_delay,
.task_boost = torture_rt_boost,
.writeunlock = torture_mutex_unlock,
+ .nested_unlock = torture_mutex_nested_unlock,
.readlock = NULL,
.read_delay = NULL,
.readunlock = NULL,
@@ -504,6 +597,28 @@ static struct lock_torture_ops ww_mutex_lock_ops = {
#ifdef CONFIG_RT_MUTEXES
static DEFINE_RT_MUTEX(torture_rtmutex);
+static struct rt_mutex torture_nested_rtmutexes[MAX_NESTED_LOCKS];
+static struct lock_class_key nested_rtmutex_keys[MAX_NESTED_LOCKS];
+
+static void torture_rtmutex_init(void)
+{
+ int i;
+
+ for (i = 0; i < MAX_NESTED_LOCKS; i++)
+ __rt_mutex_init(&torture_nested_rtmutexes[i], __func__,
+ &nested_rtmutex_keys[i]);
+}
+
+static int torture_rtmutex_nested_lock(int tid __maybe_unused,
+ u32 lockset)
+{
+ int i;
+
+ for (i = 0; i < nested_locks; i++)
+ if (lockset & (1 << i))
+ rt_mutex_lock(&torture_nested_rtmutexes[i]);
+ return 0;
+}
static int torture_rtmutex_lock(int tid __maybe_unused)
__acquires(torture_rtmutex)
@@ -545,11 +660,24 @@ static void torture_rt_boost_rtmutex(struct torture_random_state *trsp)
__torture_rt_boost(trsp);
}
+static void torture_rtmutex_nested_unlock(int tid __maybe_unused,
+ u32 lockset)
+{
+ int i;
+
+ for (i = nested_locks - 1; i >= 0; i--)
+ if (lockset & (1 << i))
+ rt_mutex_unlock(&torture_nested_rtmutexes[i]);
+}
+
static struct lock_torture_ops rtmutex_lock_ops = {
+ .init = torture_rtmutex_init,
+ .nested_lock = torture_rtmutex_nested_lock,
.writelock = torture_rtmutex_lock,
.write_delay = torture_rtmutex_delay,
.task_boost = torture_rt_boost_rtmutex,
.writeunlock = torture_rtmutex_unlock,
+ .nested_unlock = torture_rtmutex_nested_unlock,
.readlock = NULL,
.read_delay = NULL,
.readunlock = NULL,
@@ -684,6 +812,8 @@ static int lock_torture_writer(void *arg)
struct lock_stress_stats *lwsp = arg;
int tid = lwsp - cxt.lwsa;
DEFINE_TORTURE_RANDOM(rand);
+ u32 lockset_mask;
+ bool skip_main_lock;
VERBOSE_TOROUT_STRING("lock_torture_writer task started");
set_user_nice(current, MAX_NICE);
@@ -692,19 +822,40 @@ static int lock_torture_writer(void *arg)
if ((torture_random(&rand) & 0xfffff) == 0)
schedule_timeout_uninterruptible(1);
- cxt.cur_ops->task_boost(&rand);
- cxt.cur_ops->writelock(tid);
- if (WARN_ON_ONCE(lock_is_write_held))
- lwsp->n_lock_fail++;
- lock_is_write_held = true;
- if (WARN_ON_ONCE(atomic_read(&lock_is_read_held)))
- lwsp->n_lock_fail++; /* rare, but... */
+ lockset_mask = torture_random(&rand);
+ /*
+ * When using nested_locks, we want to occasionally
+ * skip the main lock so we can avoid always serializing
+ * the lock chains on that central lock. By skipping the
+ * main lock occasionally, we can create different
+ * contention patterns (allowing for multiple disjoint
+ * blocked trees)
+ */
+ skip_main_lock = (nested_locks &&
+ !(torture_random(&rand) % 100));
- lwsp->n_lock_acquired++;
+ cxt.cur_ops->task_boost(&rand);
+ if (cxt.cur_ops->nested_lock)
+ cxt.cur_ops->nested_lock(tid, lockset_mask);
+
+ if (!skip_main_lock) {
+ cxt.cur_ops->writelock(tid);
+ if (WARN_ON_ONCE(lock_is_write_held))
+ lwsp->n_lock_fail++;
+ lock_is_write_held = true;
+ if (WARN_ON_ONCE(atomic_read(&lock_is_read_held)))
+ lwsp->n_lock_fail++; /* rare, but... */
+
+ lwsp->n_lock_acquired++;
+ }
cxt.cur_ops->write_delay(&rand);
- lock_is_write_held = false;
- WRITE_ONCE(last_lock_release, jiffies);
- cxt.cur_ops->writeunlock(tid);
+ if (!skip_main_lock) {
+ lock_is_write_held = false;
+ WRITE_ONCE(last_lock_release, jiffies);
+ cxt.cur_ops->writeunlock(tid);
+ }
+ if (cxt.cur_ops->nested_unlock)
+ cxt.cur_ops->nested_unlock(tid, lockset_mask);
stutter_wait("lock_torture_writer");
} while (!torture_must_stop());
@@ -845,11 +996,11 @@ lock_torture_print_module_parms(struct lock_torture_ops *cur_ops,
const char *tag)
{
pr_alert("%s" TORTURE_FLAG
- "--- %s%s: nwriters_stress=%d nreaders_stress=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n",
+ "--- %s%s: nwriters_stress=%d nreaders_stress=%d nested_locks=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n",
torture_type, tag, cxt.debug_lock ? " [debug]": "",
- cxt.nrealwriters_stress, cxt.nrealreaders_stress, stat_interval,
- verbose, shuffle_interval, stutter, shutdown_secs,
- onoff_interval, onoff_holdoff);
+ cxt.nrealwriters_stress, cxt.nrealreaders_stress,
+ nested_locks, stat_interval, verbose, shuffle_interval,
+ stutter, shutdown_secs, onoff_interval, onoff_holdoff);
}
static void lock_torture_cleanup(void)
@@ -919,6 +1070,7 @@ static int __init lock_torture_init(void)
static struct lock_torture_ops *torture_ops[] = {
&lock_busted_ops,
&spin_lock_ops, &spin_lock_irq_ops,
+ &raw_spin_lock_ops, &raw_spin_lock_irq_ops,
&rw_lock_ops, &rw_lock_irq_ops,
&mutex_lock_ops,
&ww_mutex_lock_ops,
@@ -1068,6 +1220,10 @@ static int __init lock_torture_init(void)
}
}
+ /* cap nested_locks to MAX_NESTED_LOCKS */
+ if (nested_locks > MAX_NESTED_LOCKS)
+ nested_locks = MAX_NESTED_LOCKS;
+
if (cxt.cur_ops->readlock) {
reader_tasks = kcalloc(cxt.nrealreaders_stress,
sizeof(reader_tasks[0]),
diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c
index 29dc253d03af..93cca6e69860 100644
--- a/kernel/locking/test-ww_mutex.c
+++ b/kernel/locking/test-ww_mutex.c
@@ -659,7 +659,7 @@ static int __init test_ww_mutex_init(void)
if (ret)
return ret;
- ret = stress(4095, hweight32(STRESS_ALL)*ncpus, STRESS_ALL);
+ ret = stress(2047, hweight32(STRESS_ALL)*ncpus, STRESS_ALL);
if (ret)
return ret;
diff --git a/kernel/module/livepatch.c b/kernel/module/livepatch.c
index 486d4ff92719..a89f01e1d6b7 100644
--- a/kernel/module/livepatch.c
+++ b/kernel/module/livepatch.c
@@ -11,7 +11,7 @@
#include "internal.h"
/*
- * Persist Elf information about a module. Copy the Elf header,
+ * Persist ELF information about a module. Copy the ELF header,
* section header table, section string table, and symtab section
* index from info to mod->klp_info.
*/
@@ -25,11 +25,11 @@ int copy_module_elf(struct module *mod, struct load_info *info)
if (!mod->klp_info)
return -ENOMEM;
- /* Elf header */
+ /* ELF header */
size = sizeof(mod->klp_info->hdr);
memcpy(&mod->klp_info->hdr, info->hdr, size);
- /* Elf section header table */
+ /* ELF section header table */
size = sizeof(*info->sechdrs) * info->hdr->e_shnum;
mod->klp_info->sechdrs = kmemdup(info->sechdrs, size, GFP_KERNEL);
if (!mod->klp_info->sechdrs) {
@@ -37,7 +37,7 @@ int copy_module_elf(struct module *mod, struct load_info *info)
goto free_info;
}
- /* Elf section name string table */
+ /* ELF section name string table */
size = info->sechdrs[info->hdr->e_shstrndx].sh_size;
mod->klp_info->secstrings = kmemdup(info->secstrings, size, GFP_KERNEL);
if (!mod->klp_info->secstrings) {
@@ -45,7 +45,7 @@ int copy_module_elf(struct module *mod, struct load_info *info)
goto free_sechdrs;
}
- /* Elf symbol section index */
+ /* ELF symbol section index */
symndx = info->index.sym;
mod->klp_info->symndx = symndx;
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index a487ff24129b..80d9c6d77a45 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -545,21 +545,20 @@ static void commit_nsset(struct nsset *nsset)
SYSCALL_DEFINE2(setns, int, fd, int, flags)
{
- struct file *file;
+ struct fd f = fdget(fd);
struct ns_common *ns = NULL;
struct nsset nsset = {};
int err = 0;
- file = fget(fd);
- if (!file)
+ if (!f.file)
return -EBADF;
- if (proc_ns_file(file)) {
- ns = get_proc_ns(file_inode(file));
+ if (proc_ns_file(f.file)) {
+ ns = get_proc_ns(file_inode(f.file));
if (flags && (ns->ops->type != flags))
err = -EINVAL;
flags = ns->ops->type;
- } else if (!IS_ERR(pidfd_pid(file))) {
+ } else if (!IS_ERR(pidfd_pid(f.file))) {
err = check_setns_flags(flags);
} else {
err = -EINVAL;
@@ -571,17 +570,17 @@ SYSCALL_DEFINE2(setns, int, fd, int, flags)
if (err)
goto out;
- if (proc_ns_file(file))
+ if (proc_ns_file(f.file))
err = validate_ns(&nsset, ns);
else
- err = validate_nsset(&nsset, file->private_data);
+ err = validate_nsset(&nsset, f.file->private_data);
if (!err) {
commit_nsset(&nsset);
perf_event_namespaces(current);
}
put_nsset(&nsset);
out:
- fput(file);
+ fdput(f);
return err;
}
diff --git a/kernel/pid.c b/kernel/pid.c
index 3fbc5e46b721..f93954a0384d 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -594,20 +594,15 @@ struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags)
*/
int pidfd_create(struct pid *pid, unsigned int flags)
{
- int fd;
-
- if (!pid || !pid_has_task(pid, PIDTYPE_TGID))
- return -EINVAL;
+ int pidfd;
+ struct file *pidfd_file;
- if (flags & ~(O_NONBLOCK | O_RDWR | O_CLOEXEC))
- return -EINVAL;
-
- fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid),
- flags | O_RDWR | O_CLOEXEC);
- if (fd < 0)
- put_pid(pid);
+ pidfd = pidfd_prepare(pid, flags, &pidfd_file);
+ if (pidfd < 0)
+ return pidfd;
- return fd;
+ fd_install(pidfd, pidfd_file);
+ return pidfd;
}
/**
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index fd0c9f913940..9644f6e5bf15 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -730,7 +730,7 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
size_t len = iov_iter_count(from);
ssize_t ret = len;
- if (!user || len > PRINTKRB_RECORD_MAX)
+ if (len > PRINTKRB_RECORD_MAX)
return -EINVAL;
/* Ignore when user logging is disabled. */
@@ -792,9 +792,6 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
};
ssize_t ret;
- if (!user)
- return -EBADF;
-
ret = mutex_lock_interruptible(&user->lock);
if (ret)
return ret;
@@ -859,8 +856,6 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
struct devkmsg_user *user = file->private_data;
loff_t ret = 0;
- if (!user)
- return -EBADF;
if (offset)
return -ESPIPE;
@@ -893,9 +888,6 @@ static __poll_t devkmsg_poll(struct file *file, poll_table *wait)
struct printk_info info;
__poll_t ret = 0;
- if (!user)
- return EPOLLERR|EPOLLNVAL;
-
poll_wait(file, &log_wait, wait);
if (prb_read_valid_info(prb, atomic64_read(&user->seq), &info, NULL)) {
@@ -944,9 +936,6 @@ static int devkmsg_release(struct inode *inode, struct file *file)
{
struct devkmsg_user *user = file->private_data;
- if (!user)
- return 0;
-
ratelimit_state_exit(&user->rs);
mutex_destroy(&user->lock);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 0786450074c1..443057bee87c 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -32,6 +32,7 @@
#include <linux/compat.h>
#include <linux/sched/signal.h>
#include <linux/minmax.h>
+#include <linux/syscall_user_dispatch.h>
#include <asm/syscall.h> /* for syscall_get_* */
@@ -1259,6 +1260,14 @@ int ptrace_request(struct task_struct *child, long request,
break;
#endif
+ case PTRACE_SET_SYSCALL_USER_DISPATCH_CONFIG:
+ ret = syscall_user_dispatch_set_config(child, addr, datavp);
+ break;
+
+ case PTRACE_GET_SYSCALL_USER_DISPATCH_CONFIG:
+ ret = syscall_user_dispatch_get_config(child, addr, datavp);
+ break;
+
default:
break;
}
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index ab62074174c3..9071182b1284 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -53,9 +53,6 @@ config RCU_EXPERT
Say N if you are unsure.
-config SRCU
- def_bool y
-
config TINY_SRCU
bool
default y if TINY_RCU
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 115616ac3bfa..4a1b9622598b 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -14,6 +14,43 @@
/*
* Grace-period counter management.
+ *
+ * The two least significant bits contain the control flags.
+ * The most significant bits contain the grace-period sequence counter.
+ *
+ * When both control flags are zero, no grace period is in progress.
+ * When either bit is non-zero, a grace period has started and is in
+ * progress. When the grace period completes, the control flags are reset
+ * to 0 and the grace-period sequence counter is incremented.
+ *
+ * However some specific RCU usages make use of custom values.
+ *
+ * SRCU special control values:
+ *
+ * SRCU_SNP_INIT_SEQ : Invalid/init value set when SRCU node
+ * is initialized.
+ *
+ * SRCU_STATE_IDLE : No SRCU gp is in progress
+ *
+ * SRCU_STATE_SCAN1 : State set by rcu_seq_start(). Indicates
+ * we are scanning the readers on the slot
+ * defined as inactive (there might well
+ * be pending readers that will use that
+ * index, but their number is bounded).
+ *
+ * SRCU_STATE_SCAN2 : State set manually via rcu_seq_set_state()
+ * Indicates we are flipping the readers
+ * index and then scanning the readers on the
+ * slot newly designated as inactive (again,
+ * the number of pending readers that will use
+ * this inactive index is bounded).
+ *
+ * RCU polled GP special control value:
+ *
+ * RCU_GET_STATE_COMPLETED : State value indicating an already-completed
+ * polled GP has completed. This value covers
+ * both the state and the counter of the
+ * grace-period sequence number.
*/
#define RCU_SEQ_CTR_SHIFT 2
@@ -341,11 +378,13 @@ extern void rcu_init_geometry(void);
* specified state structure (for SRCU) or the only rcu_state structure
* (for RCU).
*/
-#define srcu_for_each_node_breadth_first(sp, rnp) \
+#define _rcu_for_each_node_breadth_first(sp, rnp) \
for ((rnp) = &(sp)->node[0]; \
(rnp) < &(sp)->node[rcu_num_nodes]; (rnp)++)
#define rcu_for_each_node_breadth_first(rnp) \
- srcu_for_each_node_breadth_first(&rcu_state, rnp)
+ _rcu_for_each_node_breadth_first(&rcu_state, rnp)
+#define srcu_for_each_node_breadth_first(ssp, rnp) \
+ _rcu_for_each_node_breadth_first(ssp->srcu_sup, rnp)
/*
* Scan the leaves of the rcu_node hierarchy for the rcu_state structure.
diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c
index 91fb5905a008..e82ec9f9a5d8 100644
--- a/kernel/rcu/rcuscale.c
+++ b/kernel/rcu/rcuscale.c
@@ -631,8 +631,7 @@ static int compute_real(int n)
static int
rcu_scale_shutdown(void *arg)
{
- wait_event(shutdown_wq,
- atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters);
+ wait_event_idle(shutdown_wq, atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters);
smp_mb(); /* Wake before output. */
rcu_scale_cleanup();
kernel_power_off();
@@ -716,7 +715,7 @@ kfree_scale_thread(void *arg)
// is tested.
if ((kfree_rcu_test_single && !kfree_rcu_test_double) ||
(kfree_rcu_test_both && torture_random(&tr) & 0x800))
- kfree_rcu(alloc_ptr);
+ kfree_rcu_mightsleep(alloc_ptr);
else
kfree_rcu(alloc_ptr, rh);
}
@@ -771,8 +770,8 @@ kfree_scale_cleanup(void)
static int
kfree_scale_shutdown(void *arg)
{
- wait_event(shutdown_wq,
- atomic_read(&n_kfree_scale_thread_ended) >= kfree_nrealthreads);
+ wait_event_idle(shutdown_wq,
+ atomic_read(&n_kfree_scale_thread_ended) >= kfree_nrealthreads);
smp_mb(); /* Wake before output. */
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 8e6c023212cb..147551c23baf 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -119,7 +119,9 @@ torture_param(int, stutter, 5, "Number of seconds to run/halt test");
torture_param(int, test_boost, 1, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
torture_param(int, test_boost_duration, 4, "Duration of each boost test, seconds.");
torture_param(int, test_boost_interval, 7, "Interval between boost tests, seconds.");
+torture_param(int, test_nmis, 0, "End-test NMI tests, 0 to disable.");
torture_param(bool, test_no_idle_hz, true, "Test support for tickless idle CPUs");
+torture_param(int, test_srcu_lockdep, 0, "Test specified SRCU deadlock scenario.");
torture_param(int, verbose, 1, "Enable verbose debugging printk()s");
static char *torture_type = "rcu";
@@ -179,7 +181,6 @@ static atomic_t n_rcu_torture_mbchk_tries;
static atomic_t n_rcu_torture_error;
static long n_rcu_torture_barrier_error;
static long n_rcu_torture_boost_ktrerror;
-static long n_rcu_torture_boost_rterror;
static long n_rcu_torture_boost_failure;
static long n_rcu_torture_boosts;
static atomic_long_t n_rcu_torture_timers;
@@ -2194,12 +2195,11 @@ rcu_torture_stats_print(void)
atomic_read(&n_rcu_torture_alloc),
atomic_read(&n_rcu_torture_alloc_fail),
atomic_read(&n_rcu_torture_free));
- pr_cont("rtmbe: %d rtmbkf: %d/%d rtbe: %ld rtbke: %ld rtbre: %ld ",
+ pr_cont("rtmbe: %d rtmbkf: %d/%d rtbe: %ld rtbke: %ld ",
atomic_read(&n_rcu_torture_mberror),
atomic_read(&n_rcu_torture_mbchk_fail), atomic_read(&n_rcu_torture_mbchk_tries),
n_rcu_torture_barrier_error,
- n_rcu_torture_boost_ktrerror,
- n_rcu_torture_boost_rterror);
+ n_rcu_torture_boost_ktrerror);
pr_cont("rtbf: %ld rtb: %ld nt: %ld ",
n_rcu_torture_boost_failure,
n_rcu_torture_boosts,
@@ -2217,15 +2217,13 @@ rcu_torture_stats_print(void)
if (atomic_read(&n_rcu_torture_mberror) ||
atomic_read(&n_rcu_torture_mbchk_fail) ||
n_rcu_torture_barrier_error || n_rcu_torture_boost_ktrerror ||
- n_rcu_torture_boost_rterror || n_rcu_torture_boost_failure ||
- i > 1) {
+ n_rcu_torture_boost_failure || i > 1) {
pr_cont("%s", "!!! ");
atomic_inc(&n_rcu_torture_error);
WARN_ON_ONCE(atomic_read(&n_rcu_torture_mberror));
WARN_ON_ONCE(atomic_read(&n_rcu_torture_mbchk_fail));
WARN_ON_ONCE(n_rcu_torture_barrier_error); // rcu_barrier()
WARN_ON_ONCE(n_rcu_torture_boost_ktrerror); // no boost kthread
- WARN_ON_ONCE(n_rcu_torture_boost_rterror); // can't set RT prio
WARN_ON_ONCE(n_rcu_torture_boost_failure); // boost failed (TIMER_SOFTIRQ RT prio?)
WARN_ON_ONCE(i > 1); // Too-short grace period
}
@@ -2358,7 +2356,8 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
"n_barrier_cbs=%d "
"onoff_interval=%d onoff_holdoff=%d "
"read_exit_delay=%d read_exit_burst=%d "
- "nocbs_nthreads=%d nocbs_toggle=%d\n",
+ "nocbs_nthreads=%d nocbs_toggle=%d "
+ "test_nmis=%d\n",
torture_type, tag, nrealreaders, nfakewriters,
stat_interval, verbose, test_no_idle_hz, shuffle_interval,
stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
@@ -2369,7 +2368,8 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
n_barrier_cbs,
onoff_interval, onoff_holdoff,
read_exit_delay, read_exit_burst,
- nocbs_nthreads, nocbs_toggle);
+ nocbs_nthreads, nocbs_toggle,
+ test_nmis);
}
static int rcutorture_booster_cleanup(unsigned int cpu)
@@ -3273,6 +3273,29 @@ static void rcu_torture_read_exit_cleanup(void)
torture_stop_kthread(rcutorture_read_exit, read_exit_task);
}
+static void rcutorture_test_nmis(int n)
+{
+#if IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)
+ int cpu;
+ int dumpcpu;
+ int i;
+
+ for (i = 0; i < n; i++) {
+ preempt_disable();
+ cpu = smp_processor_id();
+ dumpcpu = cpu + 1;
+ if (dumpcpu >= nr_cpu_ids)
+ dumpcpu = 0;
+ pr_alert("%s: CPU %d invoking dump_cpu_task(%d)\n", __func__, cpu, dumpcpu);
+ dump_cpu_task(dumpcpu);
+ preempt_enable();
+ schedule_timeout_uninterruptible(15 * HZ);
+ }
+#else // #if IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)
+ WARN_ONCE(n, "Non-zero rcutorture.test_nmis=%d permitted only when rcutorture is built in.\n", test_nmis);
+#endif // #else // #if IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)
+}
+
static enum cpuhp_state rcutor_hp;
static void
@@ -3297,6 +3320,8 @@ rcu_torture_cleanup(void)
return;
}
+ rcutorture_test_nmis(test_nmis);
+
if (cur_ops->gp_kthread_dbg)
cur_ops->gp_kthread_dbg();
rcu_torture_read_exit_cleanup();
@@ -3463,6 +3488,188 @@ static void rcutorture_sync(void)
cur_ops->sync();
}
+static DEFINE_MUTEX(mut0);
+static DEFINE_MUTEX(mut1);
+static DEFINE_MUTEX(mut2);
+static DEFINE_MUTEX(mut3);
+static DEFINE_MUTEX(mut4);
+static DEFINE_MUTEX(mut5);
+static DEFINE_MUTEX(mut6);
+static DEFINE_MUTEX(mut7);
+static DEFINE_MUTEX(mut8);
+static DEFINE_MUTEX(mut9);
+
+static DECLARE_RWSEM(rwsem0);
+static DECLARE_RWSEM(rwsem1);
+static DECLARE_RWSEM(rwsem2);
+static DECLARE_RWSEM(rwsem3);
+static DECLARE_RWSEM(rwsem4);
+static DECLARE_RWSEM(rwsem5);
+static DECLARE_RWSEM(rwsem6);
+static DECLARE_RWSEM(rwsem7);
+static DECLARE_RWSEM(rwsem8);
+static DECLARE_RWSEM(rwsem9);
+
+DEFINE_STATIC_SRCU(srcu0);
+DEFINE_STATIC_SRCU(srcu1);
+DEFINE_STATIC_SRCU(srcu2);
+DEFINE_STATIC_SRCU(srcu3);
+DEFINE_STATIC_SRCU(srcu4);
+DEFINE_STATIC_SRCU(srcu5);
+DEFINE_STATIC_SRCU(srcu6);
+DEFINE_STATIC_SRCU(srcu7);
+DEFINE_STATIC_SRCU(srcu8);
+DEFINE_STATIC_SRCU(srcu9);
+
+static int srcu_lockdep_next(const char *f, const char *fl, const char *fs, const char *fu, int i,
+ int cyclelen, int deadlock)
+{
+ int j = i + 1;
+
+ if (j >= cyclelen)
+ j = deadlock ? 0 : -1;
+ if (j >= 0)
+ pr_info("%s: %s(%d), %s(%d), %s(%d)\n", f, fl, i, fs, j, fu, i);
+ else
+ pr_info("%s: %s(%d), %s(%d)\n", f, fl, i, fu, i);
+ return j;
+}
+
+// Test lockdep on SRCU-based deadlock scenarios.
+static void rcu_torture_init_srcu_lockdep(void)
+{
+ int cyclelen;
+ int deadlock;
+ bool err = false;
+ int i;
+ int j;
+ int idx;
+ struct mutex *muts[] = { &mut0, &mut1, &mut2, &mut3, &mut4,
+ &mut5, &mut6, &mut7, &mut8, &mut9 };
+ struct rw_semaphore *rwsems[] = { &rwsem0, &rwsem1, &rwsem2, &rwsem3, &rwsem4,
+ &rwsem5, &rwsem6, &rwsem7, &rwsem8, &rwsem9 };
+ struct srcu_struct *srcus[] = { &srcu0, &srcu1, &srcu2, &srcu3, &srcu4,
+ &srcu5, &srcu6, &srcu7, &srcu8, &srcu9 };
+ int testtype;
+
+ if (!test_srcu_lockdep)
+ return;
+
+ deadlock = test_srcu_lockdep / 1000;
+ testtype = (test_srcu_lockdep / 10) % 100;
+ cyclelen = test_srcu_lockdep % 10;
+ WARN_ON_ONCE(ARRAY_SIZE(muts) != ARRAY_SIZE(srcus));
+ if (WARN_ONCE(deadlock != !!deadlock,
+ "%s: test_srcu_lockdep=%d and deadlock digit %d must be zero or one.\n",
+ __func__, test_srcu_lockdep, deadlock))
+ err = true;
+ if (WARN_ONCE(cyclelen <= 0,
+ "%s: test_srcu_lockdep=%d and cycle-length digit %d must be greater than zero.\n",
+ __func__, test_srcu_lockdep, cyclelen))
+ err = true;
+ if (err)
+ goto err_out;
+
+ if (testtype == 0) {
+ pr_info("%s: test_srcu_lockdep = %05d: SRCU %d-way %sdeadlock.\n",
+ __func__, test_srcu_lockdep, cyclelen, deadlock ? "" : "non-");
+ if (deadlock && cyclelen == 1)
+ pr_info("%s: Expect hang.\n", __func__);
+ for (i = 0; i < cyclelen; i++) {
+ j = srcu_lockdep_next(__func__, "srcu_read_lock", "synchronize_srcu",
+ "srcu_read_unlock", i, cyclelen, deadlock);
+ idx = srcu_read_lock(srcus[i]);
+ if (j >= 0)
+ synchronize_srcu(srcus[j]);
+ srcu_read_unlock(srcus[i], idx);
+ }
+ return;
+ }
+
+ if (testtype == 1) {
+ pr_info("%s: test_srcu_lockdep = %05d: SRCU/mutex %d-way %sdeadlock.\n",
+ __func__, test_srcu_lockdep, cyclelen, deadlock ? "" : "non-");
+ for (i = 0; i < cyclelen; i++) {
+ pr_info("%s: srcu_read_lock(%d), mutex_lock(%d), mutex_unlock(%d), srcu_read_unlock(%d)\n",
+ __func__, i, i, i, i);
+ idx = srcu_read_lock(srcus[i]);
+ mutex_lock(muts[i]);
+ mutex_unlock(muts[i]);
+ srcu_read_unlock(srcus[i], idx);
+
+ j = srcu_lockdep_next(__func__, "mutex_lock", "synchronize_srcu",
+ "mutex_unlock", i, cyclelen, deadlock);
+ mutex_lock(muts[i]);
+ if (j >= 0)
+ synchronize_srcu(srcus[j]);
+ mutex_unlock(muts[i]);
+ }
+ return;
+ }
+
+ if (testtype == 2) {
+ pr_info("%s: test_srcu_lockdep = %05d: SRCU/rwsem %d-way %sdeadlock.\n",
+ __func__, test_srcu_lockdep, cyclelen, deadlock ? "" : "non-");
+ for (i = 0; i < cyclelen; i++) {
+ pr_info("%s: srcu_read_lock(%d), down_read(%d), up_read(%d), srcu_read_unlock(%d)\n",
+ __func__, i, i, i, i);
+ idx = srcu_read_lock(srcus[i]);
+ down_read(rwsems[i]);
+ up_read(rwsems[i]);
+ srcu_read_unlock(srcus[i], idx);
+
+ j = srcu_lockdep_next(__func__, "down_write", "synchronize_srcu",
+ "up_write", i, cyclelen, deadlock);
+ down_write(rwsems[i]);
+ if (j >= 0)
+ synchronize_srcu(srcus[j]);
+ up_write(rwsems[i]);
+ }
+ return;
+ }
+
+#ifdef CONFIG_TASKS_TRACE_RCU
+ if (testtype == 3) {
+ pr_info("%s: test_srcu_lockdep = %05d: SRCU and Tasks Trace RCU %d-way %sdeadlock.\n",
+ __func__, test_srcu_lockdep, cyclelen, deadlock ? "" : "non-");
+ if (deadlock && cyclelen == 1)
+ pr_info("%s: Expect hang.\n", __func__);
+ for (i = 0; i < cyclelen; i++) {
+ char *fl = i == 0 ? "rcu_read_lock_trace" : "srcu_read_lock";
+ char *fs = i == cyclelen - 1 ? "synchronize_rcu_tasks_trace"
+ : "synchronize_srcu";
+ char *fu = i == 0 ? "rcu_read_unlock_trace" : "srcu_read_unlock";
+
+ j = srcu_lockdep_next(__func__, fl, fs, fu, i, cyclelen, deadlock);
+ if (i == 0)
+ rcu_read_lock_trace();
+ else
+ idx = srcu_read_lock(srcus[i]);
+ if (j >= 0) {
+ if (i == cyclelen - 1)
+ synchronize_rcu_tasks_trace();
+ else
+ synchronize_srcu(srcus[j]);
+ }
+ if (i == 0)
+ rcu_read_unlock_trace();
+ else
+ srcu_read_unlock(srcus[i], idx);
+ }
+ return;
+ }
+#endif // #ifdef CONFIG_TASKS_TRACE_RCU
+
+err_out:
+ pr_info("%s: test_srcu_lockdep = %05d does nothing.\n", __func__, test_srcu_lockdep);
+ pr_info("%s: test_srcu_lockdep = DNNL.\n", __func__);
+ pr_info("%s: D: Deadlock if nonzero.\n", __func__);
+ pr_info("%s: NN: Test number, 0=SRCU, 1=SRCU/mutex, 2=SRCU/rwsem, 3=SRCU/Tasks Trace RCU.\n", __func__);
+ pr_info("%s: L: Cycle length.\n", __func__);
+ if (!IS_ENABLED(CONFIG_TASKS_TRACE_RCU))
+ pr_info("%s: NN=3 disallowed because kernel is built with CONFIG_TASKS_TRACE_RCU=n\n", __func__);
+}
+
static int __init
rcu_torture_init(void)
{
@@ -3501,9 +3708,17 @@ rcu_torture_init(void)
pr_alert("rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n");
fqs_duration = 0;
}
+ if (nocbs_nthreads != 0 && (cur_ops != &rcu_ops ||
+ !IS_ENABLED(CONFIG_RCU_NOCB_CPU))) {
+ pr_alert("rcu-torture types: %s and CONFIG_RCU_NOCB_CPU=%d, nocb toggle disabled.\n",
+ cur_ops->name, IS_ENABLED(CONFIG_RCU_NOCB_CPU));
+ nocbs_nthreads = 0;
+ }
if (cur_ops->init)
cur_ops->init();
+ rcu_torture_init_srcu_lockdep();
+
if (nreaders >= 0) {
nrealreaders = nreaders;
} else {
@@ -3540,7 +3755,6 @@ rcu_torture_init(void)
atomic_set(&n_rcu_torture_error, 0);
n_rcu_torture_barrier_error = 0;
n_rcu_torture_boost_ktrerror = 0;
- n_rcu_torture_boost_rterror = 0;
n_rcu_torture_boost_failure = 0;
n_rcu_torture_boosts = 0;
for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c
index afa3e1a2f690..1970ce5f22d4 100644
--- a/kernel/rcu/refscale.c
+++ b/kernel/rcu/refscale.c
@@ -1031,7 +1031,7 @@ ref_scale_cleanup(void)
static int
ref_scale_shutdown(void *arg)
{
- wait_event(shutdown_wq, shutdown_start);
+ wait_event_idle(shutdown_wq, shutdown_start);
smp_mb(); // Wake before output.
ref_scale_cleanup();
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index b12fb0cec44d..336af24e0fe3 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -197,6 +197,8 @@ void synchronize_srcu(struct srcu_struct *ssp)
{
struct rcu_synchronize rs;
+ srcu_lock_sync(&ssp->dep_map);
+
RCU_LOCKDEP_WARN(lockdep_is_held(ssp) ||
lock_is_held(&rcu_bh_lock_map) ||
lock_is_held(&rcu_lock_map) ||
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index ab4ee58af84b..20d7a238d675 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -103,7 +103,7 @@ do { \
#define spin_trylock_irqsave_rcu_node(p, flags) \
({ \
- bool ___locked = spin_trylock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \
+ bool ___locked = spin_trylock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \
\
if (___locked) \
smp_mb__after_unlock_lock(); \
@@ -135,8 +135,8 @@ static void init_srcu_struct_data(struct srcu_struct *ssp)
spin_lock_init(&ACCESS_PRIVATE(sdp, lock));
rcu_segcblist_init(&sdp->srcu_cblist);
sdp->srcu_cblist_invoking = false;
- sdp->srcu_gp_seq_needed = ssp->srcu_gp_seq;
- sdp->srcu_gp_seq_needed_exp = ssp->srcu_gp_seq;
+ sdp->srcu_gp_seq_needed = ssp->srcu_sup->srcu_gp_seq;
+ sdp->srcu_gp_seq_needed_exp = ssp->srcu_sup->srcu_gp_seq;
sdp->mynode = NULL;
sdp->cpu = cpu;
INIT_WORK(&sdp->work, srcu_invoke_callbacks);
@@ -173,14 +173,14 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags)
/* Initialize geometry if it has not already been initialized. */
rcu_init_geometry();
- ssp->node = kcalloc(rcu_num_nodes, sizeof(*ssp->node), gfp_flags);
- if (!ssp->node)
+ ssp->srcu_sup->node = kcalloc(rcu_num_nodes, sizeof(*ssp->srcu_sup->node), gfp_flags);
+ if (!ssp->srcu_sup->node)
return false;
/* Work out the overall tree geometry. */
- ssp->level[0] = &ssp->node[0];
+ ssp->srcu_sup->level[0] = &ssp->srcu_sup->node[0];
for (i = 1; i < rcu_num_lvls; i++)
- ssp->level[i] = ssp->level[i - 1] + num_rcu_lvl[i - 1];
+ ssp->srcu_sup->level[i] = ssp->srcu_sup->level[i - 1] + num_rcu_lvl[i - 1];
rcu_init_levelspread(levelspread, num_rcu_lvl);
/* Each pass through this loop initializes one srcu_node structure. */
@@ -195,17 +195,17 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags)
snp->srcu_gp_seq_needed_exp = SRCU_SNP_INIT_SEQ;
snp->grplo = -1;
snp->grphi = -1;
- if (snp == &ssp->node[0]) {
+ if (snp == &ssp->srcu_sup->node[0]) {
/* Root node, special case. */
snp->srcu_parent = NULL;
continue;
}
/* Non-root node. */
- if (snp == ssp->level[level + 1])
+ if (snp == ssp->srcu_sup->level[level + 1])
level++;
- snp->srcu_parent = ssp->level[level - 1] +
- (snp - ssp->level[level]) /
+ snp->srcu_parent = ssp->srcu_sup->level[level - 1] +
+ (snp - ssp->srcu_sup->level[level]) /
levelspread[level - 1];
}
@@ -214,7 +214,7 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags)
* leaves of the srcu_node tree.
*/
level = rcu_num_lvls - 1;
- snp_first = ssp->level[level];
+ snp_first = ssp->srcu_sup->level[level];
for_each_possible_cpu(cpu) {
sdp = per_cpu_ptr(ssp->sda, cpu);
sdp->mynode = &snp_first[cpu / levelspread[level]];
@@ -225,7 +225,7 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags)
}
sdp->grpmask = 1 << (cpu - sdp->mynode->grplo);
}
- smp_store_release(&ssp->srcu_size_state, SRCU_SIZE_WAIT_BARRIER);
+ smp_store_release(&ssp->srcu_sup->srcu_size_state, SRCU_SIZE_WAIT_BARRIER);
return true;
}
@@ -236,36 +236,47 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags)
*/
static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
{
- ssp->srcu_size_state = SRCU_SIZE_SMALL;
- ssp->node = NULL;
- mutex_init(&ssp->srcu_cb_mutex);
- mutex_init(&ssp->srcu_gp_mutex);
+ if (!is_static)
+ ssp->srcu_sup = kzalloc(sizeof(*ssp->srcu_sup), GFP_KERNEL);
+ if (!ssp->srcu_sup)
+ return -ENOMEM;
+ if (!is_static)
+ spin_lock_init(&ACCESS_PRIVATE(ssp->srcu_sup, lock));
+ ssp->srcu_sup->srcu_size_state = SRCU_SIZE_SMALL;
+ ssp->srcu_sup->node = NULL;
+ mutex_init(&ssp->srcu_sup->srcu_cb_mutex);
+ mutex_init(&ssp->srcu_sup->srcu_gp_mutex);
ssp->srcu_idx = 0;
- ssp->srcu_gp_seq = 0;
- ssp->srcu_barrier_seq = 0;
- mutex_init(&ssp->srcu_barrier_mutex);
- atomic_set(&ssp->srcu_barrier_cpu_cnt, 0);
- INIT_DELAYED_WORK(&ssp->work, process_srcu);
- ssp->sda_is_static = is_static;
+ ssp->srcu_sup->srcu_gp_seq = 0;
+ ssp->srcu_sup->srcu_barrier_seq = 0;
+ mutex_init(&ssp->srcu_sup->srcu_barrier_mutex);
+ atomic_set(&ssp->srcu_sup->srcu_barrier_cpu_cnt, 0);
+ INIT_DELAYED_WORK(&ssp->srcu_sup->work, process_srcu);
+ ssp->srcu_sup->sda_is_static = is_static;
if (!is_static)
ssp->sda = alloc_percpu(struct srcu_data);
- if (!ssp->sda)
+ if (!ssp->sda) {
+ if (!is_static)
+ kfree(ssp->srcu_sup);
return -ENOMEM;
+ }
init_srcu_struct_data(ssp);
- ssp->srcu_gp_seq_needed_exp = 0;
- ssp->srcu_last_gp_end = ktime_get_mono_fast_ns();
- if (READ_ONCE(ssp->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) {
+ ssp->srcu_sup->srcu_gp_seq_needed_exp = 0;
+ ssp->srcu_sup->srcu_last_gp_end = ktime_get_mono_fast_ns();
+ if (READ_ONCE(ssp->srcu_sup->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) {
if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC)) {
- if (!ssp->sda_is_static) {
+ if (!ssp->srcu_sup->sda_is_static) {
free_percpu(ssp->sda);
ssp->sda = NULL;
+ kfree(ssp->srcu_sup);
return -ENOMEM;
}
} else {
- WRITE_ONCE(ssp->srcu_size_state, SRCU_SIZE_BIG);
+ WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_BIG);
}
}
- smp_store_release(&ssp->srcu_gp_seq_needed, 0); /* Init done. */
+ ssp->srcu_sup->srcu_ssp = ssp;
+ smp_store_release(&ssp->srcu_sup->srcu_gp_seq_needed, 0); /* Init done. */
return 0;
}
@@ -277,7 +288,6 @@ int __init_srcu_struct(struct srcu_struct *ssp, const char *name,
/* Don't re-initialize a lock while it is held. */
debug_check_no_locks_freed((void *)ssp, sizeof(*ssp));
lockdep_init_map(&ssp->dep_map, name, key, 0);
- spin_lock_init(&ACCESS_PRIVATE(ssp, lock));
return init_srcu_struct_fields(ssp, false);
}
EXPORT_SYMBOL_GPL(__init_srcu_struct);
@@ -294,7 +304,6 @@ EXPORT_SYMBOL_GPL(__init_srcu_struct);
*/
int init_srcu_struct(struct srcu_struct *ssp)
{
- spin_lock_init(&ACCESS_PRIVATE(ssp, lock));
return init_srcu_struct_fields(ssp, false);
}
EXPORT_SYMBOL_GPL(init_srcu_struct);
@@ -306,8 +315,8 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
*/
static void __srcu_transition_to_big(struct srcu_struct *ssp)
{
- lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock));
- smp_store_release(&ssp->srcu_size_state, SRCU_SIZE_ALLOC);
+ lockdep_assert_held(&ACCESS_PRIVATE(ssp->srcu_sup, lock));
+ smp_store_release(&ssp->srcu_sup->srcu_size_state, SRCU_SIZE_ALLOC);
}
/*
@@ -318,15 +327,15 @@ static void srcu_transition_to_big(struct srcu_struct *ssp)
unsigned long flags;
/* Double-checked locking on ->srcu_size-state. */
- if (smp_load_acquire(&ssp->srcu_size_state) != SRCU_SIZE_SMALL)
+ if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) != SRCU_SIZE_SMALL)
return;
- spin_lock_irqsave_rcu_node(ssp, flags);
- if (smp_load_acquire(&ssp->srcu_size_state) != SRCU_SIZE_SMALL) {
- spin_unlock_irqrestore_rcu_node(ssp, flags);
+ spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags);
+ if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) != SRCU_SIZE_SMALL) {
+ spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
return;
}
__srcu_transition_to_big(ssp);
- spin_unlock_irqrestore_rcu_node(ssp, flags);
+ spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
}
/*
@@ -337,14 +346,14 @@ static void spin_lock_irqsave_check_contention(struct srcu_struct *ssp)
{
unsigned long j;
- if (!SRCU_SIZING_IS_CONTEND() || ssp->srcu_size_state)
+ if (!SRCU_SIZING_IS_CONTEND() || ssp->srcu_sup->srcu_size_state)
return;
j = jiffies;
- if (ssp->srcu_size_jiffies != j) {
- ssp->srcu_size_jiffies = j;
- ssp->srcu_n_lock_retries = 0;
+ if (ssp->srcu_sup->srcu_size_jiffies != j) {
+ ssp->srcu_sup->srcu_size_jiffies = j;
+ ssp->srcu_sup->srcu_n_lock_retries = 0;
}
- if (++ssp->srcu_n_lock_retries <= small_contention_lim)
+ if (++ssp->srcu_sup->srcu_n_lock_retries <= small_contention_lim)
return;
__srcu_transition_to_big(ssp);
}
@@ -361,9 +370,9 @@ static void spin_lock_irqsave_sdp_contention(struct srcu_data *sdp, unsigned lon
if (spin_trylock_irqsave_rcu_node(sdp, *flags))
return;
- spin_lock_irqsave_rcu_node(ssp, *flags);
+ spin_lock_irqsave_rcu_node(ssp->srcu_sup, *flags);
spin_lock_irqsave_check_contention(ssp);
- spin_unlock_irqrestore_rcu_node(ssp, *flags);
+ spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, *flags);
spin_lock_irqsave_rcu_node(sdp, *flags);
}
@@ -375,9 +384,9 @@ static void spin_lock_irqsave_sdp_contention(struct srcu_data *sdp, unsigned lon
*/
static void spin_lock_irqsave_ssp_contention(struct srcu_struct *ssp, unsigned long *flags)
{
- if (spin_trylock_irqsave_rcu_node(ssp, *flags))
+ if (spin_trylock_irqsave_rcu_node(ssp->srcu_sup, *flags))
return;
- spin_lock_irqsave_rcu_node(ssp, *flags);
+ spin_lock_irqsave_rcu_node(ssp->srcu_sup, *flags);
spin_lock_irqsave_check_contention(ssp);
}
@@ -394,15 +403,15 @@ static void check_init_srcu_struct(struct srcu_struct *ssp)
unsigned long flags;
/* The smp_load_acquire() pairs with the smp_store_release(). */
- if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_gp_seq_needed))) /*^^^*/
+ if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_sup->srcu_gp_seq_needed))) /*^^^*/
return; /* Already initialized. */
- spin_lock_irqsave_rcu_node(ssp, flags);
- if (!rcu_seq_state(ssp->srcu_gp_seq_needed)) {
- spin_unlock_irqrestore_rcu_node(ssp, flags);
+ spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags);
+ if (!rcu_seq_state(ssp->srcu_sup->srcu_gp_seq_needed)) {
+ spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
return;
}
init_srcu_struct_fields(ssp, true);
- spin_unlock_irqrestore_rcu_node(ssp, flags);
+ spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
}
/*
@@ -607,17 +616,18 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp)
unsigned long gpstart;
unsigned long j;
unsigned long jbase = SRCU_INTERVAL;
+ struct srcu_usage *sup = ssp->srcu_sup;
- if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), READ_ONCE(ssp->srcu_gp_seq_needed_exp)))
+ if (ULONG_CMP_LT(READ_ONCE(sup->srcu_gp_seq), READ_ONCE(sup->srcu_gp_seq_needed_exp)))
jbase = 0;
- if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq))) {
+ if (rcu_seq_state(READ_ONCE(sup->srcu_gp_seq))) {
j = jiffies - 1;
- gpstart = READ_ONCE(ssp->srcu_gp_start);
+ gpstart = READ_ONCE(sup->srcu_gp_start);
if (time_after(j, gpstart))
jbase += j - gpstart;
if (!jbase) {
- WRITE_ONCE(ssp->srcu_n_exp_nodelay, READ_ONCE(ssp->srcu_n_exp_nodelay) + 1);
- if (READ_ONCE(ssp->srcu_n_exp_nodelay) > srcu_max_nodelay_phase)
+ WRITE_ONCE(sup->srcu_n_exp_nodelay, READ_ONCE(sup->srcu_n_exp_nodelay) + 1);
+ if (READ_ONCE(sup->srcu_n_exp_nodelay) > srcu_max_nodelay_phase)
jbase = 1;
}
}
@@ -634,12 +644,13 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp)
void cleanup_srcu_struct(struct srcu_struct *ssp)
{
int cpu;
+ struct srcu_usage *sup = ssp->srcu_sup;
if (WARN_ON(!srcu_get_delay(ssp)))
return; /* Just leak it! */
if (WARN_ON(srcu_readers_active(ssp)))
return; /* Just leak it! */
- flush_delayed_work(&ssp->work);
+ flush_delayed_work(&sup->work);
for_each_possible_cpu(cpu) {
struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu);
@@ -648,21 +659,23 @@ void cleanup_srcu_struct(struct srcu_struct *ssp)
if (WARN_ON(rcu_segcblist_n_cbs(&sdp->srcu_cblist)))
return; /* Forgot srcu_barrier(), so just leak it! */
}
- if (WARN_ON(rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) != SRCU_STATE_IDLE) ||
- WARN_ON(rcu_seq_current(&ssp->srcu_gp_seq) != ssp->srcu_gp_seq_needed) ||
+ if (WARN_ON(rcu_seq_state(READ_ONCE(sup->srcu_gp_seq)) != SRCU_STATE_IDLE) ||
+ WARN_ON(rcu_seq_current(&sup->srcu_gp_seq) != sup->srcu_gp_seq_needed) ||
WARN_ON(srcu_readers_active(ssp))) {
pr_info("%s: Active srcu_struct %p read state: %d gp state: %lu/%lu\n",
- __func__, ssp, rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)),
- rcu_seq_current(&ssp->srcu_gp_seq), ssp->srcu_gp_seq_needed);
+ __func__, ssp, rcu_seq_state(READ_ONCE(sup->srcu_gp_seq)),
+ rcu_seq_current(&sup->srcu_gp_seq), sup->srcu_gp_seq_needed);
return; /* Caller forgot to stop doing call_srcu()? */
}
- if (!ssp->sda_is_static) {
+ kfree(sup->node);
+ sup->node = NULL;
+ sup->srcu_size_state = SRCU_SIZE_SMALL;
+ if (!sup->sda_is_static) {
free_percpu(ssp->sda);
ssp->sda = NULL;
+ kfree(sup);
+ ssp->srcu_sup = NULL;
}
- kfree(ssp->node);
- ssp->node = NULL;
- ssp->srcu_size_state = SRCU_SIZE_SMALL;
}
EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
@@ -760,23 +773,23 @@ static void srcu_gp_start(struct srcu_struct *ssp)
struct srcu_data *sdp;
int state;
- if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
+ if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
sdp = per_cpu_ptr(ssp->sda, get_boot_cpu_id());
else
sdp = this_cpu_ptr(ssp->sda);
- lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock));
- WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed));
+ lockdep_assert_held(&ACCESS_PRIVATE(ssp->srcu_sup, lock));
+ WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed));
spin_lock_rcu_node(sdp); /* Interrupts already disabled. */
rcu_segcblist_advance(&sdp->srcu_cblist,
- rcu_seq_current(&ssp->srcu_gp_seq));
+ rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq));
(void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
- rcu_seq_snap(&ssp->srcu_gp_seq));
+ rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq));
spin_unlock_rcu_node(sdp); /* Interrupts remain disabled. */
- WRITE_ONCE(ssp->srcu_gp_start, jiffies);
- WRITE_ONCE(ssp->srcu_n_exp_nodelay, 0);
+ WRITE_ONCE(ssp->srcu_sup->srcu_gp_start, jiffies);
+ WRITE_ONCE(ssp->srcu_sup->srcu_n_exp_nodelay, 0);
smp_mb(); /* Order prior store to ->srcu_gp_seq_needed vs. GP start. */
- rcu_seq_start(&ssp->srcu_gp_seq);
- state = rcu_seq_state(ssp->srcu_gp_seq);
+ rcu_seq_start(&ssp->srcu_sup->srcu_gp_seq);
+ state = rcu_seq_state(ssp->srcu_sup->srcu_gp_seq);
WARN_ON_ONCE(state != SRCU_STATE_SCAN1);
}
@@ -849,28 +862,29 @@ static void srcu_gp_end(struct srcu_struct *ssp)
unsigned long sgsne;
struct srcu_node *snp;
int ss_state;
+ struct srcu_usage *sup = ssp->srcu_sup;
/* Prevent more than one additional grace period. */
- mutex_lock(&ssp->srcu_cb_mutex);
+ mutex_lock(&sup->srcu_cb_mutex);
/* End the current grace period. */
- spin_lock_irq_rcu_node(ssp);
- idx = rcu_seq_state(ssp->srcu_gp_seq);
+ spin_lock_irq_rcu_node(sup);
+ idx = rcu_seq_state(sup->srcu_gp_seq);
WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
- if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), READ_ONCE(ssp->srcu_gp_seq_needed_exp)))
+ if (ULONG_CMP_LT(READ_ONCE(sup->srcu_gp_seq), READ_ONCE(sup->srcu_gp_seq_needed_exp)))
cbdelay = 0;
- WRITE_ONCE(ssp->srcu_last_gp_end, ktime_get_mono_fast_ns());
- rcu_seq_end(&ssp->srcu_gp_seq);
- gpseq = rcu_seq_current(&ssp->srcu_gp_seq);
- if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, gpseq))
- WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, gpseq);
- spin_unlock_irq_rcu_node(ssp);
- mutex_unlock(&ssp->srcu_gp_mutex);
+ WRITE_ONCE(sup->srcu_last_gp_end, ktime_get_mono_fast_ns());
+ rcu_seq_end(&sup->srcu_gp_seq);
+ gpseq = rcu_seq_current(&sup->srcu_gp_seq);
+ if (ULONG_CMP_LT(sup->srcu_gp_seq_needed_exp, gpseq))
+ WRITE_ONCE(sup->srcu_gp_seq_needed_exp, gpseq);
+ spin_unlock_irq_rcu_node(sup);
+ mutex_unlock(&sup->srcu_gp_mutex);
/* A new grace period can start at this point. But only one. */
/* Initiate callback invocation as needed. */
- ss_state = smp_load_acquire(&ssp->srcu_size_state);
+ ss_state = smp_load_acquire(&sup->srcu_size_state);
if (ss_state < SRCU_SIZE_WAIT_BARRIER) {
srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, get_boot_cpu_id()),
cbdelay);
@@ -879,7 +893,7 @@ static void srcu_gp_end(struct srcu_struct *ssp)
srcu_for_each_node_breadth_first(ssp, snp) {
spin_lock_irq_rcu_node(snp);
cbs = false;
- last_lvl = snp >= ssp->level[rcu_num_lvls - 1];
+ last_lvl = snp >= sup->level[rcu_num_lvls - 1];
if (last_lvl)
cbs = ss_state < SRCU_SIZE_BIG || snp->srcu_have_cbs[idx] == gpseq;
snp->srcu_have_cbs[idx] = gpseq;
@@ -911,18 +925,18 @@ static void srcu_gp_end(struct srcu_struct *ssp)
}
/* Callback initiation done, allow grace periods after next. */
- mutex_unlock(&ssp->srcu_cb_mutex);
+ mutex_unlock(&sup->srcu_cb_mutex);
/* Start a new grace period if needed. */
- spin_lock_irq_rcu_node(ssp);
- gpseq = rcu_seq_current(&ssp->srcu_gp_seq);
+ spin_lock_irq_rcu_node(sup);
+ gpseq = rcu_seq_current(&sup->srcu_gp_seq);
if (!rcu_seq_state(gpseq) &&
- ULONG_CMP_LT(gpseq, ssp->srcu_gp_seq_needed)) {
+ ULONG_CMP_LT(gpseq, sup->srcu_gp_seq_needed)) {
srcu_gp_start(ssp);
- spin_unlock_irq_rcu_node(ssp);
+ spin_unlock_irq_rcu_node(sup);
srcu_reschedule(ssp, 0);
} else {
- spin_unlock_irq_rcu_node(ssp);
+ spin_unlock_irq_rcu_node(sup);
}
/* Transition to big if needed. */
@@ -930,7 +944,7 @@ static void srcu_gp_end(struct srcu_struct *ssp)
if (ss_state == SRCU_SIZE_ALLOC)
init_srcu_struct_nodes(ssp, GFP_KERNEL);
else
- smp_store_release(&ssp->srcu_size_state, ss_state + 1);
+ smp_store_release(&sup->srcu_size_state, ss_state + 1);
}
}
@@ -950,7 +964,7 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp
if (snp)
for (; snp != NULL; snp = snp->srcu_parent) {
sgsne = READ_ONCE(snp->srcu_gp_seq_needed_exp);
- if (WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_gp_seq, s)) ||
+ if (WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_sup->srcu_gp_seq, s)) ||
(!srcu_invl_snp_seq(sgsne) && ULONG_CMP_GE(sgsne, s)))
return;
spin_lock_irqsave_rcu_node(snp, flags);
@@ -963,9 +977,9 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp
spin_unlock_irqrestore_rcu_node(snp, flags);
}
spin_lock_irqsave_ssp_contention(ssp, &flags);
- if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s))
- WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, s);
- spin_unlock_irqrestore_rcu_node(ssp, flags);
+ if (ULONG_CMP_LT(ssp->srcu_sup->srcu_gp_seq_needed_exp, s))
+ WRITE_ONCE(ssp->srcu_sup->srcu_gp_seq_needed_exp, s);
+ spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
}
/*
@@ -990,9 +1004,10 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
struct srcu_node *snp;
struct srcu_node *snp_leaf;
unsigned long snp_seq;
+ struct srcu_usage *sup = ssp->srcu_sup;
/* Ensure that snp node tree is fully initialized before traversing it */
- if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
+ if (smp_load_acquire(&sup->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
snp_leaf = NULL;
else
snp_leaf = sdp->mynode;
@@ -1000,7 +1015,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
if (snp_leaf)
/* Each pass through the loop does one level of the srcu_node tree. */
for (snp = snp_leaf; snp != NULL; snp = snp->srcu_parent) {
- if (WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_gp_seq, s)) && snp != snp_leaf)
+ if (WARN_ON_ONCE(rcu_seq_done(&sup->srcu_gp_seq, s)) && snp != snp_leaf)
return; /* GP already done and CBs recorded. */
spin_lock_irqsave_rcu_node(snp, flags);
snp_seq = snp->srcu_have_cbs[idx];
@@ -1027,20 +1042,20 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
/* Top of tree, must ensure the grace period will be started. */
spin_lock_irqsave_ssp_contention(ssp, &flags);
- if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed, s)) {
+ if (ULONG_CMP_LT(sup->srcu_gp_seq_needed, s)) {
/*
* Record need for grace period s. Pair with load
* acquire setting up for initialization.
*/
- smp_store_release(&ssp->srcu_gp_seq_needed, s); /*^^^*/
+ smp_store_release(&sup->srcu_gp_seq_needed, s); /*^^^*/
}
- if (!do_norm && ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s))
- WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, s);
+ if (!do_norm && ULONG_CMP_LT(sup->srcu_gp_seq_needed_exp, s))
+ WRITE_ONCE(sup->srcu_gp_seq_needed_exp, s);
/* If grace period not already in progress, start it. */
- if (!WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_gp_seq, s)) &&
- rcu_seq_state(ssp->srcu_gp_seq) == SRCU_STATE_IDLE) {
- WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed));
+ if (!WARN_ON_ONCE(rcu_seq_done(&sup->srcu_gp_seq, s)) &&
+ rcu_seq_state(sup->srcu_gp_seq) == SRCU_STATE_IDLE) {
+ WARN_ON_ONCE(ULONG_CMP_GE(sup->srcu_gp_seq, sup->srcu_gp_seq_needed));
srcu_gp_start(ssp);
// And how can that list_add() in the "else" clause
@@ -1049,12 +1064,12 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
// can only be executed during early boot when there is only
// the one boot CPU running with interrupts still disabled.
if (likely(srcu_init_done))
- queue_delayed_work(rcu_gp_wq, &ssp->work,
+ queue_delayed_work(rcu_gp_wq, &sup->work,
!!srcu_get_delay(ssp));
- else if (list_empty(&ssp->work.work.entry))
- list_add(&ssp->work.work.entry, &srcu_boot_list);
+ else if (list_empty(&sup->work.work.entry))
+ list_add(&sup->work.work.entry, &srcu_boot_list);
}
- spin_unlock_irqrestore_rcu_node(ssp, flags);
+ spin_unlock_irqrestore_rcu_node(sup, flags);
}
/*
@@ -1085,16 +1100,36 @@ static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount)
static void srcu_flip(struct srcu_struct *ssp)
{
/*
- * Ensure that if this updater saw a given reader's increment
- * from __srcu_read_lock(), that reader was using an old value
- * of ->srcu_idx. Also ensure that if a given reader sees the
- * new value of ->srcu_idx, this updater's earlier scans cannot
- * have seen that reader's increments (which is OK, because this
- * grace period need not wait on that reader).
+ * Because the flip of ->srcu_idx is executed only if the
+ * preceding call to srcu_readers_active_idx_check() found that
+ * the ->srcu_unlock_count[] and ->srcu_lock_count[] sums matched
+ * and because that summing uses atomic_long_read(), there is
+ * ordering due to a control dependency between that summing and
+ * the WRITE_ONCE() in this call to srcu_flip(). This ordering
+ * ensures that if this updater saw a given reader's increment from
+ * __srcu_read_lock(), that reader was using a value of ->srcu_idx
+ * from before the previous call to srcu_flip(), which should be
+ * quite rare. This ordering thus helps forward progress because
+ * the grace period could otherwise be delayed by additional
+ * calls to __srcu_read_lock() using that old (soon to be new)
+ * value of ->srcu_idx.
+ *
+ * This sum-equality check and ordering also ensures that if
+ * a given call to __srcu_read_lock() uses the new value of
+ * ->srcu_idx, this updater's earlier scans cannot have seen
+ * that reader's increments, which is all to the good, because
+ * this grace period need not wait on that reader. After all,
+ * if those earlier scans had seen that reader, there would have
+ * been a sum mismatch and this code would not be reached.
+ *
+ * This means that the following smp_mb() is redundant, but
+ * it stays until either (1) Compilers learn about this sort of
+ * control dependency or (2) Some production workload running on
+ * a production system is unduly delayed by this slowpath smp_mb().
*/
smp_mb(); /* E */ /* Pairs with B and C. */
- WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1);
+ WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); // Flip the counter.
/*
* Ensure that if the updater misses an __srcu_read_unlock()
@@ -1154,18 +1189,18 @@ static bool srcu_might_be_idle(struct srcu_struct *ssp)
/* First, see if enough time has passed since the last GP. */
t = ktime_get_mono_fast_ns();
- tlast = READ_ONCE(ssp->srcu_last_gp_end);
+ tlast = READ_ONCE(ssp->srcu_sup->srcu_last_gp_end);
if (exp_holdoff == 0 ||
time_in_range_open(t, tlast, tlast + exp_holdoff))
return false; /* Too soon after last GP. */
/* Next, check for probable idleness. */
- curseq = rcu_seq_current(&ssp->srcu_gp_seq);
+ curseq = rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq);
smp_mb(); /* Order ->srcu_gp_seq with ->srcu_gp_seq_needed. */
- if (ULONG_CMP_LT(curseq, READ_ONCE(ssp->srcu_gp_seq_needed)))
+ if (ULONG_CMP_LT(curseq, READ_ONCE(ssp->srcu_sup->srcu_gp_seq_needed)))
return false; /* Grace period in progress, so not idle. */
smp_mb(); /* Order ->srcu_gp_seq with prior access. */
- if (curseq != rcu_seq_current(&ssp->srcu_gp_seq))
+ if (curseq != rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq))
return false; /* GP # changed, so not idle. */
return true; /* With reasonable probability, idle! */
}
@@ -1199,7 +1234,7 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
* sequence number cannot wrap around in the meantime.
*/
idx = __srcu_read_lock_nmisafe(ssp);
- ss_state = smp_load_acquire(&ssp->srcu_size_state);
+ ss_state = smp_load_acquire(&ssp->srcu_sup->srcu_size_state);
if (ss_state < SRCU_SIZE_WAIT_CALL)
sdp = per_cpu_ptr(ssp->sda, get_boot_cpu_id());
else
@@ -1208,8 +1243,8 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
if (rhp)
rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp);
rcu_segcblist_advance(&sdp->srcu_cblist,
- rcu_seq_current(&ssp->srcu_gp_seq));
- s = rcu_seq_snap(&ssp->srcu_gp_seq);
+ rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq));
+ s = rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq);
(void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s);
if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) {
sdp->srcu_gp_seq_needed = s;
@@ -1307,6 +1342,8 @@ static void __synchronize_srcu(struct srcu_struct *ssp, bool do_norm)
{
struct rcu_synchronize rcu;
+ srcu_lock_sync(&ssp->dep_map);
+
RCU_LOCKDEP_WARN(lockdep_is_held(ssp) ||
lock_is_held(&rcu_bh_lock_map) ||
lock_is_held(&rcu_lock_map) ||
@@ -1420,7 +1457,7 @@ unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp)
// Any prior manipulation of SRCU-protected data must happen
// before the load from ->srcu_gp_seq.
smp_mb();
- return rcu_seq_snap(&ssp->srcu_gp_seq);
+ return rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq);
}
EXPORT_SYMBOL_GPL(get_state_synchronize_srcu);
@@ -1467,7 +1504,7 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu);
*/
bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie)
{
- if (!rcu_seq_done(&ssp->srcu_gp_seq, cookie))
+ if (!rcu_seq_done(&ssp->srcu_sup->srcu_gp_seq, cookie))
return false;
// Ensure that the end of the SRCU grace period happens before
// any subsequent code that the caller might execute.
@@ -1486,8 +1523,8 @@ static void srcu_barrier_cb(struct rcu_head *rhp)
sdp = container_of(rhp, struct srcu_data, srcu_barrier_head);
ssp = sdp->ssp;
- if (atomic_dec_and_test(&ssp->srcu_barrier_cpu_cnt))
- complete(&ssp->srcu_barrier_completion);
+ if (atomic_dec_and_test(&ssp->srcu_sup->srcu_barrier_cpu_cnt))
+ complete(&ssp->srcu_sup->srcu_barrier_completion);
}
/*
@@ -1501,13 +1538,13 @@ static void srcu_barrier_cb(struct rcu_head *rhp)
static void srcu_barrier_one_cpu(struct srcu_struct *ssp, struct srcu_data *sdp)
{
spin_lock_irq_rcu_node(sdp);
- atomic_inc(&ssp->srcu_barrier_cpu_cnt);
+ atomic_inc(&ssp->srcu_sup->srcu_barrier_cpu_cnt);
sdp->srcu_barrier_head.func = srcu_barrier_cb;
debug_rcu_head_queue(&sdp->srcu_barrier_head);
if (!rcu_segcblist_entrain(&sdp->srcu_cblist,
&sdp->srcu_barrier_head)) {
debug_rcu_head_unqueue(&sdp->srcu_barrier_head);
- atomic_dec(&ssp->srcu_barrier_cpu_cnt);
+ atomic_dec(&ssp->srcu_sup->srcu_barrier_cpu_cnt);
}
spin_unlock_irq_rcu_node(sdp);
}
@@ -1520,23 +1557,23 @@ void srcu_barrier(struct srcu_struct *ssp)
{
int cpu;
int idx;
- unsigned long s = rcu_seq_snap(&ssp->srcu_barrier_seq);
+ unsigned long s = rcu_seq_snap(&ssp->srcu_sup->srcu_barrier_seq);
check_init_srcu_struct(ssp);
- mutex_lock(&ssp->srcu_barrier_mutex);
- if (rcu_seq_done(&ssp->srcu_barrier_seq, s)) {
+ mutex_lock(&ssp->srcu_sup->srcu_barrier_mutex);
+ if (rcu_seq_done(&ssp->srcu_sup->srcu_barrier_seq, s)) {
smp_mb(); /* Force ordering following return. */
- mutex_unlock(&ssp->srcu_barrier_mutex);
+ mutex_unlock(&ssp->srcu_sup->srcu_barrier_mutex);
return; /* Someone else did our work for us. */
}
- rcu_seq_start(&ssp->srcu_barrier_seq);
- init_completion(&ssp->srcu_barrier_completion);
+ rcu_seq_start(&ssp->srcu_sup->srcu_barrier_seq);
+ init_completion(&ssp->srcu_sup->srcu_barrier_completion);
/* Initial count prevents reaching zero until all CBs are posted. */
- atomic_set(&ssp->srcu_barrier_cpu_cnt, 1);
+ atomic_set(&ssp->srcu_sup->srcu_barrier_cpu_cnt, 1);
idx = __srcu_read_lock_nmisafe(ssp);
- if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
+ if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, get_boot_cpu_id()));
else
for_each_possible_cpu(cpu)
@@ -1544,12 +1581,12 @@ void srcu_barrier(struct srcu_struct *ssp)
__srcu_read_unlock_nmisafe(ssp, idx);
/* Remove the initial count, at which point reaching zero can happen. */
- if (atomic_dec_and_test(&ssp->srcu_barrier_cpu_cnt))
- complete(&ssp->srcu_barrier_completion);
- wait_for_completion(&ssp->srcu_barrier_completion);
+ if (atomic_dec_and_test(&ssp->srcu_sup->srcu_barrier_cpu_cnt))
+ complete(&ssp->srcu_sup->srcu_barrier_completion);
+ wait_for_completion(&ssp->srcu_sup->srcu_barrier_completion);
- rcu_seq_end(&ssp->srcu_barrier_seq);
- mutex_unlock(&ssp->srcu_barrier_mutex);
+ rcu_seq_end(&ssp->srcu_sup->srcu_barrier_seq);
+ mutex_unlock(&ssp->srcu_sup->srcu_barrier_mutex);
}
EXPORT_SYMBOL_GPL(srcu_barrier);
@@ -1575,7 +1612,7 @@ static void srcu_advance_state(struct srcu_struct *ssp)
{
int idx;
- mutex_lock(&ssp->srcu_gp_mutex);
+ mutex_lock(&ssp->srcu_sup->srcu_gp_mutex);
/*
* Because readers might be delayed for an extended period after
@@ -1587,39 +1624,39 @@ static void srcu_advance_state(struct srcu_struct *ssp)
* The load-acquire ensures that we see the accesses performed
* by the prior grace period.
*/
- idx = rcu_seq_state(smp_load_acquire(&ssp->srcu_gp_seq)); /* ^^^ */
+ idx = rcu_seq_state(smp_load_acquire(&ssp->srcu_sup->srcu_gp_seq)); /* ^^^ */
if (idx == SRCU_STATE_IDLE) {
- spin_lock_irq_rcu_node(ssp);
- if (ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)) {
- WARN_ON_ONCE(rcu_seq_state(ssp->srcu_gp_seq));
- spin_unlock_irq_rcu_node(ssp);
- mutex_unlock(&ssp->srcu_gp_mutex);
+ spin_lock_irq_rcu_node(ssp->srcu_sup);
+ if (ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed)) {
+ WARN_ON_ONCE(rcu_seq_state(ssp->srcu_sup->srcu_gp_seq));
+ spin_unlock_irq_rcu_node(ssp->srcu_sup);
+ mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex);
return;
}
- idx = rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq));
+ idx = rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq));
if (idx == SRCU_STATE_IDLE)
srcu_gp_start(ssp);
- spin_unlock_irq_rcu_node(ssp);
+ spin_unlock_irq_rcu_node(ssp->srcu_sup);
if (idx != SRCU_STATE_IDLE) {
- mutex_unlock(&ssp->srcu_gp_mutex);
+ mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex);
return; /* Someone else started the grace period. */
}
}
- if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) == SRCU_STATE_SCAN1) {
+ if (rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)) == SRCU_STATE_SCAN1) {
idx = 1 ^ (ssp->srcu_idx & 1);
if (!try_check_zero(ssp, idx, 1)) {
- mutex_unlock(&ssp->srcu_gp_mutex);
+ mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex);
return; /* readers present, retry later. */
}
srcu_flip(ssp);
- spin_lock_irq_rcu_node(ssp);
- rcu_seq_set_state(&ssp->srcu_gp_seq, SRCU_STATE_SCAN2);
- ssp->srcu_n_exp_nodelay = 0;
- spin_unlock_irq_rcu_node(ssp);
+ spin_lock_irq_rcu_node(ssp->srcu_sup);
+ rcu_seq_set_state(&ssp->srcu_sup->srcu_gp_seq, SRCU_STATE_SCAN2);
+ ssp->srcu_sup->srcu_n_exp_nodelay = 0;
+ spin_unlock_irq_rcu_node(ssp->srcu_sup);
}
- if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) == SRCU_STATE_SCAN2) {
+ if (rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)) == SRCU_STATE_SCAN2) {
/*
* SRCU read-side critical sections are normally short,
@@ -1627,10 +1664,10 @@ static void srcu_advance_state(struct srcu_struct *ssp)
*/
idx = 1 ^ (ssp->srcu_idx & 1);
if (!try_check_zero(ssp, idx, 2)) {
- mutex_unlock(&ssp->srcu_gp_mutex);
+ mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex);
return; /* readers present, retry later. */
}
- ssp->srcu_n_exp_nodelay = 0;
+ ssp->srcu_sup->srcu_n_exp_nodelay = 0;
srcu_gp_end(ssp); /* Releases ->srcu_gp_mutex. */
}
}
@@ -1656,7 +1693,7 @@ static void srcu_invoke_callbacks(struct work_struct *work)
rcu_cblist_init(&ready_cbs);
spin_lock_irq_rcu_node(sdp);
rcu_segcblist_advance(&sdp->srcu_cblist,
- rcu_seq_current(&ssp->srcu_gp_seq));
+ rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq));
if (sdp->srcu_cblist_invoking ||
!rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) {
spin_unlock_irq_rcu_node(sdp);
@@ -1684,7 +1721,7 @@ static void srcu_invoke_callbacks(struct work_struct *work)
spin_lock_irq_rcu_node(sdp);
rcu_segcblist_add_len(&sdp->srcu_cblist, -len);
(void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
- rcu_seq_snap(&ssp->srcu_gp_seq));
+ rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq));
sdp->srcu_cblist_invoking = false;
more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist);
spin_unlock_irq_rcu_node(sdp);
@@ -1700,20 +1737,20 @@ static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay)
{
bool pushgp = true;
- spin_lock_irq_rcu_node(ssp);
- if (ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)) {
- if (!WARN_ON_ONCE(rcu_seq_state(ssp->srcu_gp_seq))) {
+ spin_lock_irq_rcu_node(ssp->srcu_sup);
+ if (ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed)) {
+ if (!WARN_ON_ONCE(rcu_seq_state(ssp->srcu_sup->srcu_gp_seq))) {
/* All requests fulfilled, time to go idle. */
pushgp = false;
}
- } else if (!rcu_seq_state(ssp->srcu_gp_seq)) {
+ } else if (!rcu_seq_state(ssp->srcu_sup->srcu_gp_seq)) {
/* Outstanding request and no GP. Start one. */
srcu_gp_start(ssp);
}
- spin_unlock_irq_rcu_node(ssp);
+ spin_unlock_irq_rcu_node(ssp->srcu_sup);
if (pushgp)
- queue_delayed_work(rcu_gp_wq, &ssp->work, delay);
+ queue_delayed_work(rcu_gp_wq, &ssp->srcu_sup->work, delay);
}
/*
@@ -1724,22 +1761,24 @@ static void process_srcu(struct work_struct *work)
unsigned long curdelay;
unsigned long j;
struct srcu_struct *ssp;
+ struct srcu_usage *sup;
- ssp = container_of(work, struct srcu_struct, work.work);
+ sup = container_of(work, struct srcu_usage, work.work);
+ ssp = sup->srcu_ssp;
srcu_advance_state(ssp);
curdelay = srcu_get_delay(ssp);
if (curdelay) {
- WRITE_ONCE(ssp->reschedule_count, 0);
+ WRITE_ONCE(sup->reschedule_count, 0);
} else {
j = jiffies;
- if (READ_ONCE(ssp->reschedule_jiffies) == j) {
- WRITE_ONCE(ssp->reschedule_count, READ_ONCE(ssp->reschedule_count) + 1);
- if (READ_ONCE(ssp->reschedule_count) > srcu_max_nodelay)
+ if (READ_ONCE(sup->reschedule_jiffies) == j) {
+ WRITE_ONCE(sup->reschedule_count, READ_ONCE(sup->reschedule_count) + 1);
+ if (READ_ONCE(sup->reschedule_count) > srcu_max_nodelay)
curdelay = 1;
} else {
- WRITE_ONCE(ssp->reschedule_count, 1);
- WRITE_ONCE(ssp->reschedule_jiffies, j);
+ WRITE_ONCE(sup->reschedule_count, 1);
+ WRITE_ONCE(sup->reschedule_jiffies, j);
}
}
srcu_reschedule(ssp, curdelay);
@@ -1752,7 +1791,7 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type,
if (test_type != SRCU_FLAVOR)
return;
*flags = 0;
- *gp_seq = rcu_seq_current(&ssp->srcu_gp_seq);
+ *gp_seq = rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq);
}
EXPORT_SYMBOL_GPL(srcutorture_get_gp_data);
@@ -1774,14 +1813,14 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf)
int cpu;
int idx;
unsigned long s0 = 0, s1 = 0;
- int ss_state = READ_ONCE(ssp->srcu_size_state);
+ int ss_state = READ_ONCE(ssp->srcu_sup->srcu_size_state);
int ss_state_idx = ss_state;
idx = ssp->srcu_idx & 0x1;
if (ss_state < 0 || ss_state >= ARRAY_SIZE(srcu_size_state_name))
ss_state_idx = ARRAY_SIZE(srcu_size_state_name) - 1;
pr_alert("%s%s Tree SRCU g%ld state %d (%s)",
- tt, tf, rcu_seq_current(&ssp->srcu_gp_seq), ss_state,
+ tt, tf, rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq), ss_state,
srcu_size_state_name[ss_state_idx]);
if (!ssp->sda) {
// Called after cleanup_srcu_struct(), perhaps.
@@ -1838,7 +1877,7 @@ early_initcall(srcu_bootup_announce);
void __init srcu_init(void)
{
- struct srcu_struct *ssp;
+ struct srcu_usage *sup;
/* Decide on srcu_struct-size strategy. */
if (SRCU_SIZING_IS(SRCU_SIZING_AUTO)) {
@@ -1858,12 +1897,13 @@ void __init srcu_init(void)
*/
srcu_init_done = true;
while (!list_empty(&srcu_boot_list)) {
- ssp = list_first_entry(&srcu_boot_list, struct srcu_struct,
+ sup = list_first_entry(&srcu_boot_list, struct srcu_usage,
work.work.entry);
- list_del_init(&ssp->work.work.entry);
- if (SRCU_SIZING_IS(SRCU_SIZING_INIT) && ssp->srcu_size_state == SRCU_SIZE_SMALL)
- ssp->srcu_size_state = SRCU_SIZE_ALLOC;
- queue_work(rcu_gp_wq, &ssp->work.work);
+ list_del_init(&sup->work.work.entry);
+ if (SRCU_SIZING_IS(SRCU_SIZING_INIT) &&
+ sup->srcu_size_state == SRCU_SIZE_SMALL)
+ sup->srcu_size_state = SRCU_SIZE_ALLOC;
+ queue_work(rcu_gp_wq, &sup->work.work);
}
}
@@ -1873,13 +1913,14 @@ void __init srcu_init(void)
static int srcu_module_coming(struct module *mod)
{
int i;
+ struct srcu_struct *ssp;
struct srcu_struct **sspp = mod->srcu_struct_ptrs;
- int ret;
for (i = 0; i < mod->num_srcu_structs; i++) {
- ret = init_srcu_struct(*(sspp++));
- if (WARN_ON_ONCE(ret))
- return ret;
+ ssp = *(sspp++);
+ ssp->sda = alloc_percpu(struct srcu_data);
+ if (WARN_ON_ONCE(!ssp->sda))
+ return -ENOMEM;
}
return 0;
}
@@ -1888,10 +1929,17 @@ static int srcu_module_coming(struct module *mod)
static void srcu_module_going(struct module *mod)
{
int i;
+ struct srcu_struct *ssp;
struct srcu_struct **sspp = mod->srcu_struct_ptrs;
- for (i = 0; i < mod->num_srcu_structs; i++)
- cleanup_srcu_struct(*(sspp++));
+ for (i = 0; i < mod->num_srcu_structs; i++) {
+ ssp = *(sspp++);
+ if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_sup->srcu_gp_seq_needed)) &&
+ !WARN_ON_ONCE(!ssp->srcu_sup->sda_is_static))
+ cleanup_srcu_struct(ssp);
+ if (!WARN_ON(srcu_readers_active(ssp)))
+ free_percpu(ssp->sda);
+ }
}
/* Handle one module, either coming or going. */
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index bfb5e1549f2b..5f4fc8184dd0 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -136,8 +136,16 @@ static struct rcu_tasks rt_name = \
.kname = #rt_name, \
}
+#ifdef CONFIG_TASKS_RCU
/* Track exiting tasks in order to allow them to be waited for. */
DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu);
+#endif
+
+#ifdef CONFIG_TASKS_RCU
+/* Report delay in synchronize_srcu() completion in rcu_tasks_postscan(). */
+static void tasks_rcu_exit_srcu_stall(struct timer_list *unused);
+static DEFINE_TIMER(tasks_rcu_exit_srcu_stall_timer, tasks_rcu_exit_srcu_stall);
+#endif
/* Avoid IPIing CPUs early in the grace period. */
#define RCU_TASK_IPI_DELAY (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) ? HZ / 2 : 0)
@@ -830,6 +838,13 @@ static void rcu_tasks_pertask(struct task_struct *t, struct list_head *hop)
/* Processing between scanning taskslist and draining the holdout list. */
static void rcu_tasks_postscan(struct list_head *hop)
{
+ int rtsi = READ_ONCE(rcu_task_stall_info);
+
+ if (!IS_ENABLED(CONFIG_TINY_RCU)) {
+ tasks_rcu_exit_srcu_stall_timer.expires = jiffies + rtsi;
+ add_timer(&tasks_rcu_exit_srcu_stall_timer);
+ }
+
/*
* Exiting tasks may escape the tasklist scan. Those are vulnerable
* until their final schedule() with TASK_DEAD state. To cope with
@@ -848,6 +863,9 @@ static void rcu_tasks_postscan(struct list_head *hop)
* call to synchronize_rcu().
*/
synchronize_srcu(&tasks_rcu_exit_srcu);
+
+ if (!IS_ENABLED(CONFIG_TINY_RCU))
+ del_timer_sync(&tasks_rcu_exit_srcu_stall_timer);
}
/* See if tasks are still holding out, complain if so. */
@@ -923,6 +941,21 @@ static void rcu_tasks_postgp(struct rcu_tasks *rtp)
void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func);
DEFINE_RCU_TASKS(rcu_tasks, rcu_tasks_wait_gp, call_rcu_tasks, "RCU Tasks");
+static void tasks_rcu_exit_srcu_stall(struct timer_list *unused)
+{
+#ifndef CONFIG_TINY_RCU
+ int rtsi;
+
+ rtsi = READ_ONCE(rcu_task_stall_info);
+ pr_info("%s: %s grace period number %lu (since boot) gp_state: %s is %lu jiffies old.\n",
+ __func__, rcu_tasks.kname, rcu_tasks.tasks_gp_seq,
+ tasks_gp_state_getname(&rcu_tasks), jiffies - rcu_tasks.gp_jiffies);
+ pr_info("Please check any exiting tasks stuck between calls to exit_tasks_rcu_start() and exit_tasks_rcu_finish()\n");
+ tasks_rcu_exit_srcu_stall_timer.expires = jiffies + rtsi;
+ add_timer(&tasks_rcu_exit_srcu_stall_timer);
+#endif // #ifndef CONFIG_TINY_RCU
+}
+
/**
* call_rcu_tasks() - Queue an RCU for invocation task-based grace period
* @rhp: structure to be used for queueing the RCU updates.
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 7b95ee98a1a5..f52ff7241041 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -640,6 +640,7 @@ void __rcu_irq_enter_check_tick(void)
}
raw_spin_unlock_rcu_node(rdp->mynode);
}
+NOKPROBE_SYMBOL(__rcu_irq_enter_check_tick);
#endif /* CONFIG_NO_HZ_FULL */
/*
@@ -1955,7 +1956,6 @@ rcu_report_qs_rdp(struct rcu_data *rdp)
{
unsigned long flags;
unsigned long mask;
- bool needwake = false;
bool needacc = false;
struct rcu_node *rnp;
@@ -1987,7 +1987,12 @@ rcu_report_qs_rdp(struct rcu_data *rdp)
* NOCB kthreads have their own way to deal with that...
*/
if (!rcu_rdp_is_offloaded(rdp)) {
- needwake = rcu_accelerate_cbs(rnp, rdp);
+ /*
+ * The current GP has not yet ended, so it
+ * should not be possible for rcu_accelerate_cbs()
+ * to return true. So complain, but don't awaken.
+ */
+ WARN_ON_ONCE(rcu_accelerate_cbs(rnp, rdp));
} else if (!rcu_segcblist_completely_offloaded(&rdp->cblist)) {
/*
* ...but NOCB kthreads may miss or delay callbacks acceleration
@@ -1999,8 +2004,6 @@ rcu_report_qs_rdp(struct rcu_data *rdp)
rcu_disable_urgency_upon_qs(rdp);
rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
/* ^^^ Released rnp->lock */
- if (needwake)
- rcu_gp_kthread_wake();
if (needacc) {
rcu_nocb_lock_irqsave(rdp, flags);
@@ -2131,6 +2134,8 @@ static void rcu_do_batch(struct rcu_data *rdp)
break;
}
} else {
+ // In rcuoc context, so no worries about depriving
+ // other softirq vectors of CPU cycles.
local_bh_enable();
lockdep_assert_irqs_enabled();
cond_resched_tasks_rcu_qs();
@@ -4951,9 +4956,8 @@ void __init rcu_init(void)
else
qovld_calc = qovld;
- // Kick-start any polled grace periods that started early.
- if (!(per_cpu_ptr(&rcu_data, cpu)->mynode->exp_seq_poll_rq & 0x1))
- (void)start_poll_synchronize_rcu_expedited();
+ // Kick-start in case any polled grace periods started early.
+ (void)start_poll_synchronize_rcu_expedited();
rcu_test_sync_prims();
}
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 249c2967d9e6..3b7abb58157d 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -594,6 +594,7 @@ static void synchronize_rcu_expedited_wait(void)
struct rcu_data *rdp;
struct rcu_node *rnp;
struct rcu_node *rnp_root = rcu_get_root();
+ unsigned long flags;
trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("startwait"));
jiffies_stall = rcu_exp_jiffies_till_stall_check();
@@ -602,17 +603,17 @@ static void synchronize_rcu_expedited_wait(void)
if (synchronize_rcu_expedited_wait_once(1))
return;
rcu_for_each_leaf_node(rnp) {
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
mask = READ_ONCE(rnp->expmask);
for_each_leaf_node_cpu_mask(rnp, cpu, mask) {
rdp = per_cpu_ptr(&rcu_data, cpu);
if (rdp->rcu_forced_tick_exp)
continue;
rdp->rcu_forced_tick_exp = true;
- preempt_disable();
if (cpu_online(cpu))
tick_dep_set_cpu(cpu, TICK_DEP_BIT_RCU_EXP);
- preempt_enable();
}
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
j = READ_ONCE(jiffies_till_first_fqs);
if (synchronize_rcu_expedited_wait_once(j + HZ))
@@ -802,9 +803,11 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp)
int ndetected = 0;
struct task_struct *t;
- if (!READ_ONCE(rnp->exp_tasks))
- return 0;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ if (!rnp->exp_tasks) {
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ return 0;
+ }
t = list_entry(rnp->exp_tasks->prev,
struct task_struct, rcu_node_entry);
list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
@@ -1065,9 +1068,10 @@ unsigned long start_poll_synchronize_rcu_expedited(void)
if (rcu_init_invoked())
raw_spin_lock_irqsave(&rnp->exp_poll_lock, flags);
if (!poll_state_synchronize_rcu(s)) {
- rnp->exp_seq_poll_rq = s;
- if (rcu_init_invoked())
+ if (rcu_init_invoked()) {
+ rnp->exp_seq_poll_rq = s;
queue_work(rcu_gp_wq, &rnp->exp_poll_wq);
+ }
}
if (rcu_init_invoked())
raw_spin_unlock_irqrestore(&rnp->exp_poll_lock, flags);
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index 9e1c8caec5ce..f2280616f9d5 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -1312,6 +1312,7 @@ int rcu_nocb_cpu_offload(int cpu)
}
EXPORT_SYMBOL_GPL(rcu_nocb_cpu_offload);
+#ifdef CONFIG_RCU_LAZY
static unsigned long
lazy_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
{
@@ -1360,6 +1361,7 @@ static struct shrinker lazy_rcu_shrinker = {
.batch = 0,
.seeks = DEFAULT_SEEKS,
};
+#endif // #ifdef CONFIG_RCU_LAZY
void __init rcu_init_nohz(void)
{
@@ -1391,8 +1393,10 @@ void __init rcu_init_nohz(void)
if (!rcu_state.nocb_is_setup)
return;
+#ifdef CONFIG_RCU_LAZY
if (register_shrinker(&lazy_rcu_shrinker, "rcu-lazy"))
pr_err("Failed to register lazy_rcu shrinker!\n");
+#endif // #ifdef CONFIG_RCU_LAZY
if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) {
pr_info("\tNote: kernel parameter 'rcu_nocbs=', 'nohz_full', or 'isolcpus=' contains nonexistent CPUs.\n");
diff --git a/kernel/signal.c b/kernel/signal.c
index 8cb28f1df294..8f6330f0e9ca 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1003,8 +1003,7 @@ static void complete_signal(int sig, struct task_struct *p, enum pid_type type)
/*
* Now find a thread we can wake up to take the signal off the queue.
*
- * If the main thread wants the signal, it gets first crack.
- * Probably the least surprising to the average bear.
+ * Try the suggested task first (may or may not be the main thread).
*/
if (wants_signal(sig, p))
t = p;
@@ -1970,8 +1969,24 @@ int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type)
ret = -1;
rcu_read_lock();
+
+ /*
+ * This function is used by POSIX timers to deliver a timer signal.
+ * Where type is PIDTYPE_PID (such as for timers with SIGEV_THREAD_ID
+ * set), the signal must be delivered to the specific thread (queues
+ * into t->pending).
+ *
+ * Where type is not PIDTYPE_PID, signals must be delivered to the
+ * process. In this case, prefer to deliver to current if it is in
+ * the same thread group as the target process, which avoids
+ * unnecessarily waking up a potentially idle task.
+ */
t = pid_task(pid, type);
- if (!t || !likely(lock_task_sighand(t, &flags)))
+ if (!t)
+ goto ret;
+ if (type != PIDTYPE_PID && same_thread_group(t, current))
+ t = current;
+ if (!likely(lock_task_sighand(t, &flags)))
goto ret;
ret = 1; /* the signal is ignored */
diff --git a/kernel/softirq.c b/kernel/softirq.c
index c8a6913c067d..1b725510dd0f 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -793,10 +793,15 @@ static void tasklet_action_common(struct softirq_action *a,
if (tasklet_trylock(t)) {
if (!atomic_read(&t->count)) {
if (tasklet_clear_sched(t)) {
- if (t->use_callback)
+ if (t->use_callback) {
+ trace_tasklet_entry(t, t->callback);
t->callback(t);
- else
+ trace_tasklet_exit(t, t->callback);
+ } else {
+ trace_tasklet_entry(t, t->func);
t->func(t->data);
+ trace_tasklet_exit(t, t->func);
+ }
}
tasklet_unlock(t);
continue;
diff --git a/kernel/sys.c b/kernel/sys.c
index 495cd87d9bf4..351de7916302 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -664,6 +664,7 @@ long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
struct cred *new;
int retval;
kuid_t kruid, keuid, ksuid;
+ bool ruid_new, euid_new, suid_new;
kruid = make_kuid(ns, ruid);
keuid = make_kuid(ns, euid);
@@ -678,25 +679,29 @@ long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
if ((suid != (uid_t) -1) && !uid_valid(ksuid))
return -EINVAL;
+ old = current_cred();
+
+ /* check for no-op */
+ if ((ruid == (uid_t) -1 || uid_eq(kruid, old->uid)) &&
+ (euid == (uid_t) -1 || (uid_eq(keuid, old->euid) &&
+ uid_eq(keuid, old->fsuid))) &&
+ (suid == (uid_t) -1 || uid_eq(ksuid, old->suid)))
+ return 0;
+
+ ruid_new = ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) &&
+ !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid);
+ euid_new = euid != (uid_t) -1 && !uid_eq(keuid, old->uid) &&
+ !uid_eq(keuid, old->euid) && !uid_eq(keuid, old->suid);
+ suid_new = suid != (uid_t) -1 && !uid_eq(ksuid, old->uid) &&
+ !uid_eq(ksuid, old->euid) && !uid_eq(ksuid, old->suid);
+ if ((ruid_new || euid_new || suid_new) &&
+ !ns_capable_setid(old->user_ns, CAP_SETUID))
+ return -EPERM;
+
new = prepare_creds();
if (!new)
return -ENOMEM;
- old = current_cred();
-
- retval = -EPERM;
- if (!ns_capable_setid(old->user_ns, CAP_SETUID)) {
- if (ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) &&
- !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid))
- goto error;
- if (euid != (uid_t) -1 && !uid_eq(keuid, old->uid) &&
- !uid_eq(keuid, old->euid) && !uid_eq(keuid, old->suid))
- goto error;
- if (suid != (uid_t) -1 && !uid_eq(ksuid, old->uid) &&
- !uid_eq(ksuid, old->euid) && !uid_eq(ksuid, old->suid))
- goto error;
- }
-
if (ruid != (uid_t) -1) {
new->uid = kruid;
if (!uid_eq(kruid, old->uid)) {
@@ -761,6 +766,7 @@ long __sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
struct cred *new;
int retval;
kgid_t krgid, kegid, ksgid;
+ bool rgid_new, egid_new, sgid_new;
krgid = make_kgid(ns, rgid);
kegid = make_kgid(ns, egid);
@@ -773,23 +779,28 @@ long __sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
if ((sgid != (gid_t) -1) && !gid_valid(ksgid))
return -EINVAL;
+ old = current_cred();
+
+ /* check for no-op */
+ if ((rgid == (gid_t) -1 || gid_eq(krgid, old->gid)) &&
+ (egid == (gid_t) -1 || (gid_eq(kegid, old->egid) &&
+ gid_eq(kegid, old->fsgid))) &&
+ (sgid == (gid_t) -1 || gid_eq(ksgid, old->sgid)))
+ return 0;
+
+ rgid_new = rgid != (gid_t) -1 && !gid_eq(krgid, old->gid) &&
+ !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid);
+ egid_new = egid != (gid_t) -1 && !gid_eq(kegid, old->gid) &&
+ !gid_eq(kegid, old->egid) && !gid_eq(kegid, old->sgid);
+ sgid_new = sgid != (gid_t) -1 && !gid_eq(ksgid, old->gid) &&
+ !gid_eq(ksgid, old->egid) && !gid_eq(ksgid, old->sgid);
+ if ((rgid_new || egid_new || sgid_new) &&
+ !ns_capable_setid(old->user_ns, CAP_SETGID))
+ return -EPERM;
+
new = prepare_creds();
if (!new)
return -ENOMEM;
- old = current_cred();
-
- retval = -EPERM;
- if (!ns_capable_setid(old->user_ns, CAP_SETGID)) {
- if (rgid != (gid_t) -1 && !gid_eq(krgid, old->gid) &&
- !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid))
- goto error;
- if (egid != (gid_t) -1 && !gid_eq(kegid, old->gid) &&
- !gid_eq(kegid, old->egid) && !gid_eq(kegid, old->sgid))
- goto error;
- if (sgid != (gid_t) -1 && !gid_eq(ksgid, old->gid) &&
- !gid_eq(ksgid, old->egid) && !gid_eq(ksgid, old->sgid))
- goto error;
- }
if (rgid != (gid_t) -1)
new->gid = krgid;
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 2f5e9b34022c..e9c6f9d0e42c 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -846,6 +846,8 @@ static u64 collect_timerqueue(struct timerqueue_head *head,
return expires;
ctmr->firing = 1;
+ /* See posix_cpu_timer_wait_running() */
+ rcu_assign_pointer(ctmr->handling, current);
cpu_timer_dequeue(ctmr);
list_add_tail(&ctmr->elist, firing);
}
@@ -1161,7 +1163,49 @@ static void handle_posix_cpu_timers(struct task_struct *tsk);
#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK
static void posix_cpu_timers_work(struct callback_head *work)
{
+ struct posix_cputimers_work *cw = container_of(work, typeof(*cw), work);
+
+ mutex_lock(&cw->mutex);
handle_posix_cpu_timers(current);
+ mutex_unlock(&cw->mutex);
+}
+
+/*
+ * Invoked from the posix-timer core when a cancel operation failed because
+ * the timer is marked firing. The caller holds rcu_read_lock(), which
+ * protects the timer and the task which is expiring it from being freed.
+ */
+static void posix_cpu_timer_wait_running(struct k_itimer *timr)
+{
+ struct task_struct *tsk = rcu_dereference(timr->it.cpu.handling);
+
+ /* Has the handling task completed expiry already? */
+ if (!tsk)
+ return;
+
+ /* Ensure that the task cannot go away */
+ get_task_struct(tsk);
+ /* Now drop the RCU protection so the mutex can be locked */
+ rcu_read_unlock();
+ /* Wait on the expiry mutex */
+ mutex_lock(&tsk->posix_cputimers_work.mutex);
+ /* Release it immediately again. */
+ mutex_unlock(&tsk->posix_cputimers_work.mutex);
+ /* Drop the task reference. */
+ put_task_struct(tsk);
+ /* Relock RCU so the callsite is balanced */
+ rcu_read_lock();
+}
+
+static void posix_cpu_timer_wait_running_nsleep(struct k_itimer *timr)
+{
+ /* Ensure that timr->it.cpu.handling task cannot go away */
+ rcu_read_lock();
+ spin_unlock_irq(&timr->it_lock);
+ posix_cpu_timer_wait_running(timr);
+ rcu_read_unlock();
+ /* @timr is on stack and is valid */
+ spin_lock_irq(&timr->it_lock);
}
/*
@@ -1177,6 +1221,7 @@ void clear_posix_cputimers_work(struct task_struct *p)
sizeof(p->posix_cputimers_work.work));
init_task_work(&p->posix_cputimers_work.work,
posix_cpu_timers_work);
+ mutex_init(&p->posix_cputimers_work.mutex);
p->posix_cputimers_work.scheduled = false;
}
@@ -1255,6 +1300,18 @@ static inline void __run_posix_cpu_timers(struct task_struct *tsk)
lockdep_posixtimer_exit();
}
+static void posix_cpu_timer_wait_running(struct k_itimer *timr)
+{
+ cpu_relax();
+}
+
+static void posix_cpu_timer_wait_running_nsleep(struct k_itimer *timr)
+{
+ spin_unlock_irq(&timr->it_lock);
+ cpu_relax();
+ spin_lock_irq(&timr->it_lock);
+}
+
static inline bool posix_cpu_timers_work_scheduled(struct task_struct *tsk)
{
return false;
@@ -1363,6 +1420,8 @@ static void handle_posix_cpu_timers(struct task_struct *tsk)
*/
if (likely(cpu_firing >= 0))
cpu_timer_fire(timer);
+ /* See posix_cpu_timer_wait_running() */
+ rcu_assign_pointer(timer->it.cpu.handling, NULL);
spin_unlock(&timer->it_lock);
}
}
@@ -1497,23 +1556,16 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
expires = cpu_timer_getexpires(&timer.it.cpu);
error = posix_cpu_timer_set(&timer, 0, &zero_it, &it);
if (!error) {
- /*
- * Timer is now unarmed, deletion can not fail.
- */
+ /* Timer is now unarmed, deletion can not fail. */
posix_cpu_timer_del(&timer);
+ } else {
+ while (error == TIMER_RETRY) {
+ posix_cpu_timer_wait_running_nsleep(&timer);
+ error = posix_cpu_timer_del(&timer);
+ }
}
- spin_unlock_irq(&timer.it_lock);
- while (error == TIMER_RETRY) {
- /*
- * We need to handle case when timer was or is in the
- * middle of firing. In other cases we already freed
- * resources.
- */
- spin_lock_irq(&timer.it_lock);
- error = posix_cpu_timer_del(&timer);
- spin_unlock_irq(&timer.it_lock);
- }
+ spin_unlock_irq(&timer.it_lock);
if ((it.it_value.tv_sec | it.it_value.tv_nsec) == 0) {
/*
@@ -1623,6 +1675,7 @@ const struct k_clock clock_posix_cpu = {
.timer_del = posix_cpu_timer_del,
.timer_get = posix_cpu_timer_get,
.timer_rearm = posix_cpu_timer_rearm,
+ .timer_wait_running = posix_cpu_timer_wait_running,
};
const struct k_clock clock_process = {
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 0c8a87a11b39..808a247205a9 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -846,6 +846,10 @@ static struct k_itimer *timer_wait_running(struct k_itimer *timer,
rcu_read_lock();
unlock_timer(timer, *flags);
+ /*
+ * kc->timer_wait_running() might drop RCU lock. So @timer
+ * cannot be touched anymore after the function returns!
+ */
if (!WARN_ON_ONCE(!kc->timer_wait_running))
kc->timer_wait_running(timer);
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 46789356f856..65b8658da829 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -218,9 +218,19 @@ static void tick_setup_device(struct tick_device *td,
* this cpu:
*/
if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) {
+ ktime_t next_p;
+ u32 rem;
+
tick_do_timer_cpu = cpu;
- tick_next_period = ktime_get();
+ next_p = ktime_get();
+ div_u64_rem(next_p, TICK_NSEC, &rem);
+ if (rem) {
+ next_p -= rem;
+ next_p += TICK_NSEC;
+ }
+
+ tick_next_period = next_p;
#ifdef CONFIG_NO_HZ_FULL
/*
* The boot CPU may be nohz_full, in which case set
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index b0e3c9205946..52254679ec48 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -281,6 +281,11 @@ static bool check_tick_dependency(atomic_t *dep)
return true;
}
+ if (val & TICK_DEP_MASK_RCU_EXP) {
+ trace_tick_stop(0, TICK_DEP_MASK_RCU_EXP);
+ return true;
+ }
+
return false;
}
@@ -527,7 +532,7 @@ void __init tick_nohz_full_setup(cpumask_var_t cpumask)
tick_nohz_full_running = true;
}
-static int tick_nohz_cpu_down(unsigned int cpu)
+bool tick_nohz_cpu_hotpluggable(unsigned int cpu)
{
/*
* The tick_do_timer_cpu CPU handles housekeeping duty (unbound
@@ -535,8 +540,13 @@ static int tick_nohz_cpu_down(unsigned int cpu)
* CPUs. It must remain online when nohz full is enabled.
*/
if (tick_nohz_full_running && tick_do_timer_cpu == cpu)
- return -EBUSY;
- return 0;
+ return false;
+ return true;
+}
+
+static int tick_nohz_cpu_down(unsigned int cpu)
+{
+ return tick_nohz_cpu_hotpluggable(cpu) ? 0 : -EBUSY;
}
void __init tick_nohz_init(void)
@@ -637,43 +647,67 @@ static void tick_nohz_update_jiffies(ktime_t now)
touch_softlockup_watchdog_sched();
}
-/*
- * Updates the per-CPU time idle statistics counters
- */
-static void
-update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time)
+static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
{
ktime_t delta;
- if (ts->idle_active) {
- delta = ktime_sub(now, ts->idle_entrytime);
- if (nr_iowait_cpu(cpu) > 0)
- ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
- else
- ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
- ts->idle_entrytime = now;
- }
+ if (WARN_ON_ONCE(!ts->idle_active))
+ return;
- if (last_update_time)
- *last_update_time = ktime_to_us(now);
+ delta = ktime_sub(now, ts->idle_entrytime);
-}
+ write_seqcount_begin(&ts->idle_sleeptime_seq);
+ if (nr_iowait_cpu(smp_processor_id()) > 0)
+ ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
+ else
+ ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
-static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
-{
- update_ts_time_stats(smp_processor_id(), ts, now, NULL);
+ ts->idle_entrytime = now;
ts->idle_active = 0;
+ write_seqcount_end(&ts->idle_sleeptime_seq);
sched_clock_idle_wakeup_event();
}
static void tick_nohz_start_idle(struct tick_sched *ts)
{
+ write_seqcount_begin(&ts->idle_sleeptime_seq);
ts->idle_entrytime = ktime_get();
ts->idle_active = 1;
+ write_seqcount_end(&ts->idle_sleeptime_seq);
+
sched_clock_idle_sleep_event();
}
+static u64 get_cpu_sleep_time_us(struct tick_sched *ts, ktime_t *sleeptime,
+ bool compute_delta, u64 *last_update_time)
+{
+ ktime_t now, idle;
+ unsigned int seq;
+
+ if (!tick_nohz_active)
+ return -1;
+
+ now = ktime_get();
+ if (last_update_time)
+ *last_update_time = ktime_to_us(now);
+
+ do {
+ seq = read_seqcount_begin(&ts->idle_sleeptime_seq);
+
+ if (ts->idle_active && compute_delta) {
+ ktime_t delta = ktime_sub(now, ts->idle_entrytime);
+
+ idle = ktime_add(*sleeptime, delta);
+ } else {
+ idle = *sleeptime;
+ }
+ } while (read_seqcount_retry(&ts->idle_sleeptime_seq, seq));
+
+ return ktime_to_us(idle);
+
+}
+
/**
* get_cpu_idle_time_us - get the total idle time of a CPU
* @cpu: CPU number to query
@@ -681,7 +715,10 @@ static void tick_nohz_start_idle(struct tick_sched *ts)
* counters if NULL.
*
* Return the cumulative idle time (since boot) for a given
- * CPU, in microseconds.
+ * CPU, in microseconds. Note this is partially broken due to
+ * the counter of iowait tasks that can be remotely updated without
+ * any synchronization. Therefore it is possible to observe backward
+ * values within two consecutive reads.
*
* This time is measured via accounting rather than sampling,
* and is as accurate as ktime_get() is.
@@ -691,27 +728,9 @@ static void tick_nohz_start_idle(struct tick_sched *ts)
u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
{
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
- ktime_t now, idle;
-
- if (!tick_nohz_active)
- return -1;
-
- now = ktime_get();
- if (last_update_time) {
- update_ts_time_stats(cpu, ts, now, last_update_time);
- idle = ts->idle_sleeptime;
- } else {
- if (ts->idle_active && !nr_iowait_cpu(cpu)) {
- ktime_t delta = ktime_sub(now, ts->idle_entrytime);
-
- idle = ktime_add(ts->idle_sleeptime, delta);
- } else {
- idle = ts->idle_sleeptime;
- }
- }
-
- return ktime_to_us(idle);
+ return get_cpu_sleep_time_us(ts, &ts->idle_sleeptime,
+ !nr_iowait_cpu(cpu), last_update_time);
}
EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
@@ -722,7 +741,10 @@ EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
* counters if NULL.
*
* Return the cumulative iowait time (since boot) for a given
- * CPU, in microseconds.
+ * CPU, in microseconds. Note this is partially broken due to
+ * the counter of iowait tasks that can be remotely updated without
+ * any synchronization. Therefore it is possible to observe backward
+ * values within two consecutive reads.
*
* This time is measured via accounting rather than sampling,
* and is as accurate as ktime_get() is.
@@ -732,26 +754,9 @@ EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
{
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
- ktime_t now, iowait;
-
- if (!tick_nohz_active)
- return -1;
-
- now = ktime_get();
- if (last_update_time) {
- update_ts_time_stats(cpu, ts, now, last_update_time);
- iowait = ts->iowait_sleeptime;
- } else {
- if (ts->idle_active && nr_iowait_cpu(cpu) > 0) {
- ktime_t delta = ktime_sub(now, ts->idle_entrytime);
- iowait = ktime_add(ts->iowait_sleeptime, delta);
- } else {
- iowait = ts->iowait_sleeptime;
- }
- }
-
- return ktime_to_us(iowait);
+ return get_cpu_sleep_time_us(ts, &ts->iowait_sleeptime,
+ nr_iowait_cpu(cpu), last_update_time);
}
EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
@@ -1084,10 +1089,16 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
return true;
}
-static void __tick_nohz_idle_stop_tick(struct tick_sched *ts)
+/**
+ * tick_nohz_idle_stop_tick - stop the idle tick from the idle task
+ *
+ * When the next event is more than a tick into the future, stop the idle tick
+ */
+void tick_nohz_idle_stop_tick(void)
{
- ktime_t expires;
+ struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
int cpu = smp_processor_id();
+ ktime_t expires;
/*
* If tick_nohz_get_sleep_length() ran tick_nohz_next_event(), the
@@ -1119,16 +1130,6 @@ static void __tick_nohz_idle_stop_tick(struct tick_sched *ts)
}
}
-/**
- * tick_nohz_idle_stop_tick - stop the idle tick from the idle task
- *
- * When the next event is more than a tick into the future, stop the idle tick
- */
-void tick_nohz_idle_stop_tick(void)
-{
- __tick_nohz_idle_stop_tick(this_cpu_ptr(&tick_cpu_sched));
-}
-
void tick_nohz_idle_retain_tick(void)
{
tick_nohz_retain_tick(this_cpu_ptr(&tick_cpu_sched));
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index 504649513399..5ed5a9d41d5a 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -22,65 +22,82 @@ enum tick_nohz_mode {
/**
* struct tick_sched - sched tick emulation and no idle tick control/stats
- * @sched_timer: hrtimer to schedule the periodic tick in high
- * resolution mode
- * @check_clocks: Notification mechanism about clocksource changes
- * @nohz_mode: Mode - one state of tick_nohz_mode
+ *
* @inidle: Indicator that the CPU is in the tick idle mode
* @tick_stopped: Indicator that the idle tick has been stopped
* @idle_active: Indicator that the CPU is actively in the tick idle mode;
* it is reset during irq handling phases.
- * @do_timer_lst: CPU was the last one doing do_timer before going idle
+ * @do_timer_last: CPU was the last one doing do_timer before going idle
* @got_idle_tick: Tick timer function has run with @inidle set
+ * @stalled_jiffies: Number of stalled jiffies detected across ticks
+ * @last_tick_jiffies: Value of jiffies seen on last tick
+ * @sched_timer: hrtimer to schedule the periodic tick in high
+ * resolution mode
* @last_tick: Store the last tick expiry time when the tick
* timer is modified for nohz sleeps. This is necessary
* to resume the tick timer operation in the timeline
* when the CPU returns from nohz sleep.
* @next_tick: Next tick to be fired when in dynticks mode.
* @idle_jiffies: jiffies at the entry to idle for idle time accounting
+ * @idle_waketime: Time when the idle was interrupted
+ * @idle_entrytime: Time when the idle call was entered
+ * @nohz_mode: Mode - one state of tick_nohz_mode
+ * @last_jiffies: Base jiffies snapshot when next event was last computed
+ * @timer_expires_base: Base time clock monotonic for @timer_expires
+ * @timer_expires: Anticipated timer expiration time (in case sched tick is stopped)
+ * @next_timer: Expiry time of next expiring timer for debugging purpose only
+ * @idle_expires: Next tick in idle, for debugging purpose only
* @idle_calls: Total number of idle calls
* @idle_sleeps: Number of idle calls, where the sched tick was stopped
- * @idle_entrytime: Time when the idle call was entered
- * @idle_waketime: Time when the idle was interrupted
* @idle_exittime: Time when the idle state was left
* @idle_sleeptime: Sum of the time slept in idle with sched tick stopped
* @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding
- * @timer_expires: Anticipated timer expiration time (in case sched tick is stopped)
- * @timer_expires_base: Base time clock monotonic for @timer_expires
- * @next_timer: Expiry time of next expiring timer for debugging purpose only
* @tick_dep_mask: Tick dependency mask - is set, if someone needs the tick
- * @last_tick_jiffies: Value of jiffies seen on last tick
- * @stalled_jiffies: Number of stalled jiffies detected across ticks
+ * @check_clocks: Notification mechanism about clocksource changes
*/
struct tick_sched {
- struct hrtimer sched_timer;
- unsigned long check_clocks;
- enum tick_nohz_mode nohz_mode;
-
+ /* Common flags */
unsigned int inidle : 1;
unsigned int tick_stopped : 1;
unsigned int idle_active : 1;
unsigned int do_timer_last : 1;
unsigned int got_idle_tick : 1;
+ /* Tick handling: jiffies stall check */
+ unsigned int stalled_jiffies;
+ unsigned long last_tick_jiffies;
+
+ /* Tick handling */
+ struct hrtimer sched_timer;
ktime_t last_tick;
ktime_t next_tick;
unsigned long idle_jiffies;
- unsigned long idle_calls;
- unsigned long idle_sleeps;
- ktime_t idle_entrytime;
ktime_t idle_waketime;
- ktime_t idle_exittime;
- ktime_t idle_sleeptime;
- ktime_t iowait_sleeptime;
+
+ /* Idle entry */
+ seqcount_t idle_sleeptime_seq;
+ ktime_t idle_entrytime;
+
+ /* Tick stop */
+ enum tick_nohz_mode nohz_mode;
unsigned long last_jiffies;
- u64 timer_expires;
u64 timer_expires_base;
+ u64 timer_expires;
u64 next_timer;
ktime_t idle_expires;
+ unsigned long idle_calls;
+ unsigned long idle_sleeps;
+
+ /* Idle exit */
+ ktime_t idle_exittime;
+ ktime_t idle_sleeptime;
+ ktime_t iowait_sleeptime;
+
+ /* Full dynticks handling */
atomic_t tick_dep_mask;
- unsigned long last_tick_jiffies;
- unsigned int stalled_jiffies;
+
+ /* Clocksource changes */
+ unsigned long check_clocks;
};
extern struct tick_sched *tick_get_tick_sched(int cpu);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index a856d4a34c67..5b1e7fa41ca8 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -257,7 +257,7 @@ config DYNAMIC_FTRACE_WITH_REGS
config DYNAMIC_FTRACE_WITH_DIRECT_CALLS
def_bool y
- depends on DYNAMIC_FTRACE_WITH_REGS
+ depends on DYNAMIC_FTRACE_WITH_REGS || DYNAMIC_FTRACE_WITH_ARGS
depends on HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
config DYNAMIC_FTRACE_WITH_CALL_OPS
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index c67bcc89a771..3b46dba3f69b 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2583,28 +2583,13 @@ ftrace_add_rec_direct(unsigned long ip, unsigned long addr,
static void call_direct_funcs(unsigned long ip, unsigned long pip,
struct ftrace_ops *ops, struct ftrace_regs *fregs)
{
- unsigned long addr;
+ unsigned long addr = READ_ONCE(ops->direct_call);
- addr = ftrace_find_rec_direct(ip);
if (!addr)
return;
arch_ftrace_set_direct_caller(fregs, addr);
}
-
-static struct ftrace_ops direct_ops = {
- .func = call_direct_funcs,
- .flags = FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS
- | FTRACE_OPS_FL_PERMANENT,
- /*
- * By declaring the main trampoline as this trampoline
- * it will never have one allocated for it. Allocated
- * trampolines should not call direct functions.
- * The direct_ops should only be called by the builtin
- * ftrace_regs_caller trampoline.
- */
- .trampoline = FTRACE_REGS_ADDR,
-};
#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
/**
@@ -5301,391 +5286,9 @@ struct ftrace_direct_func {
static LIST_HEAD(ftrace_direct_funcs);
-/**
- * ftrace_find_direct_func - test an address if it is a registered direct caller
- * @addr: The address of a registered direct caller
- *
- * This searches to see if a ftrace direct caller has been registered
- * at a specific address, and if so, it returns a descriptor for it.
- *
- * This can be used by architecture code to see if an address is
- * a direct caller (trampoline) attached to a fentry/mcount location.
- * This is useful for the function_graph tracer, as it may need to
- * do adjustments if it traced a location that also has a direct
- * trampoline attached to it.
- */
-struct ftrace_direct_func *ftrace_find_direct_func(unsigned long addr)
-{
- struct ftrace_direct_func *entry;
- bool found = false;
-
- /* May be called by fgraph trampoline (protected by rcu tasks) */
- list_for_each_entry_rcu(entry, &ftrace_direct_funcs, next) {
- if (entry->addr == addr) {
- found = true;
- break;
- }
- }
- if (found)
- return entry;
-
- return NULL;
-}
-
-static struct ftrace_direct_func *ftrace_alloc_direct_func(unsigned long addr)
-{
- struct ftrace_direct_func *direct;
-
- direct = kmalloc(sizeof(*direct), GFP_KERNEL);
- if (!direct)
- return NULL;
- direct->addr = addr;
- direct->count = 0;
- list_add_rcu(&direct->next, &ftrace_direct_funcs);
- ftrace_direct_func_count++;
- return direct;
-}
-
static int register_ftrace_function_nolock(struct ftrace_ops *ops);
-/**
- * register_ftrace_direct - Call a custom trampoline directly
- * @ip: The address of the nop at the beginning of a function
- * @addr: The address of the trampoline to call at @ip
- *
- * This is used to connect a direct call from the nop location (@ip)
- * at the start of ftrace traced functions. The location that it calls
- * (@addr) must be able to handle a direct call, and save the parameters
- * of the function being traced, and restore them (or inject new ones
- * if needed), before returning.
- *
- * Returns:
- * 0 on success
- * -EBUSY - Another direct function is already attached (there can be only one)
- * -ENODEV - @ip does not point to a ftrace nop location (or not supported)
- * -ENOMEM - There was an allocation failure.
- */
-int register_ftrace_direct(unsigned long ip, unsigned long addr)
-{
- struct ftrace_direct_func *direct;
- struct ftrace_func_entry *entry;
- struct ftrace_hash *free_hash = NULL;
- struct dyn_ftrace *rec;
- int ret = -ENODEV;
-
- mutex_lock(&direct_mutex);
-
- ip = ftrace_location(ip);
- if (!ip)
- goto out_unlock;
-
- /* See if there's a direct function at @ip already */
- ret = -EBUSY;
- if (ftrace_find_rec_direct(ip))
- goto out_unlock;
-
- ret = -ENODEV;
- rec = lookup_rec(ip, ip);
- if (!rec)
- goto out_unlock;
-
- /*
- * Check if the rec says it has a direct call but we didn't
- * find one earlier?
- */
- if (WARN_ON(rec->flags & FTRACE_FL_DIRECT))
- goto out_unlock;
-
- /* Make sure the ip points to the exact record */
- if (ip != rec->ip) {
- ip = rec->ip;
- /* Need to check this ip for a direct. */
- if (ftrace_find_rec_direct(ip))
- goto out_unlock;
- }
-
- ret = -ENOMEM;
- direct = ftrace_find_direct_func(addr);
- if (!direct) {
- direct = ftrace_alloc_direct_func(addr);
- if (!direct)
- goto out_unlock;
- }
-
- entry = ftrace_add_rec_direct(ip, addr, &free_hash);
- if (!entry)
- goto out_unlock;
-
- ret = ftrace_set_filter_ip(&direct_ops, ip, 0, 0);
-
- if (!ret && !(direct_ops.flags & FTRACE_OPS_FL_ENABLED)) {
- ret = register_ftrace_function_nolock(&direct_ops);
- if (ret)
- ftrace_set_filter_ip(&direct_ops, ip, 1, 0);
- }
-
- if (ret) {
- remove_hash_entry(direct_functions, entry);
- kfree(entry);
- if (!direct->count) {
- list_del_rcu(&direct->next);
- synchronize_rcu_tasks();
- kfree(direct);
- if (free_hash)
- free_ftrace_hash(free_hash);
- free_hash = NULL;
- ftrace_direct_func_count--;
- }
- } else {
- direct->count++;
- }
- out_unlock:
- mutex_unlock(&direct_mutex);
-
- if (free_hash) {
- synchronize_rcu_tasks();
- free_ftrace_hash(free_hash);
- }
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(register_ftrace_direct);
-
-static struct ftrace_func_entry *find_direct_entry(unsigned long *ip,
- struct dyn_ftrace **recp)
-{
- struct ftrace_func_entry *entry;
- struct dyn_ftrace *rec;
-
- rec = lookup_rec(*ip, *ip);
- if (!rec)
- return NULL;
-
- entry = __ftrace_lookup_ip(direct_functions, rec->ip);
- if (!entry) {
- WARN_ON(rec->flags & FTRACE_FL_DIRECT);
- return NULL;
- }
-
- WARN_ON(!(rec->flags & FTRACE_FL_DIRECT));
-
- /* Passed in ip just needs to be on the call site */
- *ip = rec->ip;
-
- if (recp)
- *recp = rec;
-
- return entry;
-}
-
-int unregister_ftrace_direct(unsigned long ip, unsigned long addr)
-{
- struct ftrace_direct_func *direct;
- struct ftrace_func_entry *entry;
- struct ftrace_hash *hash;
- int ret = -ENODEV;
-
- mutex_lock(&direct_mutex);
-
- ip = ftrace_location(ip);
- if (!ip)
- goto out_unlock;
-
- entry = find_direct_entry(&ip, NULL);
- if (!entry)
- goto out_unlock;
-
- hash = direct_ops.func_hash->filter_hash;
- if (hash->count == 1)
- unregister_ftrace_function(&direct_ops);
-
- ret = ftrace_set_filter_ip(&direct_ops, ip, 1, 0);
-
- WARN_ON(ret);
-
- remove_hash_entry(direct_functions, entry);
-
- direct = ftrace_find_direct_func(addr);
- if (!WARN_ON(!direct)) {
- /* This is the good path (see the ! before WARN) */
- direct->count--;
- WARN_ON(direct->count < 0);
- if (!direct->count) {
- list_del_rcu(&direct->next);
- synchronize_rcu_tasks();
- kfree(direct);
- kfree(entry);
- ftrace_direct_func_count--;
- }
- }
- out_unlock:
- mutex_unlock(&direct_mutex);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(unregister_ftrace_direct);
-
-static struct ftrace_ops stub_ops = {
- .func = ftrace_stub,
-};
-
-/**
- * ftrace_modify_direct_caller - modify ftrace nop directly
- * @entry: The ftrace hash entry of the direct helper for @rec
- * @rec: The record representing the function site to patch
- * @old_addr: The location that the site at @rec->ip currently calls
- * @new_addr: The location that the site at @rec->ip should call
- *
- * An architecture may overwrite this function to optimize the
- * changing of the direct callback on an ftrace nop location.
- * This is called with the ftrace_lock mutex held, and no other
- * ftrace callbacks are on the associated record (@rec). Thus,
- * it is safe to modify the ftrace record, where it should be
- * currently calling @old_addr directly, to call @new_addr.
- *
- * This is called with direct_mutex locked.
- *
- * Safety checks should be made to make sure that the code at
- * @rec->ip is currently calling @old_addr. And this must
- * also update entry->direct to @new_addr.
- */
-int __weak ftrace_modify_direct_caller(struct ftrace_func_entry *entry,
- struct dyn_ftrace *rec,
- unsigned long old_addr,
- unsigned long new_addr)
-{
- unsigned long ip = rec->ip;
- int ret;
-
- lockdep_assert_held(&direct_mutex);
-
- /*
- * The ftrace_lock was used to determine if the record
- * had more than one registered user to it. If it did,
- * we needed to prevent that from changing to do the quick
- * switch. But if it did not (only a direct caller was attached)
- * then this function is called. But this function can deal
- * with attached callers to the rec that we care about, and
- * since this function uses standard ftrace calls that take
- * the ftrace_lock mutex, we need to release it.
- */
- mutex_unlock(&ftrace_lock);
-
- /*
- * By setting a stub function at the same address, we force
- * the code to call the iterator and the direct_ops helper.
- * This means that @ip does not call the direct call, and
- * we can simply modify it.
- */
- ret = ftrace_set_filter_ip(&stub_ops, ip, 0, 0);
- if (ret)
- goto out_lock;
-
- ret = register_ftrace_function_nolock(&stub_ops);
- if (ret) {
- ftrace_set_filter_ip(&stub_ops, ip, 1, 0);
- goto out_lock;
- }
-
- entry->direct = new_addr;
-
- /*
- * By removing the stub, we put back the direct call, calling
- * the @new_addr.
- */
- unregister_ftrace_function(&stub_ops);
- ftrace_set_filter_ip(&stub_ops, ip, 1, 0);
-
- out_lock:
- mutex_lock(&ftrace_lock);
-
- return ret;
-}
-
-/**
- * modify_ftrace_direct - Modify an existing direct call to call something else
- * @ip: The instruction pointer to modify
- * @old_addr: The address that the current @ip calls directly
- * @new_addr: The address that the @ip should call
- *
- * This modifies a ftrace direct caller at an instruction pointer without
- * having to disable it first. The direct call will switch over to the
- * @new_addr without missing anything.
- *
- * Returns: zero on success. Non zero on error, which includes:
- * -ENODEV : the @ip given has no direct caller attached
- * -EINVAL : the @old_addr does not match the current direct caller
- */
-int modify_ftrace_direct(unsigned long ip,
- unsigned long old_addr, unsigned long new_addr)
-{
- struct ftrace_direct_func *direct, *new_direct = NULL;
- struct ftrace_func_entry *entry;
- struct dyn_ftrace *rec;
- int ret = -ENODEV;
-
- mutex_lock(&direct_mutex);
-
- mutex_lock(&ftrace_lock);
-
- ip = ftrace_location(ip);
- if (!ip)
- goto out_unlock;
-
- entry = find_direct_entry(&ip, &rec);
- if (!entry)
- goto out_unlock;
-
- ret = -EINVAL;
- if (entry->direct != old_addr)
- goto out_unlock;
-
- direct = ftrace_find_direct_func(old_addr);
- if (WARN_ON(!direct))
- goto out_unlock;
- if (direct->count > 1) {
- ret = -ENOMEM;
- new_direct = ftrace_alloc_direct_func(new_addr);
- if (!new_direct)
- goto out_unlock;
- direct->count--;
- new_direct->count++;
- } else {
- direct->addr = new_addr;
- }
-
- /*
- * If there's no other ftrace callback on the rec->ip location,
- * then it can be changed directly by the architecture.
- * If there is another caller, then we just need to change the
- * direct caller helper to point to @new_addr.
- */
- if (ftrace_rec_count(rec) == 1) {
- ret = ftrace_modify_direct_caller(entry, rec, old_addr, new_addr);
- } else {
- entry->direct = new_addr;
- ret = 0;
- }
-
- if (ret) {
- direct->addr = old_addr;
- if (unlikely(new_direct)) {
- direct->count++;
- list_del_rcu(&new_direct->next);
- synchronize_rcu_tasks();
- kfree(new_direct);
- ftrace_direct_func_count--;
- }
- }
-
- out_unlock:
- mutex_unlock(&ftrace_lock);
- mutex_unlock(&direct_mutex);
- return ret;
-}
-EXPORT_SYMBOL_GPL(modify_ftrace_direct);
-
-#define MULTI_FLAGS (FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS)
+#define MULTI_FLAGS (FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_ARGS)
static int check_direct_multi(struct ftrace_ops *ops)
{
@@ -5714,7 +5317,7 @@ static void remove_direct_functions_hash(struct ftrace_hash *hash, unsigned long
}
/**
- * register_ftrace_direct_multi - Call a custom trampoline directly
+ * register_ftrace_direct - Call a custom trampoline directly
* for multiple functions registered in @ops
* @ops: The address of the struct ftrace_ops object
* @addr: The address of the trampoline to call at @ops functions
@@ -5735,7 +5338,7 @@ static void remove_direct_functions_hash(struct ftrace_hash *hash, unsigned long
* -ENODEV - @ip does not point to a ftrace nop location (or not supported)
* -ENOMEM - There was an allocation failure.
*/
-int register_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
+int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
{
struct ftrace_hash *hash, *free_hash = NULL;
struct ftrace_func_entry *entry, *new;
@@ -5777,6 +5380,7 @@ int register_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
ops->func = call_direct_funcs;
ops->flags = MULTI_FLAGS;
ops->trampoline = FTRACE_REGS_ADDR;
+ ops->direct_call = addr;
err = register_ftrace_function_nolock(ops);
@@ -5793,11 +5397,11 @@ int register_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
}
return err;
}
-EXPORT_SYMBOL_GPL(register_ftrace_direct_multi);
+EXPORT_SYMBOL_GPL(register_ftrace_direct);
/**
- * unregister_ftrace_direct_multi - Remove calls to custom trampoline
- * previously registered by register_ftrace_direct_multi for @ops object.
+ * unregister_ftrace_direct - Remove calls to custom trampoline
+ * previously registered by register_ftrace_direct for @ops object.
* @ops: The address of the struct ftrace_ops object
*
* This is used to remove a direct calls to @addr from the nop locations
@@ -5808,7 +5412,8 @@ EXPORT_SYMBOL_GPL(register_ftrace_direct_multi);
* 0 on success
* -EINVAL - The @ops object was not properly registered.
*/
-int unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
+int unregister_ftrace_direct(struct ftrace_ops *ops, unsigned long addr,
+ bool free_filters)
{
struct ftrace_hash *hash = ops->func_hash->filter_hash;
int err;
@@ -5826,12 +5431,15 @@ int unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
/* cleanup for possible another register call */
ops->func = NULL;
ops->trampoline = 0;
+
+ if (free_filters)
+ ftrace_free_filter(ops);
return err;
}
-EXPORT_SYMBOL_GPL(unregister_ftrace_direct_multi);
+EXPORT_SYMBOL_GPL(unregister_ftrace_direct);
static int
-__modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
+__modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
{
struct ftrace_hash *hash;
struct ftrace_func_entry *entry, *iter;
@@ -5847,6 +5455,7 @@ __modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
/* Enable the tmp_ops to have the same functions as the direct ops */
ftrace_ops_init(&tmp_ops);
tmp_ops.func_hash = ops->func_hash;
+ tmp_ops.direct_call = addr;
err = register_ftrace_function_nolock(&tmp_ops);
if (err)
@@ -5868,6 +5477,8 @@ __modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
entry->direct = addr;
}
}
+ /* Prevent store tearing if a trampoline concurrently accesses the value */
+ WRITE_ONCE(ops->direct_call, addr);
mutex_unlock(&ftrace_lock);
@@ -5878,7 +5489,7 @@ __modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
}
/**
- * modify_ftrace_direct_multi_nolock - Modify an existing direct 'multi' call
+ * modify_ftrace_direct_nolock - Modify an existing direct 'multi' call
* to call something else
* @ops: The address of the struct ftrace_ops object
* @addr: The address of the new trampoline to call at @ops functions
@@ -5895,19 +5506,19 @@ __modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
* Returns: zero on success. Non zero on error, which includes:
* -EINVAL - The @ops object was not properly registered.
*/
-int modify_ftrace_direct_multi_nolock(struct ftrace_ops *ops, unsigned long addr)
+int modify_ftrace_direct_nolock(struct ftrace_ops *ops, unsigned long addr)
{
if (check_direct_multi(ops))
return -EINVAL;
if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
return -EINVAL;
- return __modify_ftrace_direct_multi(ops, addr);
+ return __modify_ftrace_direct(ops, addr);
}
-EXPORT_SYMBOL_GPL(modify_ftrace_direct_multi_nolock);
+EXPORT_SYMBOL_GPL(modify_ftrace_direct_nolock);
/**
- * modify_ftrace_direct_multi - Modify an existing direct 'multi' call
+ * modify_ftrace_direct - Modify an existing direct 'multi' call
* to call something else
* @ops: The address of the struct ftrace_ops object
* @addr: The address of the new trampoline to call at @ops functions
@@ -5921,7 +5532,7 @@ EXPORT_SYMBOL_GPL(modify_ftrace_direct_multi_nolock);
* Returns: zero on success. Non zero on error, which includes:
* -EINVAL - The @ops object was not properly registered.
*/
-int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
+int modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
{
int err;
@@ -5931,11 +5542,11 @@ int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
return -EINVAL;
mutex_lock(&direct_mutex);
- err = __modify_ftrace_direct_multi(ops, addr);
+ err = __modify_ftrace_direct(ops, addr);
mutex_unlock(&direct_mutex);
return err;
}
-EXPORT_SYMBOL_GPL(modify_ftrace_direct_multi);
+EXPORT_SYMBOL_GPL(modify_ftrace_direct);
#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
/**
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index 4496975f2029..efbbec2caff8 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -159,7 +159,7 @@ static void osnoise_unregister_instance(struct trace_array *tr)
if (!found)
return;
- kvfree_rcu(inst);
+ kvfree_rcu_mightsleep(inst);
}
/*
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 20d0c4a97633..2d2616678295 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -1172,7 +1172,7 @@ int trace_probe_remove_file(struct trace_probe *tp,
return -ENOENT;
list_del_rcu(&link->list);
- kvfree_rcu(link);
+ kvfree_rcu_mightsleep(link);
if (list_empty(&tp->event->files))
trace_probe_clear_flag(tp, TP_FLAG_TRACE);
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index ff0536cea968..a931d9aaea26 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -785,14 +785,7 @@ static struct fgraph_ops fgraph_ops __initdata = {
};
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
-#ifndef CALL_DEPTH_ACCOUNT
-#define CALL_DEPTH_ACCOUNT ""
-#endif
-
-noinline __noclone static void trace_direct_tramp(void)
-{
- asm(CALL_DEPTH_ACCOUNT);
-}
+static struct ftrace_ops direct;
#endif
/*
@@ -870,8 +863,9 @@ trace_selftest_startup_function_graph(struct tracer *trace,
* Register direct function together with graph tracer
* and make sure we get graph trace.
*/
- ret = register_ftrace_direct((unsigned long) DYN_FTRACE_TEST_NAME,
- (unsigned long) trace_direct_tramp);
+ ftrace_set_filter_ip(&direct, (unsigned long)DYN_FTRACE_TEST_NAME, 0, 0);
+ ret = register_ftrace_direct(&direct,
+ (unsigned long)ftrace_stub_direct_tramp);
if (ret)
goto out;
@@ -891,8 +885,9 @@ trace_selftest_startup_function_graph(struct tracer *trace,
unregister_ftrace_graph(&fgraph_ops);
- ret = unregister_ftrace_direct((unsigned long) DYN_FTRACE_TEST_NAME,
- (unsigned long) trace_direct_tramp);
+ ret = unregister_ftrace_direct(&direct,
+ (unsigned long)ftrace_stub_direct_tramp,
+ true);
if (ret)
goto out;
diff --git a/kernel/vhost_task.c b/kernel/vhost_task.c
new file mode 100644
index 000000000000..b7cbd66f889e
--- /dev/null
+++ b/kernel/vhost_task.c
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2021 Oracle Corporation
+ */
+#include <linux/slab.h>
+#include <linux/completion.h>
+#include <linux/sched/task.h>
+#include <linux/sched/vhost_task.h>
+#include <linux/sched/signal.h>
+
+enum vhost_task_flags {
+ VHOST_TASK_FLAGS_STOP,
+};
+
+static int vhost_task_fn(void *data)
+{
+ struct vhost_task *vtsk = data;
+ int ret;
+
+ ret = vtsk->fn(vtsk->data);
+ complete(&vtsk->exited);
+ do_exit(ret);
+}
+
+/**
+ * vhost_task_stop - stop a vhost_task
+ * @vtsk: vhost_task to stop
+ *
+ * Callers must call vhost_task_should_stop and return from their worker
+ * function when it returns true;
+ */
+void vhost_task_stop(struct vhost_task *vtsk)
+{
+ pid_t pid = vtsk->task->pid;
+
+ set_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags);
+ wake_up_process(vtsk->task);
+ /*
+ * Make sure vhost_task_fn is no longer accessing the vhost_task before
+ * freeing it below. If userspace crashed or exited without closing,
+ * then the vhost_task->task could already be marked dead so
+ * kernel_wait will return early.
+ */
+ wait_for_completion(&vtsk->exited);
+ /*
+ * If we are just closing/removing a device and the parent process is
+ * not exiting then reap the task.
+ */
+ kernel_wait4(pid, NULL, __WCLONE, NULL);
+ kfree(vtsk);
+}
+EXPORT_SYMBOL_GPL(vhost_task_stop);
+
+/**
+ * vhost_task_should_stop - should the vhost task return from the work function
+ * @vtsk: vhost_task to stop
+ */
+bool vhost_task_should_stop(struct vhost_task *vtsk)
+{
+ return test_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags);
+}
+EXPORT_SYMBOL_GPL(vhost_task_should_stop);
+
+/**
+ * vhost_task_create - create a copy of a process to be used by the kernel
+ * @fn: thread stack
+ * @arg: data to be passed to fn
+ * @name: the thread's name
+ *
+ * This returns a specialized task for use by the vhost layer or NULL on
+ * failure. The returned task is inactive, and the caller must fire it up
+ * through vhost_task_start().
+ */
+struct vhost_task *vhost_task_create(int (*fn)(void *), void *arg,
+ const char *name)
+{
+ struct kernel_clone_args args = {
+ .flags = CLONE_FS | CLONE_UNTRACED | CLONE_VM,
+ .exit_signal = 0,
+ .fn = vhost_task_fn,
+ .name = name,
+ .user_worker = 1,
+ .no_files = 1,
+ .ignore_signals = 1,
+ };
+ struct vhost_task *vtsk;
+ struct task_struct *tsk;
+
+ vtsk = kzalloc(sizeof(*vtsk), GFP_KERNEL);
+ if (!vtsk)
+ return NULL;
+ init_completion(&vtsk->exited);
+ vtsk->data = arg;
+ vtsk->fn = fn;
+
+ args.fn_arg = vtsk;
+
+ tsk = copy_process(NULL, 0, NUMA_NO_NODE, &args);
+ if (IS_ERR(tsk)) {
+ kfree(vtsk);
+ return NULL;
+ }
+
+ vtsk->task = tsk;
+ return vtsk;
+}
+EXPORT_SYMBOL_GPL(vhost_task_create);
+
+/**
+ * vhost_task_start - start a vhost_task created with vhost_task_create
+ * @vtsk: vhost_task to wake up
+ */
+void vhost_task_start(struct vhost_task *vtsk)
+{
+ wake_up_new_task(vtsk->task);
+}
+EXPORT_SYMBOL_GPL(vhost_task_start);