aboutsummaryrefslogtreecommitdiff
path: root/kernel/fork.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/fork.c')
-rw-r--r--kernel/fork.c309
1 files changed, 274 insertions, 35 deletions
diff --git a/kernel/fork.c b/kernel/fork.c
index f68954d05e89..ed4e01daccaa 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -97,6 +97,8 @@
#include <linux/io_uring.h>
#include <linux/bpf.h>
#include <linux/stackprotector.h>
+#include <linux/user_events.h>
+#include <linux/iommu.h>
#include <asm/pgalloc.h>
#include <linux/uaccess.h>
@@ -451,13 +453,49 @@ static struct kmem_cache *vm_area_cachep;
/* SLAB cache for mm_struct structures (tsk->mm) */
static struct kmem_cache *mm_cachep;
+#ifdef CONFIG_PER_VMA_LOCK
+
+/* SLAB cache for vm_area_struct.lock */
+static struct kmem_cache *vma_lock_cachep;
+
+static bool vma_lock_alloc(struct vm_area_struct *vma)
+{
+ vma->vm_lock = kmem_cache_alloc(vma_lock_cachep, GFP_KERNEL);
+ if (!vma->vm_lock)
+ return false;
+
+ init_rwsem(&vma->vm_lock->lock);
+ vma->vm_lock_seq = -1;
+
+ return true;
+}
+
+static inline void vma_lock_free(struct vm_area_struct *vma)
+{
+ kmem_cache_free(vma_lock_cachep, vma->vm_lock);
+}
+
+#else /* CONFIG_PER_VMA_LOCK */
+
+static inline bool vma_lock_alloc(struct vm_area_struct *vma) { return true; }
+static inline void vma_lock_free(struct vm_area_struct *vma) {}
+
+#endif /* CONFIG_PER_VMA_LOCK */
+
struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
{
struct vm_area_struct *vma;
vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
- if (vma)
- vma_init(vma, mm);
+ if (!vma)
+ return NULL;
+
+ vma_init(vma, mm);
+ if (!vma_lock_alloc(vma)) {
+ kmem_cache_free(vm_area_cachep, vma);
+ return NULL;
+ }
+
return vma;
}
@@ -465,26 +503,56 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
{
struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
- if (new) {
- ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
- ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
- /*
- * orig->shared.rb may be modified concurrently, but the clone
- * will be reinitialized.
- */
- data_race(memcpy(new, orig, sizeof(*new)));
- INIT_LIST_HEAD(&new->anon_vma_chain);
- dup_anon_vma_name(orig, new);
+ if (!new)
+ return NULL;
+
+ ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
+ ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
+ /*
+ * orig->shared.rb may be modified concurrently, but the clone
+ * will be reinitialized.
+ */
+ data_race(memcpy(new, orig, sizeof(*new)));
+ if (!vma_lock_alloc(new)) {
+ kmem_cache_free(vm_area_cachep, new);
+ return NULL;
}
+ INIT_LIST_HEAD(&new->anon_vma_chain);
+ vma_numab_state_init(new);
+ dup_anon_vma_name(orig, new);
+
return new;
}
-void vm_area_free(struct vm_area_struct *vma)
+void __vm_area_free(struct vm_area_struct *vma)
{
+ vma_numab_state_free(vma);
free_anon_vma_name(vma);
+ vma_lock_free(vma);
kmem_cache_free(vm_area_cachep, vma);
}
+#ifdef CONFIG_PER_VMA_LOCK
+static void vm_area_free_rcu_cb(struct rcu_head *head)
+{
+ struct vm_area_struct *vma = container_of(head, struct vm_area_struct,
+ vm_rcu);
+
+ /* The vma should not be locked while being destroyed. */
+ VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock->lock), vma);
+ __vm_area_free(vma);
+}
+#endif
+
+void vm_area_free(struct vm_area_struct *vma)
+{
+#ifdef CONFIG_PER_VMA_LOCK
+ call_rcu(&vma->vm_rcu, vm_area_free_rcu_cb);
+#else
+ __vm_area_free(vma);
+#endif
+}
+
static void account_kernel_stack(struct task_struct *tsk, int account)
{
if (IS_ENABLED(CONFIG_VMAP_STACK)) {
@@ -617,6 +685,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
if (retval)
goto out;
+ mt_clear_in_rcu(vmi.mas.tree);
for_each_vma(old_vmi, mpnt) {
struct file *file;
@@ -700,6 +769,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
retval = arch_dup_mmap(oldmm, mm);
loop_out:
vma_iter_free(&vmi);
+ if (!retval)
+ mt_set_in_rcu(vmi.mas.tree);
out:
mmap_write_unlock(mm);
flush_tlb_mm(oldmm);
@@ -755,11 +826,6 @@ static void check_mm(struct mm_struct *mm)
for (i = 0; i < NR_MM_COUNTERS; i++) {
long x = percpu_counter_sum(&mm->rss_stat[i]);
- if (likely(!x))
- continue;
-
- /* Making sure this is not due to race with CPU offlining. */
- x = percpu_counter_sum_all(&mm->rss_stat[i]);
if (unlikely(x))
pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n",
mm, resident_page_types[i], x);
@@ -777,6 +843,67 @@ static void check_mm(struct mm_struct *mm)
#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
+static void do_check_lazy_tlb(void *arg)
+{
+ struct mm_struct *mm = arg;
+
+ WARN_ON_ONCE(current->active_mm == mm);
+}
+
+static void do_shoot_lazy_tlb(void *arg)
+{
+ struct mm_struct *mm = arg;
+
+ if (current->active_mm == mm) {
+ WARN_ON_ONCE(current->mm);
+ current->active_mm = &init_mm;
+ switch_mm(mm, &init_mm, current);
+ }
+}
+
+static void cleanup_lazy_tlbs(struct mm_struct *mm)
+{
+ if (!IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) {
+ /*
+ * In this case, lazy tlb mms are refounted and would not reach
+ * __mmdrop until all CPUs have switched away and mmdrop()ed.
+ */
+ return;
+ }
+
+ /*
+ * Lazy mm shootdown does not refcount "lazy tlb mm" usage, rather it
+ * requires lazy mm users to switch to another mm when the refcount
+ * drops to zero, before the mm is freed. This requires IPIs here to
+ * switch kernel threads to init_mm.
+ *
+ * archs that use IPIs to flush TLBs can piggy-back that lazy tlb mm
+ * switch with the final userspace teardown TLB flush which leaves the
+ * mm lazy on this CPU but no others, reducing the need for additional
+ * IPIs here. There are cases where a final IPI is still required here,
+ * such as the final mmdrop being performed on a different CPU than the
+ * one exiting, or kernel threads using the mm when userspace exits.
+ *
+ * IPI overheads have not found to be expensive, but they could be
+ * reduced in a number of possible ways, for example (roughly
+ * increasing order of complexity):
+ * - The last lazy reference created by exit_mm() could instead switch
+ * to init_mm, however it's probable this will run on the same CPU
+ * immediately afterwards, so this may not reduce IPIs much.
+ * - A batch of mms requiring IPIs could be gathered and freed at once.
+ * - CPUs store active_mm where it can be remotely checked without a
+ * lock, to filter out false-positives in the cpumask.
+ * - After mm_users or mm_count reaches zero, switching away from the
+ * mm could clear mm_cpumask to reduce some IPIs, perhaps together
+ * with some batching or delaying of the final IPIs.
+ * - A delayed freeing and RCU-like quiescing sequence based on mm
+ * switching to avoid IPIs completely.
+ */
+ on_each_cpu_mask(mm_cpumask(mm), do_shoot_lazy_tlb, (void *)mm, 1);
+ if (IS_ENABLED(CONFIG_DEBUG_VM_SHOOT_LAZIES))
+ on_each_cpu(do_check_lazy_tlb, (void *)mm, 1);
+}
+
/*
* Called when the last reference to the mm
* is dropped: either by a lazy thread or by
@@ -788,6 +915,10 @@ void __mmdrop(struct mm_struct *mm)
BUG_ON(mm == &init_mm);
WARN_ON_ONCE(mm == current->mm);
+
+ /* Ensure no CPUs are using this as their lazy tlb mm */
+ cleanup_lazy_tlbs(mm);
+
WARN_ON_ONCE(mm == current->active_mm);
mm_free_pgd(mm);
destroy_context(mm);
@@ -795,6 +926,7 @@ void __mmdrop(struct mm_struct *mm)
check_mm(mm);
put_user_ns(mm->user_ns);
mm_pasid_drop(mm);
+ mm_destroy_cid(mm);
for (i = 0; i < NR_MM_COUNTERS; i++)
percpu_counter_destroy(&mm->rss_stat[i]);
@@ -1059,7 +1191,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
#ifdef CONFIG_SCHED_MM_CID
tsk->mm_cid = -1;
+ tsk->last_mm_cid = -1;
tsk->mm_cid_active = 0;
+ tsk->migrate_from_cpu = -1;
#endif
return tsk;
@@ -1130,6 +1264,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
seqcount_init(&mm->write_protect_seq);
mmap_init_lock(mm);
INIT_LIST_HEAD(&mm->mmlist);
+#ifdef CONFIG_PER_VMA_LOCK
+ mm->mm_lock_seq = 0;
+#endif
mm_pgtables_bytes_init(mm);
mm->map_count = 0;
mm->locked_vm = 0;
@@ -1164,18 +1301,23 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
if (init_new_context(p, mm))
goto fail_nocontext;
+ if (mm_alloc_cid(mm))
+ goto fail_cid;
+
for (i = 0; i < NR_MM_COUNTERS; i++)
if (percpu_counter_init(&mm->rss_stat[i], 0, GFP_KERNEL_ACCOUNT))
goto fail_pcpu;
mm->user_ns = get_user_ns(user_ns);
lru_gen_init_mm(mm);
- mm_init_cid(mm);
return mm;
fail_pcpu:
while (i > 0)
percpu_counter_destroy(&mm->rss_stat[--i]);
+ mm_destroy_cid(mm);
+fail_cid:
+ destroy_context(mm);
fail_nocontext:
mm_free_pgd(mm);
fail_nopgd:
@@ -1627,7 +1769,8 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
return 0;
}
-static int copy_files(unsigned long clone_flags, struct task_struct *tsk)
+static int copy_files(unsigned long clone_flags, struct task_struct *tsk,
+ int no_files)
{
struct files_struct *oldf, *newf;
int error = 0;
@@ -1639,6 +1782,11 @@ static int copy_files(unsigned long clone_flags, struct task_struct *tsk)
if (!oldf)
goto out;
+ if (no_files) {
+ tsk->files = NULL;
+ goto out;
+ }
+
if (clone_flags & CLONE_FILES) {
atomic_inc(&oldf->count);
goto out;
@@ -1956,6 +2104,91 @@ const struct file_operations pidfd_fops = {
#endif
};
+/**
+ * __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
+ * @pid: the struct pid for which to create a pidfd
+ * @flags: flags of the new @pidfd
+ * @pidfd: the pidfd to return
+ *
+ * Allocate a new file that stashes @pid and reserve a new pidfd number in the
+ * caller's file descriptor table. The pidfd is reserved but not installed yet.
+
+ * The helper doesn't perform checks on @pid which makes it useful for pidfds
+ * created via CLONE_PIDFD where @pid has no task attached when the pidfd and
+ * pidfd file are prepared.
+ *
+ * If this function returns successfully the caller is responsible to either
+ * call fd_install() passing the returned pidfd and pidfd file as arguments in
+ * order to install the pidfd into its file descriptor table or they must use
+ * put_unused_fd() and fput() on the returned pidfd and pidfd file
+ * respectively.
+ *
+ * This function is useful when a pidfd must already be reserved but there
+ * might still be points of failure afterwards and the caller wants to ensure
+ * that no pidfd is leaked into its file descriptor table.
+ *
+ * Return: On success, a reserved pidfd is returned from the function and a new
+ * pidfd file is returned in the last argument to the function. On
+ * error, a negative error code is returned from the function and the
+ * last argument remains unchanged.
+ */
+static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
+{
+ int pidfd;
+ struct file *pidfd_file;
+
+ if (flags & ~(O_NONBLOCK | O_RDWR | O_CLOEXEC))
+ return -EINVAL;
+
+ pidfd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
+ if (pidfd < 0)
+ return pidfd;
+
+ pidfd_file = anon_inode_getfile("[pidfd]", &pidfd_fops, pid,
+ flags | O_RDWR | O_CLOEXEC);
+ if (IS_ERR(pidfd_file)) {
+ put_unused_fd(pidfd);
+ return PTR_ERR(pidfd_file);
+ }
+ get_pid(pid); /* held by pidfd_file now */
+ *ret = pidfd_file;
+ return pidfd;
+}
+
+/**
+ * pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
+ * @pid: the struct pid for which to create a pidfd
+ * @flags: flags of the new @pidfd
+ * @pidfd: the pidfd to return
+ *
+ * Allocate a new file that stashes @pid and reserve a new pidfd number in the
+ * caller's file descriptor table. The pidfd is reserved but not installed yet.
+ *
+ * The helper verifies that @pid is used as a thread group leader.
+ *
+ * If this function returns successfully the caller is responsible to either
+ * call fd_install() passing the returned pidfd and pidfd file as arguments in
+ * order to install the pidfd into its file descriptor table or they must use
+ * put_unused_fd() and fput() on the returned pidfd and pidfd file
+ * respectively.
+ *
+ * This function is useful when a pidfd must already be reserved but there
+ * might still be points of failure afterwards and the caller wants to ensure
+ * that no pidfd is leaked into its file descriptor table.
+ *
+ * Return: On success, a reserved pidfd is returned from the function and a new
+ * pidfd file is returned in the last argument to the function. On
+ * error, a negative error code is returned from the function and the
+ * last argument remains unchanged.
+ */
+int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
+{
+ if (!pid || !pid_has_task(pid, PIDTYPE_TGID))
+ return -EINVAL;
+
+ return __pidfd_prepare(pid, flags, ret);
+}
+
static void __delayed_free_task(struct rcu_head *rhp)
{
struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
@@ -2010,7 +2243,7 @@ static void rv_task_fork(struct task_struct *p)
* parts of the process environment (as per the clone
* flags). The actual kick-off is left to the caller.
*/
-static __latent_entropy struct task_struct *copy_process(
+__latent_entropy struct task_struct *copy_process(
struct pid *pid,
int trace,
int node,
@@ -2103,6 +2336,8 @@ static __latent_entropy struct task_struct *copy_process(
p->flags &= ~PF_KTHREAD;
if (args->kthread)
p->flags |= PF_KTHREAD;
+ if (args->user_worker)
+ p->flags |= PF_USER_WORKER;
if (args->io_thread) {
/*
* Mark us an IO worker, and block any signal that isn't
@@ -2112,6 +2347,9 @@ static __latent_entropy struct task_struct *copy_process(
siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP));
}
+ if (args->name)
+ strscpy_pad(p->comm, args->name, sizeof(p->comm));
+
p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL;
/*
* Clear TID on mm_release()?
@@ -2254,7 +2492,7 @@ static __latent_entropy struct task_struct *copy_process(
retval = copy_semundo(clone_flags, p);
if (retval)
goto bad_fork_cleanup_security;
- retval = copy_files(clone_flags, p);
+ retval = copy_files(clone_flags, p, args->no_files);
if (retval)
goto bad_fork_cleanup_semundo;
retval = copy_fs(clone_flags, p);
@@ -2279,6 +2517,9 @@ static __latent_entropy struct task_struct *copy_process(
if (retval)
goto bad_fork_cleanup_io;
+ if (args->ignore_signals)
+ ignore_signals(p);
+
stackleak_task_init(p);
if (pid != &init_struct_pid) {
@@ -2296,21 +2537,12 @@ static __latent_entropy struct task_struct *copy_process(
* if the fd table isn't shared).
*/
if (clone_flags & CLONE_PIDFD) {
- retval = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
+ /* Note that no task has been attached to @pid yet. */
+ retval = __pidfd_prepare(pid, O_RDWR | O_CLOEXEC, &pidfile);
if (retval < 0)
goto bad_fork_free_pid;
-
pidfd = retval;
- pidfile = anon_inode_getfile("[pidfd]", &pidfd_fops, pid,
- O_RDWR | O_CLOEXEC);
- if (IS_ERR(pidfile)) {
- put_unused_fd(pidfd);
- retval = PTR_ERR(pidfile);
- goto bad_fork_free_pid;
- }
- get_pid(pid); /* held by pidfile now */
-
retval = put_user(pidfd, args->pidfd);
if (retval)
goto bad_fork_put_pidfd;
@@ -2505,6 +2737,7 @@ static __latent_entropy struct task_struct *copy_process(
trace_task_newtask(p, clone_flags);
uprobe_copy_process(p, clone_flags);
+ user_events_fork(p, clone_flags);
copy_oom_score_adj(clone_flags, p);
@@ -2627,6 +2860,7 @@ struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
.fn = fn,
.fn_arg = arg,
.io_thread = 1,
+ .user_worker = 1,
};
return copy_process(NULL, 0, node, &args);
@@ -2730,7 +2964,8 @@ pid_t kernel_clone(struct kernel_clone_args *args)
/*
* Create a kernel thread.
*/
-pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
+pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name,
+ unsigned long flags)
{
struct kernel_clone_args args = {
.flags = ((lower_32_bits(flags) | CLONE_VM |
@@ -2738,6 +2973,7 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
.exit_signal = (lower_32_bits(flags) & CSIGNAL),
.fn = fn,
.fn_arg = arg,
+ .name = name,
.kthread = 1,
};
@@ -2936,7 +3172,7 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs)
* - make the CLONE_DETACHED bit reusable for clone3
* - make the CSIGNAL bits reusable for clone3
*/
- if (kargs->flags & (CLONE_DETACHED | CSIGNAL))
+ if (kargs->flags & (CLONE_DETACHED | (CSIGNAL & (~CLONE_NEWTIME))))
return false;
if ((kargs->flags & (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND)) ==
@@ -3067,6 +3303,9 @@ void __init proc_caches_init(void)
NULL);
vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
+#ifdef CONFIG_PER_VMA_LOCK
+ vma_lock_cachep = KMEM_CACHE(vma_lock, SLAB_PANIC|SLAB_ACCOUNT);
+#endif
mmap_init();
nsproxy_cache_init();
}