diff options
Diffstat (limited to 'kernel/fork.c')
-rw-r--r-- | kernel/fork.c | 309 |
1 files changed, 274 insertions, 35 deletions
diff --git a/kernel/fork.c b/kernel/fork.c index f68954d05e89..ed4e01daccaa 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -97,6 +97,8 @@ #include <linux/io_uring.h> #include <linux/bpf.h> #include <linux/stackprotector.h> +#include <linux/user_events.h> +#include <linux/iommu.h> #include <asm/pgalloc.h> #include <linux/uaccess.h> @@ -451,13 +453,49 @@ static struct kmem_cache *vm_area_cachep; /* SLAB cache for mm_struct structures (tsk->mm) */ static struct kmem_cache *mm_cachep; +#ifdef CONFIG_PER_VMA_LOCK + +/* SLAB cache for vm_area_struct.lock */ +static struct kmem_cache *vma_lock_cachep; + +static bool vma_lock_alloc(struct vm_area_struct *vma) +{ + vma->vm_lock = kmem_cache_alloc(vma_lock_cachep, GFP_KERNEL); + if (!vma->vm_lock) + return false; + + init_rwsem(&vma->vm_lock->lock); + vma->vm_lock_seq = -1; + + return true; +} + +static inline void vma_lock_free(struct vm_area_struct *vma) +{ + kmem_cache_free(vma_lock_cachep, vma->vm_lock); +} + +#else /* CONFIG_PER_VMA_LOCK */ + +static inline bool vma_lock_alloc(struct vm_area_struct *vma) { return true; } +static inline void vma_lock_free(struct vm_area_struct *vma) {} + +#endif /* CONFIG_PER_VMA_LOCK */ + struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) { struct vm_area_struct *vma; vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); - if (vma) - vma_init(vma, mm); + if (!vma) + return NULL; + + vma_init(vma, mm); + if (!vma_lock_alloc(vma)) { + kmem_cache_free(vm_area_cachep, vma); + return NULL; + } + return vma; } @@ -465,26 +503,56 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) { struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); - if (new) { - ASSERT_EXCLUSIVE_WRITER(orig->vm_flags); - ASSERT_EXCLUSIVE_WRITER(orig->vm_file); - /* - * orig->shared.rb may be modified concurrently, but the clone - * will be reinitialized. - */ - data_race(memcpy(new, orig, sizeof(*new))); - INIT_LIST_HEAD(&new->anon_vma_chain); - dup_anon_vma_name(orig, new); + if (!new) + return NULL; + + ASSERT_EXCLUSIVE_WRITER(orig->vm_flags); + ASSERT_EXCLUSIVE_WRITER(orig->vm_file); + /* + * orig->shared.rb may be modified concurrently, but the clone + * will be reinitialized. + */ + data_race(memcpy(new, orig, sizeof(*new))); + if (!vma_lock_alloc(new)) { + kmem_cache_free(vm_area_cachep, new); + return NULL; } + INIT_LIST_HEAD(&new->anon_vma_chain); + vma_numab_state_init(new); + dup_anon_vma_name(orig, new); + return new; } -void vm_area_free(struct vm_area_struct *vma) +void __vm_area_free(struct vm_area_struct *vma) { + vma_numab_state_free(vma); free_anon_vma_name(vma); + vma_lock_free(vma); kmem_cache_free(vm_area_cachep, vma); } +#ifdef CONFIG_PER_VMA_LOCK +static void vm_area_free_rcu_cb(struct rcu_head *head) +{ + struct vm_area_struct *vma = container_of(head, struct vm_area_struct, + vm_rcu); + + /* The vma should not be locked while being destroyed. */ + VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock->lock), vma); + __vm_area_free(vma); +} +#endif + +void vm_area_free(struct vm_area_struct *vma) +{ +#ifdef CONFIG_PER_VMA_LOCK + call_rcu(&vma->vm_rcu, vm_area_free_rcu_cb); +#else + __vm_area_free(vma); +#endif +} + static void account_kernel_stack(struct task_struct *tsk, int account) { if (IS_ENABLED(CONFIG_VMAP_STACK)) { @@ -617,6 +685,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, if (retval) goto out; + mt_clear_in_rcu(vmi.mas.tree); for_each_vma(old_vmi, mpnt) { struct file *file; @@ -700,6 +769,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, retval = arch_dup_mmap(oldmm, mm); loop_out: vma_iter_free(&vmi); + if (!retval) + mt_set_in_rcu(vmi.mas.tree); out: mmap_write_unlock(mm); flush_tlb_mm(oldmm); @@ -755,11 +826,6 @@ static void check_mm(struct mm_struct *mm) for (i = 0; i < NR_MM_COUNTERS; i++) { long x = percpu_counter_sum(&mm->rss_stat[i]); - if (likely(!x)) - continue; - - /* Making sure this is not due to race with CPU offlining. */ - x = percpu_counter_sum_all(&mm->rss_stat[i]); if (unlikely(x)) pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n", mm, resident_page_types[i], x); @@ -777,6 +843,67 @@ static void check_mm(struct mm_struct *mm) #define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) #define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) +static void do_check_lazy_tlb(void *arg) +{ + struct mm_struct *mm = arg; + + WARN_ON_ONCE(current->active_mm == mm); +} + +static void do_shoot_lazy_tlb(void *arg) +{ + struct mm_struct *mm = arg; + + if (current->active_mm == mm) { + WARN_ON_ONCE(current->mm); + current->active_mm = &init_mm; + switch_mm(mm, &init_mm, current); + } +} + +static void cleanup_lazy_tlbs(struct mm_struct *mm) +{ + if (!IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) { + /* + * In this case, lazy tlb mms are refounted and would not reach + * __mmdrop until all CPUs have switched away and mmdrop()ed. + */ + return; + } + + /* + * Lazy mm shootdown does not refcount "lazy tlb mm" usage, rather it + * requires lazy mm users to switch to another mm when the refcount + * drops to zero, before the mm is freed. This requires IPIs here to + * switch kernel threads to init_mm. + * + * archs that use IPIs to flush TLBs can piggy-back that lazy tlb mm + * switch with the final userspace teardown TLB flush which leaves the + * mm lazy on this CPU but no others, reducing the need for additional + * IPIs here. There are cases where a final IPI is still required here, + * such as the final mmdrop being performed on a different CPU than the + * one exiting, or kernel threads using the mm when userspace exits. + * + * IPI overheads have not found to be expensive, but they could be + * reduced in a number of possible ways, for example (roughly + * increasing order of complexity): + * - The last lazy reference created by exit_mm() could instead switch + * to init_mm, however it's probable this will run on the same CPU + * immediately afterwards, so this may not reduce IPIs much. + * - A batch of mms requiring IPIs could be gathered and freed at once. + * - CPUs store active_mm where it can be remotely checked without a + * lock, to filter out false-positives in the cpumask. + * - After mm_users or mm_count reaches zero, switching away from the + * mm could clear mm_cpumask to reduce some IPIs, perhaps together + * with some batching or delaying of the final IPIs. + * - A delayed freeing and RCU-like quiescing sequence based on mm + * switching to avoid IPIs completely. + */ + on_each_cpu_mask(mm_cpumask(mm), do_shoot_lazy_tlb, (void *)mm, 1); + if (IS_ENABLED(CONFIG_DEBUG_VM_SHOOT_LAZIES)) + on_each_cpu(do_check_lazy_tlb, (void *)mm, 1); +} + /* * Called when the last reference to the mm * is dropped: either by a lazy thread or by @@ -788,6 +915,10 @@ void __mmdrop(struct mm_struct *mm) BUG_ON(mm == &init_mm); WARN_ON_ONCE(mm == current->mm); + + /* Ensure no CPUs are using this as their lazy tlb mm */ + cleanup_lazy_tlbs(mm); + WARN_ON_ONCE(mm == current->active_mm); mm_free_pgd(mm); destroy_context(mm); @@ -795,6 +926,7 @@ void __mmdrop(struct mm_struct *mm) check_mm(mm); put_user_ns(mm->user_ns); mm_pasid_drop(mm); + mm_destroy_cid(mm); for (i = 0; i < NR_MM_COUNTERS; i++) percpu_counter_destroy(&mm->rss_stat[i]); @@ -1059,7 +1191,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) #ifdef CONFIG_SCHED_MM_CID tsk->mm_cid = -1; + tsk->last_mm_cid = -1; tsk->mm_cid_active = 0; + tsk->migrate_from_cpu = -1; #endif return tsk; @@ -1130,6 +1264,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, seqcount_init(&mm->write_protect_seq); mmap_init_lock(mm); INIT_LIST_HEAD(&mm->mmlist); +#ifdef CONFIG_PER_VMA_LOCK + mm->mm_lock_seq = 0; +#endif mm_pgtables_bytes_init(mm); mm->map_count = 0; mm->locked_vm = 0; @@ -1164,18 +1301,23 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, if (init_new_context(p, mm)) goto fail_nocontext; + if (mm_alloc_cid(mm)) + goto fail_cid; + for (i = 0; i < NR_MM_COUNTERS; i++) if (percpu_counter_init(&mm->rss_stat[i], 0, GFP_KERNEL_ACCOUNT)) goto fail_pcpu; mm->user_ns = get_user_ns(user_ns); lru_gen_init_mm(mm); - mm_init_cid(mm); return mm; fail_pcpu: while (i > 0) percpu_counter_destroy(&mm->rss_stat[--i]); + mm_destroy_cid(mm); +fail_cid: + destroy_context(mm); fail_nocontext: mm_free_pgd(mm); fail_nopgd: @@ -1627,7 +1769,8 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) return 0; } -static int copy_files(unsigned long clone_flags, struct task_struct *tsk) +static int copy_files(unsigned long clone_flags, struct task_struct *tsk, + int no_files) { struct files_struct *oldf, *newf; int error = 0; @@ -1639,6 +1782,11 @@ static int copy_files(unsigned long clone_flags, struct task_struct *tsk) if (!oldf) goto out; + if (no_files) { + tsk->files = NULL; + goto out; + } + if (clone_flags & CLONE_FILES) { atomic_inc(&oldf->count); goto out; @@ -1956,6 +2104,91 @@ const struct file_operations pidfd_fops = { #endif }; +/** + * __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd + * @pid: the struct pid for which to create a pidfd + * @flags: flags of the new @pidfd + * @pidfd: the pidfd to return + * + * Allocate a new file that stashes @pid and reserve a new pidfd number in the + * caller's file descriptor table. The pidfd is reserved but not installed yet. + + * The helper doesn't perform checks on @pid which makes it useful for pidfds + * created via CLONE_PIDFD where @pid has no task attached when the pidfd and + * pidfd file are prepared. + * + * If this function returns successfully the caller is responsible to either + * call fd_install() passing the returned pidfd and pidfd file as arguments in + * order to install the pidfd into its file descriptor table or they must use + * put_unused_fd() and fput() on the returned pidfd and pidfd file + * respectively. + * + * This function is useful when a pidfd must already be reserved but there + * might still be points of failure afterwards and the caller wants to ensure + * that no pidfd is leaked into its file descriptor table. + * + * Return: On success, a reserved pidfd is returned from the function and a new + * pidfd file is returned in the last argument to the function. On + * error, a negative error code is returned from the function and the + * last argument remains unchanged. + */ +static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret) +{ + int pidfd; + struct file *pidfd_file; + + if (flags & ~(O_NONBLOCK | O_RDWR | O_CLOEXEC)) + return -EINVAL; + + pidfd = get_unused_fd_flags(O_RDWR | O_CLOEXEC); + if (pidfd < 0) + return pidfd; + + pidfd_file = anon_inode_getfile("[pidfd]", &pidfd_fops, pid, + flags | O_RDWR | O_CLOEXEC); + if (IS_ERR(pidfd_file)) { + put_unused_fd(pidfd); + return PTR_ERR(pidfd_file); + } + get_pid(pid); /* held by pidfd_file now */ + *ret = pidfd_file; + return pidfd; +} + +/** + * pidfd_prepare - allocate a new pidfd_file and reserve a pidfd + * @pid: the struct pid for which to create a pidfd + * @flags: flags of the new @pidfd + * @pidfd: the pidfd to return + * + * Allocate a new file that stashes @pid and reserve a new pidfd number in the + * caller's file descriptor table. The pidfd is reserved but not installed yet. + * + * The helper verifies that @pid is used as a thread group leader. + * + * If this function returns successfully the caller is responsible to either + * call fd_install() passing the returned pidfd and pidfd file as arguments in + * order to install the pidfd into its file descriptor table or they must use + * put_unused_fd() and fput() on the returned pidfd and pidfd file + * respectively. + * + * This function is useful when a pidfd must already be reserved but there + * might still be points of failure afterwards and the caller wants to ensure + * that no pidfd is leaked into its file descriptor table. + * + * Return: On success, a reserved pidfd is returned from the function and a new + * pidfd file is returned in the last argument to the function. On + * error, a negative error code is returned from the function and the + * last argument remains unchanged. + */ +int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret) +{ + if (!pid || !pid_has_task(pid, PIDTYPE_TGID)) + return -EINVAL; + + return __pidfd_prepare(pid, flags, ret); +} + static void __delayed_free_task(struct rcu_head *rhp) { struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); @@ -2010,7 +2243,7 @@ static void rv_task_fork(struct task_struct *p) * parts of the process environment (as per the clone * flags). The actual kick-off is left to the caller. */ -static __latent_entropy struct task_struct *copy_process( +__latent_entropy struct task_struct *copy_process( struct pid *pid, int trace, int node, @@ -2103,6 +2336,8 @@ static __latent_entropy struct task_struct *copy_process( p->flags &= ~PF_KTHREAD; if (args->kthread) p->flags |= PF_KTHREAD; + if (args->user_worker) + p->flags |= PF_USER_WORKER; if (args->io_thread) { /* * Mark us an IO worker, and block any signal that isn't @@ -2112,6 +2347,9 @@ static __latent_entropy struct task_struct *copy_process( siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP)); } + if (args->name) + strscpy_pad(p->comm, args->name, sizeof(p->comm)); + p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL; /* * Clear TID on mm_release()? @@ -2254,7 +2492,7 @@ static __latent_entropy struct task_struct *copy_process( retval = copy_semundo(clone_flags, p); if (retval) goto bad_fork_cleanup_security; - retval = copy_files(clone_flags, p); + retval = copy_files(clone_flags, p, args->no_files); if (retval) goto bad_fork_cleanup_semundo; retval = copy_fs(clone_flags, p); @@ -2279,6 +2517,9 @@ static __latent_entropy struct task_struct *copy_process( if (retval) goto bad_fork_cleanup_io; + if (args->ignore_signals) + ignore_signals(p); + stackleak_task_init(p); if (pid != &init_struct_pid) { @@ -2296,21 +2537,12 @@ static __latent_entropy struct task_struct *copy_process( * if the fd table isn't shared). */ if (clone_flags & CLONE_PIDFD) { - retval = get_unused_fd_flags(O_RDWR | O_CLOEXEC); + /* Note that no task has been attached to @pid yet. */ + retval = __pidfd_prepare(pid, O_RDWR | O_CLOEXEC, &pidfile); if (retval < 0) goto bad_fork_free_pid; - pidfd = retval; - pidfile = anon_inode_getfile("[pidfd]", &pidfd_fops, pid, - O_RDWR | O_CLOEXEC); - if (IS_ERR(pidfile)) { - put_unused_fd(pidfd); - retval = PTR_ERR(pidfile); - goto bad_fork_free_pid; - } - get_pid(pid); /* held by pidfile now */ - retval = put_user(pidfd, args->pidfd); if (retval) goto bad_fork_put_pidfd; @@ -2505,6 +2737,7 @@ static __latent_entropy struct task_struct *copy_process( trace_task_newtask(p, clone_flags); uprobe_copy_process(p, clone_flags); + user_events_fork(p, clone_flags); copy_oom_score_adj(clone_flags, p); @@ -2627,6 +2860,7 @@ struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node) .fn = fn, .fn_arg = arg, .io_thread = 1, + .user_worker = 1, }; return copy_process(NULL, 0, node, &args); @@ -2730,7 +2964,8 @@ pid_t kernel_clone(struct kernel_clone_args *args) /* * Create a kernel thread. */ -pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) +pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name, + unsigned long flags) { struct kernel_clone_args args = { .flags = ((lower_32_bits(flags) | CLONE_VM | @@ -2738,6 +2973,7 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) .exit_signal = (lower_32_bits(flags) & CSIGNAL), .fn = fn, .fn_arg = arg, + .name = name, .kthread = 1, }; @@ -2936,7 +3172,7 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs) * - make the CLONE_DETACHED bit reusable for clone3 * - make the CSIGNAL bits reusable for clone3 */ - if (kargs->flags & (CLONE_DETACHED | CSIGNAL)) + if (kargs->flags & (CLONE_DETACHED | (CSIGNAL & (~CLONE_NEWTIME)))) return false; if ((kargs->flags & (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND)) == @@ -3067,6 +3303,9 @@ void __init proc_caches_init(void) NULL); vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT); +#ifdef CONFIG_PER_VMA_LOCK + vma_lock_cachep = KMEM_CACHE(vma_lock, SLAB_PANIC|SLAB_ACCOUNT); +#endif mmap_init(); nsproxy_cache_init(); } |