diff options
Diffstat (limited to 'kernel/fork.c')
| -rw-r--r-- | kernel/fork.c | 208 |
1 files changed, 163 insertions, 45 deletions
diff --git a/kernel/fork.c b/kernel/fork.c index 11c5c8ab827c..17921b0390b4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -12,6 +12,16 @@ */ #include <linux/slab.h> +#include <linux/sched/autogroup.h> +#include <linux/sched/mm.h> +#include <linux/sched/coredump.h> +#include <linux/sched/user.h> +#include <linux/sched/numa_balancing.h> +#include <linux/sched/stat.h> +#include <linux/sched/task.h> +#include <linux/sched/task_stack.h> +#include <linux/sched/cputime.h> +#include <linux/rtmutex.h> #include <linux/init.h> #include <linux/unistd.h> #include <linux/module.h> @@ -55,6 +65,7 @@ #include <linux/rmap.h> #include <linux/ksm.h> #include <linux/acct.h> +#include <linux/userfaultfd_k.h> #include <linux/tsacct_kern.h> #include <linux/cn_proc.h> #include <linux/freezer.h> @@ -76,6 +87,7 @@ #include <linux/compiler.h> #include <linux/sysctl.h> #include <linux/kcov.h> +#include <linux/livepatch.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -167,6 +179,24 @@ void __weak arch_release_thread_stack(unsigned long *stack) */ #define NR_CACHED_STACKS 2 static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]); + +static int free_vm_stack_cache(unsigned int cpu) +{ + struct vm_struct **cached_vm_stacks = per_cpu_ptr(cached_stacks, cpu); + int i; + + for (i = 0; i < NR_CACHED_STACKS; i++) { + struct vm_struct *vm_stack = cached_vm_stacks[i]; + + if (!vm_stack) + continue; + + vfree(vm_stack->addr); + cached_vm_stacks[i] = NULL; + } + + return 0; +} #endif static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) @@ -175,23 +205,21 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) void *stack; int i; - local_irq_disable(); for (i = 0; i < NR_CACHED_STACKS; i++) { - struct vm_struct *s = this_cpu_read(cached_stacks[i]); + struct vm_struct *s; + + s = this_cpu_xchg(cached_stacks[i], NULL); if (!s) continue; - this_cpu_write(cached_stacks[i], NULL); tsk->stack_vm_area = s; - local_irq_enable(); return s->addr; } - local_irq_enable(); stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE, VMALLOC_START, VMALLOC_END, - THREADINFO_GFP | __GFP_HIGHMEM, + THREADINFO_GFP, PAGE_KERNEL, 0, node, __builtin_return_address(0)); @@ -215,19 +243,15 @@ static inline void free_thread_stack(struct task_struct *tsk) { #ifdef CONFIG_VMAP_STACK if (task_stack_vm_area(tsk)) { - unsigned long flags; int i; - local_irq_save(flags); for (i = 0; i < NR_CACHED_STACKS; i++) { - if (this_cpu_read(cached_stacks[i])) + if (this_cpu_cmpxchg(cached_stacks[i], + NULL, tsk->stack_vm_area) != NULL) continue; - this_cpu_write(cached_stacks[i], tsk->stack_vm_area); - local_irq_restore(flags); return; } - local_irq_restore(flags); vfree_atomic(tsk->stack); return; @@ -296,8 +320,8 @@ static void account_kernel_stack(struct task_struct *tsk, int account) } /* All stack pages belong to the same memcg. */ - memcg_kmem_update_page_stat(vm->pages[0], MEMCG_KERNEL_STACK_KB, - account * (THREAD_SIZE / 1024)); + mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB, + account * (THREAD_SIZE / 1024)); } else { /* * All stack pages are in the same zone and belong to the @@ -308,8 +332,8 @@ static void account_kernel_stack(struct task_struct *tsk, int account) mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, THREAD_SIZE / 1024 * account); - memcg_kmem_update_page_stat(first_page, MEMCG_KERNEL_STACK_KB, - account * (THREAD_SIZE / 1024)); + mod_memcg_page_state(first_page, MEMCG_KERNEL_STACK_KB, + account * (THREAD_SIZE / 1024)); } } @@ -432,11 +456,13 @@ void __init fork_init(void) int i; #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR #ifndef ARCH_MIN_TASKALIGN -#define ARCH_MIN_TASKALIGN L1_CACHE_BYTES +#define ARCH_MIN_TASKALIGN 0 #endif + int align = max_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN); + /* create a slab on which task_structs can be allocated */ task_struct_cachep = kmem_cache_create("task_struct", - arch_task_struct_size, ARCH_MIN_TASKALIGN, + arch_task_struct_size, align, SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, NULL); #endif @@ -453,6 +479,11 @@ void __init fork_init(void) for (i = 0; i < UCOUNT_COUNTS; i++) { init_user_ns.ucount_max[i] = max_threads/2; } + +#ifdef CONFIG_VMAP_STACK + cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache", + NULL, free_vm_stack_cache); +#endif } int __weak arch_dup_task_struct(struct task_struct *dst, @@ -523,7 +554,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) set_task_stack_end_magic(tsk); #ifdef CONFIG_CC_STACKPROTECTOR - tsk->stack_canary = get_random_int(); + tsk->stack_canary = get_random_canary(); #endif /* @@ -542,6 +573,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) kcov_task_init(tsk); +#ifdef CONFIG_FAULT_INJECTION + tsk->fail_nth = 0; +#endif + return tsk; free_stack: @@ -559,6 +594,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, struct rb_node **rb_link, *rb_parent; int retval; unsigned long charge; + LIST_HEAD(uf); uprobe_start_dup_mmap(); if (down_write_killable(&oldmm->mmap_sem)) { @@ -615,12 +651,13 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, if (retval) goto fail_nomem_policy; tmp->vm_mm = mm; + retval = dup_userfaultfd(tmp, &uf); + if (retval) + goto fail_nomem_anon_vma_fork; if (anon_vma_fork(tmp, mpnt)) goto fail_nomem_anon_vma_fork; - tmp->vm_flags &= - ~(VM_LOCKED|VM_LOCKONFAULT|VM_UFFD_MISSING|VM_UFFD_WP); + tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT); tmp->vm_next = tmp->vm_prev = NULL; - tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; file = tmp->vm_file; if (file) { struct inode *inode = file_inode(file); @@ -676,6 +713,7 @@ out: up_write(&mm->mmap_sem); flush_tlb_mm(oldmm); up_write(&oldmm->mmap_sem); + dup_userfaultfd_complete(&uf); fail_uprobe_end: uprobe_end_dup_mmap(); return retval; @@ -994,7 +1032,7 @@ struct mm_struct *get_task_mm(struct task_struct *task) if (task->flags & PF_KTHREAD) mm = NULL; else - atomic_inc(&mm->mm_users); + mmget(mm); } task_unlock(task); return mm; @@ -1182,7 +1220,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) vmacache_flush(tsk); if (clone_flags & CLONE_VM) { - atomic_inc(&oldmm->mm_users); + mmget(oldmm); mm = oldmm; goto good_mm; } @@ -1297,13 +1335,14 @@ void __cleanup_sighand(struct sighand_struct *sighand) if (atomic_dec_and_test(&sighand->count)) { signalfd_cleanup(sighand); /* - * sighand_cachep is SLAB_DESTROY_BY_RCU so we can free it + * sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it * without an RCU grace period, see __lock_task_sighand(). */ kmem_cache_free(sighand_cachep, sighand); } } +#ifdef CONFIG_POSIX_TIMERS /* * Initialize POSIX timer handling for a thread group. */ @@ -1313,7 +1352,7 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig) cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); if (cpu_limit != RLIM_INFINITY) { - sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit); + sig->cputime_expires.prof_exp = cpu_limit * NSEC_PER_SEC; sig->cputimer.running = true; } @@ -1322,6 +1361,9 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig) INIT_LIST_HEAD(&sig->cpu_timers[1]); INIT_LIST_HEAD(&sig->cpu_timers[2]); } +#else +static inline void posix_cpu_timers_init_group(struct signal_struct *sig) { } +#endif static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) { @@ -1346,11 +1388,11 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) init_waitqueue_head(&sig->wait_chldexit); sig->curr_target = tsk; init_sigpending(&sig->shared_pending); - INIT_LIST_HEAD(&sig->posix_timers); seqlock_init(&sig->stats_lock); prev_cputime_init(&sig->prev_cputime); #ifdef CONFIG_POSIX_TIMERS + INIT_LIST_HEAD(&sig->posix_timers); hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); sig->real_timer.function = it_real_fn; #endif @@ -1367,9 +1409,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->oom_score_adj = current->signal->oom_score_adj; sig->oom_score_adj_min = current->signal->oom_score_adj_min; - sig->has_child_subreaper = current->signal->has_child_subreaper || - current->signal->is_child_subreaper; - mutex_init(&sig->cred_guard_mutex); return 0; @@ -1421,10 +1460,12 @@ static void rt_mutex_init_task(struct task_struct *p) #ifdef CONFIG_RT_MUTEXES p->pi_waiters = RB_ROOT; p->pi_waiters_leftmost = NULL; + p->pi_top_task = NULL; p->pi_blocked_on = NULL; #endif } +#ifdef CONFIG_POSIX_TIMERS /* * Initialize POSIX timer handling for a single task. */ @@ -1437,6 +1478,9 @@ static void posix_cpu_timers_init(struct task_struct *tsk) INIT_LIST_HEAD(&tsk->cpu_timers[1]); INIT_LIST_HEAD(&tsk->cpu_timers[2]); } +#else +static inline void posix_cpu_timers_init(struct task_struct *tsk) { } +#endif static inline void init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid) @@ -1444,6 +1488,21 @@ init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid) task->pids[type].pid = pid; } +static inline void rcu_copy_process(struct task_struct *p) +{ +#ifdef CONFIG_PREEMPT_RCU + p->rcu_read_lock_nesting = 0; + p->rcu_read_unlock_special.s = 0; + p->rcu_blocked_node = NULL; + INIT_LIST_HEAD(&p->rcu_node_entry); +#endif /* #ifdef CONFIG_PREEMPT_RCU */ +#ifdef CONFIG_TASKS_RCU + p->rcu_tasks_holdout = false; + INIT_LIST_HEAD(&p->rcu_tasks_holdout_list); + p->rcu_tasks_idle_cpu = -1; +#endif /* #ifdef CONFIG_TASKS_RCU */ +} + /* * This creates a new process as a copy of the old one, * but does not actually start it yet. @@ -1516,6 +1575,18 @@ static __latent_entropy struct task_struct *copy_process( if (!p) goto fork_out; + /* + * This _must_ happen before we call free_task(), i.e. before we jump + * to any of the bad_fork_* labels. This is to avoid freeing + * p->set_child_tid which is (ab)used as a kthread's data pointer for + * kernel threads (PF_KTHREAD). + */ + p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; + /* + * Clear TID on mm_release()? + */ + p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL; + ftrace_graph_init_task(p); rt_mutex_init_task(p); @@ -1564,9 +1635,9 @@ static __latent_entropy struct task_struct *copy_process( prev_cputime_init(&p->prev_cputime); #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN - seqcount_init(&p->vtime_seqcount); - p->vtime_snap = 0; - p->vtime_snap_whence = VTIME_INACTIVE; + seqcount_init(&p->vtime.seqcount); + p->vtime.starttime = 0; + p->vtime.state = VTIME_INACTIVE; #endif #if defined(SPLIT_RSS_COUNTING) @@ -1643,9 +1714,12 @@ static __latent_entropy struct task_struct *copy_process( goto bad_fork_cleanup_perf; /* copy all the process information */ shm_init_task(p); - retval = copy_semundo(clone_flags, p); + retval = security_task_alloc(p, clone_flags); if (retval) goto bad_fork_cleanup_audit; + retval = copy_semundo(clone_flags, p); + if (retval) + goto bad_fork_cleanup_security; retval = copy_files(clone_flags, p); if (retval) goto bad_fork_cleanup_semundo; @@ -1679,11 +1753,6 @@ static __latent_entropy struct task_struct *copy_process( } } - p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; - /* - * Clear TID on mm_release()? - */ - p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL; #ifdef CONFIG_BLOCK p->plug = NULL; #endif @@ -1735,7 +1804,7 @@ static __latent_entropy struct task_struct *copy_process( INIT_LIST_HEAD(&p->thread_group); p->task_works = NULL; - threadgroup_change_begin(current); + cgroup_threadgroup_change_begin(current); /* * Ensure that the cgroup subsystem policies allow the new process to be * forked. It should be noted the the new process's css_set can be changed @@ -1761,6 +1830,8 @@ static __latent_entropy struct task_struct *copy_process( p->parent_exec_id = current->self_exec_id; } + klp_copy_process(p); + spin_lock(¤t->sighand->siglock); /* @@ -1779,11 +1850,13 @@ static __latent_entropy struct task_struct *copy_process( */ recalc_sigpending(); if (signal_pending(current)) { - spin_unlock(¤t->sighand->siglock); - write_unlock_irq(&tasklist_lock); retval = -ERESTARTNOINTR; goto bad_fork_cancel_cgroup; } + if (unlikely(!(ns_of_pid(pid)->nr_hashed & PIDNS_HASH_ADDING))) { + retval = -ENOMEM; + goto bad_fork_cancel_cgroup; + } if (likely(p->pid)) { ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); @@ -1800,6 +1873,13 @@ static __latent_entropy struct task_struct *copy_process( p->signal->leader_pid = pid; p->signal->tty = tty_kref_get(current->signal->tty); + /* + * Inherit has_child_subreaper flag under the same + * tasklist_lock with adding child to the process tree + * for propagate_has_child_subreaper optimization. + */ + p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper || + p->real_parent->signal->is_child_subreaper; list_add_tail(&p->sibling, &p->real_parent->children); list_add_tail_rcu(&p->tasks, &init_task.tasks); attach_pid(p, PIDTYPE_PGID); @@ -1825,7 +1905,7 @@ static __latent_entropy struct task_struct *copy_process( proc_fork_connector(p); cgroup_post_fork(p); - threadgroup_change_end(current); + cgroup_threadgroup_change_end(current); perf_event_fork(p); trace_task_newtask(p, clone_flags); @@ -1834,9 +1914,11 @@ static __latent_entropy struct task_struct *copy_process( return p; bad_fork_cancel_cgroup: + spin_unlock(¤t->sighand->siglock); + write_unlock_irq(&tasklist_lock); cgroup_cancel_fork(p); bad_fork_free_pid: - threadgroup_change_end(current); + cgroup_threadgroup_change_end(current); if (pid != &init_struct_pid) free_pid(pid); bad_fork_cleanup_thread: @@ -1860,6 +1942,8 @@ bad_fork_cleanup_files: exit_files(p); /* blocking */ bad_fork_cleanup_semundo: exit_sem(p); +bad_fork_cleanup_security: + security_task_free(p); bad_fork_cleanup_audit: audit_free(p); bad_fork_cleanup_perf: @@ -2053,6 +2137,38 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, } #endif +void walk_process_tree(struct task_struct *top, proc_visitor visitor, void *data) +{ + struct task_struct *leader, *parent, *child; + int res; + + read_lock(&tasklist_lock); + leader = top = top->group_leader; +down: + for_each_thread(leader, parent) { + list_for_each_entry(child, &parent->children, sibling) { + res = visitor(child, data); + if (res) { + if (res < 0) + goto out; + leader = child; + goto down; + } +up: + ; + } + } + + if (leader != top) { + child = leader; + parent = child->real_parent; + leader = parent->group_leader; + goto up; + } +out: + read_unlock(&tasklist_lock); +} + #ifndef ARCH_MIN_MMSTRUCT_ALIGN #define ARCH_MIN_MMSTRUCT_ALIGN 0 #endif @@ -2069,7 +2185,7 @@ void __init proc_caches_init(void) { sighand_cachep = kmem_cache_create("sighand_cache", sizeof(struct sighand_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU| + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| SLAB_NOTRACK|SLAB_ACCOUNT, sighand_ctor); signal_cachep = kmem_cache_create("signal_cache", sizeof(struct signal_struct), 0, @@ -2277,6 +2393,8 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) } } + perf_event_namespaces(current); + bad_unshare_cleanup_cred: if (new_cred) put_cred(new_cred); |