diff options
Diffstat (limited to 'kernel/fork.c')
| -rw-r--r-- | kernel/fork.c | 195 | 
1 files changed, 172 insertions, 23 deletions
| diff --git a/kernel/fork.c b/kernel/fork.c index beb31725f7e2..623259fc794d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -158,19 +158,83 @@ void __weak arch_release_thread_stack(unsigned long *stack)   * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a   * kmemcache based allocator.   */ -# if THREAD_SIZE >= PAGE_SIZE -static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, -						  int node) +# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) + +#ifdef CONFIG_VMAP_STACK +/* + * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB + * flush.  Try to minimize the number of calls by caching stacks. + */ +#define NR_CACHED_STACKS 2 +static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]); +#endif + +static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)  { +#ifdef CONFIG_VMAP_STACK +	void *stack; +	int i; + +	local_irq_disable(); +	for (i = 0; i < NR_CACHED_STACKS; i++) { +		struct vm_struct *s = this_cpu_read(cached_stacks[i]); + +		if (!s) +			continue; +		this_cpu_write(cached_stacks[i], NULL); + +		tsk->stack_vm_area = s; +		local_irq_enable(); +		return s->addr; +	} +	local_irq_enable(); + +	stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE, +				     VMALLOC_START, VMALLOC_END, +				     THREADINFO_GFP | __GFP_HIGHMEM, +				     PAGE_KERNEL, +				     0, node, __builtin_return_address(0)); + +	/* +	 * We can't call find_vm_area() in interrupt context, and +	 * free_thread_stack() can be called in interrupt context, +	 * so cache the vm_struct. +	 */ +	if (stack) +		tsk->stack_vm_area = find_vm_area(stack); +	return stack; +#else  	struct page *page = alloc_pages_node(node, THREADINFO_GFP,  					     THREAD_SIZE_ORDER);  	return page ? page_address(page) : NULL; +#endif  } -static inline void free_thread_stack(unsigned long *stack) +static inline void free_thread_stack(struct task_struct *tsk)  { -	__free_pages(virt_to_page(stack), THREAD_SIZE_ORDER); +#ifdef CONFIG_VMAP_STACK +	if (task_stack_vm_area(tsk)) { +		unsigned long flags; +		int i; + +		local_irq_save(flags); +		for (i = 0; i < NR_CACHED_STACKS; i++) { +			if (this_cpu_read(cached_stacks[i])) +				continue; + +			this_cpu_write(cached_stacks[i], tsk->stack_vm_area); +			local_irq_restore(flags); +			return; +		} +		local_irq_restore(flags); + +		vfree(tsk->stack); +		return; +	} +#endif + +	__free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER);  }  # else  static struct kmem_cache *thread_stack_cache; @@ -181,9 +245,9 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,  	return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);  } -static void free_thread_stack(unsigned long *stack) +static void free_thread_stack(struct task_struct *tsk)  { -	kmem_cache_free(thread_stack_cache, stack); +	kmem_cache_free(thread_stack_cache, tsk->stack);  }  void thread_stack_cache_init(void) @@ -213,24 +277,76 @@ struct kmem_cache *vm_area_cachep;  /* SLAB cache for mm_struct structures (tsk->mm) */  static struct kmem_cache *mm_cachep; -static void account_kernel_stack(unsigned long *stack, int account) +static void account_kernel_stack(struct task_struct *tsk, int account)  { -	/* All stack pages are in the same zone and belong to the same memcg. */ -	struct page *first_page = virt_to_page(stack); +	void *stack = task_stack_page(tsk); +	struct vm_struct *vm = task_stack_vm_area(tsk); -	mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, -			    THREAD_SIZE / 1024 * account); +	BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0); + +	if (vm) { +		int i; + +		BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE); + +		for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { +			mod_zone_page_state(page_zone(vm->pages[i]), +					    NR_KERNEL_STACK_KB, +					    PAGE_SIZE / 1024 * account); +		} + +		/* All stack pages belong to the same memcg. */ +		memcg_kmem_update_page_stat(vm->pages[0], MEMCG_KERNEL_STACK_KB, +					    account * (THREAD_SIZE / 1024)); +	} else { +		/* +		 * All stack pages are in the same zone and belong to the +		 * same memcg. +		 */ +		struct page *first_page = virt_to_page(stack); -	memcg_kmem_update_page_stat( -		first_page, MEMCG_KERNEL_STACK_KB, -		account * (THREAD_SIZE / 1024)); +		mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, +				    THREAD_SIZE / 1024 * account); + +		memcg_kmem_update_page_stat(first_page, MEMCG_KERNEL_STACK_KB, +					    account * (THREAD_SIZE / 1024)); +	}  } -void free_task(struct task_struct *tsk) +static void release_task_stack(struct task_struct *tsk)  { -	account_kernel_stack(tsk->stack, -1); +	account_kernel_stack(tsk, -1);  	arch_release_thread_stack(tsk->stack); -	free_thread_stack(tsk->stack); +	free_thread_stack(tsk); +	tsk->stack = NULL; +#ifdef CONFIG_VMAP_STACK +	tsk->stack_vm_area = NULL; +#endif +} + +#ifdef CONFIG_THREAD_INFO_IN_TASK +void put_task_stack(struct task_struct *tsk) +{ +	if (atomic_dec_and_test(&tsk->stack_refcount)) +		release_task_stack(tsk); +} +#endif + +void free_task(struct task_struct *tsk) +{ +#ifndef CONFIG_THREAD_INFO_IN_TASK +	/* +	 * The task is finally done with both the stack and thread_info, +	 * so free both. +	 */ +	release_task_stack(tsk); +#else +	/* +	 * If the task had a separate stack allocation, it should be gone +	 * by now. +	 */ +	WARN_ON_ONCE(atomic_read(&tsk->stack_refcount) != 0); +#endif  	rt_mutex_debug_task_free(tsk);  	ftrace_graph_exit_task(tsk);  	put_seccomp_filter(tsk); @@ -243,6 +359,12 @@ static inline void free_signal_struct(struct signal_struct *sig)  {  	taskstats_tgid_free(sig);  	sched_autogroup_exit(sig); +	/* +	 * __mmdrop is not safe to call from softirq context on x86 due to +	 * pgd_dtor so postpone it to the async context +	 */ +	if (sig->oom_mm) +		mmdrop_async(sig->oom_mm);  	kmem_cache_free(signal_cachep, sig);  } @@ -302,6 +424,7 @@ int arch_task_struct_size __read_mostly;  void __init fork_init(void)  { +	int i;  #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR  #ifndef ARCH_MIN_TASKALIGN  #define ARCH_MIN_TASKALIGN	L1_CACHE_BYTES @@ -321,6 +444,10 @@ void __init fork_init(void)  	init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;  	init_task.signal->rlim[RLIMIT_SIGPENDING] =  		init_task.signal->rlim[RLIMIT_NPROC]; + +	for (i = 0; i < UCOUNT_COUNTS; i++) { +		init_user_ns.ucount_max[i] = max_threads/2; +	}  }  int __weak arch_dup_task_struct(struct task_struct *dst, @@ -342,6 +469,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)  {  	struct task_struct *tsk;  	unsigned long *stack; +	struct vm_struct *stack_vm_area;  	int err;  	if (node == NUMA_NO_NODE) @@ -354,11 +482,26 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)  	if (!stack)  		goto free_tsk; +	stack_vm_area = task_stack_vm_area(tsk); +  	err = arch_dup_task_struct(tsk, orig); + +	/* +	 * arch_dup_task_struct() clobbers the stack-related fields.  Make +	 * sure they're properly initialized before using any stack-related +	 * functions again. +	 */ +	tsk->stack = stack; +#ifdef CONFIG_VMAP_STACK +	tsk->stack_vm_area = stack_vm_area; +#endif +#ifdef CONFIG_THREAD_INFO_IN_TASK +	atomic_set(&tsk->stack_refcount, 1); +#endif +  	if (err)  		goto free_stack; -	tsk->stack = stack;  #ifdef CONFIG_SECCOMP  	/*  	 * We must handle setting up seccomp filters once we're under @@ -390,21 +533,22 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)  	tsk->task_frag.page = NULL;  	tsk->wake_q.next = NULL; -	account_kernel_stack(stack, 1); +	account_kernel_stack(tsk, 1);  	kcov_task_init(tsk);  	return tsk;  free_stack: -	free_thread_stack(stack); +	free_thread_stack(tsk);  free_tsk:  	free_task_struct(tsk);  	return NULL;  }  #ifdef CONFIG_MMU -static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) +static __latent_entropy int dup_mmap(struct mm_struct *mm, +					struct mm_struct *oldmm)  {  	struct vm_area_struct *mpnt, *tmp, *prev, **pprev;  	struct rb_node **rb_link, *rb_parent; @@ -711,6 +855,7 @@ static inline void __mmput(struct mm_struct *mm)  	ksm_exit(mm);  	khugepaged_exit(mm); /* must run before exit_mmap */  	exit_mmap(mm); +	mm_put_huge_zero_page(mm);  	set_mm_exe_file(mm, NULL);  	if (!list_empty(&mm->mmlist)) {  		spin_lock(&mmlist_lock); @@ -719,6 +864,7 @@ static inline void __mmput(struct mm_struct *mm)  	}  	if (mm->binfmt)  		module_put(mm->binfmt->module); +	set_bit(MMF_OOM_SKIP, &mm->flags);  	mmdrop(mm);  } @@ -1296,7 +1442,8 @@ init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid)   * parts of the process environment (as per the clone   * flags). The actual kick-off is left to the caller.   */ -static struct task_struct *copy_process(unsigned long clone_flags, +static __latent_entropy struct task_struct *copy_process( +					unsigned long clone_flags,  					unsigned long stack_start,  					unsigned long stack_size,  					int __user *child_tidptr, @@ -1715,6 +1862,7 @@ bad_fork_cleanup_count:  	atomic_dec(&p->cred->user->processes);  	exit_creds(p);  bad_fork_free: +	put_task_stack(p);  	free_task(p);  fork_out:  	return ERR_PTR(retval); @@ -1780,6 +1928,7 @@ long _do_fork(unsigned long clone_flags,  	p = copy_process(clone_flags, stack_start, stack_size,  			 child_tidptr, NULL, trace, tls, NUMA_NO_NODE); +	add_latent_entropy();  	/*  	 * Do this prior waking up the new thread - the thread pointer  	 * might get invalid after that point, if the thread exits quickly. |