diff options
Diffstat (limited to 'kernel')
37 files changed, 719 insertions, 297 deletions
| diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index 6639640523c0..b58b2efb9b43 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -109,7 +109,7 @@ static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key)  	fd = *(int *)key;  	f = fget_raw(fd);  	if (!f) -		return NULL; +		return ERR_PTR(-EBADF);  	sdata = inode_storage_lookup(f->f_inode, map, true);  	fput(f); diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 1a666a975416..70f6fd4fa305 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -430,7 +430,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,  		tprogs[BPF_TRAMP_FENTRY].progs[0] = prog;  		tprogs[BPF_TRAMP_FENTRY].nr_progs = 1; -		err = arch_prepare_bpf_trampoline(image, +		err = arch_prepare_bpf_trampoline(NULL, image,  						  st_map->image + PAGE_SIZE,  						  &st_ops->func_models[i], 0,  						  tprogs, NULL); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 2efeb5f4b343..b1a76fe046cb 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -4321,8 +4321,6 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf,  		 * is not supported yet.  		 * BPF_PROG_TYPE_RAW_TRACEPOINT is fine.  		 */ -		if (log->level & BPF_LOG_LEVEL) -			bpf_log(log, "arg#%d type is not a struct\n", arg);  		return NULL;  	}  	tname = btf_name_by_offset(btf, t->name_off); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 0ae015ad1e05..75244ecb2389 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -827,7 +827,7 @@ static int __init bpf_jit_charge_init(void)  }  pure_initcall(bpf_jit_charge_init); -static int bpf_jit_charge_modmem(u32 pages) +int bpf_jit_charge_modmem(u32 pages)  {  	if (atomic_long_add_return(pages, &bpf_jit_current) >  	    (bpf_jit_limit >> PAGE_SHIFT)) { @@ -840,7 +840,7 @@ static int bpf_jit_charge_modmem(u32 pages)  	return 0;  } -static void bpf_jit_uncharge_modmem(u32 pages) +void bpf_jit_uncharge_modmem(u32 pages)  {  	atomic_long_sub(pages, &bpf_jit_current);  } @@ -1118,6 +1118,8 @@ static void bpf_prog_clone_free(struct bpf_prog *fp)  	 * clone is guaranteed to not be locked.  	 */  	fp->aux = NULL; +	fp->stats = NULL; +	fp->active = NULL;  	__bpf_prog_free(fp);  } @@ -2342,6 +2344,10 @@ bool __weak bpf_helper_changes_pkt_data(void *func)  /* Return TRUE if the JIT backend wants verifier to enable sub-register usage   * analysis code and wants explicit zero extension inserted by verifier.   * Otherwise, return FALSE. + * + * The verifier inserts an explicit zero extension after BPF_CMPXCHGs even if + * you don't override this. JITs that don't want these extra insns can detect + * them using insn_is_zext.   */  bool __weak bpf_jit_needs_zext(void)  { diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index 3acc7e0b6916..faa54d58972c 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -84,7 +84,7 @@ static const char *const bpf_atomic_alu_string[16] = {  	[BPF_ADD >> 4]  = "add",  	[BPF_AND >> 4]  = "and",  	[BPF_OR >> 4]  = "or", -	[BPF_XOR >> 4]  = "or", +	[BPF_XOR >> 4]  = "xor",  };  static const char *const bpf_ldst_string[] = { diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 1576ff331ee4..d2de2abec35b 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -543,11 +543,11 @@ int bpf_obj_get_user(const char __user *pathname, int flags)  		return PTR_ERR(raw);  	if (type == BPF_TYPE_PROG) -		ret = bpf_prog_new_fd(raw); +		ret = (f_flags != O_RDWR) ? -EINVAL : bpf_prog_new_fd(raw);  	else if (type == BPF_TYPE_MAP)  		ret = bpf_map_new_fd(raw, f_flags);  	else if (type == BPF_TYPE_LINK) -		ret = bpf_link_new_fd(raw); +		ret = (f_flags != O_RDWR) ? -EINVAL : bpf_link_new_fd(raw);  	else  		return -ENOENT; diff --git a/kernel/bpf/preload/bpf_preload_kern.c b/kernel/bpf/preload/bpf_preload_kern.c index 79c5772465f1..53736e52c1df 100644 --- a/kernel/bpf/preload/bpf_preload_kern.c +++ b/kernel/bpf/preload/bpf_preload_kern.c @@ -60,9 +60,12 @@ static int finish(void)  			 &magic, sizeof(magic), &pos);  	if (n != sizeof(magic))  		return -EPIPE; +  	tgid = umd_ops.info.tgid; -	wait_event(tgid->wait_pidfd, thread_group_exited(tgid)); -	umd_ops.info.tgid = NULL; +	if (tgid) { +		wait_event(tgid->wait_pidfd, thread_group_exited(tgid)); +		umd_cleanup_helper(&umd_ops.info); +	}  	return 0;  } @@ -80,10 +83,18 @@ static int __init load_umd(void)  static void __exit fini_umd(void)  { +	struct pid *tgid; +  	bpf_preload_ops = NULL; +  	/* kill UMD in case it's still there due to earlier error */ -	kill_pid(umd_ops.info.tgid, SIGKILL, 1); -	umd_ops.info.tgid = NULL; +	tgid = umd_ops.info.tgid; +	if (tgid) { +		kill_pid(tgid, SIGKILL, 1); + +		wait_event(tgid->wait_pidfd, thread_group_exited(tgid)); +		umd_cleanup_helper(&umd_ops.info); +	}  	umd_unload_blob(&umd_ops.info);  }  late_initcall(load_umd); diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index be35bfb7fb13..6fbc2abe9c91 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -517,9 +517,17 @@ const struct bpf_func_proto bpf_get_stack_proto = {  BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf,  	   u32, size, u64, flags)  { -	struct pt_regs *regs = task_pt_regs(task); +	struct pt_regs *regs; +	long res; -	return __bpf_get_stack(regs, task, NULL, buf, size, flags); +	if (!try_get_task_stack(task)) +		return -EFAULT; + +	regs = task_pt_regs(task); +	res = __bpf_get_stack(regs, task, NULL, buf, size, flags); +	put_task_stack(task); + +	return res;  }  BTF_ID_LIST_SINGLE(bpf_get_task_stack_btf_ids, struct, task_struct) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c859bc46d06c..250503482cda 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -854,6 +854,11 @@ static int map_create(union bpf_attr *attr)  			err = PTR_ERR(btf);  			goto free_map;  		} +		if (btf_is_kernel(btf)) { +			btf_put(btf); +			err = -EACCES; +			goto free_map; +		}  		map->btf = btf;  		if (attr->btf_value_type_id) { diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 7bc3b3209224..4aa8b52adf25 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -9,6 +9,7 @@  #include <linux/btf.h>  #include <linux/rcupdate_trace.h>  #include <linux/rcupdate_wait.h> +#include <linux/module.h>  /* dummy _ops. The verifier will operate on target program's ops. */  const struct bpf_verifier_ops bpf_extension_verifier_ops = { @@ -57,19 +58,10 @@ void bpf_image_ksym_del(struct bpf_ksym *ksym)  			   PAGE_SIZE, true, ksym->name);  } -static void bpf_trampoline_ksym_add(struct bpf_trampoline *tr) -{ -	struct bpf_ksym *ksym = &tr->ksym; - -	snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu", tr->key); -	bpf_image_ksym_add(tr->image, ksym); -} -  static struct bpf_trampoline *bpf_trampoline_lookup(u64 key)  {  	struct bpf_trampoline *tr;  	struct hlist_head *head; -	void *image;  	int i;  	mutex_lock(&trampoline_mutex); @@ -84,14 +76,6 @@ static struct bpf_trampoline *bpf_trampoline_lookup(u64 key)  	if (!tr)  		goto out; -	/* is_root was checked earlier. No need for bpf_jit_charge_modmem() */ -	image = bpf_jit_alloc_exec_page(); -	if (!image) { -		kfree(tr); -		tr = NULL; -		goto out; -	} -  	tr->key = key;  	INIT_HLIST_NODE(&tr->hlist);  	hlist_add_head(&tr->hlist, head); @@ -99,14 +83,31 @@ static struct bpf_trampoline *bpf_trampoline_lookup(u64 key)  	mutex_init(&tr->mutex);  	for (i = 0; i < BPF_TRAMP_MAX; i++)  		INIT_HLIST_HEAD(&tr->progs_hlist[i]); -	tr->image = image; -	INIT_LIST_HEAD_RCU(&tr->ksym.lnode); -	bpf_trampoline_ksym_add(tr);  out:  	mutex_unlock(&trampoline_mutex);  	return tr;  } +static int bpf_trampoline_module_get(struct bpf_trampoline *tr) +{ +	struct module *mod; +	int err = 0; + +	preempt_disable(); +	mod = __module_text_address((unsigned long) tr->func.addr); +	if (mod && !try_module_get(mod)) +		err = -ENOENT; +	preempt_enable(); +	tr->mod = mod; +	return err; +} + +static void bpf_trampoline_module_put(struct bpf_trampoline *tr) +{ +	module_put(tr->mod); +	tr->mod = NULL; +} +  static int is_ftrace_location(void *ip)  {  	long addr; @@ -128,6 +129,9 @@ static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr)  		ret = unregister_ftrace_direct((long)ip, (long)old_addr);  	else  		ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL); + +	if (!ret) +		bpf_trampoline_module_put(tr);  	return ret;  } @@ -154,10 +158,16 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr)  		return ret;  	tr->func.ftrace_managed = ret; +	if (bpf_trampoline_module_get(tr)) +		return -ENOENT; +  	if (tr->func.ftrace_managed)  		ret = register_ftrace_direct((long)ip, (long)new_addr);  	else  		ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr); + +	if (ret) +		bpf_trampoline_module_put(tr);  	return ret;  } @@ -185,10 +195,142 @@ bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total)  	return tprogs;  } +static void __bpf_tramp_image_put_deferred(struct work_struct *work) +{ +	struct bpf_tramp_image *im; + +	im = container_of(work, struct bpf_tramp_image, work); +	bpf_image_ksym_del(&im->ksym); +	bpf_jit_free_exec(im->image); +	bpf_jit_uncharge_modmem(1); +	percpu_ref_exit(&im->pcref); +	kfree_rcu(im, rcu); +} + +/* callback, fexit step 3 or fentry step 2 */ +static void __bpf_tramp_image_put_rcu(struct rcu_head *rcu) +{ +	struct bpf_tramp_image *im; + +	im = container_of(rcu, struct bpf_tramp_image, rcu); +	INIT_WORK(&im->work, __bpf_tramp_image_put_deferred); +	schedule_work(&im->work); +} + +/* callback, fexit step 2. Called after percpu_ref_kill confirms. */ +static void __bpf_tramp_image_release(struct percpu_ref *pcref) +{ +	struct bpf_tramp_image *im; + +	im = container_of(pcref, struct bpf_tramp_image, pcref); +	call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu); +} + +/* callback, fexit or fentry step 1 */ +static void __bpf_tramp_image_put_rcu_tasks(struct rcu_head *rcu) +{ +	struct bpf_tramp_image *im; + +	im = container_of(rcu, struct bpf_tramp_image, rcu); +	if (im->ip_after_call) +		/* the case of fmod_ret/fexit trampoline and CONFIG_PREEMPTION=y */ +		percpu_ref_kill(&im->pcref); +	else +		/* the case of fentry trampoline */ +		call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu); +} + +static void bpf_tramp_image_put(struct bpf_tramp_image *im) +{ +	/* The trampoline image that calls original function is using: +	 * rcu_read_lock_trace to protect sleepable bpf progs +	 * rcu_read_lock to protect normal bpf progs +	 * percpu_ref to protect trampoline itself +	 * rcu tasks to protect trampoline asm not covered by percpu_ref +	 * (which are few asm insns before __bpf_tramp_enter and +	 *  after __bpf_tramp_exit) +	 * +	 * The trampoline is unreachable before bpf_tramp_image_put(). +	 * +	 * First, patch the trampoline to avoid calling into fexit progs. +	 * The progs will be freed even if the original function is still +	 * executing or sleeping. +	 * In case of CONFIG_PREEMPT=y use call_rcu_tasks() to wait on +	 * first few asm instructions to execute and call into +	 * __bpf_tramp_enter->percpu_ref_get. +	 * Then use percpu_ref_kill to wait for the trampoline and the original +	 * function to finish. +	 * Then use call_rcu_tasks() to make sure few asm insns in +	 * the trampoline epilogue are done as well. +	 * +	 * In !PREEMPT case the task that got interrupted in the first asm +	 * insns won't go through an RCU quiescent state which the +	 * percpu_ref_kill will be waiting for. Hence the first +	 * call_rcu_tasks() is not necessary. +	 */ +	if (im->ip_after_call) { +		int err = bpf_arch_text_poke(im->ip_after_call, BPF_MOD_JUMP, +					     NULL, im->ip_epilogue); +		WARN_ON(err); +		if (IS_ENABLED(CONFIG_PREEMPTION)) +			call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu_tasks); +		else +			percpu_ref_kill(&im->pcref); +		return; +	} + +	/* The trampoline without fexit and fmod_ret progs doesn't call original +	 * function and doesn't use percpu_ref. +	 * Use call_rcu_tasks_trace() to wait for sleepable progs to finish. +	 * Then use call_rcu_tasks() to wait for the rest of trampoline asm +	 * and normal progs. +	 */ +	call_rcu_tasks_trace(&im->rcu, __bpf_tramp_image_put_rcu_tasks); +} + +static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx) +{ +	struct bpf_tramp_image *im; +	struct bpf_ksym *ksym; +	void *image; +	int err = -ENOMEM; + +	im = kzalloc(sizeof(*im), GFP_KERNEL); +	if (!im) +		goto out; + +	err = bpf_jit_charge_modmem(1); +	if (err) +		goto out_free_im; + +	err = -ENOMEM; +	im->image = image = bpf_jit_alloc_exec_page(); +	if (!image) +		goto out_uncharge; + +	err = percpu_ref_init(&im->pcref, __bpf_tramp_image_release, 0, GFP_KERNEL); +	if (err) +		goto out_free_image; + +	ksym = &im->ksym; +	INIT_LIST_HEAD_RCU(&ksym->lnode); +	snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu_%u", key, idx); +	bpf_image_ksym_add(image, ksym); +	return im; + +out_free_image: +	bpf_jit_free_exec(im->image); +out_uncharge: +	bpf_jit_uncharge_modmem(1); +out_free_im: +	kfree(im); +out: +	return ERR_PTR(err); +} +  static int bpf_trampoline_update(struct bpf_trampoline *tr)  { -	void *old_image = tr->image + ((tr->selector + 1) & 1) * PAGE_SIZE/2; -	void *new_image = tr->image + (tr->selector & 1) * PAGE_SIZE/2; +	struct bpf_tramp_image *im;  	struct bpf_tramp_progs *tprogs;  	u32 flags = BPF_TRAMP_F_RESTORE_REGS;  	int err, total; @@ -198,41 +340,42 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr)  		return PTR_ERR(tprogs);  	if (total == 0) { -		err = unregister_fentry(tr, old_image); +		err = unregister_fentry(tr, tr->cur_image->image); +		bpf_tramp_image_put(tr->cur_image); +		tr->cur_image = NULL;  		tr->selector = 0;  		goto out;  	} +	im = bpf_tramp_image_alloc(tr->key, tr->selector); +	if (IS_ERR(im)) { +		err = PTR_ERR(im); +		goto out; +	} +  	if (tprogs[BPF_TRAMP_FEXIT].nr_progs ||  	    tprogs[BPF_TRAMP_MODIFY_RETURN].nr_progs)  		flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME; -	/* Though the second half of trampoline page is unused a task could be -	 * preempted in the middle of the first half of trampoline and two -	 * updates to trampoline would change the code from underneath the -	 * preempted task. Hence wait for tasks to voluntarily schedule or go -	 * to userspace. -	 * The same trampoline can hold both sleepable and non-sleepable progs. -	 * synchronize_rcu_tasks_trace() is needed to make sure all sleepable -	 * programs finish executing. -	 * Wait for these two grace periods together. -	 */ -	synchronize_rcu_mult(call_rcu_tasks, call_rcu_tasks_trace); - -	err = arch_prepare_bpf_trampoline(new_image, new_image + PAGE_SIZE / 2, +	err = arch_prepare_bpf_trampoline(im, im->image, im->image + PAGE_SIZE,  					  &tr->func.model, flags, tprogs,  					  tr->func.addr);  	if (err < 0)  		goto out; -	if (tr->selector) +	WARN_ON(tr->cur_image && tr->selector == 0); +	WARN_ON(!tr->cur_image && tr->selector); +	if (tr->cur_image)  		/* progs already running at this address */ -		err = modify_fentry(tr, old_image, new_image); +		err = modify_fentry(tr, tr->cur_image->image, im->image);  	else  		/* first time registering */ -		err = register_fentry(tr, new_image); +		err = register_fentry(tr, im->image);  	if (err)  		goto out; +	if (tr->cur_image) +		bpf_tramp_image_put(tr->cur_image); +	tr->cur_image = im;  	tr->selector++;  out:  	kfree(tprogs); @@ -364,17 +507,12 @@ void bpf_trampoline_put(struct bpf_trampoline *tr)  		goto out;  	if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT])))  		goto out; -	bpf_image_ksym_del(&tr->ksym); -	/* This code will be executed when all bpf progs (both sleepable and -	 * non-sleepable) went through -	 * bpf_prog_put()->call_rcu[_tasks_trace]()->bpf_prog_free_deferred(). -	 * Hence no need for another synchronize_rcu_tasks_trace() here, -	 * but synchronize_rcu_tasks() is still needed, since trampoline -	 * may not have had any sleepable programs and we need to wait -	 * for tasks to get out of trampoline code before freeing it. +	/* This code will be executed even when the last bpf_tramp_image +	 * is alive. All progs are detached from the trampoline and the +	 * trampoline image is patched with jmp into epilogue to skip +	 * fexit progs. The fentry-only trampoline will be freed via +	 * multiple rcu callbacks.  	 */ -	synchronize_rcu_tasks(); -	bpf_jit_free_exec(tr->image);  	hlist_del(&tr->hlist);  	kfree(tr);  out: @@ -478,8 +616,18 @@ void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start)  	rcu_read_unlock_trace();  } +void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr) +{ +	percpu_ref_get(&tr->pcref); +} + +void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr) +{ +	percpu_ref_put(&tr->pcref); +} +  int __weak -arch_prepare_bpf_trampoline(void *image, void *image_end, +arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end,  			    const struct btf_func_model *m, u32 flags,  			    struct bpf_tramp_progs *tprogs,  			    void *orig_call) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1dda9d81f12c..3a738724a380 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -504,6 +504,13 @@ static bool is_ptr_cast_function(enum bpf_func_id func_id)  		func_id == BPF_FUNC_skc_to_tcp_request_sock;  } +static bool is_cmpxchg_insn(const struct bpf_insn *insn) +{ +	return BPF_CLASS(insn->code) == BPF_STX && +	       BPF_MODE(insn->code) == BPF_ATOMIC && +	       insn->imm == BPF_CMPXCHG; +} +  /* string representation of 'enum bpf_reg_type' */  static const char * const reg_type_str[] = {  	[NOT_INIT]		= "?", @@ -1120,7 +1127,7 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)  		reg->type = PTR_TO_RDWR_BUF;  		break;  	default: -		WARN_ON("unknown nullable register type"); +		WARN_ONCE(1, "unknown nullable register type");  	}  } @@ -1703,7 +1710,11 @@ static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn,  	}  	if (class == BPF_STX) { -		if (reg->type != SCALAR_VALUE) +		/* BPF_STX (including atomic variants) has multiple source +		 * operands, one of which is a ptr. Check whether the caller is +		 * asking about it. +		 */ +		if (t == SRC_OP && reg->type != SCALAR_VALUE)  			return true;  		return BPF_SIZE(code) == BPF_DW;  	} @@ -1735,22 +1746,38 @@ static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn,  	return true;  } -/* Return TRUE if INSN doesn't have explicit value define. */ -static bool insn_no_def(struct bpf_insn *insn) +/* Return the regno defined by the insn, or -1. */ +static int insn_def_regno(const struct bpf_insn *insn)  { -	u8 class = BPF_CLASS(insn->code); - -	return (class == BPF_JMP || class == BPF_JMP32 || -		class == BPF_STX || class == BPF_ST); +	switch (BPF_CLASS(insn->code)) { +	case BPF_JMP: +	case BPF_JMP32: +	case BPF_ST: +		return -1; +	case BPF_STX: +		if (BPF_MODE(insn->code) == BPF_ATOMIC && +		    (insn->imm & BPF_FETCH)) { +			if (insn->imm == BPF_CMPXCHG) +				return BPF_REG_0; +			else +				return insn->src_reg; +		} else { +			return -1; +		} +	default: +		return insn->dst_reg; +	}  }  /* Return TRUE if INSN has defined any 32-bit value explicitly. */  static bool insn_has_def32(struct bpf_verifier_env *env, struct bpf_insn *insn)  { -	if (insn_no_def(insn)) +	int dst_reg = insn_def_regno(insn); + +	if (dst_reg == -1)  		return false; -	return !is_reg64(env, insn, insn->dst_reg, NULL, DST_OP); +	return !is_reg64(env, insn, dst_reg, NULL, DST_OP);  }  static void mark_insn_zext(struct bpf_verifier_env *env, @@ -5834,10 +5861,14 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg,  {  	bool mask_to_left = (opcode == BPF_ADD &&  off_is_neg) ||  			    (opcode == BPF_SUB && !off_is_neg); -	u32 off; +	u32 off, max;  	switch (ptr_reg->type) {  	case PTR_TO_STACK: +		/* Offset 0 is out-of-bounds, but acceptable start for the +		 * left direction, see BPF_REG_FP. +		 */ +		max = MAX_BPF_STACK + mask_to_left;  		/* Indirect variable offset stack access is prohibited in  		 * unprivileged mode so it's not handled here.  		 */ @@ -5845,16 +5876,17 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg,  		if (mask_to_left)  			*ptr_limit = MAX_BPF_STACK + off;  		else -			*ptr_limit = -off; -		return 0; +			*ptr_limit = -off - 1; +		return *ptr_limit >= max ? -ERANGE : 0;  	case PTR_TO_MAP_VALUE: +		max = ptr_reg->map_ptr->value_size;  		if (mask_to_left) {  			*ptr_limit = ptr_reg->umax_value + ptr_reg->off;  		} else {  			off = ptr_reg->smin_value + ptr_reg->off; -			*ptr_limit = ptr_reg->map_ptr->value_size - off; +			*ptr_limit = ptr_reg->map_ptr->value_size - off - 1;  		} -		return 0; +		return *ptr_limit >= max ? -ERANGE : 0;  	default:  		return -EINVAL;  	} @@ -5907,6 +5939,7 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env,  	u32 alu_state, alu_limit;  	struct bpf_reg_state tmp;  	bool ret; +	int err;  	if (can_skip_alu_sanitation(env, insn))  		return 0; @@ -5922,10 +5955,13 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env,  	alu_state |= ptr_is_dst_reg ?  		     BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST; -	if (retrieve_ptr_limit(ptr_reg, &alu_limit, opcode, off_is_neg)) -		return 0; -	if (update_alu_sanitation_state(aux, alu_state, alu_limit)) -		return -EACCES; +	err = retrieve_ptr_limit(ptr_reg, &alu_limit, opcode, off_is_neg); +	if (err < 0) +		return err; + +	err = update_alu_sanitation_state(aux, alu_state, alu_limit); +	if (err < 0) +		return err;  do_sim:  	/* Simulate and find potential out-of-bounds access under  	 * speculative execution from truncation as a result of @@ -6076,7 +6112,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,  	case BPF_ADD:  		ret = sanitize_ptr_alu(env, insn, ptr_reg, dst_reg, smin_val < 0);  		if (ret < 0) { -			verbose(env, "R%d tried to add from different maps or paths\n", dst); +			verbose(env, "R%d tried to add from different maps, paths, or prohibited types\n", dst);  			return ret;  		}  		/* We can take a fixed offset as long as it doesn't overflow @@ -6131,7 +6167,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,  	case BPF_SUB:  		ret = sanitize_ptr_alu(env, insn, ptr_reg, dst_reg, smin_val < 0);  		if (ret < 0) { -			verbose(env, "R%d tried to sub from different maps or paths\n", dst); +			verbose(env, "R%d tried to sub from different maps, paths, or prohibited types\n", dst);  			return ret;  		}  		if (dst_reg == off_reg) { @@ -9029,6 +9065,10 @@ static int check_btf_info(struct bpf_verifier_env *env,  	btf = btf_get_by_fd(attr->prog_btf_fd);  	if (IS_ERR(btf))  		return PTR_ERR(btf); +	if (btf_is_kernel(btf)) { +		btf_put(btf); +		return -EACCES; +	}  	env->prog->aux->btf = btf;  	err = check_btf_func(env, attr, uattr); @@ -11006,9 +11046,10 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env,  	for (i = 0; i < len; i++) {  		int adj_idx = i + delta;  		struct bpf_insn insn; -		u8 load_reg; +		int load_reg;  		insn = insns[adj_idx]; +		load_reg = insn_def_regno(&insn);  		if (!aux[adj_idx].zext_dst) {  			u8 code, class;  			u32 imm_rnd; @@ -11018,14 +11059,14 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env,  			code = insn.code;  			class = BPF_CLASS(code); -			if (insn_no_def(&insn)) +			if (load_reg == -1)  				continue;  			/* NOTE: arg "reg" (the fourth one) is only used for -			 *       BPF_STX which has been ruled out in above -			 *       check, it is safe to pass NULL here. +			 *       BPF_STX + SRC_OP, so it is safe to pass NULL +			 *       here.  			 */ -			if (is_reg64(env, &insn, insn.dst_reg, NULL, DST_OP)) { +			if (is_reg64(env, &insn, load_reg, NULL, DST_OP)) {  				if (class == BPF_LD &&  				    BPF_MODE(code) == BPF_IMM)  					i++; @@ -11040,31 +11081,28 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env,  			imm_rnd = get_random_int();  			rnd_hi32_patch[0] = insn;  			rnd_hi32_patch[1].imm = imm_rnd; -			rnd_hi32_patch[3].dst_reg = insn.dst_reg; +			rnd_hi32_patch[3].dst_reg = load_reg;  			patch = rnd_hi32_patch;  			patch_len = 4;  			goto apply_patch_buffer;  		} -		if (!bpf_jit_needs_zext()) +		/* Add in an zero-extend instruction if a) the JIT has requested +		 * it or b) it's a CMPXCHG. +		 * +		 * The latter is because: BPF_CMPXCHG always loads a value into +		 * R0, therefore always zero-extends. However some archs' +		 * equivalent instruction only does this load when the +		 * comparison is successful. This detail of CMPXCHG is +		 * orthogonal to the general zero-extension behaviour of the +		 * CPU, so it's treated independently of bpf_jit_needs_zext. +		 */ +		if (!bpf_jit_needs_zext() && !is_cmpxchg_insn(&insn))  			continue; -		/* zext_dst means that we want to zero-extend whatever register -		 * the insn defines, which is dst_reg most of the time, with -		 * the notable exception of BPF_STX + BPF_ATOMIC + BPF_FETCH. -		 */ -		if (BPF_CLASS(insn.code) == BPF_STX && -		    BPF_MODE(insn.code) == BPF_ATOMIC) { -			/* BPF_STX + BPF_ATOMIC insns without BPF_FETCH do not -			 * define any registers, therefore zext_dst cannot be -			 * set. -			 */ -			if (WARN_ON(!(insn.imm & BPF_FETCH))) -				return -EINVAL; -			load_reg = insn.imm == BPF_CMPXCHG ? BPF_REG_0 -							   : insn.src_reg; -		} else { -			load_reg = insn.dst_reg; +		if (WARN_ON(load_reg == -1)) { +			verbose(env, "verifier bug. zext_dst is set, but no reg is defined\n"); +			return -EFAULT;  		}  		zext_patch[0] = insn; @@ -11635,7 +11673,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)  			off_reg = issrc ? insn->src_reg : insn->dst_reg;  			if (isneg)  				*patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1); -			*patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit - 1); +			*patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit);  			*patch++ = BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, off_reg);  			*patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg);  			*patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0); @@ -12120,6 +12158,11 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env)  	u32 btf_id, member_idx;  	const char *mname; +	if (!prog->gpl_compatible) { +		verbose(env, "struct ops programs must have a GPL compatible license\n"); +		return -EINVAL; +	} +  	btf_id = prog->aux->attach_btf_id;  	st_ops = bpf_struct_ops_find(btf_id);  	if (!st_ops) { diff --git a/kernel/events/core.c b/kernel/events/core.c index 0aeca5f3c0ac..03db40f6cba9 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -386,6 +386,7 @@ static DEFINE_MUTEX(perf_sched_mutex);  static atomic_t perf_sched_count;  static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); +static DEFINE_PER_CPU(int, perf_sched_cb_usages);  static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);  static atomic_t nr_mmap_events __read_mostly; @@ -3461,11 +3462,16 @@ unlock:  	}  } +static DEFINE_PER_CPU(struct list_head, sched_cb_list); +  void perf_sched_cb_dec(struct pmu *pmu)  {  	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); -	--cpuctx->sched_cb_usage; +	this_cpu_dec(perf_sched_cb_usages); + +	if (!--cpuctx->sched_cb_usage) +		list_del(&cpuctx->sched_cb_entry);  } @@ -3473,7 +3479,10 @@ void perf_sched_cb_inc(struct pmu *pmu)  {  	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); -	cpuctx->sched_cb_usage++; +	if (!cpuctx->sched_cb_usage++) +		list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list)); + +	this_cpu_inc(perf_sched_cb_usages);  }  /* @@ -3502,6 +3511,24 @@ static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in  	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);  } +static void perf_pmu_sched_task(struct task_struct *prev, +				struct task_struct *next, +				bool sched_in) +{ +	struct perf_cpu_context *cpuctx; + +	if (prev == next) +		return; + +	list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) { +		/* will be handled in perf_event_context_sched_in/out */ +		if (cpuctx->task_ctx) +			continue; + +		__perf_pmu_sched_task(cpuctx, sched_in); +	} +} +  static void perf_event_switch(struct task_struct *task,  			      struct task_struct *next_prev, bool sched_in); @@ -3524,6 +3551,9 @@ void __perf_event_task_sched_out(struct task_struct *task,  {  	int ctxn; +	if (__this_cpu_read(perf_sched_cb_usages)) +		perf_pmu_sched_task(task, next, false); +  	if (atomic_read(&nr_switch_events))  		perf_event_switch(task, next, false); @@ -3832,6 +3862,9 @@ void __perf_event_task_sched_in(struct task_struct *prev,  	if (atomic_read(&nr_switch_events))  		perf_event_switch(task, prev, true); + +	if (__this_cpu_read(perf_sched_cb_usages)) +		perf_pmu_sched_task(prev, task, true);  }  static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) @@ -4656,7 +4689,7 @@ static void unaccount_event(struct perf_event *event)  	if (event->parent)  		return; -	if (event->attach_state & PERF_ATTACH_TASK) +	if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))  		dec = true;  	if (event->attr.mmap || event->attr.mmap_data)  		atomic_dec(&nr_mmap_events); @@ -11175,7 +11208,7 @@ static void account_event(struct perf_event *event)  	if (event->parent)  		return; -	if (event->attach_state & PERF_ATTACH_TASK) +	if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))  		inc = true;  	if (event->attr.mmap || event->attr.mmap_data)  		atomic_inc(&nr_mmap_events); @@ -12972,6 +13005,7 @@ static void __init perf_event_init_all_cpus(void)  #ifdef CONFIG_CGROUP_PERF  		INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));  #endif +		INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));  	}  } diff --git a/kernel/fork.c b/kernel/fork.c index d3171e8e88e5..426cd0c51f9e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -994,6 +994,13 @@ static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)  #endif  } +static void mm_init_pasid(struct mm_struct *mm) +{ +#ifdef CONFIG_IOMMU_SUPPORT +	mm->pasid = INIT_PASID; +#endif +} +  static void mm_init_uprobes_state(struct mm_struct *mm)  {  #ifdef CONFIG_UPROBES @@ -1024,6 +1031,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,  	mm_init_cpumask(mm);  	mm_init_aio(mm);  	mm_init_owner(mm, p); +	mm_init_pasid(mm);  	RCU_INIT_POINTER(mm->exe_file, NULL);  	mmu_notifier_subscriptions_init(mm);  	init_tlb_flush_pending(mm); @@ -1940,8 +1948,14 @@ static __latent_entropy struct task_struct *copy_process(  	p = dup_task_struct(current, node);  	if (!p)  		goto fork_out; -	if (args->io_thread) +	if (args->io_thread) { +		/* +		 * Mark us an IO worker, and block any signal that isn't +		 * fatal or STOP +		 */  		p->flags |= PF_IO_WORKER; +		siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP)); +	}  	/*  	 * This _must_ happen before we call free_task(), i.e. before we jump @@ -2430,14 +2444,8 @@ struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)  		.stack_size	= (unsigned long)arg,  		.io_thread	= 1,  	}; -	struct task_struct *tsk; -	tsk = copy_process(NULL, 0, node, &args); -	if (!IS_ERR(tsk)) { -		sigfillset(&tsk->blocked); -		sigdelsetmask(&tsk->blocked, sigmask(SIGKILL)); -	} -	return tsk; +	return copy_process(NULL, 0, node, &args);  }  /* diff --git a/kernel/futex.c b/kernel/futex.c index e68db7745039..00febd6dea9c 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2728,14 +2728,13 @@ retry:  		goto out;  	restart = ¤t->restart_block; -	restart->fn = futex_wait_restart;  	restart->futex.uaddr = uaddr;  	restart->futex.val = val;  	restart->futex.time = *abs_time;  	restart->futex.bitset = bitset;  	restart->futex.flags = flags | FLAGS_HAS_TIMEOUT; -	ret = -ERESTART_RESTARTBLOCK; +	ret = set_restart_fn(restart, futex_wait_restart);  out:  	if (to) { diff --git a/kernel/gcov/clang.c b/kernel/gcov/clang.c index c94b820a1b62..c466c7fbdece 100644 --- a/kernel/gcov/clang.c +++ b/kernel/gcov/clang.c @@ -70,12 +70,16 @@ struct gcov_fn_info {  	u32 ident;  	u32 checksum; +#if CONFIG_CLANG_VERSION < 110000  	u8 use_extra_checksum; +#endif  	u32 cfg_checksum;  	u32 num_counters;  	u64 *counters; +#if CONFIG_CLANG_VERSION < 110000  	const char *function_name; +#endif  };  static struct gcov_info *current_info; @@ -105,6 +109,7 @@ void llvm_gcov_init(llvm_gcov_callback writeout, llvm_gcov_callback flush)  }  EXPORT_SYMBOL(llvm_gcov_init); +#if CONFIG_CLANG_VERSION < 110000  void llvm_gcda_start_file(const char *orig_filename, const char version[4],  		u32 checksum)  { @@ -113,7 +118,17 @@ void llvm_gcda_start_file(const char *orig_filename, const char version[4],  	current_info->checksum = checksum;  }  EXPORT_SYMBOL(llvm_gcda_start_file); +#else +void llvm_gcda_start_file(const char *orig_filename, u32 version, u32 checksum) +{ +	current_info->filename = orig_filename; +	current_info->version = version; +	current_info->checksum = checksum; +} +EXPORT_SYMBOL(llvm_gcda_start_file); +#endif +#if CONFIG_CLANG_VERSION < 110000  void llvm_gcda_emit_function(u32 ident, const char *function_name,  		u32 func_checksum, u8 use_extra_checksum, u32 cfg_checksum)  { @@ -132,6 +147,21 @@ void llvm_gcda_emit_function(u32 ident, const char *function_name,  	list_add_tail(&info->head, ¤t_info->functions);  } +#else +void llvm_gcda_emit_function(u32 ident, u32 func_checksum, u32 cfg_checksum) +{ +	struct gcov_fn_info *info = kzalloc(sizeof(*info), GFP_KERNEL); + +	if (!info) +		return; + +	INIT_LIST_HEAD(&info->head); +	info->ident = ident; +	info->checksum = func_checksum; +	info->cfg_checksum = cfg_checksum; +	list_add_tail(&info->head, ¤t_info->functions); +} +#endif  EXPORT_SYMBOL(llvm_gcda_emit_function);  void llvm_gcda_emit_arcs(u32 num_counters, u64 *counters) @@ -262,11 +292,16 @@ int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2)  		!list_is_last(&fn_ptr2->head, &info2->functions)) {  		if (fn_ptr1->checksum != fn_ptr2->checksum)  			return false; +#if CONFIG_CLANG_VERSION < 110000  		if (fn_ptr1->use_extra_checksum != fn_ptr2->use_extra_checksum)  			return false;  		if (fn_ptr1->use_extra_checksum &&  			fn_ptr1->cfg_checksum != fn_ptr2->cfg_checksum)  			return false; +#else +		if (fn_ptr1->cfg_checksum != fn_ptr2->cfg_checksum) +			return false; +#endif  		fn_ptr1 = list_next_entry(fn_ptr1, head);  		fn_ptr2 = list_next_entry(fn_ptr2, head);  	} @@ -295,6 +330,7 @@ void gcov_info_add(struct gcov_info *dst, struct gcov_info *src)  	}  } +#if CONFIG_CLANG_VERSION < 110000  static struct gcov_fn_info *gcov_fn_info_dup(struct gcov_fn_info *fn)  {  	size_t cv_size; /* counter values size */ @@ -322,6 +358,28 @@ err_name:  	kfree(fn_dup);  	return NULL;  } +#else +static struct gcov_fn_info *gcov_fn_info_dup(struct gcov_fn_info *fn) +{ +	size_t cv_size; /* counter values size */ +	struct gcov_fn_info *fn_dup = kmemdup(fn, sizeof(*fn), +			GFP_KERNEL); +	if (!fn_dup) +		return NULL; +	INIT_LIST_HEAD(&fn_dup->head); + +	cv_size = fn->num_counters * sizeof(fn->counters[0]); +	fn_dup->counters = vmalloc(cv_size); +	if (!fn_dup->counters) { +		kfree(fn_dup); +		return NULL; +	} + +	memcpy(fn_dup->counters, fn->counters, cv_size); + +	return fn_dup; +} +#endif  /**   * gcov_info_dup - duplicate profiling data set @@ -362,6 +420,7 @@ err:   * gcov_info_free - release memory for profiling data set duplicate   * @info: profiling data set duplicate to free   */ +#if CONFIG_CLANG_VERSION < 110000  void gcov_info_free(struct gcov_info *info)  {  	struct gcov_fn_info *fn, *tmp; @@ -375,6 +434,20 @@ void gcov_info_free(struct gcov_info *info)  	kfree(info->filename);  	kfree(info);  } +#else +void gcov_info_free(struct gcov_info *info) +{ +	struct gcov_fn_info *fn, *tmp; + +	list_for_each_entry_safe(fn, tmp, &info->functions, head) { +		vfree(fn->counters); +		list_del(&fn->head); +		kfree(fn); +	} +	kfree(info->filename); +	kfree(info); +} +#endif  #define ITER_STRIDE	PAGE_SIZE @@ -460,17 +533,22 @@ static size_t convert_to_gcda(char *buffer, struct gcov_info *info)  	list_for_each_entry(fi_ptr, &info->functions, head) {  		u32 i; -		u32 len = 2; - -		if (fi_ptr->use_extra_checksum) -			len++;  		pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION); -		pos += store_gcov_u32(buffer, pos, len); +#if CONFIG_CLANG_VERSION < 110000 +		pos += store_gcov_u32(buffer, pos, +			fi_ptr->use_extra_checksum ? 3 : 2); +#else +		pos += store_gcov_u32(buffer, pos, 3); +#endif  		pos += store_gcov_u32(buffer, pos, fi_ptr->ident);  		pos += store_gcov_u32(buffer, pos, fi_ptr->checksum); +#if CONFIG_CLANG_VERSION < 110000  		if (fi_ptr->use_extra_checksum)  			pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum); +#else +		pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum); +#endif  		pos += store_gcov_u32(buffer, pos, GCOV_TAG_COUNTER_BASE);  		pos += store_gcov_u32(buffer, pos, fi_ptr->num_counters * 2); diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c index 48006608baf0..40880c350b95 100644 --- a/kernel/irq/irq_sim.c +++ b/kernel/irq/irq_sim.c @@ -159,7 +159,7 @@ static const struct irq_domain_ops irq_sim_domain_ops = {   * irq_domain_create_sim - Create a new interrupt simulator irq_domain and   *                         allocate a range of dummy interrupts.   * - * @fnode:      struct fwnode_handle to be associated with this domain. + * @fwnode:     struct fwnode_handle to be associated with this domain.   * @num_irqs:   Number of interrupts to allocate.   *   * On success: return a new irq_domain object. @@ -228,7 +228,7 @@ static void devm_irq_domain_release_sim(struct device *dev, void *res)   *                              a managed device.   *   * @dev:        Device to initialize the simulator object for. - * @fnode:      struct fwnode_handle to be associated with this domain. + * @fwnode:     struct fwnode_handle to be associated with this domain.   * @num_irqs:   Number of interrupts to allocate   *   * On success: return a new irq_domain object. diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 288151393a06..d10ab1d689d5 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1898,16 +1898,15 @@ DEFINE_SHOW_ATTRIBUTE(irq_domain_debug);  static void debugfs_add_domain_dir(struct irq_domain *d)  { -	if (!d->name || !domain_dir || d->debugfs_file) +	if (!d->name || !domain_dir)  		return; -	d->debugfs_file = debugfs_create_file(d->name, 0444, domain_dir, d, -					      &irq_domain_debug_fops); +	debugfs_create_file(d->name, 0444, domain_dir, d, +			    &irq_domain_debug_fops);  }  static void debugfs_remove_domain_dir(struct irq_domain *d)  { -	debugfs_remove(d->debugfs_file); -	d->debugfs_file = NULL; +	debugfs_remove(debugfs_lookup(d->name, domain_dir));  }  void __init irq_domain_debugfs_init(struct dentry *root) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index dec3f73e8db9..21ea370fccda 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1142,11 +1142,15 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)  	irqreturn_t ret;  	local_bh_disable(); +	if (!IS_ENABLED(CONFIG_PREEMPT_RT)) +		local_irq_disable();  	ret = action->thread_fn(action->irq, action->dev_id);  	if (ret == IRQ_HANDLED)  		atomic_inc(&desc->threads_handled);  	irq_finalize_oneshot(desc, action); +	if (!IS_ENABLED(CONFIG_PREEMPT_RT)) +		local_irq_enable();  	local_bh_enable();  	return ret;  } diff --git a/kernel/jump_label.c b/kernel/jump_label.c index c6a39d662935..ba39fbb1f8e7 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -407,6 +407,14 @@ static bool jump_label_can_update(struct jump_entry *entry, bool init)  		return false;  	if (!kernel_text_address(jump_entry_code(entry))) { +		/* +		 * This skips patching built-in __exit, which +		 * is part of init_section_contains() but is +		 * not part of kernel_text_address(). +		 * +		 * Skipping built-in __exit is fine since it +		 * will never be executed. +		 */  		WARN_ONCE(!jump_entry_is_init(entry),  			  "can't patch jump_label at %pS",  			  (void *)jump_entry_code(entry)); diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index c6d0c1dc6253..f160f1c97ca1 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -705,7 +705,7 @@ static void print_lock_name(struct lock_class *class)  	printk(KERN_CONT " (");  	__print_lock_name(class); -	printk(KERN_CONT "){%s}-{%hd:%hd}", usage, +	printk(KERN_CONT "){%s}-{%d:%d}", usage,  			class->wait_type_outer ?: class->wait_type_inner,  			class->wait_type_inner);  } @@ -930,7 +930,8 @@ static bool assign_lock_key(struct lockdep_map *lock)  		/* Debug-check: all keys must be persistent! */  		debug_locks_off();  		pr_err("INFO: trying to register non-static key.\n"); -		pr_err("the code is fine but needs lockdep annotation.\n"); +		pr_err("The code is fine but needs lockdep annotation, or maybe\n"); +		pr_err("you didn't initialize this object before use?\n");  		pr_err("turning off the locking correctness validator.\n");  		dump_stack();  		return false; diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index adb935090768..622ebdfcd083 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -626,7 +626,7 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)   */  static __always_inline bool  mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx, -		      const bool use_ww_ctx, struct mutex_waiter *waiter) +		      struct mutex_waiter *waiter)  {  	if (!waiter) {  		/* @@ -702,7 +702,7 @@ fail:  #else  static __always_inline bool  mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx, -		      const bool use_ww_ctx, struct mutex_waiter *waiter) +		      struct mutex_waiter *waiter)  {  	return false;  } @@ -922,6 +922,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,  	struct ww_mutex *ww;  	int ret; +	if (!use_ww_ctx) +		ww_ctx = NULL; +  	might_sleep();  #ifdef CONFIG_DEBUG_MUTEXES @@ -929,7 +932,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,  #endif  	ww = container_of(lock, struct ww_mutex, base); -	if (use_ww_ctx && ww_ctx) { +	if (ww_ctx) {  		if (unlikely(ww_ctx == READ_ONCE(ww->ctx)))  			return -EALREADY; @@ -946,10 +949,10 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,  	mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);  	if (__mutex_trylock(lock) || -	    mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, NULL)) { +	    mutex_optimistic_spin(lock, ww_ctx, NULL)) {  		/* got the lock, yay! */  		lock_acquired(&lock->dep_map, ip); -		if (use_ww_ctx && ww_ctx) +		if (ww_ctx)  			ww_mutex_set_context_fastpath(ww, ww_ctx);  		preempt_enable();  		return 0; @@ -960,7 +963,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,  	 * After waiting to acquire the wait_lock, try again.  	 */  	if (__mutex_trylock(lock)) { -		if (use_ww_ctx && ww_ctx) +		if (ww_ctx)  			__ww_mutex_check_waiters(lock, ww_ctx);  		goto skip_wait; @@ -1013,7 +1016,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,  			goto err;  		} -		if (use_ww_ctx && ww_ctx) { +		if (ww_ctx) {  			ret = __ww_mutex_check_kill(lock, &waiter, ww_ctx);  			if (ret)  				goto err; @@ -1026,7 +1029,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,  		 * ww_mutex needs to always recheck its position since its waiter  		 * list is not FIFO ordered.  		 */ -		if ((use_ww_ctx && ww_ctx) || !first) { +		if (ww_ctx || !first) {  			first = __mutex_waiter_is_first(lock, &waiter);  			if (first)  				__mutex_set_flag(lock, MUTEX_FLAG_HANDOFF); @@ -1039,7 +1042,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,  		 * or we must see its unlock and acquire.  		 */  		if (__mutex_trylock(lock) || -		    (first && mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, &waiter))) +		    (first && mutex_optimistic_spin(lock, ww_ctx, &waiter)))  			break;  		spin_lock(&lock->wait_lock); @@ -1048,7 +1051,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,  acquired:  	__set_current_state(TASK_RUNNING); -	if (use_ww_ctx && ww_ctx) { +	if (ww_ctx) {  		/*  		 * Wound-Wait; we stole the lock (!first_waiter), check the  		 * waiters as anyone might want to wound us. @@ -1068,7 +1071,7 @@ skip_wait:  	/* got the lock - cleanup and rejoice! */  	lock_acquired(&lock->dep_map, ip); -	if (use_ww_ctx && ww_ctx) +	if (ww_ctx)  		ww_mutex_lock_acquired(ww, ww_ctx);  	spin_unlock(&lock->wait_lock); diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 1358fa4abfa8..0f4530b3a8cd 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -98,7 +98,7 @@ static int __init em_debug_init(void)  	return 0;  } -core_initcall(em_debug_init); +fs_initcall(em_debug_init);  #else /* CONFIG_DEBUG_FS */  static void em_debug_create_pd(struct device *dev) {}  static void em_debug_remove_pd(struct device *dev) {} diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 821cf1723814..61db50f7ca86 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -375,7 +375,7 @@ static int ptrace_attach(struct task_struct *task, long request,  	audit_ptrace(task);  	retval = -EPERM; -	if (unlikely(task->flags & (PF_KTHREAD | PF_IO_WORKER))) +	if (unlikely(task->flags & PF_KTHREAD))  		goto out;  	if (same_thread_group(task, current))  		goto out; diff --git a/kernel/reboot.c b/kernel/reboot.c index eb1b15850761..a6ad5eb2fa73 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -244,8 +244,6 @@ void migrate_to_reboot_cpu(void)  void kernel_restart(char *cmd)  {  	kernel_restart_prepare(cmd); -	if (pm_power_off_prepare) -		pm_power_off_prepare();  	migrate_to_reboot_cpu();  	syscore_shutdown();  	if (!cmd) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ca2bb629595f..98191218d891 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1862,8 +1862,13 @@ struct migration_arg {  	struct set_affinity_pending	*pending;  }; +/* + * @refs: number of wait_for_completion() + * @stop_pending: is @stop_work in use + */  struct set_affinity_pending {  	refcount_t		refs; +	unsigned int		stop_pending;  	struct completion	done;  	struct cpu_stop_work	stop_work;  	struct migration_arg	arg; @@ -1898,8 +1903,8 @@ static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,   */  static int migration_cpu_stop(void *data)  { -	struct set_affinity_pending *pending;  	struct migration_arg *arg = data; +	struct set_affinity_pending *pending = arg->pending;  	struct task_struct *p = arg->task;  	int dest_cpu = arg->dest_cpu;  	struct rq *rq = this_rq(); @@ -1921,7 +1926,6 @@ static int migration_cpu_stop(void *data)  	raw_spin_lock(&p->pi_lock);  	rq_lock(rq, &rf); -	pending = p->migration_pending;  	/*  	 * If task_rq(p) != rq, it cannot be migrated here, because we're  	 * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because @@ -1932,21 +1936,14 @@ static int migration_cpu_stop(void *data)  			goto out;  		if (pending) { -			p->migration_pending = NULL; +			if (p->migration_pending == pending) +				p->migration_pending = NULL;  			complete = true;  		} -		/* migrate_enable() --  we must not race against SCA */  		if (dest_cpu < 0) { -			/* -			 * When this was migrate_enable() but we no longer -			 * have a @pending, a concurrent SCA 'fixed' things -			 * and we should be valid again. Nothing to do. -			 */ -			if (!pending) { -				WARN_ON_ONCE(!cpumask_test_cpu(task_cpu(p), &p->cpus_mask)); +			if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask))  				goto out; -			}  			dest_cpu = cpumask_any_distribute(&p->cpus_mask);  		} @@ -1956,7 +1953,14 @@ static int migration_cpu_stop(void *data)  		else  			p->wake_cpu = dest_cpu; -	} else if (dest_cpu < 0 || pending) { +		/* +		 * XXX __migrate_task() can fail, at which point we might end +		 * up running on a dodgy CPU, AFAICT this can only happen +		 * during CPU hotplug, at which point we'll get pushed out +		 * anyway, so it's probably not a big deal. +		 */ + +	} else if (pending) {  		/*  		 * This happens when we get migrated between migrate_enable()'s  		 * preempt_enable() and scheduling the stopper task. At that @@ -1971,43 +1975,32 @@ static int migration_cpu_stop(void *data)  		 * ->pi_lock, so the allowed mask is stable - if it got  		 * somewhere allowed, we're done.  		 */ -		if (pending && cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) { -			p->migration_pending = NULL; +		if (cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) { +			if (p->migration_pending == pending) +				p->migration_pending = NULL;  			complete = true;  			goto out;  		}  		/* -		 * When this was migrate_enable() but we no longer have an -		 * @pending, a concurrent SCA 'fixed' things and we should be -		 * valid again. Nothing to do. -		 */ -		if (!pending) { -			WARN_ON_ONCE(!cpumask_test_cpu(task_cpu(p), &p->cpus_mask)); -			goto out; -		} - -		/*  		 * When migrate_enable() hits a rq mis-match we can't reliably  		 * determine is_migration_disabled() and so have to chase after  		 * it.  		 */ +		WARN_ON_ONCE(!pending->stop_pending);  		task_rq_unlock(rq, p, &rf);  		stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,  				    &pending->arg, &pending->stop_work);  		return 0;  	}  out: +	if (pending) +		pending->stop_pending = false;  	task_rq_unlock(rq, p, &rf);  	if (complete)  		complete_all(&pending->done); -	/* For pending->{arg,stop_work} */ -	pending = arg->pending; -	if (pending && refcount_dec_and_test(&pending->refs)) -		wake_up_var(&pending->refs); -  	return 0;  } @@ -2194,11 +2187,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag  			    int dest_cpu, unsigned int flags)  {  	struct set_affinity_pending my_pending = { }, *pending = NULL; -	struct migration_arg arg = { -		.task = p, -		.dest_cpu = dest_cpu, -	}; -	bool complete = false; +	bool stop_pending, complete = false;  	/* Can the task run on the task's current CPU? If so, we're done */  	if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) { @@ -2210,12 +2199,16 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag  			push_task = get_task_struct(p);  		} +		/* +		 * If there are pending waiters, but no pending stop_work, +		 * then complete now. +		 */  		pending = p->migration_pending; -		if (pending) { -			refcount_inc(&pending->refs); +		if (pending && !pending->stop_pending) {  			p->migration_pending = NULL;  			complete = true;  		} +  		task_rq_unlock(rq, p, rf);  		if (push_task) { @@ -2224,7 +2217,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag  		}  		if (complete) -			goto do_complete; +			complete_all(&pending->done);  		return 0;  	} @@ -2235,6 +2228,12 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag  			/* Install the request */  			refcount_set(&my_pending.refs, 1);  			init_completion(&my_pending.done); +			my_pending.arg = (struct migration_arg) { +				.task = p, +				.dest_cpu = -1,		/* any */ +				.pending = &my_pending, +			}; +  			p->migration_pending = &my_pending;  		} else {  			pending = p->migration_pending; @@ -2259,45 +2258,41 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag  		return -EINVAL;  	} -	if (flags & SCA_MIGRATE_ENABLE) { - -		refcount_inc(&pending->refs); /* pending->{arg,stop_work} */ -		p->migration_flags &= ~MDF_PUSH; -		task_rq_unlock(rq, p, rf); - -		pending->arg = (struct migration_arg) { -			.task = p, -			.dest_cpu = -1, -			.pending = pending, -		}; - -		stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop, -				    &pending->arg, &pending->stop_work); - -		return 0; -	} -  	if (task_running(rq, p) || p->state == TASK_WAKING) {  		/* -		 * Lessen races (and headaches) by delegating -		 * is_migration_disabled(p) checks to the stopper, which will -		 * run on the same CPU as said p. +		 * MIGRATE_ENABLE gets here because 'p == current', but for +		 * anything else we cannot do is_migration_disabled(), punt +		 * and have the stopper function handle it all race-free.  		 */ +		stop_pending = pending->stop_pending; +		if (!stop_pending) +			pending->stop_pending = true; + +		if (flags & SCA_MIGRATE_ENABLE) +			p->migration_flags &= ~MDF_PUSH; +  		task_rq_unlock(rq, p, rf); -		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); +		if (!stop_pending) { +			stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop, +					    &pending->arg, &pending->stop_work); +		} + +		if (flags & SCA_MIGRATE_ENABLE) +			return 0;  	} else {  		if (!is_migration_disabled(p)) {  			if (task_on_rq_queued(p))  				rq = move_queued_task(rq, rf, p, dest_cpu); -			p->migration_pending = NULL; -			complete = true; +			if (!pending->stop_pending) { +				p->migration_pending = NULL; +				complete = true; +			}  		}  		task_rq_unlock(rq, p, rf); -do_complete:  		if (complete)  			complete_all(&pending->done);  	} @@ -2305,7 +2300,7 @@ do_complete:  	wait_for_completion(&pending->done);  	if (refcount_dec_and_test(&pending->refs)) -		wake_up_var(&pending->refs); +		wake_up_var(&pending->refs); /* No UaF, just an address */  	/*  	 * Block the original owner of &pending until all subsequent callers @@ -2313,6 +2308,9 @@ do_complete:  	 */  	wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs)); +	/* ARGH */ +	WARN_ON_ONCE(my_pending.stop_pending); +  	return 0;  } diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index acdae625c636..b5add64d9698 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -471,9 +471,7 @@ static int sync_runqueues_membarrier_state(struct mm_struct *mm)  	}  	rcu_read_unlock(); -	preempt_disable(); -	smp_call_function_many(tmpmask, ipi_sync_rq_state, mm, 1); -	preempt_enable(); +	on_each_cpu_mask(tmpmask, ipi_sync_rq_state, mm, true);  	free_cpumask_var(tmpmask);  	cpus_read_unlock(); diff --git a/kernel/signal.c b/kernel/signal.c index ba4d1ef39a9e..f2718350bf4b 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -91,7 +91,7 @@ static bool sig_task_ignored(struct task_struct *t, int sig, bool force)  		return true;  	/* Only allow kernel generated signals to this kthread */ -	if (unlikely((t->flags & (PF_KTHREAD | PF_IO_WORKER)) && +	if (unlikely((t->flags & PF_KTHREAD) &&  		     (handler == SIG_KTHREAD_KERNEL) && !force))  		return true; @@ -1096,7 +1096,7 @@ static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struc  	/*  	 * Skip useless siginfo allocation for SIGKILL and kernel threads.  	 */ -	if ((sig == SIGKILL) || (t->flags & (PF_KTHREAD | PF_IO_WORKER))) +	if ((sig == SIGKILL) || (t->flags & PF_KTHREAD))  		goto out_set;  	/* @@ -2768,13 +2768,21 @@ relock:  		}  		/* +		 * PF_IO_WORKER threads will catch and exit on fatal signals +		 * themselves. They have cleanup that must be performed, so +		 * we cannot call do_exit() on their behalf. +		 */ +		if (current->flags & PF_IO_WORKER) +			goto out; + +		/*  		 * Death signals, no core dump.  		 */  		do_group_exit(ksig->info.si_signo);  		/* NOTREACHED */  	}  	spin_unlock_irq(&sighand->siglock); - +out:  	ksig->sig = signr;  	if (!(ksig->ka.sa.sa_flags & SA_EXPOSE_TAGBITS)) diff --git a/kernel/static_call.c b/kernel/static_call.c index 6906c6ec4c97..2c5950b0b90e 100644 --- a/kernel/static_call.c +++ b/kernel/static_call.c @@ -35,27 +35,30 @@ static inline void *static_call_addr(struct static_call_site *site)  	return (void *)((long)site->addr + (long)&site->addr);  } +static inline unsigned long __static_call_key(const struct static_call_site *site) +{ +	return (long)site->key + (long)&site->key; +}  static inline struct static_call_key *static_call_key(const struct static_call_site *site)  { -	return (struct static_call_key *) -		(((long)site->key + (long)&site->key) & ~STATIC_CALL_SITE_FLAGS); +	return (void *)(__static_call_key(site) & ~STATIC_CALL_SITE_FLAGS);  }  /* These assume the key is word-aligned. */  static inline bool static_call_is_init(struct static_call_site *site)  { -	return ((long)site->key + (long)&site->key) & STATIC_CALL_SITE_INIT; +	return __static_call_key(site) & STATIC_CALL_SITE_INIT;  }  static inline bool static_call_is_tail(struct static_call_site *site)  { -	return ((long)site->key + (long)&site->key) & STATIC_CALL_SITE_TAIL; +	return __static_call_key(site) & STATIC_CALL_SITE_TAIL;  }  static inline void static_call_set_init(struct static_call_site *site)  { -	site->key = ((long)static_call_key(site) | STATIC_CALL_SITE_INIT) - +	site->key = (__static_call_key(site) | STATIC_CALL_SITE_INIT) -  		    (long)&site->key;  } @@ -146,6 +149,7 @@ void __static_call_update(struct static_call_key *key, void *tramp, void *func)  	};  	for (site_mod = &first; site_mod; site_mod = site_mod->next) { +		bool init = system_state < SYSTEM_RUNNING;  		struct module *mod = site_mod->mod;  		if (!site_mod->sites) { @@ -165,6 +169,7 @@ void __static_call_update(struct static_call_key *key, void *tramp, void *func)  		if (mod) {  			stop = mod->static_call_sites +  			       mod->num_static_call_sites; +			init = mod->state == MODULE_STATE_COMING;  		}  #endif @@ -172,25 +177,26 @@ void __static_call_update(struct static_call_key *key, void *tramp, void *func)  		     site < stop && static_call_key(site) == key; site++) {  			void *site_addr = static_call_addr(site); -			if (static_call_is_init(site)) { -				/* -				 * Don't write to call sites which were in -				 * initmem and have since been freed. -				 */ -				if (!mod && system_state >= SYSTEM_RUNNING) -					continue; -				if (mod && !within_module_init((unsigned long)site_addr, mod)) -					continue; -			} +			if (!init && static_call_is_init(site)) +				continue;  			if (!kernel_text_address((unsigned long)site_addr)) { -				WARN_ONCE(1, "can't patch static call site at %pS", +				/* +				 * This skips patching built-in __exit, which +				 * is part of init_section_contains() but is +				 * not part of kernel_text_address(). +				 * +				 * Skipping built-in __exit is fine since it +				 * will never be executed. +				 */ +				WARN_ONCE(!static_call_is_init(site), +					  "can't patch static call site at %pS",  					  site_addr);  				continue;  			}  			arch_static_call_transform(site_addr, NULL, func, -				static_call_is_tail(site)); +						   static_call_is_tail(site));  		}  	} @@ -349,7 +355,8 @@ static int static_call_add_module(struct module *mod)  	struct static_call_site *site;  	for (site = start; site != stop; site++) { -		unsigned long addr = (unsigned long)static_call_key(site); +		unsigned long s_key = __static_call_key(site); +		unsigned long addr = s_key & ~STATIC_CALL_SITE_FLAGS;  		unsigned long key;  		/* @@ -373,8 +380,8 @@ static int static_call_add_module(struct module *mod)  			return -EINVAL;  		} -		site->key = (key - (long)&site->key) | -			    (site->key & STATIC_CALL_SITE_FLAGS); +		key |= s_key & STATIC_CALL_SITE_FLAGS; +		site->key = key - (long)&site->key;  	}  	return __static_call_init(mod, start, stop); diff --git a/kernel/sys.c b/kernel/sys.c index b09fe21e88ff..2e2e3f378d97 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2079,7 +2079,7 @@ static int prctl_set_auxv(struct mm_struct *mm, unsigned long addr,  	 * up to the caller to provide sane values here, otherwise userspace  	 * tools which use this vector might be unhappy.  	 */ -	unsigned long user_auxv[AT_VECTOR_SIZE]; +	unsigned long user_auxv[AT_VECTOR_SIZE] = {};  	if (len > sizeof(user_auxv))  		return -EINVAL; diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 98d7a15e8cf6..4d94e2b5499d 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -854,9 +854,9 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,  	if (flags == TIMER_ABSTIME)  		return -ERESTARTNOHAND; -	restart->fn = alarm_timer_nsleep_restart;  	restart->nanosleep.clockid = type;  	restart->nanosleep.expires = exp; +	set_restart_fn(restart, alarm_timer_nsleep_restart);  	return ret;  } diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 743c852e10f2..5c9d968187ae 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -546,8 +546,11 @@ static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base,  }  /* - * Recomputes cpu_base::*next_timer and returns the earliest expires_next but - * does not set cpu_base::*expires_next, that is done by hrtimer_reprogram. + * Recomputes cpu_base::*next_timer and returns the earliest expires_next + * but does not set cpu_base::*expires_next, that is done by + * hrtimer[_force]_reprogram and hrtimer_interrupt only. When updating + * cpu_base::*expires_next right away, reprogramming logic would no longer + * work.   *   * When a softirq is pending, we can ignore the HRTIMER_ACTIVE_SOFT bases,   * those timers will get run whenever the softirq gets handled, at the end of @@ -588,6 +591,37 @@ __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_  	return expires_next;  } +static ktime_t hrtimer_update_next_event(struct hrtimer_cpu_base *cpu_base) +{ +	ktime_t expires_next, soft = KTIME_MAX; + +	/* +	 * If the soft interrupt has already been activated, ignore the +	 * soft bases. They will be handled in the already raised soft +	 * interrupt. +	 */ +	if (!cpu_base->softirq_activated) { +		soft = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT); +		/* +		 * Update the soft expiry time. clock_settime() might have +		 * affected it. +		 */ +		cpu_base->softirq_expires_next = soft; +	} + +	expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_HARD); +	/* +	 * If a softirq timer is expiring first, update cpu_base->next_timer +	 * and program the hardware with the soft expiry time. +	 */ +	if (expires_next > soft) { +		cpu_base->next_timer = cpu_base->softirq_next_timer; +		expires_next = soft; +	} + +	return expires_next; +} +  static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)  {  	ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; @@ -628,23 +662,7 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)  {  	ktime_t expires_next; -	/* -	 * Find the current next expiration time. -	 */ -	expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL); - -	if (cpu_base->next_timer && cpu_base->next_timer->is_soft) { -		/* -		 * When the softirq is activated, hrtimer has to be -		 * programmed with the first hard hrtimer because soft -		 * timer interrupt could occur too late. -		 */ -		if (cpu_base->softirq_activated) -			expires_next = __hrtimer_get_next_event(cpu_base, -								HRTIMER_ACTIVE_HARD); -		else -			cpu_base->softirq_expires_next = expires_next; -	} +	expires_next = hrtimer_update_next_event(cpu_base);  	if (skip_equal && expires_next == cpu_base->expires_next)  		return; @@ -1644,8 +1662,8 @@ retry:  	__hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); -	/* Reevaluate the clock bases for the next expiry */ -	expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL); +	/* Reevaluate the clock bases for the [soft] next expiry */ +	expires_next = hrtimer_update_next_event(cpu_base);  	/*  	 * Store the new expiry value so the migration code can verify  	 * against it. @@ -1939,9 +1957,9 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode,  	}  	restart = ¤t->restart_block; -	restart->fn = hrtimer_nanosleep_restart;  	restart->nanosleep.clockid = t.timer.base->clockid;  	restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer); +	set_restart_fn(restart, hrtimer_nanosleep_restart);  out:  	destroy_hrtimer_on_stack(&t.timer);  	return ret; diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index a71758e34e45..9abe15255bc4 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1480,8 +1480,8 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags,  		if (flags & TIMER_ABSTIME)  			return -ERESTARTNOHAND; -		restart_block->fn = posix_cpu_nsleep_restart;  		restart_block->nanosleep.clockid = which_clock; +		set_restart_fn(restart_block, posix_cpu_nsleep_restart);  	}  	return error;  } diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4d8e35575549..3ba52d4e1314 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3231,7 +3231,8 @@ ftrace_allocate_pages(unsigned long num_to_init)  	pg = start_pg;  	while (pg) {  		order = get_count_order(pg->size / ENTRIES_PER_PAGE); -		free_pages((unsigned long)pg->records, order); +		if (order >= 0) +			free_pages((unsigned long)pg->records, order);  		start_pg = pg->next;  		kfree(pg);  		pg = start_pg; @@ -5045,6 +5046,20 @@ struct ftrace_direct_func *ftrace_find_direct_func(unsigned long addr)  	return NULL;  } +static struct ftrace_direct_func *ftrace_alloc_direct_func(unsigned long addr) +{ +	struct ftrace_direct_func *direct; + +	direct = kmalloc(sizeof(*direct), GFP_KERNEL); +	if (!direct) +		return NULL; +	direct->addr = addr; +	direct->count = 0; +	list_add_rcu(&direct->next, &ftrace_direct_funcs); +	ftrace_direct_func_count++; +	return direct; +} +  /**   * register_ftrace_direct - Call a custom trampoline directly   * @ip: The address of the nop at the beginning of a function @@ -5120,15 +5135,11 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr)  	direct = ftrace_find_direct_func(addr);  	if (!direct) { -		direct = kmalloc(sizeof(*direct), GFP_KERNEL); +		direct = ftrace_alloc_direct_func(addr);  		if (!direct) {  			kfree(entry);  			goto out_unlock;  		} -		direct->addr = addr; -		direct->count = 0; -		list_add_rcu(&direct->next, &ftrace_direct_funcs); -		ftrace_direct_func_count++;  	}  	entry->ip = ip; @@ -5329,6 +5340,7 @@ int __weak ftrace_modify_direct_caller(struct ftrace_func_entry *entry,  int modify_ftrace_direct(unsigned long ip,  			 unsigned long old_addr, unsigned long new_addr)  { +	struct ftrace_direct_func *direct, *new_direct = NULL;  	struct ftrace_func_entry *entry;  	struct dyn_ftrace *rec;  	int ret = -ENODEV; @@ -5344,6 +5356,20 @@ int modify_ftrace_direct(unsigned long ip,  	if (entry->direct != old_addr)  		goto out_unlock; +	direct = ftrace_find_direct_func(old_addr); +	if (WARN_ON(!direct)) +		goto out_unlock; +	if (direct->count > 1) { +		ret = -ENOMEM; +		new_direct = ftrace_alloc_direct_func(new_addr); +		if (!new_direct) +			goto out_unlock; +		direct->count--; +		new_direct->count++; +	} else { +		direct->addr = new_addr; +	} +  	/*  	 * If there's no other ftrace callback on the rec->ip location,  	 * then it can be changed directly by the architecture. @@ -5357,6 +5383,14 @@ int modify_ftrace_direct(unsigned long ip,  		ret = 0;  	} +	if (unlikely(ret && new_direct)) { +		direct->count++; +		list_del_rcu(&new_direct->next); +		synchronize_rcu_tasks(); +		kfree(new_direct); +		ftrace_direct_func_count--; +	} +   out_unlock:  	mutex_unlock(&ftrace_lock);  	mutex_unlock(&direct_mutex); @@ -6418,7 +6452,8 @@ void ftrace_release_mod(struct module *mod)  		clear_mod_from_hashes(pg);  		order = get_count_order(pg->size / ENTRIES_PER_PAGE); -		free_pages((unsigned long)pg->records, order); +		if (order >= 0) +			free_pages((unsigned long)pg->records, order);  		tmp_page = pg->next;  		kfree(pg);  		ftrace_number_of_pages -= 1 << order; @@ -6778,7 +6813,8 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr)  		if (!pg->index) {  			*last_pg = pg->next;  			order = get_count_order(pg->size / ENTRIES_PER_PAGE); -			free_pages((unsigned long)pg->records, order); +			if (order >= 0) +				free_pages((unsigned long)pg->records, order);  			ftrace_number_of_pages -= 1 << order;  			ftrace_number_of_groups--;  			kfree(pg); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index eccb4e1187cc..5c777627212f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2984,7 +2984,8 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer,  	size = nr_entries * sizeof(unsigned long);  	event = __trace_buffer_lock_reserve(buffer, TRACE_STACK, -					    sizeof(*entry) + size, trace_ctx); +				    (sizeof(*entry) - sizeof(entry->caller)) + size, +				    trace_ctx);  	if (!event)  		goto out;  	entry = ring_buffer_event_data(event); diff --git a/kernel/usermode_driver.c b/kernel/usermode_driver.c index 0b35212ffc3d..bb7bb3b478ab 100644 --- a/kernel/usermode_driver.c +++ b/kernel/usermode_driver.c @@ -139,13 +139,22 @@ static void umd_cleanup(struct subprocess_info *info)  	struct umd_info *umd_info = info->data;  	/* cleanup if umh_setup() was successful but exec failed */ -	if (info->retval) { -		fput(umd_info->pipe_to_umh); -		fput(umd_info->pipe_from_umh); -		put_pid(umd_info->tgid); -		umd_info->tgid = NULL; -	} +	if (info->retval) +		umd_cleanup_helper(umd_info); +} + +/** + * umd_cleanup_helper - release the resources which were allocated in umd_setup + * @info: information about usermode driver + */ +void umd_cleanup_helper(struct umd_info *info) +{ +	fput(info->pipe_to_umh); +	fput(info->pipe_from_umh); +	put_pid(info->tgid); +	info->tgid = NULL;  } +EXPORT_SYMBOL_GPL(umd_cleanup_helper);  /**   * fork_usermode_driver - fork a usermode driver diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 71109065bd8e..107bc38b1945 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -278,9 +278,10 @@ void touch_all_softlockup_watchdogs(void)  	 * update as well, the only side effect might be a cycle delay for  	 * the softlockup check.  	 */ -	for_each_cpu(cpu, &watchdog_allowed_mask) +	for_each_cpu(cpu, &watchdog_allowed_mask) {  		per_cpu(watchdog_touch_ts, cpu) = SOFTLOCKUP_RESET; -	wq_watchdog_touch(-1); +		wq_watchdog_touch(cpu); +	}  }  void touch_softlockup_watchdog_sync(void) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 0d150da252e8..79f2319543ce 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1412,7 +1412,6 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,  	 */  	lockdep_assert_irqs_disabled(); -	debug_work_activate(work);  	/* if draining, only works from the same workqueue are allowed */  	if (unlikely(wq->flags & __WQ_DRAINING) && @@ -1494,6 +1493,7 @@ retry:  		worklist = &pwq->delayed_works;  	} +	debug_work_activate(work);  	insert_work(pwq, work, worklist, work_flags);  out: @@ -5787,22 +5787,17 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)  			continue;  		/* get the latest of pool and touched timestamps */ +		if (pool->cpu >= 0) +			touched = READ_ONCE(per_cpu(wq_watchdog_touched_cpu, pool->cpu)); +		else +			touched = READ_ONCE(wq_watchdog_touched);  		pool_ts = READ_ONCE(pool->watchdog_ts); -		touched = READ_ONCE(wq_watchdog_touched);  		if (time_after(pool_ts, touched))  			ts = pool_ts;  		else  			ts = touched; -		if (pool->cpu >= 0) { -			unsigned long cpu_touched = -				READ_ONCE(per_cpu(wq_watchdog_touched_cpu, -						  pool->cpu)); -			if (time_after(cpu_touched, ts)) -				ts = cpu_touched; -		} -  		/* did we stall? */  		if (time_after(jiffies, ts + thresh)) {  			lockup_detected = true; @@ -5826,8 +5821,8 @@ notrace void wq_watchdog_touch(int cpu)  {  	if (cpu >= 0)  		per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies; -	else -		wq_watchdog_touched = jiffies; + +	wq_watchdog_touched = jiffies;  }  static void wq_watchdog_set_thresh(unsigned long thresh) |