diff options
Diffstat (limited to 'kernel')
45 files changed, 510 insertions, 232 deletions
| diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 5e00b2333c26..172dc8ee0e3b 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -86,6 +86,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)  	array->map.key_size = attr->key_size;  	array->map.value_size = attr->value_size;  	array->map.max_entries = attr->max_entries; +	array->map.map_flags = attr->map_flags;  	array->elem_size = elem_size;  	if (!percpu) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 39cfafd895b8..b09185f0f17d 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -432,6 +432,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)  	trie->map.key_size = attr->key_size;  	trie->map.value_size = attr->value_size;  	trie->map.max_entries = attr->max_entries; +	trie->map.map_flags = attr->map_flags;  	trie->data_size = attr->key_size -  			  offsetof(struct bpf_lpm_trie_key, data);  	trie->max_prefixlen = trie->data_size * 8; diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 4dfd6f2ec2f9..31147d730abf 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -88,6 +88,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)  	smap->map.key_size = attr->key_size;  	smap->map.value_size = value_size;  	smap->map.max_entries = attr->max_entries; +	smap->map.map_flags = attr->map_flags;  	smap->n_buckets = n_buckets;  	smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index fd2411fd6914..265a0d854e33 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -783,7 +783,7 @@ struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type)  EXPORT_SYMBOL_GPL(bpf_prog_get_type);  /* last field in 'union bpf_attr' used by this command */ -#define	BPF_PROG_LOAD_LAST_FIELD kern_version +#define	BPF_PROG_LOAD_LAST_FIELD prog_flags  static int bpf_prog_load(union bpf_attr *attr)  { @@ -796,6 +796,9 @@ static int bpf_prog_load(union bpf_attr *attr)  	if (CHECK_ATTR(BPF_PROG_LOAD))  		return -EINVAL; +	if (attr->prog_flags & ~BPF_F_STRICT_ALIGNMENT) +		return -EINVAL; +  	/* copy eBPF program license from user space */  	if (strncpy_from_user(license, u64_to_user_ptr(attr->license),  			      sizeof(license) - 1) < 0) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c5b56c92f8e2..a8a725697bed 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -140,7 +140,7 @@ struct bpf_verifier_stack_elem {  	struct bpf_verifier_stack_elem *next;  }; -#define BPF_COMPLEXITY_LIMIT_INSNS	65536 +#define BPF_COMPLEXITY_LIMIT_INSNS	98304  #define BPF_COMPLEXITY_LIMIT_STACK	1024  #define BPF_MAP_PTR_POISON ((void *)0xeB9F + POISON_POINTER_DELTA) @@ -241,6 +241,12 @@ static void print_verifier_state(struct bpf_verifier_state *state)  		if (reg->max_value != BPF_REGISTER_MAX_RANGE)  			verbose(",max_value=%llu",  				(unsigned long long)reg->max_value); +		if (reg->min_align) +			verbose(",min_align=%u", reg->min_align); +		if (reg->aux_off) +			verbose(",aux_off=%u", reg->aux_off); +		if (reg->aux_off_align) +			verbose(",aux_off_align=%u", reg->aux_off_align);  	}  	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {  		if (state->stack_slot_type[i] == STACK_SPILL) @@ -457,16 +463,22 @@ static const int caller_saved[CALLER_SAVED_REGS] = {  	BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5  }; +static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno) +{ +	BUG_ON(regno >= MAX_BPF_REG); + +	memset(®s[regno], 0, sizeof(regs[regno])); +	regs[regno].type = NOT_INIT; +	regs[regno].min_value = BPF_REGISTER_MIN_RANGE; +	regs[regno].max_value = BPF_REGISTER_MAX_RANGE; +} +  static void init_reg_state(struct bpf_reg_state *regs)  {  	int i; -	for (i = 0; i < MAX_BPF_REG; i++) { -		regs[i].type = NOT_INIT; -		regs[i].imm = 0; -		regs[i].min_value = BPF_REGISTER_MIN_RANGE; -		regs[i].max_value = BPF_REGISTER_MAX_RANGE; -	} +	for (i = 0; i < MAX_BPF_REG; i++) +		mark_reg_not_init(regs, i);  	/* frame pointer */  	regs[BPF_REG_FP].type = FRAME_PTR; @@ -492,6 +504,7 @@ static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno)  {  	regs[regno].min_value = BPF_REGISTER_MIN_RANGE;  	regs[regno].max_value = BPF_REGISTER_MAX_RANGE; +	regs[regno].min_align = 0;  }  static void mark_reg_unknown_value_and_range(struct bpf_reg_state *regs, @@ -779,17 +792,37 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno)  }  static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg, -				   int off, int size) +				   int off, int size, bool strict)  { -	if (reg->id && size != 1) { -		verbose("Unknown alignment. Only byte-sized access allowed in packet access.\n"); -		return -EACCES; +	int ip_align; +	int reg_off; + +	/* Byte size accesses are always allowed. */ +	if (!strict || size == 1) +		return 0; + +	reg_off = reg->off; +	if (reg->id) { +		if (reg->aux_off_align % size) { +			verbose("Packet access is only %u byte aligned, %d byte access not allowed\n", +				reg->aux_off_align, size); +			return -EACCES; +		} +		reg_off += reg->aux_off;  	} -	/* skb->data is NET_IP_ALIGN-ed */ -	if ((NET_IP_ALIGN + reg->off + off) % size != 0) { +	/* For platforms that do not have a Kconfig enabling +	 * CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS the value of +	 * NET_IP_ALIGN is universally set to '2'.  And on platforms +	 * that do set CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS, we get +	 * to this code only in strict mode where we want to emulate +	 * the NET_IP_ALIGN==2 checking.  Therefore use an +	 * unconditional IP align value of '2'. +	 */ +	ip_align = 2; +	if ((ip_align + reg_off + off) % size != 0) {  		verbose("misaligned packet access off %d+%d+%d size %d\n", -			NET_IP_ALIGN, reg->off, off, size); +			ip_align, reg_off, off, size);  		return -EACCES;  	} @@ -797,9 +830,9 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,  }  static int check_val_ptr_alignment(const struct bpf_reg_state *reg, -				   int size) +				   int size, bool strict)  { -	if (size != 1) { +	if (strict && size != 1) {  		verbose("Unknown alignment. Only byte-sized access allowed in value access.\n");  		return -EACCES;  	} @@ -807,16 +840,17 @@ static int check_val_ptr_alignment(const struct bpf_reg_state *reg,  	return 0;  } -static int check_ptr_alignment(const struct bpf_reg_state *reg, +static int check_ptr_alignment(struct bpf_verifier_env *env, +			       const struct bpf_reg_state *reg,  			       int off, int size)  { +	bool strict = env->strict_alignment; +  	switch (reg->type) {  	case PTR_TO_PACKET: -		return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ? 0 : -		       check_pkt_ptr_alignment(reg, off, size); +		return check_pkt_ptr_alignment(reg, off, size, strict);  	case PTR_TO_MAP_VALUE_ADJ: -		return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ? 0 : -		       check_val_ptr_alignment(reg, size); +		return check_val_ptr_alignment(reg, size, strict);  	default:  		if (off % size != 0) {  			verbose("misaligned access off %d size %d\n", @@ -849,7 +883,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,  	if (size < 0)  		return size; -	err = check_ptr_alignment(reg, off, size); +	err = check_ptr_alignment(env, reg, off, size);  	if (err)  		return err; @@ -883,6 +917,8 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,  							 value_regno);  			/* note that reg.[id|off|range] == 0 */  			state->regs[value_regno].type = reg_type; +			state->regs[value_regno].aux_off = 0; +			state->regs[value_regno].aux_off_align = 0;  		}  	} else if (reg->type == FRAME_PTR || reg->type == PTR_TO_STACK) { @@ -953,6 +989,11 @@ static int check_xadd(struct bpf_verifier_env *env, struct bpf_insn *insn)  	if (err)  		return err; +	if (is_pointer_value(env, insn->src_reg)) { +		verbose("R%d leaks addr into mem\n", insn->src_reg); +		return -EACCES; +	} +  	/* check whether atomic_add can read the memory */  	err = check_mem_access(env, insn->dst_reg, insn->off,  			       BPF_SIZE(insn->code), BPF_READ, -1); @@ -1313,7 +1354,6 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)  	struct bpf_verifier_state *state = &env->cur_state;  	const struct bpf_func_proto *fn = NULL;  	struct bpf_reg_state *regs = state->regs; -	struct bpf_reg_state *reg;  	struct bpf_call_arg_meta meta;  	bool changes_data;  	int i, err; @@ -1380,11 +1420,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)  	}  	/* reset caller saved regs */ -	for (i = 0; i < CALLER_SAVED_REGS; i++) { -		reg = regs + caller_saved[i]; -		reg->type = NOT_INIT; -		reg->imm = 0; -	} +	for (i = 0; i < CALLER_SAVED_REGS; i++) +		mark_reg_not_init(regs, caller_saved[i]);  	/* update return register */  	if (fn->ret_type == RET_INTEGER) { @@ -1455,6 +1492,8 @@ add_imm:  		 */  		dst_reg->off += imm;  	} else { +		bool had_id; +  		if (src_reg->type == PTR_TO_PACKET) {  			/* R6=pkt(id=0,off=0,r=62) R7=imm22; r7 += r6 */  			tmp_reg = *dst_reg;  /* save r7 state */ @@ -1488,14 +1527,23 @@ add_imm:  				src_reg->imm);  			return -EACCES;  		} + +		had_id = (dst_reg->id != 0); +  		/* dst_reg stays as pkt_ptr type and since some positive  		 * integer value was added to the pointer, increment its 'id'  		 */  		dst_reg->id = ++env->id_gen; -		/* something was added to pkt_ptr, set range and off to zero */ +		/* something was added to pkt_ptr, set range to zero */ +		dst_reg->aux_off += dst_reg->off;  		dst_reg->off = 0;  		dst_reg->range = 0; +		if (had_id) +			dst_reg->aux_off_align = min(dst_reg->aux_off_align, +						     src_reg->min_align); +		else +			dst_reg->aux_off_align = src_reg->min_align;  	}  	return 0;  } @@ -1669,6 +1717,13 @@ static void check_reg_overflow(struct bpf_reg_state *reg)  		reg->min_value = BPF_REGISTER_MIN_RANGE;  } +static u32 calc_align(u32 imm) +{ +	if (!imm) +		return 1U << 31; +	return imm - ((imm - 1) & imm); +} +  static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,  				    struct bpf_insn *insn)  { @@ -1676,8 +1731,10 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,  	s64 min_val = BPF_REGISTER_MIN_RANGE;  	u64 max_val = BPF_REGISTER_MAX_RANGE;  	u8 opcode = BPF_OP(insn->code); +	u32 dst_align, src_align;  	dst_reg = ®s[insn->dst_reg]; +	src_align = 0;  	if (BPF_SRC(insn->code) == BPF_X) {  		check_reg_overflow(®s[insn->src_reg]);  		min_val = regs[insn->src_reg].min_value; @@ -1693,12 +1750,18 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,  		    regs[insn->src_reg].type != UNKNOWN_VALUE) {  			min_val = BPF_REGISTER_MIN_RANGE;  			max_val = BPF_REGISTER_MAX_RANGE; +			src_align = 0; +		} else { +			src_align = regs[insn->src_reg].min_align;  		}  	} else if (insn->imm < BPF_REGISTER_MAX_RANGE &&  		   (s64)insn->imm > BPF_REGISTER_MIN_RANGE) {  		min_val = max_val = insn->imm; +		src_align = calc_align(insn->imm);  	} +	dst_align = dst_reg->min_align; +  	/* We don't know anything about what was done to this register, mark it  	 * as unknown.  	 */ @@ -1723,18 +1786,21 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,  			dst_reg->min_value += min_val;  		if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)  			dst_reg->max_value += max_val; +		dst_reg->min_align = min(src_align, dst_align);  		break;  	case BPF_SUB:  		if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)  			dst_reg->min_value -= min_val;  		if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)  			dst_reg->max_value -= max_val; +		dst_reg->min_align = min(src_align, dst_align);  		break;  	case BPF_MUL:  		if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)  			dst_reg->min_value *= min_val;  		if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)  			dst_reg->max_value *= max_val; +		dst_reg->min_align = max(src_align, dst_align);  		break;  	case BPF_AND:  		/* Disallow AND'ing of negative numbers, ain't nobody got time @@ -1746,17 +1812,23 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,  		else  			dst_reg->min_value = 0;  		dst_reg->max_value = max_val; +		dst_reg->min_align = max(src_align, dst_align);  		break;  	case BPF_LSH:  		/* Gotta have special overflow logic here, if we're shifting  		 * more than MAX_RANGE then just assume we have an invalid  		 * range.  		 */ -		if (min_val > ilog2(BPF_REGISTER_MAX_RANGE)) +		if (min_val > ilog2(BPF_REGISTER_MAX_RANGE)) {  			dst_reg->min_value = BPF_REGISTER_MIN_RANGE; -		else if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) -			dst_reg->min_value <<= min_val; - +			dst_reg->min_align = 1; +		} else { +			if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) +				dst_reg->min_value <<= min_val; +			if (!dst_reg->min_align) +				dst_reg->min_align = 1; +			dst_reg->min_align <<= min_val; +		}  		if (max_val > ilog2(BPF_REGISTER_MAX_RANGE))  			dst_reg->max_value = BPF_REGISTER_MAX_RANGE;  		else if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) @@ -1766,11 +1838,19 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,  		/* RSH by a negative number is undefined, and the BPF_RSH is an  		 * unsigned shift, so make the appropriate casts.  		 */ -		if (min_val < 0 || dst_reg->min_value < 0) +		if (min_val < 0 || dst_reg->min_value < 0) {  			dst_reg->min_value = BPF_REGISTER_MIN_RANGE; -		else +		} else {  			dst_reg->min_value =  				(u64)(dst_reg->min_value) >> min_val; +		} +		if (min_val < 0) { +			dst_reg->min_align = 1; +		} else { +			dst_reg->min_align >>= (u64) min_val; +			if (!dst_reg->min_align) +				dst_reg->min_align = 1; +		}  		if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)  			dst_reg->max_value >>= max_val;  		break; @@ -1872,6 +1952,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)  			regs[insn->dst_reg].imm = insn->imm;  			regs[insn->dst_reg].max_value = insn->imm;  			regs[insn->dst_reg].min_value = insn->imm; +			regs[insn->dst_reg].min_align = calc_align(insn->imm);  		}  	} else if (opcode > BPF_END) { @@ -2368,7 +2449,6 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)  {  	struct bpf_reg_state *regs = env->cur_state.regs;  	u8 mode = BPF_MODE(insn->code); -	struct bpf_reg_state *reg;  	int i, err;  	if (!may_access_skb(env->prog->type)) { @@ -2401,11 +2481,8 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)  	}  	/* reset caller saved regs to unreadable */ -	for (i = 0; i < CALLER_SAVED_REGS; i++) { -		reg = regs + caller_saved[i]; -		reg->type = NOT_INIT; -		reg->imm = 0; -	} +	for (i = 0; i < CALLER_SAVED_REGS; i++) +		mark_reg_not_init(regs, caller_saved[i]);  	/* mark destination R0 register as readable, since it contains  	 * the value fetched from the packet @@ -2564,6 +2641,7 @@ peek_stack:  				env->explored_states[t + 1] = STATE_LIST_MARK;  		} else {  			/* conditional jump with two edges */ +			env->explored_states[t] = STATE_LIST_MARK;  			ret = push_insn(t, t + 1, FALLTHROUGH, env);  			if (ret == 1)  				goto peek_stack; @@ -2615,7 +2693,8 @@ err_free:  /* the following conditions reduce the number of explored insns   * from ~140k to ~80k for ultra large programs that use a lot of ptr_to_packet   */ -static bool compare_ptrs_to_packet(struct bpf_reg_state *old, +static bool compare_ptrs_to_packet(struct bpf_verifier_env *env, +				   struct bpf_reg_state *old,  				   struct bpf_reg_state *cur)  {  	if (old->id != cur->id) @@ -2658,7 +2737,7 @@ static bool compare_ptrs_to_packet(struct bpf_reg_state *old,  	 * 'if (R4 > data_end)' and all further insn were already good with r=20,  	 * so they will be good with r=30 and we can prune the search.  	 */ -	if (old->off <= cur->off && +	if (!env->strict_alignment && old->off <= cur->off &&  	    old->off >= old->range && cur->off >= cur->range)  		return true; @@ -2722,8 +2801,14 @@ static bool states_equal(struct bpf_verifier_env *env,  		     rcur->type != NOT_INIT))  			continue; +		/* Don't care about the reg->id in this case. */ +		if (rold->type == PTR_TO_MAP_VALUE_OR_NULL && +		    rcur->type == PTR_TO_MAP_VALUE_OR_NULL && +		    rold->map_ptr == rcur->map_ptr) +			continue; +  		if (rold->type == PTR_TO_PACKET && rcur->type == PTR_TO_PACKET && -		    compare_ptrs_to_packet(rold, rcur)) +		    compare_ptrs_to_packet(env, rold, rcur))  			continue;  		return false; @@ -2856,8 +2941,15 @@ static int do_check(struct bpf_verifier_env *env)  			goto process_bpf_exit;  		} -		if (log_level && do_print_state) { -			verbose("\nfrom %d to %d:", prev_insn_idx, insn_idx); +		if (need_resched()) +			cond_resched(); + +		if (log_level > 1 || (log_level && do_print_state)) { +			if (log_level > 1) +				verbose("%d:", insn_idx); +			else +				verbose("\nfrom %d to %d:", +					prev_insn_idx, insn_idx);  			print_verifier_state(&env->cur_state);  			do_print_state = false;  		} @@ -3495,6 +3587,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)  		log_level = 0;  	} +	env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT); +	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) +		env->strict_alignment = true; +  	ret = replace_map_fd_with_map_ptr(env);  	if (ret < 0)  		goto skip_full_check; @@ -3600,6 +3696,10 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,  	log_level = 0; +	env->strict_alignment = false; +	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) +		env->strict_alignment = true; +  	env->explored_states = kcalloc(env->prog->len,  				       sizeof(struct bpf_verifier_state_list *),  				       GFP_KERNEL); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index c3c9a0e1b3c9..8d4e85eae42c 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4265,6 +4265,11 @@ static void kill_css(struct cgroup_subsys_state *css)  {  	lockdep_assert_held(&cgroup_mutex); +	if (css->flags & CSS_DYING) +		return; + +	css->flags |= CSS_DYING; +  	/*  	 * This must happen before css is disassociated with its cgroup.  	 * See seq_css() for details. diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index f6501f4f6040..ae643412948a 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -176,9 +176,9 @@ typedef enum {  } cpuset_flagbits_t;  /* convenient tests for these bits */ -static inline bool is_cpuset_online(const struct cpuset *cs) +static inline bool is_cpuset_online(struct cpuset *cs)  { -	return test_bit(CS_ONLINE, &cs->flags); +	return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css);  }  static inline int is_cpu_exclusive(const struct cpuset *cs) diff --git a/kernel/cpu.c b/kernel/cpu.c index 9ae6fbe5b5cf..cb5103413bd8 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1658,13 +1658,13 @@ static ssize_t write_cpuhp_target(struct device *dev,  	ret = !sp->name || sp->cant_stop ? -EINVAL : 0;  	mutex_unlock(&cpuhp_state_mutex);  	if (ret) -		return ret; +		goto out;  	if (st->state < target)  		ret = do_cpu_up(dev->id, target);  	else  		ret = do_cpu_down(dev->id, target); - +out:  	unlock_device_hotplug();  	return ret ? ret : count;  } diff --git a/kernel/events/core.c b/kernel/events/core.c index 6e75a5c9412d..6c4e523dc1e2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7316,6 +7316,21 @@ int perf_event_account_interrupt(struct perf_event *event)  	return __perf_event_account_interrupt(event, 1);  } +static bool sample_is_allowed(struct perf_event *event, struct pt_regs *regs) +{ +	/* +	 * Due to interrupt latency (AKA "skid"), we may enter the +	 * kernel before taking an overflow, even if the PMU is only +	 * counting user events. +	 * To avoid leaking information to userspace, we must always +	 * reject kernel samples when exclude_kernel is set. +	 */ +	if (event->attr.exclude_kernel && !user_mode(regs)) +		return false; + +	return true; +} +  /*   * Generic event overflow handling, sampling.   */ @@ -7337,6 +7352,12 @@ static int __perf_event_overflow(struct perf_event *event,  	ret = __perf_event_account_interrupt(event, throttle);  	/* +	 * For security, drop the skid kernel samples if necessary. +	 */ +	if (!sample_is_allowed(event, regs)) +		return ret; + +	/*  	 * XXX event_limit might not quite work as expected on inherited  	 * events  	 */ diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 2831480c63a2..ee97196bb151 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -580,7 +580,7 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,  	int ret = -ENOMEM, max_order = 0;  	if (!has_aux(event)) -		return -ENOTSUPP; +		return -EOPNOTSUPP;  	if (event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) {  		/* diff --git a/kernel/fork.c b/kernel/fork.c index 06d759ab4c62..e53770d2bf95 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1577,6 +1577,18 @@ static __latent_entropy struct task_struct *copy_process(  	if (!p)  		goto fork_out; +	/* +	 * This _must_ happen before we call free_task(), i.e. before we jump +	 * to any of the bad_fork_* labels. This is to avoid freeing +	 * p->set_child_tid which is (ab)used as a kthread's data pointer for +	 * kernel threads (PF_KTHREAD). +	 */ +	p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; +	/* +	 * Clear TID on mm_release()? +	 */ +	p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL; +  	ftrace_graph_init_task(p);  	rt_mutex_init_task(p); @@ -1743,11 +1755,6 @@ static __latent_entropy struct task_struct *copy_process(  		}  	} -	p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; -	/* -	 * Clear TID on mm_release()? -	 */ -	p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL;  #ifdef CONFIG_BLOCK  	p->plug = NULL;  #endif @@ -1845,11 +1852,13 @@ static __latent_entropy struct task_struct *copy_process(  	*/  	recalc_sigpending();  	if (signal_pending(current)) { -		spin_unlock(¤t->sighand->siglock); -		write_unlock_irq(&tasklist_lock);  		retval = -ERESTARTNOINTR;  		goto bad_fork_cancel_cgroup;  	} +	if (unlikely(!(ns_of_pid(pid)->nr_hashed & PIDNS_HASH_ADDING))) { +		retval = -ENOMEM; +		goto bad_fork_cancel_cgroup; +	}  	if (likely(p->pid)) {  		ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); @@ -1907,6 +1916,8 @@ static __latent_entropy struct task_struct *copy_process(  	return p;  bad_fork_cancel_cgroup: +	spin_unlock(¤t->sighand->siglock); +	write_unlock_irq(&tasklist_lock);  	cgroup_cancel_fork(p);  bad_fork_free_pid:  	cgroup_threadgroup_change_end(current); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 686be4b73018..c94da688ee9b 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -880,8 +880,8 @@ irq_set_chained_handler_and_data(unsigned int irq, irq_flow_handler_t handle,  	if (!desc)  		return; -	__irq_do_set_handler(desc, handle, 1, NULL);  	desc->irq_common_data.handler_data = data; +	__irq_do_set_handler(desc, handle, 1, NULL);  	irq_put_desc_busunlock(desc, flags);  } diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 070be980c37a..425170d4439b 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1312,8 +1312,10 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)  			ret = __irq_set_trigger(desc,  						new->flags & IRQF_TRIGGER_MASK); -			if (ret) +			if (ret) { +				irq_release_resources(desc);  				goto out_mask; +			}  		}  		desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \ diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 7367e0ec6f81..adfe3b4cfe05 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -122,7 +122,7 @@ static void *alloc_insn_page(void)  	return module_alloc(PAGE_SIZE);  } -static void free_insn_page(void *page) +void __weak free_insn_page(void *page)  {  	module_memfree(page);  } @@ -595,7 +595,7 @@ static void kprobe_optimizer(struct work_struct *work)  }  /* Wait for completing optimization and unoptimization */ -static void wait_for_kprobe_optimizer(void) +void wait_for_kprobe_optimizer(void)  {  	mutex_lock(&kprobe_mutex); @@ -2183,6 +2183,12 @@ static int kprobes_module_callback(struct notifier_block *nb,  				 * The vaddr this probe is installed will soon  				 * be vfreed buy not synced to disk. Hence,  				 * disarming the breakpoint isn't needed. +				 * +				 * Note, this will also move any optimized probes +				 * that are pending to be removed from their +				 * corresponding lists to the freeing_list and +				 * will not be touched by the delayed +				 * kprobe_optimizer work handler.  				 */  				kill_kprobe(p);  			} diff --git a/kernel/livepatch/Kconfig b/kernel/livepatch/Kconfig index 045022557936..ec4565122e65 100644 --- a/kernel/livepatch/Kconfig +++ b/kernel/livepatch/Kconfig @@ -10,6 +10,7 @@ config LIVEPATCH  	depends on SYSFS  	depends on KALLSYMS_ALL  	depends on HAVE_LIVEPATCH +	depends on !TRIM_UNUSED_KSYMS  	help  	  Say Y here if you want to support kernel live patching.  	  This option has no runtime impact until a kernel "patch" diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c index f8269036bf0b..52c4e907c14b 100644 --- a/kernel/livepatch/patch.c +++ b/kernel/livepatch/patch.c @@ -59,7 +59,11 @@ static void notrace klp_ftrace_handler(unsigned long ip,  	ops = container_of(fops, struct klp_ops, fops); -	rcu_read_lock(); +	/* +	 * A variant of synchronize_sched() is used to allow patching functions +	 * where RCU is not watching, see klp_synchronize_transition(). +	 */ +	preempt_disable_notrace();  	func = list_first_or_null_rcu(&ops->func_stack, struct klp_func,  				      stack_node); @@ -115,7 +119,7 @@ static void notrace klp_ftrace_handler(unsigned long ip,  	klp_arch_set_pc(regs, (unsigned long)func->new_func);  unlock: -	rcu_read_unlock(); +	preempt_enable_notrace();  }  /* diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index adc0cc64aa4b..b004a1fb6032 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -49,6 +49,28 @@ static void klp_transition_work_fn(struct work_struct *work)  static DECLARE_DELAYED_WORK(klp_transition_work, klp_transition_work_fn);  /* + * This function is just a stub to implement a hard force + * of synchronize_sched(). This requires synchronizing + * tasks even in userspace and idle. + */ +static void klp_sync(struct work_struct *work) +{ +} + +/* + * We allow to patch also functions where RCU is not watching, + * e.g. before user_exit(). We can not rely on the RCU infrastructure + * to do the synchronization. Instead hard force the sched synchronization. + * + * This approach allows to use RCU functions for manipulating func_stack + * safely. + */ +static void klp_synchronize_transition(void) +{ +	schedule_on_each_cpu(klp_sync); +} + +/*   * The transition to the target patch state is complete.  Clean up the data   * structures.   */ @@ -73,7 +95,7 @@ static void klp_complete_transition(void)  		 * func->transition gets cleared, the handler may choose a  		 * removed function.  		 */ -		synchronize_rcu(); +		klp_synchronize_transition();  	}  	if (klp_transition_patch->immediate) @@ -92,7 +114,7 @@ static void klp_complete_transition(void)  	/* Prevent klp_ftrace_handler() from seeing KLP_UNDEFINED state */  	if (klp_target_state == KLP_PATCHED) -		synchronize_rcu(); +		klp_synchronize_transition();  	read_lock(&tasklist_lock);  	for_each_process_thread(g, task) { @@ -136,7 +158,11 @@ void klp_cancel_transition(void)   */  void klp_update_patch_state(struct task_struct *task)  { -	rcu_read_lock(); +	/* +	 * A variant of synchronize_sched() is used to allow patching functions +	 * where RCU is not watching, see klp_synchronize_transition(). +	 */ +	preempt_disable_notrace();  	/*  	 * This test_and_clear_tsk_thread_flag() call also serves as a read @@ -153,7 +179,7 @@ void klp_update_patch_state(struct task_struct *task)  	if (test_and_clear_tsk_thread_flag(task, TIF_PATCH_PENDING))  		task->patch_state = READ_ONCE(klp_target_state); -	rcu_read_unlock(); +	preempt_enable_notrace();  }  /* @@ -539,7 +565,7 @@ void klp_reverse_transition(void)  		clear_tsk_thread_flag(idle_task(cpu), TIF_PATCH_PENDING);  	/* Let any remaining calls to klp_update_patch_state() complete */ -	synchronize_rcu(); +	klp_synchronize_transition();  	klp_start_transition();  } diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index b95509416909..28cd09e635ed 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1785,12 +1785,14 @@ int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,  	int ret;  	raw_spin_lock_irq(&lock->wait_lock); - -	set_current_state(TASK_INTERRUPTIBLE); -  	/* sleep on the mutex */ +	set_current_state(TASK_INTERRUPTIBLE);  	ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter); - +	/* +	 * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might +	 * have to fix that up. +	 */ +	fixup_rt_mutex_waiters(lock);  	raw_spin_unlock_irq(&lock->wait_lock);  	return ret; @@ -1822,15 +1824,25 @@ bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,  	raw_spin_lock_irq(&lock->wait_lock);  	/* +	 * Do an unconditional try-lock, this deals with the lock stealing +	 * state where __rt_mutex_futex_unlock() -> mark_wakeup_next_waiter() +	 * sets a NULL owner. +	 * +	 * We're not interested in the return value, because the subsequent +	 * test on rt_mutex_owner() will infer that. If the trylock succeeded, +	 * we will own the lock and it will have removed the waiter. If we +	 * failed the trylock, we're still not owner and we need to remove +	 * ourselves. +	 */ +	try_to_take_rt_mutex(lock, current, waiter); +	/*  	 * Unless we're the owner; we're still enqueued on the wait_list.  	 * So check if we became owner, if not, take us off the wait_list.  	 */  	if (rt_mutex_owner(lock) != current) {  		remove_waiter(lock, waiter); -		fixup_rt_mutex_waiters(lock);  		cleanup = true;  	} -  	/*  	 * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might  	 * have to fix that up. diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index d1f3e9f558b8..74a5a7255b4d 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -277,7 +277,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)  	 * if reparented.  	 */  	for (;;) { -		set_current_state(TASK_UNINTERRUPTIBLE); +		set_current_state(TASK_INTERRUPTIBLE);  		if (pid_ns->nr_hashed == init_pids)  			break;  		schedule(); diff --git a/kernel/power/process.c b/kernel/power/process.c index 78672d324a6e..c7209f060eeb 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -132,7 +132,7 @@ int freeze_processes(void)  	if (!pm_freezing)  		atomic_inc(&system_freezing_cnt); -	pm_wakeup_clear(true); +	pm_wakeup_clear();  	pr_info("Freezing user space processes ... ");  	pm_freezing = true;  	error = try_to_freeze_tasks(true); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 3b1e0f3ad07f..fa46606f3356 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1425,7 +1425,7 @@ static unsigned int nr_meta_pages;   * Numbers of normal and highmem page frames allocated for hibernation image   * before suspending devices.   */ -unsigned int alloc_normal, alloc_highmem; +static unsigned int alloc_normal, alloc_highmem;  /*   * Memory bitmap used for marking saveable pages (during hibernation) or   * hibernation image pages (during restore) diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index c0248c74d6d4..15e6baef5c73 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -72,8 +72,6 @@ static void freeze_begin(void)  static void freeze_enter(void)  { -	trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_FREEZE, true); -  	spin_lock_irq(&suspend_freeze_lock);  	if (pm_wakeup_pending())  		goto out; @@ -100,27 +98,6 @@ static void freeze_enter(void)   out:  	suspend_freeze_state = FREEZE_STATE_NONE;  	spin_unlock_irq(&suspend_freeze_lock); - -	trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_FREEZE, false); -} - -static void s2idle_loop(void) -{ -	do { -		freeze_enter(); - -		if (freeze_ops && freeze_ops->wake) -			freeze_ops->wake(); - -		dpm_resume_noirq(PMSG_RESUME); -		if (freeze_ops && freeze_ops->sync) -			freeze_ops->sync(); - -		if (pm_wakeup_pending()) -			break; - -		pm_wakeup_clear(false); -	} while (!dpm_suspend_noirq(PMSG_SUSPEND));  }  void freeze_wake(void) @@ -394,8 +371,10 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)  	 * all the devices are suspended.  	 */  	if (state == PM_SUSPEND_FREEZE) { -		s2idle_loop(); -		goto Platform_early_resume; +		trace_suspend_resume(TPS("machine_suspend"), state, true); +		freeze_enter(); +		trace_suspend_resume(TPS("machine_suspend"), state, false); +		goto Platform_wake;  	}  	error = disable_nonboot_cpus(); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index a1aecf44ab07..a1db38abac5b 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -269,7 +269,6 @@ static struct console *exclusive_console;  #define MAX_CMDLINECONSOLES 8  static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES]; -static int console_cmdline_cnt;  static int preferred_console = -1;  int console_set_on_cmdline; @@ -1906,25 +1905,12 @@ static int __add_preferred_console(char *name, int idx, char *options,  	 *	See if this tty is not yet registered, and  	 *	if we have a slot free.  	 */ -	for (i = 0, c = console_cmdline; i < console_cmdline_cnt; i++, c++) { +	for (i = 0, c = console_cmdline; +	     i < MAX_CMDLINECONSOLES && c->name[0]; +	     i++, c++) {  		if (strcmp(c->name, name) == 0 && c->index == idx) { -			if (brl_options) -				return 0; - -			/* -			 * Maintain an invariant that will help to find if -			 * the matching console is preferred, see -			 * register_console(): -			 * -			 * The last non-braille console is always -			 * the preferred one. -			 */ -			if (i != console_cmdline_cnt - 1) -				swap(console_cmdline[i], -				     console_cmdline[console_cmdline_cnt - 1]); - -			preferred_console = console_cmdline_cnt - 1; - +			if (!brl_options) +				preferred_console = i;  			return 0;  		}  	} @@ -1937,7 +1923,6 @@ static int __add_preferred_console(char *name, int idx, char *options,  	braille_set_options(c, brl_options);  	c->index = idx; -	console_cmdline_cnt++;  	return 0;  }  /* @@ -2477,23 +2462,12 @@ void register_console(struct console *newcon)  	}  	/* -	 * See if this console matches one we selected on the command line. -	 * -	 * There may be several entries in the console_cmdline array matching -	 * with the same console, one with newcon->match(), another by -	 * name/index: -	 * -	 *	pl011,mmio,0x87e024000000,115200 -- added from SPCR -	 *	ttyAMA0 -- added from command line -	 * -	 * Traverse the console_cmdline array in reverse order to be -	 * sure that if this console is preferred then it will be the first -	 * matching entry.  We use the invariant that is maintained in -	 * __add_preferred_console(). +	 *	See if this console matches one we selected on +	 *	the command line.  	 */ -	for (i = console_cmdline_cnt - 1; i >= 0; i--) { -		c = console_cmdline + i; - +	for (i = 0, c = console_cmdline; +	     i < MAX_CMDLINECONSOLES && c->name[0]; +	     i++, c++) {  		if (!newcon->match ||  		    newcon->match(newcon, c->name, c->index, c->options) != 0) {  			/* default matching */ diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 266ddcc1d8bb..60f356d91060 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -60,19 +60,25 @@ int ptrace_access_vm(struct task_struct *tsk, unsigned long addr,  } +void __ptrace_link(struct task_struct *child, struct task_struct *new_parent, +		   const struct cred *ptracer_cred) +{ +	BUG_ON(!list_empty(&child->ptrace_entry)); +	list_add(&child->ptrace_entry, &new_parent->ptraced); +	child->parent = new_parent; +	child->ptracer_cred = get_cred(ptracer_cred); +} +  /*   * ptrace a task: make the debugger its new parent and   * move it to the ptrace list.   *   * Must be called with the tasklist lock write-held.   */ -void __ptrace_link(struct task_struct *child, struct task_struct *new_parent) +static void ptrace_link(struct task_struct *child, struct task_struct *new_parent)  { -	BUG_ON(!list_empty(&child->ptrace_entry)); -	list_add(&child->ptrace_entry, &new_parent->ptraced); -	child->parent = new_parent;  	rcu_read_lock(); -	child->ptracer_cred = get_cred(__task_cred(new_parent)); +	__ptrace_link(child, new_parent, __task_cred(new_parent));  	rcu_read_unlock();  } @@ -386,7 +392,7 @@ static int ptrace_attach(struct task_struct *task, long request,  		flags |= PT_SEIZED;  	task->ptrace = flags; -	__ptrace_link(task, current); +	ptrace_link(task, current);  	/* SEIZE doesn't trap tracee on attach */  	if (!seize) @@ -459,7 +465,7 @@ static int ptrace_traceme(void)  		 */  		if (!ret && !(current->real_parent->flags & PF_EXITING)) {  			current->ptrace = PT_PTRACED; -			__ptrace_link(current, current->real_parent); +			ptrace_link(current, current->real_parent);  		}  	}  	write_unlock_irq(&tasklist_lock); diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index 584d8a983883..dea03614263f 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c @@ -263,7 +263,7 @@ EXPORT_SYMBOL_GPL(cleanup_srcu_struct);  /*   * Counts the new reader in the appropriate per-CPU element of the - * srcu_struct.  Must be called from process context. + * srcu_struct.   * Returns an index that must be passed to the matching srcu_read_unlock().   */  int __srcu_read_lock(struct srcu_struct *sp) @@ -271,7 +271,7 @@ int __srcu_read_lock(struct srcu_struct *sp)  	int idx;  	idx = READ_ONCE(sp->completed) & 0x1; -	__this_cpu_inc(sp->per_cpu_ref->lock_count[idx]); +	this_cpu_inc(sp->per_cpu_ref->lock_count[idx]);  	smp_mb(); /* B */  /* Avoid leaking the critical section. */  	return idx;  } @@ -281,7 +281,6 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock);   * Removes the count for the old reader from the appropriate per-CPU   * element of the srcu_struct.  Note that this may well be a different   * CPU than that which was incremented by the corresponding srcu_read_lock(). - * Must be called from process context.   */  void __srcu_read_unlock(struct srcu_struct *sp, int idx)  { diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 36e1f82faed1..32798eb14853 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -97,8 +97,9 @@ EXPORT_SYMBOL_GPL(cleanup_srcu_struct);  /*   * Counts the new reader in the appropriate per-CPU element of the - * srcu_struct.  Must be called from process context. - * Returns an index that must be passed to the matching srcu_read_unlock(). + * srcu_struct.  Can be invoked from irq/bh handlers, but the matching + * __srcu_read_unlock() must be in the same handler instance.  Returns an + * index that must be passed to the matching srcu_read_unlock().   */  int __srcu_read_lock(struct srcu_struct *sp)  { @@ -112,7 +113,7 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock);  /*   * Removes the count for the old reader from the appropriate element of - * the srcu_struct.  Must be called from process context. + * the srcu_struct.   */  void __srcu_read_unlock(struct srcu_struct *sp, int idx)  { diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 3ae8474557df..157654fa436a 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -357,7 +357,7 @@ EXPORT_SYMBOL_GPL(cleanup_srcu_struct);  /*   * Counts the new reader in the appropriate per-CPU element of the - * srcu_struct.  Must be called from process context. + * srcu_struct.   * Returns an index that must be passed to the matching srcu_read_unlock().   */  int __srcu_read_lock(struct srcu_struct *sp) @@ -365,7 +365,7 @@ int __srcu_read_lock(struct srcu_struct *sp)  	int idx;  	idx = READ_ONCE(sp->srcu_idx) & 0x1; -	__this_cpu_inc(sp->sda->srcu_lock_count[idx]); +	this_cpu_inc(sp->sda->srcu_lock_count[idx]);  	smp_mb(); /* B */  /* Avoid leaking the critical section. */  	return idx;  } @@ -375,7 +375,6 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock);   * Removes the count for the old reader from the appropriate per-CPU   * element of the srcu_struct.  Note that this may well be a different   * CPU than that which was incremented by the corresponding srcu_read_lock(). - * Must be called from process context.   */  void __srcu_read_unlock(struct srcu_struct *sp, int idx)  { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 759f4bd52cd6..326d4f88e2b1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3502,6 +3502,31 @@ asmlinkage __visible void __sched schedule(void)  }  EXPORT_SYMBOL(schedule); +/* + * synchronize_rcu_tasks() makes sure that no task is stuck in preempted + * state (have scheduled out non-voluntarily) by making sure that all + * tasks have either left the run queue or have gone into user space. + * As idle tasks do not do either, they must not ever be preempted + * (schedule out non-voluntarily). + * + * schedule_idle() is similar to schedule_preempt_disable() except that it + * never enables preemption because it does not call sched_submit_work(). + */ +void __sched schedule_idle(void) +{ +	/* +	 * As this skips calling sched_submit_work(), which the idle task does +	 * regardless because that function is a nop when the task is in a +	 * TASK_RUNNING state, make sure this isn't used someplace that the +	 * current task can be in any other state. Note, idle is always in the +	 * TASK_RUNNING state. +	 */ +	WARN_ON_ONCE(current->state); +	do { +		__schedule(false); +	} while (need_resched()); +} +  #ifdef CONFIG_CONTEXT_TRACKING  asmlinkage __visible void __sched schedule_user(void)  { @@ -5580,7 +5605,7 @@ void idle_task_exit(void)  	BUG_ON(cpu_online(smp_processor_id()));  	if (mm != &init_mm) { -		switch_mm_irqs_off(mm, &init_mm, current); +		switch_mm(mm, &init_mm, current);  		finish_arch_post_lock_switch();  	}  	mmdrop(mm); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 76877a62b5fa..076a2e31951c 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -101,9 +101,6 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,  	if (sg_policy->next_freq == next_freq)  		return; -	if (sg_policy->next_freq > next_freq) -		next_freq = (sg_policy->next_freq + next_freq) >> 1; -  	sg_policy->next_freq = next_freq;  	sg_policy->last_freq_update_time = time; @@ -245,11 +242,10 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,  	sugov_update_commit(sg_policy, time, next_f);  } -static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu) +static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)  {  	struct sugov_policy *sg_policy = sg_cpu->sg_policy;  	struct cpufreq_policy *policy = sg_policy->policy; -	u64 last_freq_update_time = sg_policy->last_freq_update_time;  	unsigned long util = 0, max = 1;  	unsigned int j; @@ -265,7 +261,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu)  		 * enough, don't take the CPU into account as it probably is  		 * idle now (and clear iowait_boost for it).  		 */ -		delta_ns = last_freq_update_time - j_sg_cpu->last_update; +		delta_ns = time - j_sg_cpu->last_update;  		if (delta_ns > TICK_NSEC) {  			j_sg_cpu->iowait_boost = 0;  			continue; @@ -309,7 +305,7 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time,  		if (flags & SCHED_CPUFREQ_RT_DL)  			next_f = sg_policy->policy->cpuinfo.max_freq;  		else -			next_f = sugov_next_freq_shared(sg_cpu); +			next_f = sugov_next_freq_shared(sg_cpu, time);  		sugov_update_commit(sg_policy, time, next_f);  	} diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d71109321841..c77e4b1d51c0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3563,7 +3563,7 @@ static inline void check_schedstat_required(void)  			trace_sched_stat_runtime_enabled())  {  		printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, "  			     "stat_blocked and stat_runtime require the " -			     "kernel parameter schedstats=enabled or " +			     "kernel parameter schedstats=enable or "  			     "kernel.sched_schedstats=1\n");  	}  #endif diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 2a25a9ec2c6e..ef63adce0c9c 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -265,7 +265,7 @@ static void do_idle(void)  	smp_mb__after_atomic();  	sched_ttwu_pending(); -	schedule_preempt_disabled(); +	schedule_idle();  	if (unlikely(klp_patch_pending(current)))  		klp_update_patch_state(current); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 7808ab050599..6dda2aab731e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1467,6 +1467,8 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq)  }  #endif +extern void schedule_idle(void); +  extern void sysrq_sched_debug_show(void);  extern void sched_init_granularity(void);  extern void update_max_interval(void); diff --git a/kernel/signal.c b/kernel/signal.c index ca92bcfeb322..45b4c1ffe14e 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -510,7 +510,8 @@ int unhandled_signal(struct task_struct *tsk, int sig)  	return !tsk->ptrace;  } -static void collect_signal(int sig, struct sigpending *list, siginfo_t *info) +static void collect_signal(int sig, struct sigpending *list, siginfo_t *info, +			   bool *resched_timer)  {  	struct sigqueue *q, *first = NULL; @@ -532,6 +533,12 @@ static void collect_signal(int sig, struct sigpending *list, siginfo_t *info)  still_pending:  		list_del_init(&first->list);  		copy_siginfo(info, &first->info); + +		*resched_timer = +			(first->flags & SIGQUEUE_PREALLOC) && +			(info->si_code == SI_TIMER) && +			(info->si_sys_private); +  		__sigqueue_free(first);  	} else {  		/* @@ -548,12 +555,12 @@ still_pending:  }  static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, -			siginfo_t *info) +			siginfo_t *info, bool *resched_timer)  {  	int sig = next_signal(pending, mask);  	if (sig) -		collect_signal(sig, pending, info); +		collect_signal(sig, pending, info, resched_timer);  	return sig;  } @@ -565,15 +572,16 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,   */  int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)  { +	bool resched_timer = false;  	int signr;  	/* We only dequeue private signals from ourselves, we don't let  	 * signalfd steal them  	 */ -	signr = __dequeue_signal(&tsk->pending, mask, info); +	signr = __dequeue_signal(&tsk->pending, mask, info, &resched_timer);  	if (!signr) {  		signr = __dequeue_signal(&tsk->signal->shared_pending, -					 mask, info); +					 mask, info, &resched_timer);  #ifdef CONFIG_POSIX_TIMERS  		/*  		 * itimer signal ? @@ -621,7 +629,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)  		current->jobctl |= JOBCTL_STOP_DEQUEUED;  	}  #ifdef CONFIG_POSIX_TIMERS -	if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) { +	if (resched_timer) {  		/*  		 * Release the siglock to ensure proper locking order  		 * of timer locks outside of siglocks.  Note, we leave diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 5cb5b0008d97..ee2f4202d82a 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -387,7 +387,7 @@ void alarm_start_relative(struct alarm *alarm, ktime_t start)  {  	struct alarm_base *base = &alarm_bases[alarm->type]; -	start = ktime_add(start, base->gettime()); +	start = ktime_add_safe(start, base->gettime());  	alarm_start(alarm, start);  }  EXPORT_SYMBOL_GPL(alarm_start_relative); @@ -475,7 +475,7 @@ u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval)  		overrun++;  	} -	alarm->node.expires = ktime_add(alarm->node.expires, interval); +	alarm->node.expires = ktime_add_safe(alarm->node.expires, interval);  	return overrun;  }  EXPORT_SYMBOL_GPL(alarm_forward); @@ -660,13 +660,21 @@ static int alarm_timer_set(struct k_itimer *timr, int flags,  	/* start the timer */  	timr->it.alarm.interval = timespec64_to_ktime(new_setting->it_interval); + +	/* +	 * Rate limit to the tick as a hot fix to prevent DOS. Will be +	 * mopped up later. +	 */ +	if (timr->it.alarm.interval < TICK_NSEC) +		timr->it.alarm.interval = TICK_NSEC; +  	exp = timespec64_to_ktime(new_setting->it_value);  	/* Convert (if necessary) to absolute time */  	if (flags != TIMER_ABSTIME) {  		ktime_t now;  		now = alarm_bases[timr->it.alarm.alarmtimer.type].gettime(); -		exp = ktime_add(now, exp); +		exp = ktime_add_safe(now, exp);  	}  	alarm_start(&timr->it.alarm.alarmtimer, exp); diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 1370f067fb51..d2a1e6dd0291 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -825,8 +825,10 @@ static void check_thread_timers(struct task_struct *tsk,  			 * At the hard limit, we just die.  			 * No need to calculate anything else now.  			 */ -			pr_info("CPU Watchdog Timeout (hard): %s[%d]\n", -				tsk->comm, task_pid_nr(tsk)); +			if (print_fatal_signals) { +				pr_info("CPU Watchdog Timeout (hard): %s[%d]\n", +					tsk->comm, task_pid_nr(tsk)); +			}  			__group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);  			return;  		} @@ -838,8 +840,10 @@ static void check_thread_timers(struct task_struct *tsk,  				soft += USEC_PER_SEC;  				sig->rlim[RLIMIT_RTTIME].rlim_cur = soft;  			} -			pr_info("RT Watchdog Timeout (soft): %s[%d]\n", -				tsk->comm, task_pid_nr(tsk)); +			if (print_fatal_signals) { +				pr_info("RT Watchdog Timeout (soft): %s[%d]\n", +					tsk->comm, task_pid_nr(tsk)); +			}  			__group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);  		}  	} @@ -936,8 +940,10 @@ static void check_process_timers(struct task_struct *tsk,  			 * At the hard limit, we just die.  			 * No need to calculate anything else now.  			 */ -			pr_info("RT Watchdog Timeout (hard): %s[%d]\n", -				tsk->comm, task_pid_nr(tsk)); +			if (print_fatal_signals) { +				pr_info("RT Watchdog Timeout (hard): %s[%d]\n", +					tsk->comm, task_pid_nr(tsk)); +			}  			__group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);  			return;  		} @@ -945,8 +951,10 @@ static void check_process_timers(struct task_struct *tsk,  			/*  			 * At the soft limit, send a SIGXCPU every second.  			 */ -			pr_info("CPU Watchdog Timeout (soft): %s[%d]\n", -				tsk->comm, task_pid_nr(tsk)); +			if (print_fatal_signals) { +				pr_info("CPU Watchdog Timeout (soft): %s[%d]\n", +					tsk->comm, task_pid_nr(tsk)); +			}  			__group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);  			if (soft < hard) {  				soft++; diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 987e496bb51a..b398c2ea69b2 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -37,9 +37,11 @@ static int tick_broadcast_forced;  static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(tick_broadcast_lock);  #ifdef CONFIG_TICK_ONESHOT +static void tick_broadcast_setup_oneshot(struct clock_event_device *bc);  static void tick_broadcast_clear_oneshot(int cpu);  static void tick_resume_broadcast_oneshot(struct clock_event_device *bc);  #else +static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); }  static inline void tick_broadcast_clear_oneshot(int cpu) { }  static inline void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { }  #endif @@ -867,7 +869,7 @@ static void tick_broadcast_init_next_event(struct cpumask *mask,  /**   * tick_broadcast_setup_oneshot - setup the broadcast device   */ -void tick_broadcast_setup_oneshot(struct clock_event_device *bc) +static void tick_broadcast_setup_oneshot(struct clock_event_device *bc)  {  	int cpu = smp_processor_id(); diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index f738251000fe..be0ac01f2e12 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -126,7 +126,6 @@ static inline int tick_check_oneshot_change(int allow_nohz) { return 0; }  /* Functions related to oneshot broadcasting */  #if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT) -extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc);  extern void tick_broadcast_switch_to_oneshot(void);  extern void tick_shutdown_broadcast_oneshot(unsigned int cpu);  extern int tick_broadcast_oneshot_active(void); @@ -134,7 +133,6 @@ extern void tick_check_oneshot_broadcast_this_cpu(void);  bool tick_broadcast_oneshot_available(void);  extern struct cpumask *tick_get_broadcast_oneshot_mask(void);  #else /* !(BROADCAST && ONESHOT): */ -static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); }  static inline void tick_broadcast_switch_to_oneshot(void) { }  static inline void tick_shutdown_broadcast_oneshot(unsigned int cpu) { }  static inline int tick_broadcast_oneshot_active(void) { return 0; } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 9652bc57fd09..b602c48cb841 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -118,6 +118,26 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)  	tk->offs_boot = ktime_add(tk->offs_boot, delta);  } +/* + * tk_clock_read - atomic clocksource read() helper + * + * This helper is necessary to use in the read paths because, while the + * seqlock ensures we don't return a bad value while structures are updated, + * it doesn't protect from potential crashes. There is the possibility that + * the tkr's clocksource may change between the read reference, and the + * clock reference passed to the read function.  This can cause crashes if + * the wrong clocksource is passed to the wrong read function. + * This isn't necessary to use when holding the timekeeper_lock or doing + * a read of the fast-timekeeper tkrs (which is protected by its own locking + * and update logic). + */ +static inline u64 tk_clock_read(struct tk_read_base *tkr) +{ +	struct clocksource *clock = READ_ONCE(tkr->clock); + +	return clock->read(clock); +} +  #ifdef CONFIG_DEBUG_TIMEKEEPING  #define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */ @@ -175,7 +195,7 @@ static inline u64 timekeeping_get_delta(struct tk_read_base *tkr)  	 */  	do {  		seq = read_seqcount_begin(&tk_core.seq); -		now = tkr->read(tkr->clock); +		now = tk_clock_read(tkr);  		last = tkr->cycle_last;  		mask = tkr->mask;  		max = tkr->clock->max_cycles; @@ -209,7 +229,7 @@ static inline u64 timekeeping_get_delta(struct tk_read_base *tkr)  	u64 cycle_now, delta;  	/* read clocksource */ -	cycle_now = tkr->read(tkr->clock); +	cycle_now = tk_clock_read(tkr);  	/* calculate the delta since the last update_wall_time */  	delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask); @@ -238,12 +258,10 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)  	++tk->cs_was_changed_seq;  	old_clock = tk->tkr_mono.clock;  	tk->tkr_mono.clock = clock; -	tk->tkr_mono.read = clock->read;  	tk->tkr_mono.mask = clock->mask; -	tk->tkr_mono.cycle_last = tk->tkr_mono.read(clock); +	tk->tkr_mono.cycle_last = tk_clock_read(&tk->tkr_mono);  	tk->tkr_raw.clock = clock; -	tk->tkr_raw.read = clock->read;  	tk->tkr_raw.mask = clock->mask;  	tk->tkr_raw.cycle_last = tk->tkr_mono.cycle_last; @@ -262,7 +280,7 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)  	/* Go back from cycles -> shifted ns */  	tk->xtime_interval = interval * clock->mult;  	tk->xtime_remainder = ntpinterval - tk->xtime_interval; -	tk->raw_interval = (interval * clock->mult) >> clock->shift; +	tk->raw_interval = interval * clock->mult;  	 /* if changing clocks, convert xtime_nsec shift units */  	if (old_clock) { @@ -404,7 +422,7 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)  		now += timekeeping_delta_to_ns(tkr,  				clocksource_delta( -					tkr->read(tkr->clock), +					tk_clock_read(tkr),  					tkr->cycle_last,  					tkr->mask));  	} while (read_seqcount_retry(&tkf->seq, seq)); @@ -461,6 +479,10 @@ static u64 dummy_clock_read(struct clocksource *cs)  	return cycles_at_suspend;  } +static struct clocksource dummy_clock = { +	.read = dummy_clock_read, +}; +  /**   * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource.   * @tk: Timekeeper to snapshot. @@ -477,13 +499,13 @@ static void halt_fast_timekeeper(struct timekeeper *tk)  	struct tk_read_base *tkr = &tk->tkr_mono;  	memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); -	cycles_at_suspend = tkr->read(tkr->clock); -	tkr_dummy.read = dummy_clock_read; +	cycles_at_suspend = tk_clock_read(tkr); +	tkr_dummy.clock = &dummy_clock;  	update_fast_timekeeper(&tkr_dummy, &tk_fast_mono);  	tkr = &tk->tkr_raw;  	memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); -	tkr_dummy.read = dummy_clock_read; +	tkr_dummy.clock = &dummy_clock;  	update_fast_timekeeper(&tkr_dummy, &tk_fast_raw);  } @@ -649,11 +671,10 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)   */  static void timekeeping_forward_now(struct timekeeper *tk)  { -	struct clocksource *clock = tk->tkr_mono.clock;  	u64 cycle_now, delta;  	u64 nsec; -	cycle_now = tk->tkr_mono.read(clock); +	cycle_now = tk_clock_read(&tk->tkr_mono);  	delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask);  	tk->tkr_mono.cycle_last = cycle_now;  	tk->tkr_raw.cycle_last  = cycle_now; @@ -929,8 +950,7 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)  	do {  		seq = read_seqcount_begin(&tk_core.seq); - -		now = tk->tkr_mono.read(tk->tkr_mono.clock); +		now = tk_clock_read(&tk->tkr_mono);  		systime_snapshot->cs_was_changed_seq = tk->cs_was_changed_seq;  		systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq;  		base_real = ktime_add(tk->tkr_mono.base, @@ -1108,7 +1128,7 @@ int get_device_system_crosststamp(int (*get_time_fn)  		 * Check whether the system counter value provided by the  		 * device driver is on the current timekeeping interval.  		 */ -		now = tk->tkr_mono.read(tk->tkr_mono.clock); +		now = tk_clock_read(&tk->tkr_mono);  		interval_start = tk->tkr_mono.cycle_last;  		if (!cycle_between(interval_start, cycles, now)) {  			clock_was_set_seq = tk->clock_was_set_seq; @@ -1629,7 +1649,7 @@ void timekeeping_resume(void)  	 * The less preferred source will only be tried if there is no better  	 * usable source. The rtc part is handled separately in rtc core code.  	 */ -	cycle_now = tk->tkr_mono.read(clock); +	cycle_now = tk_clock_read(&tk->tkr_mono);  	if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) &&  		cycle_now > tk->tkr_mono.cycle_last) {  		u64 nsec, cyc_delta; @@ -1976,7 +1996,7 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset,  				    u32 shift, unsigned int *clock_set)  {  	u64 interval = tk->cycle_interval << shift; -	u64 raw_nsecs; +	u64 snsec_per_sec;  	/* If the offset is smaller than a shifted interval, do nothing */  	if (offset < interval) @@ -1991,14 +2011,15 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset,  	*clock_set |= accumulate_nsecs_to_secs(tk);  	/* Accumulate raw time */ -	raw_nsecs = (u64)tk->raw_interval << shift; -	raw_nsecs += tk->raw_time.tv_nsec; -	if (raw_nsecs >= NSEC_PER_SEC) { -		u64 raw_secs = raw_nsecs; -		raw_nsecs = do_div(raw_secs, NSEC_PER_SEC); -		tk->raw_time.tv_sec += raw_secs; +	tk->tkr_raw.xtime_nsec += (u64)tk->raw_time.tv_nsec << tk->tkr_raw.shift; +	tk->tkr_raw.xtime_nsec += tk->raw_interval << shift; +	snsec_per_sec = (u64)NSEC_PER_SEC << tk->tkr_raw.shift; +	while (tk->tkr_raw.xtime_nsec >= snsec_per_sec) { +		tk->tkr_raw.xtime_nsec -= snsec_per_sec; +		tk->raw_time.tv_sec++;  	} -	tk->raw_time.tv_nsec = raw_nsecs; +	tk->raw_time.tv_nsec = tk->tkr_raw.xtime_nsec >> tk->tkr_raw.shift; +	tk->tkr_raw.xtime_nsec -= (u64)tk->raw_time.tv_nsec << tk->tkr_raw.shift;  	/* Accumulate error between NTP and clock interval */  	tk->ntp_error += tk->ntp_tick << shift; @@ -2030,7 +2051,7 @@ void update_wall_time(void)  #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET  	offset = real_tk->cycle_interval;  #else -	offset = clocksource_delta(tk->tkr_mono.read(tk->tkr_mono.clock), +	offset = clocksource_delta(tk_clock_read(&tk->tkr_mono),  				   tk->tkr_mono.cycle_last, tk->tkr_mono.mask);  #endif diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index bd8ae8d5ae9c..193c5f5e3f79 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1662,14 +1662,14 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,  		goto out;  	if (attr == &dev_attr_act_mask) { -		if (sscanf(buf, "%llx", &value) != 1) { +		if (kstrtoull(buf, 0, &value)) {  			/* Assume it is a list of trace category names */  			ret = blk_trace_str2mask(buf);  			if (ret < 0)  				goto out;  			value = ret;  		} -	} else if (sscanf(buf, "%llu", &value) != 1) +	} else if (kstrtoull(buf, 0, &value))  		goto out;  	ret = -ENXIO; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 39dca4e86a94..b308be30dfb9 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4144,9 +4144,9 @@ unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr,  	int i, ret = -ENODEV;  	int size; -	if (glob && (strcmp(glob, "*") == 0 || !strlen(glob))) +	if (!glob || !strlen(glob) || !strcmp(glob, "*"))  		func_g.search = NULL; -	else if (glob) { +	else {  		int not;  		func_g.type = filter_parse_regex(glob, strlen(glob), @@ -4256,6 +4256,14 @@ unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr,  	return ret;  } +void clear_ftrace_function_probes(struct trace_array *tr) +{ +	struct ftrace_func_probe *probe, *n; + +	list_for_each_entry_safe(probe, n, &tr->func_probes, list) +		unregister_ftrace_function_probe_func(NULL, tr, probe->probe_ops); +} +  static LIST_HEAD(ftrace_commands);  static DEFINE_MUTEX(ftrace_cmd_mutex); @@ -4329,9 +4337,6 @@ static int ftrace_process_regex(struct ftrace_iterator *iter,  	command = strsep(&next, ":"); -	if (WARN_ON_ONCE(!tr)) -		return -EINVAL; -  	mutex_lock(&ftrace_cmd_mutex);  	list_for_each_entry(p, &ftrace_commands, list) {  		if (strcmp(p->name, command) == 0) { @@ -5055,7 +5060,7 @@ ftrace_graph_release(struct inode *inode, struct file *file)  	}   out: -	kfree(fgd->new_hash); +	free_ftrace_hash(fgd->new_hash);  	kfree(fgd);  	return ret; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c4536c449021..091e801145c9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1558,7 +1558,7 @@ static __init int init_trace_selftests(void)  	return 0;  } -early_initcall(init_trace_selftests); +core_initcall(init_trace_selftests);  #else  static inline int run_tracer_selftest(struct tracer *type)  { @@ -2568,7 +2568,36 @@ static inline void ftrace_trace_stack(struct trace_array *tr,  void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,  		   int pc)  { -	__ftrace_trace_stack(tr->trace_buffer.buffer, flags, skip, pc, NULL); +	struct ring_buffer *buffer = tr->trace_buffer.buffer; + +	if (rcu_is_watching()) { +		__ftrace_trace_stack(buffer, flags, skip, pc, NULL); +		return; +	} + +	/* +	 * When an NMI triggers, RCU is enabled via rcu_nmi_enter(), +	 * but if the above rcu_is_watching() failed, then the NMI +	 * triggered someplace critical, and rcu_irq_enter() should +	 * not be called from NMI. +	 */ +	if (unlikely(in_nmi())) +		return; + +	/* +	 * It is possible that a function is being traced in a +	 * location that RCU is not watching. A call to +	 * rcu_irq_enter() will make sure that it is, but there's +	 * a few internal rcu functions that could be traced +	 * where that wont work either. In those cases, we just +	 * do nothing. +	 */ +	if (unlikely(rcu_irq_enter_disabled())) +		return; + +	rcu_irq_enter_irqson(); +	__ftrace_trace_stack(buffer, flags, skip, pc, NULL); +	rcu_irq_exit_irqson();  }  /** @@ -6852,6 +6881,9 @@ ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash,  	char *number;  	int ret; +	if (!tr) +		return -ENODEV; +  	/* hash funcs only work with set_ftrace_filter */  	if (!enable)  		return -EINVAL; @@ -7550,6 +7582,7 @@ static int instance_rmdir(const char *name)  	}  	tracing_set_nop(tr); +	clear_ftrace_function_probes(tr);  	event_trace_del_tracer(tr);  	ftrace_clear_pids(tr);  	ftrace_destroy_function_files(tr); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 291a1bca5748..39fd77330aab 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -980,6 +980,7 @@ register_ftrace_function_probe(char *glob, struct trace_array *tr,  extern int  unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr,  				      struct ftrace_probe_ops *ops); +extern void clear_ftrace_function_probes(struct trace_array *tr);  int register_ftrace_command(struct ftrace_func_command *cmd);  int unregister_ftrace_command(struct ftrace_func_command *cmd); @@ -998,6 +999,10 @@ static inline __init int unregister_ftrace_command(char *cmd_name)  {  	return -EINVAL;  } +static inline void clear_ftrace_function_probes(struct trace_array *tr) +{ +} +  /*   * The ops parameter passed in is usually undefined.   * This must be a macro. diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index a3bddbfd0874..a0910c0cdf2e 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -654,6 +654,9 @@ ftrace_trace_onoff_callback(struct trace_array *tr, struct ftrace_hash *hash,  {  	struct ftrace_probe_ops *ops; +	if (!tr) +		return -ENODEV; +  	/* we register both traceon and traceoff to this callback */  	if (strcmp(cmd, "traceon") == 0)  		ops = param ? &traceon_count_probe_ops : &traceon_probe_ops; @@ -670,6 +673,9 @@ ftrace_stacktrace_callback(struct trace_array *tr, struct ftrace_hash *hash,  {  	struct ftrace_probe_ops *ops; +	if (!tr) +		return -ENODEV; +  	ops = param ? &stacktrace_count_probe_ops : &stacktrace_probe_ops;  	return ftrace_trace_probe_callback(tr, ops, hash, glob, cmd, @@ -682,6 +688,9 @@ ftrace_dump_callback(struct trace_array *tr, struct ftrace_hash *hash,  {  	struct ftrace_probe_ops *ops; +	if (!tr) +		return -ENODEV; +  	ops = &dump_probe_ops;  	/* Only dump once. */ @@ -695,6 +704,9 @@ ftrace_cpudump_callback(struct trace_array *tr, struct ftrace_hash *hash,  {  	struct ftrace_probe_ops *ops; +	if (!tr) +		return -ENODEV; +  	ops = &cpudump_probe_ops;  	/* Only dump once. */ diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 8485f6738a87..b53c8d369163 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -707,20 +707,16 @@ static int create_trace_kprobe(int argc, char **argv)  		pr_info("Probe point is not specified.\n");  		return -EINVAL;  	} -	if (isdigit(argv[1][0])) { -		/* an address specified */ -		ret = kstrtoul(&argv[1][0], 0, (unsigned long *)&addr); -		if (ret) { -			pr_info("Failed to parse address.\n"); -			return ret; -		} -	} else { + +	/* try to parse an address. if that fails, try to read the +	 * input as a symbol. */ +	if (kstrtoul(argv[1], 0, (unsigned long *)&addr)) {  		/* a symbol specified */  		symbol = argv[1];  		/* TODO: support .init module functions */  		ret = traceprobe_split_symbol_offset(symbol, &offset);  		if (ret) { -			pr_info("Failed to parse symbol.\n"); +			pr_info("Failed to parse either an address or a symbol.\n");  			return ret;  		}  		if (offset && is_return && @@ -1535,6 +1531,11 @@ static __init int kprobe_trace_self_tests_init(void)  end:  	release_all_trace_kprobes(); +	/* +	 * Wait for the optimizer work to finish. Otherwise it might fiddle +	 * with probes in already freed __init text. +	 */ +	wait_for_kprobe_optimizer();  	if (warn)  		pr_cont("NG: Some tests are failed. Please check them.\n");  	else diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 76aa04d4c925..b4a751e8f9d6 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -409,7 +409,9 @@ static const struct file_operations stack_trace_fops = {  static int  stack_trace_filter_open(struct inode *inode, struct file *file)  { -	return ftrace_regex_open(&trace_ops, FTRACE_ITER_FILTER, +	struct ftrace_ops *ops = inode->i_private; + +	return ftrace_regex_open(ops, FTRACE_ITER_FILTER,  				 inode, file);  } @@ -476,7 +478,7 @@ static __init int stack_trace_init(void)  			NULL, &stack_trace_fops);  	trace_create_file("stack_trace_filter", 0444, d_tracer, -			NULL, &stack_trace_filter_fops); +			  &trace_ops, &stack_trace_filter_fops);  	if (stack_trace_filter_buf[0])  		ftrace_set_early_filter(&trace_ops, stack_trace_filter_buf, 1); |