diff options
Diffstat (limited to 'kernel')
174 files changed, 1851 insertions, 1058 deletions
| diff --git a/kernel/Makefile b/kernel/Makefile index ed470aac53da..172d151d429c 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0  #  # Makefile for the linux kernel.  # diff --git a/kernel/acct.c b/kernel/acct.c index 5e72af29ab73..6670fbd3e466 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   *  linux/kernel/acct.c   * diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 011d46e5f73f..d4b050d9a66e 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  #include "audit.h"  #include <linux/fsnotify_backend.h>  #include <linux/namei.h> diff --git a/kernel/bounds.c b/kernel/bounds.c index e1d1d1952bfa..c373e887c066 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * Generate definitions needed by the preprocessor.   * This code generates raw asm output which is post-processed diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 897daa005b23..af3ab6164ff5 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0  obj-y := core.o  obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 98c0f00c3f5e..e2636737b69b 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -98,7 +98,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)  	array_size += (u64) attr->max_entries * elem_size * num_possible_cpus();  	if (array_size >= U32_MAX - PAGE_SIZE || -	    elem_size > PCPU_MIN_UNIT_SIZE || bpf_array_alloc_percpu(array)) { +	    bpf_array_alloc_percpu(array)) {  		bpf_map_area_free(array);  		return ERR_PTR(-ENOMEM);  	} diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 917cc04a0a94..7b62df86be1d 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1022,7 +1022,7 @@ select_insn:  		struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2;  		struct bpf_array *array = container_of(map, struct bpf_array, map);  		struct bpf_prog *prog; -		u64 index = BPF_R3; +		u32 index = BPF_R3;  		if (unlikely(index >= array->map.max_entries))  			goto out; diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 959c9a07f318..e745d6a88224 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -69,14 +69,17 @@ static LIST_HEAD(dev_map_list);  static u64 dev_map_bitmap_size(const union bpf_attr *attr)  { -	return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long); +	return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long);  }  static struct bpf_map *dev_map_alloc(union bpf_attr *attr)  {  	struct bpf_dtab *dtab; +	int err = -EINVAL;  	u64 cost; -	int err; + +	if (!capable(CAP_NET_ADMIN)) +		return ERR_PTR(-EPERM);  	/* check sanity of attributes */  	if (attr->max_entries == 0 || attr->key_size != 4 || @@ -108,9 +111,12 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)  	if (err)  		goto free_dtab; +	err = -ENOMEM; +  	/* A per cpu bitfield with a bit per possible net device */ -	dtab->flush_needed = __alloc_percpu(dev_map_bitmap_size(attr), -					    __alignof__(unsigned long)); +	dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr), +						__alignof__(unsigned long), +						GFP_KERNEL | __GFP_NOWARN);  	if (!dtab->flush_needed)  		goto free_dtab; @@ -128,7 +134,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)  free_dtab:  	free_percpu(dtab->flush_needed);  	kfree(dtab); -	return ERR_PTR(-ENOMEM); +	return ERR_PTR(err);  }  static void dev_map_free(struct bpf_map *map) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 431126f31ea3..6533f08d1238 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -317,10 +317,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)  		 */  		goto free_htab; -	if (percpu && round_up(htab->map.value_size, 8) > PCPU_MIN_UNIT_SIZE) -		/* make sure the size for pcpu_alloc() is reasonable */ -		goto free_htab; -  	htab->elem_size = sizeof(struct htab_elem) +  			  round_up(htab->map.key_size, 8);  	if (percpu) diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index e833ed914358..be1dde967208 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -363,6 +363,7 @@ out:  	putname(pname);  	return ret;  } +EXPORT_SYMBOL_GPL(bpf_obj_get_user);  static void bpf_evict_inode(struct inode *inode)  { diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 6424ce0e4969..dbd7b322a86b 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -39,6 +39,7 @@  #include <linux/workqueue.h>  #include <linux/list.h>  #include <net/strparser.h> +#include <net/tcp.h>  struct bpf_stab {  	struct bpf_map map; @@ -92,21 +93,45 @@ static inline struct smap_psock *smap_psock_sk(const struct sock *sk)  	return rcu_dereference_sk_user_data(sk);  } +/* compute the linear packet data range [data, data_end) for skb when + * sk_skb type programs are in use. + */ +static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb) +{ +	TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb); +} + +enum __sk_action { +	__SK_DROP = 0, +	__SK_PASS, +	__SK_REDIRECT, +}; +  static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)  {  	struct bpf_prog *prog = READ_ONCE(psock->bpf_verdict);  	int rc;  	if (unlikely(!prog)) -		return SK_DROP; +		return __SK_DROP;  	skb_orphan(skb); +	/* We need to ensure that BPF metadata for maps is also cleared +	 * when we orphan the skb so that we don't have the possibility +	 * to reference a stale map. +	 */ +	TCP_SKB_CB(skb)->bpf.map = NULL;  	skb->sk = psock->sock; -	bpf_compute_data_end(skb); +	bpf_compute_data_end_sk_skb(skb); +	preempt_disable();  	rc = (*prog->bpf_func)(skb, prog->insnsi); +	preempt_enable();  	skb->sk = NULL; -	return rc; +	/* Moving return codes from UAPI namespace into internal namespace */ +	return rc == SK_PASS ? +		(TCP_SKB_CB(skb)->bpf.map ? __SK_REDIRECT : __SK_PASS) : +		__SK_DROP;  }  static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb) @@ -114,17 +139,10 @@ static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb)  	struct sock *sk;  	int rc; -	/* Because we use per cpu values to feed input from sock redirect -	 * in BPF program to do_sk_redirect_map() call we need to ensure we -	 * are not preempted. RCU read lock is not sufficient in this case -	 * with CONFIG_PREEMPT_RCU enabled so we must be explicit here. -	 */ -	preempt_disable();  	rc = smap_verdict_func(psock, skb);  	switch (rc) { -	case SK_REDIRECT: -		sk = do_sk_redirect_map(); -		preempt_enable(); +	case __SK_REDIRECT: +		sk = do_sk_redirect_map(skb);  		if (likely(sk)) {  			struct smap_psock *peer = smap_psock_sk(sk); @@ -139,10 +157,8 @@ static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb)  			}  		}  	/* Fall through and free skb otherwise */ -	case SK_DROP: +	case __SK_DROP:  	default: -		if (rc != SK_REDIRECT) -			preempt_enable();  		kfree_skb(skb);  	}  } @@ -369,7 +385,7 @@ static int smap_parse_func_strparser(struct strparser *strp,  	 * any socket yet.  	 */  	skb->sk = psock->sock; -	bpf_compute_data_end(skb); +	bpf_compute_data_end_sk_skb(skb);  	rc = (*prog->bpf_func)(skb, prog->insnsi);  	skb->sk = NULL;  	rcu_read_unlock(); @@ -487,6 +503,9 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)  	int err = -EINVAL;  	u64 cost; +	if (!capable(CAP_NET_ADMIN)) +		return ERR_PTR(-EPERM); +  	/* check sanity of attributes */  	if (attr->max_entries == 0 || attr->key_size != 4 ||  	    attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE) @@ -840,6 +859,12 @@ static int sock_map_update_elem(struct bpf_map *map,  		return -EINVAL;  	} +	if (skops.sk->sk_type != SOCK_STREAM || +	    skops.sk->sk_protocol != IPPROTO_TCP) { +		fput(socket->file); +		return -EOPNOTSUPP; +	} +  	err = sock_map_ctx_update_elem(&skops, map, key, flags);  	fput(socket->file);  	return err; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index cb17e1cd1d43..25d074920a00 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -186,15 +186,17 @@ static int bpf_map_alloc_id(struct bpf_map *map)  static void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)  { +	unsigned long flags; +  	if (do_idr_lock) -		spin_lock_bh(&map_idr_lock); +		spin_lock_irqsave(&map_idr_lock, flags);  	else  		__acquire(&map_idr_lock);  	idr_remove(&map_idr, map->id);  	if (do_idr_lock) -		spin_unlock_bh(&map_idr_lock); +		spin_unlock_irqrestore(&map_idr_lock, flags);  	else  		__release(&map_idr_lock);  } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 799b2451ef2d..c48ca2a34b5e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -653,6 +653,10 @@ static void mark_reg_read(const struct bpf_verifier_state *state, u32 regno)  {  	struct bpf_verifier_state *parent = state->parent; +	if (regno == BPF_REG_FP) +		/* We don't need to worry about FP liveness because it's read-only */ +		return; +  	while (parent) {  		/* if read wasn't screened by an earlier write ... */  		if (state->regs[regno].live & REG_LIVE_WRITTEN) @@ -1112,7 +1116,12 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn  		/* ctx accesses must be at a fixed offset, so that we can  		 * determine what type of data were returned.  		 */ -		if (!tnum_is_const(reg->var_off)) { +		if (reg->off) { +			verbose("dereference of modified ctx ptr R%d off=%d+%d, ctx+const is allowed, ctx+const+const is not\n", +				regno, reg->off, off - reg->off); +			return -EACCES; +		} +		if (!tnum_is_const(reg->var_off) || reg->var_off.value) {  			char tn_buf[48];  			tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); @@ -1120,7 +1129,6 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn  				tn_buf, off, size);  			return -EACCES;  		} -		off += reg->var_off.value;  		err = check_ctx_access(env, insn_idx, off, size, t, ®_type);  		if (!err && t == BPF_READ && value_regno >= 0) {  			/* ctx access returns either a scalar, or a @@ -2345,6 +2353,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)  				 * copy register state to dest reg  				 */  				regs[insn->dst_reg] = regs[insn->src_reg]; +				regs[insn->dst_reg].live |= REG_LIVE_WRITTEN;  			} else {  				/* R1 = (u32) R2 */  				if (is_pointer_value(env, insn->src_reg)) { @@ -2421,12 +2430,15 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)  }  static void find_good_pkt_pointers(struct bpf_verifier_state *state, -				   struct bpf_reg_state *dst_reg) +				   struct bpf_reg_state *dst_reg, +				   bool range_right_open)  {  	struct bpf_reg_state *regs = state->regs, *reg; +	u16 new_range;  	int i; -	if (dst_reg->off < 0) +	if (dst_reg->off < 0 || +	    (dst_reg->off == 0 && range_right_open))  		/* This doesn't give us any range */  		return; @@ -2437,9 +2449,13 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,  		 */  		return; -	/* LLVM can generate four kind of checks: +	new_range = dst_reg->off; +	if (range_right_open) +		new_range--; + +	/* Examples for register markings:  	 * -	 * Type 1/2: +	 * pkt_data in dst register:  	 *  	 *   r2 = r3;  	 *   r2 += 8; @@ -2456,7 +2472,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,  	 *     r2=pkt(id=n,off=8,r=0)  	 *     r3=pkt(id=n,off=0,r=0)  	 * -	 * Type 3/4: +	 * pkt_data in src register:  	 *  	 *   r2 = r3;  	 *   r2 += 8; @@ -2474,7 +2490,9 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,  	 *     r3=pkt(id=n,off=0,r=0)  	 *  	 * Find register r3 and mark its range as r3=pkt(id=n,off=0,r=8) -	 * so that range of bytes [r3, r3 + 8) is safe to access. +	 * or r3=pkt(id=n,off=0,r=8-1), so that range of bytes [r3, r3 + 8) +	 * and [r3, r3 + 8-1) respectively is safe to access depending on +	 * the check.  	 */  	/* If our ids match, then we must have the same max_value.  And we @@ -2485,14 +2503,14 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,  	for (i = 0; i < MAX_BPF_REG; i++)  		if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id)  			/* keep the maximum range already checked */ -			regs[i].range = max_t(u16, regs[i].range, dst_reg->off); +			regs[i].range = max(regs[i].range, new_range);  	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {  		if (state->stack_slot_type[i] != STACK_SPILL)  			continue;  		reg = &state->spilled_regs[i / BPF_REG_SIZE];  		if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id) -			reg->range = max_t(u16, reg->range, dst_reg->off); +			reg->range = max(reg->range, new_range);  	}  } @@ -2856,19 +2874,43 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,  	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&  		   dst_reg->type == PTR_TO_PACKET &&  		   regs[insn->src_reg].type == PTR_TO_PACKET_END) { -		find_good_pkt_pointers(this_branch, dst_reg); +		/* pkt_data' > pkt_end */ +		find_good_pkt_pointers(this_branch, dst_reg, false); +	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && +		   dst_reg->type == PTR_TO_PACKET_END && +		   regs[insn->src_reg].type == PTR_TO_PACKET) { +		/* pkt_end > pkt_data' */ +		find_good_pkt_pointers(other_branch, ®s[insn->src_reg], true); +	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT && +		   dst_reg->type == PTR_TO_PACKET && +		   regs[insn->src_reg].type == PTR_TO_PACKET_END) { +		/* pkt_data' < pkt_end */ +		find_good_pkt_pointers(other_branch, dst_reg, true);  	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT && +		   dst_reg->type == PTR_TO_PACKET_END && +		   regs[insn->src_reg].type == PTR_TO_PACKET) { +		/* pkt_end < pkt_data' */ +		find_good_pkt_pointers(this_branch, ®s[insn->src_reg], false); +	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE &&  		   dst_reg->type == PTR_TO_PACKET &&  		   regs[insn->src_reg].type == PTR_TO_PACKET_END) { -		find_good_pkt_pointers(other_branch, dst_reg); +		/* pkt_data' >= pkt_end */ +		find_good_pkt_pointers(this_branch, dst_reg, true);  	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE &&  		   dst_reg->type == PTR_TO_PACKET_END &&  		   regs[insn->src_reg].type == PTR_TO_PACKET) { -		find_good_pkt_pointers(other_branch, ®s[insn->src_reg]); +		/* pkt_end >= pkt_data' */ +		find_good_pkt_pointers(other_branch, ®s[insn->src_reg], false); +	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE && +		   dst_reg->type == PTR_TO_PACKET && +		   regs[insn->src_reg].type == PTR_TO_PACKET_END) { +		/* pkt_data' <= pkt_end */ +		find_good_pkt_pointers(other_branch, dst_reg, false);  	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE &&  		   dst_reg->type == PTR_TO_PACKET_END &&  		   regs[insn->src_reg].type == PTR_TO_PACKET) { -		find_good_pkt_pointers(this_branch, ®s[insn->src_reg]); +		/* pkt_end <= pkt_data' */ +		find_good_pkt_pointers(this_branch, ®s[insn->src_reg], true);  	} else if (is_pointer_value(env, insn->dst_reg)) {  		verbose("R%d pointer comparison prohibited\n", insn->dst_reg);  		return -EACCES; @@ -4205,7 +4247,12 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)  		}  		if (insn->imm == BPF_FUNC_redirect_map) { -			u64 addr = (unsigned long)prog; +			/* Note, we cannot use prog directly as imm as subsequent +			 * rewrites would still change the prog pointer. The only +			 * stable address we can use is aux, which also works with +			 * prog clones during blinding. +			 */ +			u64 addr = (unsigned long)prog->aux;  			struct bpf_insn r4_ld[] = {  				BPF_LD_IMM64(BPF_REG_4, addr),  				*insn, diff --git a/kernel/capability.c b/kernel/capability.c index f97fe77ceb88..1e1c0236f55b 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * linux/kernel/capability.c   * diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile index ce693ccb8c58..ae448f7632cc 100644 --- a/kernel/cgroup/Makefile +++ b/kernel/cgroup/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0  obj-y := cgroup.o namespace.o cgroup-v1.o  obj-$(CONFIG_CGROUP_FREEZER) += freezer.o diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 5151ff256c29..bf54ade001be 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  #ifndef __CGROUP_INTERNAL_H  #define __CGROUP_INTERNAL_H diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index d6551cd45238..44857278eb8a 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2311,6 +2311,14 @@ out_release_tset:  		list_del_init(&cset->mg_node);  	}  	spin_unlock_irq(&css_set_lock); + +	/* +	 * Re-initialize the cgroup_taskset structure in case it is reused +	 * again in another cgroup_migrate_add_task()/cgroup_migrate_execute() +	 * iteration. +	 */ +	tset->nr_tasks = 0; +	tset->csets    = &tset->src_csets;  	return ret;  } diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c index f661b4cc5efd..5f780d8f6a9d 100644 --- a/kernel/cgroup/debug.c +++ b/kernel/cgroup/debug.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * Debug controller   * diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index 66129eb4371d..b05f1dd58a62 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  #include "cgroup-internal.h"  #include <linux/sched/task.h> diff --git a/kernel/cpu.c b/kernel/cpu.c index acf5308fad51..04892a82f6ac 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -24,6 +24,7 @@  #include <linux/lockdep.h>  #include <linux/tick.h>  #include <linux/irq.h> +#include <linux/nmi.h>  #include <linux/smpboot.h>  #include <linux/relay.h>  #include <linux/slab.h> @@ -46,11 +47,13 @@   * @bringup:	Single callback bringup or teardown selector   * @cb_state:	The state for a single callback (install/uninstall)   * @result:	Result of the operation - * @done:	Signal completion to the issuer of the task + * @done_up:	Signal completion to the issuer of the task for cpu-up + * @done_down:	Signal completion to the issuer of the task for cpu-down   */  struct cpuhp_cpu_state {  	enum cpuhp_state	state;  	enum cpuhp_state	target; +	enum cpuhp_state	fail;  #ifdef CONFIG_SMP  	struct task_struct	*thread;  	bool			should_run; @@ -58,18 +61,39 @@ struct cpuhp_cpu_state {  	bool			single;  	bool			bringup;  	struct hlist_node	*node; +	struct hlist_node	*last;  	enum cpuhp_state	cb_state;  	int			result; -	struct completion	done; +	struct completion	done_up; +	struct completion	done_down;  #endif  }; -static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state); +static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state) = { +	.fail = CPUHP_INVALID, +};  #if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP) -static struct lock_class_key cpuhp_state_key; -static struct lockdep_map cpuhp_state_lock_map = -	STATIC_LOCKDEP_MAP_INIT("cpuhp_state", &cpuhp_state_key); +static struct lockdep_map cpuhp_state_up_map = +	STATIC_LOCKDEP_MAP_INIT("cpuhp_state-up", &cpuhp_state_up_map); +static struct lockdep_map cpuhp_state_down_map = +	STATIC_LOCKDEP_MAP_INIT("cpuhp_state-down", &cpuhp_state_down_map); + + +static void inline cpuhp_lock_acquire(bool bringup) +{ +	lock_map_acquire(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map); +} + +static void inline cpuhp_lock_release(bool bringup) +{ +	lock_map_release(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map); +} +#else + +static void inline cpuhp_lock_acquire(bool bringup) { } +static void inline cpuhp_lock_release(bool bringup) { } +  #endif  /** @@ -123,13 +147,16 @@ static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state)  /**   * cpuhp_invoke_callback _ Invoke the callbacks for a given state   * @cpu:	The cpu for which the callback should be invoked - * @step:	The step in the state machine + * @state:	The state to do callbacks for   * @bringup:	True if the bringup callback should be invoked + * @node:	For multi-instance, do a single entry callback for install/remove + * @lastp:	For multi-instance rollback, remember how far we got   *   * Called from cpu hotplug and from the state register machinery.   */  static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, -				 bool bringup, struct hlist_node *node) +				 bool bringup, struct hlist_node *node, +				 struct hlist_node **lastp)  {  	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);  	struct cpuhp_step *step = cpuhp_get_step(state); @@ -137,7 +164,17 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,  	int (*cb)(unsigned int cpu);  	int ret, cnt; +	if (st->fail == state) { +		st->fail = CPUHP_INVALID; + +		if (!(bringup ? step->startup.single : step->teardown.single)) +			return 0; + +		return -EAGAIN; +	} +  	if (!step->multi_instance) { +		WARN_ON_ONCE(lastp && *lastp);  		cb = bringup ? step->startup.single : step->teardown.single;  		if (!cb)  			return 0; @@ -152,6 +189,7 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,  	/* Single invocation for instance add/remove */  	if (node) { +		WARN_ON_ONCE(lastp && *lastp);  		trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);  		ret = cbm(cpu, node);  		trace_cpuhp_exit(cpu, st->state, state, ret); @@ -161,13 +199,23 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,  	/* State transition. Invoke on all instances */  	cnt = 0;  	hlist_for_each(node, &step->list) { +		if (lastp && node == *lastp) +			break; +  		trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);  		ret = cbm(cpu, node);  		trace_cpuhp_exit(cpu, st->state, state, ret); -		if (ret) -			goto err; +		if (ret) { +			if (!lastp) +				goto err; + +			*lastp = node; +			return ret; +		}  		cnt++;  	} +	if (lastp) +		*lastp = NULL;  	return 0;  err:  	/* Rollback the instances if one failed */ @@ -178,12 +226,39 @@ err:  	hlist_for_each(node, &step->list) {  		if (!cnt--)  			break; -		cbm(cpu, node); + +		trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node); +		ret = cbm(cpu, node); +		trace_cpuhp_exit(cpu, st->state, state, ret); +		/* +		 * Rollback must not fail, +		 */ +		WARN_ON_ONCE(ret);  	}  	return ret;  }  #ifdef CONFIG_SMP +static inline void wait_for_ap_thread(struct cpuhp_cpu_state *st, bool bringup) +{ +	struct completion *done = bringup ? &st->done_up : &st->done_down; +	wait_for_completion(done); +} + +static inline void complete_ap_thread(struct cpuhp_cpu_state *st, bool bringup) +{ +	struct completion *done = bringup ? &st->done_up : &st->done_down; +	complete(done); +} + +/* + * The former STARTING/DYING states, ran with IRQs disabled and must not fail. + */ +static bool cpuhp_is_atomic_state(enum cpuhp_state state) +{ +	return CPUHP_AP_IDLE_DEAD <= state && state < CPUHP_AP_ONLINE; +} +  /* Serializes the updates to cpu_online_mask, cpu_present_mask */  static DEFINE_MUTEX(cpu_add_remove_lock);  bool cpuhp_tasks_frozen; @@ -271,14 +346,79 @@ void cpu_hotplug_enable(void)  EXPORT_SYMBOL_GPL(cpu_hotplug_enable);  #endif	/* CONFIG_HOTPLUG_CPU */ -static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st); +static inline enum cpuhp_state +cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target) +{ +	enum cpuhp_state prev_state = st->state; + +	st->rollback = false; +	st->last = NULL; + +	st->target = target; +	st->single = false; +	st->bringup = st->state < target; + +	return prev_state; +} + +static inline void +cpuhp_reset_state(struct cpuhp_cpu_state *st, enum cpuhp_state prev_state) +{ +	st->rollback = true; + +	/* +	 * If we have st->last we need to undo partial multi_instance of this +	 * state first. Otherwise start undo at the previous state. +	 */ +	if (!st->last) { +		if (st->bringup) +			st->state--; +		else +			st->state++; +	} + +	st->target = prev_state; +	st->bringup = !st->bringup; +} + +/* Regular hotplug invocation of the AP hotplug thread */ +static void __cpuhp_kick_ap(struct cpuhp_cpu_state *st) +{ +	if (!st->single && st->state == st->target) +		return; + +	st->result = 0; +	/* +	 * Make sure the above stores are visible before should_run becomes +	 * true. Paired with the mb() above in cpuhp_thread_fun() +	 */ +	smp_mb(); +	st->should_run = true; +	wake_up_process(st->thread); +	wait_for_ap_thread(st, st->bringup); +} + +static int cpuhp_kick_ap(struct cpuhp_cpu_state *st, enum cpuhp_state target) +{ +	enum cpuhp_state prev_state; +	int ret; + +	prev_state = cpuhp_set_state(st, target); +	__cpuhp_kick_ap(st); +	if ((ret = st->result)) { +		cpuhp_reset_state(st, prev_state); +		__cpuhp_kick_ap(st); +	} + +	return ret; +}  static int bringup_wait_for_ap(unsigned int cpu)  {  	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);  	/* Wait for the CPU to reach CPUHP_AP_ONLINE_IDLE */ -	wait_for_completion(&st->done); +	wait_for_ap_thread(st, true);  	if (WARN_ON_ONCE((!cpu_online(cpu))))  		return -ECANCELED; @@ -286,12 +426,10 @@ static int bringup_wait_for_ap(unsigned int cpu)  	stop_machine_unpark(cpu);  	kthread_unpark(st->thread); -	/* Should we go further up ? */ -	if (st->target > CPUHP_AP_ONLINE_IDLE) { -		__cpuhp_kick_ap_work(st); -		wait_for_completion(&st->done); -	} -	return st->result; +	if (st->target <= CPUHP_AP_ONLINE_IDLE) +		return 0; + +	return cpuhp_kick_ap(st, st->target);  }  static int bringup_cpu(unsigned int cpu) @@ -317,32 +455,6 @@ static int bringup_cpu(unsigned int cpu)  /*   * Hotplug state machine related functions   */ -static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st) -{ -	for (st->state++; st->state < st->target; st->state++) { -		struct cpuhp_step *step = cpuhp_get_step(st->state); - -		if (!step->skip_onerr) -			cpuhp_invoke_callback(cpu, st->state, true, NULL); -	} -} - -static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, -				enum cpuhp_state target) -{ -	enum cpuhp_state prev_state = st->state; -	int ret = 0; - -	for (; st->state > target; st->state--) { -		ret = cpuhp_invoke_callback(cpu, st->state, false, NULL); -		if (ret) { -			st->target = prev_state; -			undo_cpu_down(cpu, st); -			break; -		} -	} -	return ret; -}  static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st)  { @@ -350,7 +462,7 @@ static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st)  		struct cpuhp_step *step = cpuhp_get_step(st->state);  		if (!step->skip_onerr) -			cpuhp_invoke_callback(cpu, st->state, false, NULL); +			cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL);  	}  } @@ -362,7 +474,7 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,  	while (st->state < target) {  		st->state++; -		ret = cpuhp_invoke_callback(cpu, st->state, true, NULL); +		ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);  		if (ret) {  			st->target = prev_state;  			undo_cpu_up(cpu, st); @@ -379,7 +491,8 @@ static void cpuhp_create(unsigned int cpu)  {  	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); -	init_completion(&st->done); +	init_completion(&st->done_up); +	init_completion(&st->done_down);  }  static int cpuhp_should_run(unsigned int cpu) @@ -389,69 +502,90 @@ static int cpuhp_should_run(unsigned int cpu)  	return st->should_run;  } -/* Execute the teardown callbacks. Used to be CPU_DOWN_PREPARE */ -static int cpuhp_ap_offline(unsigned int cpu, struct cpuhp_cpu_state *st) -{ -	enum cpuhp_state target = max((int)st->target, CPUHP_TEARDOWN_CPU); - -	return cpuhp_down_callbacks(cpu, st, target); -} - -/* Execute the online startup callbacks. Used to be CPU_ONLINE */ -static int cpuhp_ap_online(unsigned int cpu, struct cpuhp_cpu_state *st) -{ -	return cpuhp_up_callbacks(cpu, st, st->target); -} -  /*   * Execute teardown/startup callbacks on the plugged cpu. Also used to invoke   * callbacks when a state gets [un]installed at runtime. + * + * Each invocation of this function by the smpboot thread does a single AP + * state callback. + * + * It has 3 modes of operation: + *  - single: runs st->cb_state + *  - up:     runs ++st->state, while st->state < st->target + *  - down:   runs st->state--, while st->state > st->target + * + * When complete or on error, should_run is cleared and the completion is fired.   */  static void cpuhp_thread_fun(unsigned int cpu)  {  	struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); -	int ret = 0; +	bool bringup = st->bringup; +	enum cpuhp_state state;  	/* -	 * Paired with the mb() in cpuhp_kick_ap_work and -	 * cpuhp_invoke_ap_callback, so the work set is consistent visible. +	 * ACQUIRE for the cpuhp_should_run() load of ->should_run. Ensures +	 * that if we see ->should_run we also see the rest of the state.  	 */  	smp_mb(); -	if (!st->should_run) + +	if (WARN_ON_ONCE(!st->should_run))  		return; -	st->should_run = false; +	cpuhp_lock_acquire(bringup); -	lock_map_acquire(&cpuhp_state_lock_map); -	/* Single callback invocation for [un]install ? */  	if (st->single) { -		if (st->cb_state < CPUHP_AP_ONLINE) { -			local_irq_disable(); -			ret = cpuhp_invoke_callback(cpu, st->cb_state, -						    st->bringup, st->node); -			local_irq_enable(); +		state = st->cb_state; +		st->should_run = false; +	} else { +		if (bringup) { +			st->state++; +			state = st->state; +			st->should_run = (st->state < st->target); +			WARN_ON_ONCE(st->state > st->target);  		} else { -			ret = cpuhp_invoke_callback(cpu, st->cb_state, -						    st->bringup, st->node); +			state = st->state; +			st->state--; +			st->should_run = (st->state > st->target); +			WARN_ON_ONCE(st->state < st->target);  		} -	} else if (st->rollback) { -		BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE); +	} + +	WARN_ON_ONCE(!cpuhp_is_ap_state(state)); -		undo_cpu_down(cpu, st); -		st->rollback = false; +	if (st->rollback) { +		struct cpuhp_step *step = cpuhp_get_step(state); +		if (step->skip_onerr) +			goto next; +	} + +	if (cpuhp_is_atomic_state(state)) { +		local_irq_disable(); +		st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last); +		local_irq_enable(); + +		/* +		 * STARTING/DYING must not fail! +		 */ +		WARN_ON_ONCE(st->result);  	} else { -		/* Cannot happen .... */ -		BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE); - -		/* Regular hotplug work */ -		if (st->state < st->target) -			ret = cpuhp_ap_online(cpu, st); -		else if (st->state > st->target) -			ret = cpuhp_ap_offline(cpu, st); +		st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last); +	} + +	if (st->result) { +		/* +		 * If we fail on a rollback, we're up a creek without no +		 * paddle, no way forward, no way back. We loose, thanks for +		 * playing. +		 */ +		WARN_ON_ONCE(st->rollback); +		st->should_run = false;  	} -	lock_map_release(&cpuhp_state_lock_map); -	st->result = ret; -	complete(&st->done); + +next: +	cpuhp_lock_release(bringup); + +	if (!st->should_run) +		complete_ap_thread(st, bringup);  }  /* Invoke a single callback on a remote cpu */ @@ -460,62 +594,69 @@ cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup,  			 struct hlist_node *node)  {  	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); +	int ret;  	if (!cpu_online(cpu))  		return 0; -	lock_map_acquire(&cpuhp_state_lock_map); -	lock_map_release(&cpuhp_state_lock_map); +	cpuhp_lock_acquire(false); +	cpuhp_lock_release(false); + +	cpuhp_lock_acquire(true); +	cpuhp_lock_release(true);  	/*  	 * If we are up and running, use the hotplug thread. For early calls  	 * we invoke the thread function directly.  	 */  	if (!st->thread) -		return cpuhp_invoke_callback(cpu, state, bringup, node); +		return cpuhp_invoke_callback(cpu, state, bringup, node, NULL); +	st->rollback = false; +	st->last = NULL; + +	st->node = node; +	st->bringup = bringup;  	st->cb_state = state;  	st->single = true; -	st->bringup = bringup; -	st->node = node; + +	__cpuhp_kick_ap(st);  	/* -	 * Make sure the above stores are visible before should_run becomes -	 * true. Paired with the mb() above in cpuhp_thread_fun() +	 * If we failed and did a partial, do a rollback.  	 */ -	smp_mb(); -	st->should_run = true; -	wake_up_process(st->thread); -	wait_for_completion(&st->done); -	return st->result; -} +	if ((ret = st->result) && st->last) { +		st->rollback = true; +		st->bringup = !bringup; + +		__cpuhp_kick_ap(st); +	} -/* Regular hotplug invocation of the AP hotplug thread */ -static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st) -{ -	st->result = 0; -	st->single = false;  	/* -	 * Make sure the above stores are visible before should_run becomes -	 * true. Paired with the mb() above in cpuhp_thread_fun() +	 * Clean up the leftovers so the next hotplug operation wont use stale +	 * data.  	 */ -	smp_mb(); -	st->should_run = true; -	wake_up_process(st->thread); +	st->node = st->last = NULL; +	return ret;  }  static int cpuhp_kick_ap_work(unsigned int cpu)  {  	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); -	enum cpuhp_state state = st->state; +	enum cpuhp_state prev_state = st->state; +	int ret; + +	cpuhp_lock_acquire(false); +	cpuhp_lock_release(false); + +	cpuhp_lock_acquire(true); +	cpuhp_lock_release(true); -	trace_cpuhp_enter(cpu, st->target, state, cpuhp_kick_ap_work); -	lock_map_acquire(&cpuhp_state_lock_map); -	lock_map_release(&cpuhp_state_lock_map); -	__cpuhp_kick_ap_work(st); -	wait_for_completion(&st->done); -	trace_cpuhp_exit(cpu, st->state, state, st->result); -	return st->result; +	trace_cpuhp_enter(cpu, st->target, prev_state, cpuhp_kick_ap_work); +	ret = cpuhp_kick_ap(st, st->target); +	trace_cpuhp_exit(cpu, st->state, prev_state, ret); + +	return ret;  }  static struct smp_hotplug_thread cpuhp_threads = { @@ -581,6 +722,7 @@ static int take_cpu_down(void *_param)  	struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);  	enum cpuhp_state target = max((int)st->target, CPUHP_AP_OFFLINE);  	int err, cpu = smp_processor_id(); +	int ret;  	/* Ensure this CPU doesn't handle any more interrupts. */  	err = __cpu_disable(); @@ -594,8 +736,13 @@ static int take_cpu_down(void *_param)  	WARN_ON(st->state != CPUHP_TEARDOWN_CPU);  	st->state--;  	/* Invoke the former CPU_DYING callbacks */ -	for (; st->state > target; st->state--) -		cpuhp_invoke_callback(cpu, st->state, false, NULL); +	for (; st->state > target; st->state--) { +		ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL); +		/* +		 * DYING must not fail! +		 */ +		WARN_ON_ONCE(ret); +	}  	/* Give up timekeeping duties */  	tick_handover_do_timer(); @@ -639,7 +786,7 @@ static int takedown_cpu(unsigned int cpu)  	 *  	 * Wait for the stop thread to go away.  	 */ -	wait_for_completion(&st->done); +	wait_for_ap_thread(st, false);  	BUG_ON(st->state != CPUHP_AP_IDLE_DEAD);  	/* Interrupts are moved away from the dying cpu, reenable alloc/free */ @@ -658,7 +805,7 @@ static void cpuhp_complete_idle_dead(void *arg)  {  	struct cpuhp_cpu_state *st = arg; -	complete(&st->done); +	complete_ap_thread(st, false);  }  void cpuhp_report_idle_dead(void) @@ -676,11 +823,32 @@ void cpuhp_report_idle_dead(void)  				 cpuhp_complete_idle_dead, st, 0);  } -#else -#define takedown_cpu		NULL -#endif +static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st) +{ +	for (st->state++; st->state < st->target; st->state++) { +		struct cpuhp_step *step = cpuhp_get_step(st->state); -#ifdef CONFIG_HOTPLUG_CPU +		if (!step->skip_onerr) +			cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); +	} +} + +static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, +				enum cpuhp_state target) +{ +	enum cpuhp_state prev_state = st->state; +	int ret = 0; + +	for (; st->state > target; st->state--) { +		ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL); +		if (ret) { +			st->target = prev_state; +			undo_cpu_down(cpu, st); +			break; +		} +	} +	return ret; +}  /* Requires cpu_add_remove_lock to be held */  static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, @@ -699,13 +867,13 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,  	cpuhp_tasks_frozen = tasks_frozen; -	prev_state = st->state; -	st->target = target; +	prev_state = cpuhp_set_state(st, target);  	/*  	 * If the current CPU state is in the range of the AP hotplug thread,  	 * then we need to kick the thread.  	 */  	if (st->state > CPUHP_TEARDOWN_CPU) { +		st->target = max((int)target, CPUHP_TEARDOWN_CPU);  		ret = cpuhp_kick_ap_work(cpu);  		/*  		 * The AP side has done the error rollback already. Just @@ -720,6 +888,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,  		 */  		if (st->state > CPUHP_TEARDOWN_CPU)  			goto out; + +		st->target = target;  	}  	/*  	 * The AP brought itself down to CPUHP_TEARDOWN_CPU. So we need @@ -727,13 +897,17 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,  	 */  	ret = cpuhp_down_callbacks(cpu, st, target);  	if (ret && st->state > CPUHP_TEARDOWN_CPU && st->state < prev_state) { -		st->target = prev_state; -		st->rollback = true; -		cpuhp_kick_ap_work(cpu); +		cpuhp_reset_state(st, prev_state); +		__cpuhp_kick_ap(st);  	}  out:  	cpus_write_unlock(); +	/* +	 * Do post unplug cleanup. This is still protected against +	 * concurrent CPU hotplug via cpu_add_remove_lock. +	 */ +	lockup_detector_cleanup();  	return ret;  } @@ -754,11 +928,15 @@ out:  	cpu_maps_update_done();  	return err;  } +  int cpu_down(unsigned int cpu)  {  	return do_cpu_down(cpu, CPUHP_OFFLINE);  }  EXPORT_SYMBOL(cpu_down); + +#else +#define takedown_cpu		NULL  #endif /*CONFIG_HOTPLUG_CPU*/  /** @@ -772,11 +950,16 @@ void notify_cpu_starting(unsigned int cpu)  {  	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);  	enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE); +	int ret;  	rcu_cpu_starting(cpu);	/* Enables RCU usage on this CPU. */  	while (st->state < target) {  		st->state++; -		cpuhp_invoke_callback(cpu, st->state, true, NULL); +		ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); +		/* +		 * STARTING must not fail! +		 */ +		WARN_ON_ONCE(ret);  	}  } @@ -794,7 +977,7 @@ void cpuhp_online_idle(enum cpuhp_state state)  		return;  	st->state = CPUHP_AP_ONLINE_IDLE; -	complete(&st->done); +	complete_ap_thread(st, true);  }  /* Requires cpu_add_remove_lock to be held */ @@ -829,7 +1012,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)  	cpuhp_tasks_frozen = tasks_frozen; -	st->target = target; +	cpuhp_set_state(st, target);  	/*  	 * If the current CPU state is in the range of the AP hotplug thread,  	 * then we need to kick the thread once more. @@ -1296,6 +1479,10 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup,  	struct cpuhp_step *sp = cpuhp_get_step(state);  	int ret; +	/* +	 * If there's nothing to do, we done. +	 * Relies on the union for multi_instance. +	 */  	if ((bringup && !sp->startup.single) ||  	    (!bringup && !sp->teardown.single))  		return 0; @@ -1307,9 +1494,9 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup,  	if (cpuhp_is_ap_state(state))  		ret = cpuhp_invoke_ap_callback(cpu, state, bringup, node);  	else -		ret = cpuhp_invoke_callback(cpu, state, bringup, node); +		ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL);  #else -	ret = cpuhp_invoke_callback(cpu, state, bringup, node); +	ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL);  #endif  	BUG_ON(ret && !bringup);  	return ret; @@ -1641,9 +1828,55 @@ static ssize_t show_cpuhp_target(struct device *dev,  }  static DEVICE_ATTR(target, 0644, show_cpuhp_target, write_cpuhp_target); + +static ssize_t write_cpuhp_fail(struct device *dev, +				struct device_attribute *attr, +				const char *buf, size_t count) +{ +	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id); +	struct cpuhp_step *sp; +	int fail, ret; + +	ret = kstrtoint(buf, 10, &fail); +	if (ret) +		return ret; + +	/* +	 * Cannot fail STARTING/DYING callbacks. +	 */ +	if (cpuhp_is_atomic_state(fail)) +		return -EINVAL; + +	/* +	 * Cannot fail anything that doesn't have callbacks. +	 */ +	mutex_lock(&cpuhp_state_mutex); +	sp = cpuhp_get_step(fail); +	if (!sp->startup.single && !sp->teardown.single) +		ret = -EINVAL; +	mutex_unlock(&cpuhp_state_mutex); +	if (ret) +		return ret; + +	st->fail = fail; + +	return count; +} + +static ssize_t show_cpuhp_fail(struct device *dev, +			       struct device_attribute *attr, char *buf) +{ +	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id); + +	return sprintf(buf, "%d\n", st->fail); +} + +static DEVICE_ATTR(fail, 0644, show_cpuhp_fail, write_cpuhp_fail); +  static struct attribute *cpuhp_cpu_attrs[] = {  	&dev_attr_state.attr,  	&dev_attr_target.attr, +	&dev_attr_fail.attr,  	NULL  }; diff --git a/kernel/dma.c b/kernel/dma.c index 6c6262f86c17..3506fc34a712 100644 --- a/kernel/dma.c +++ b/kernel/dma.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * linux/kernel/dma.c: A DMA channel allocator. Inspired by linux/kernel/irq.c.   * diff --git a/kernel/elfcore.c b/kernel/elfcore.c index e556751d15d9..fc482c8e0bd8 100644 --- a/kernel/elfcore.c +++ b/kernel/elfcore.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  #include <linux/elf.h>  #include <linux/fs.h>  #include <linux/mm.h> diff --git a/kernel/events/Makefile b/kernel/events/Makefile index 2925188f50ea..3c022e33c109 100644 --- a/kernel/events/Makefile +++ b/kernel/events/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0  ifdef CONFIG_FUNCTION_TRACER  CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE)  endif diff --git a/kernel/events/core.c b/kernel/events/core.c index 3e691b75b2db..10cdb9c26b5d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -662,7 +662,7 @@ static inline void update_cgrp_time_from_event(struct perf_event *event)  	/*  	 * Do not update time when cgroup is not active  	 */ -	if (cgrp == event->cgrp) +       if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))  		__update_cgrp_time(event->cgrp);  } @@ -901,9 +901,11 @@ list_update_cgroup_event(struct perf_event *event,  	cpuctx_entry = &cpuctx->cgrp_cpuctx_entry;  	/* cpuctx->cgrp is NULL unless a cgroup event is active in this CPU .*/  	if (add) { +		struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx); +  		list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list)); -		if (perf_cgroup_from_task(current, ctx) == event->cgrp) -			cpuctx->cgrp = event->cgrp; +		if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) +			cpuctx->cgrp = cgrp;  	} else {  		list_del(cpuctx_entry);  		cpuctx->cgrp = NULL; @@ -8171,6 +8173,7 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)  		}  	}  	event->tp_event->prog = prog; +	event->tp_event->bpf_prog_owner = event;  	return 0;  } @@ -8185,7 +8188,7 @@ static void perf_event_free_bpf_prog(struct perf_event *event)  		return;  	prog = event->tp_event->prog; -	if (prog) { +	if (prog && event->tp_event->bpf_prog_owner == event) {  		event->tp_event->prog = NULL;  		bpf_prog_put(prog);  	} @@ -8954,6 +8957,14 @@ static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)  static void free_pmu_context(struct pmu *pmu)  { +	/* +	 * Static contexts such as perf_sw_context have a global lifetime +	 * and may be shared between different PMUs. Avoid freeing them +	 * when a single PMU is going away. +	 */ +	if (pmu->task_ctx_nr > perf_invalid_context) +		return; +  	mutex_lock(&pmus_lock);  	free_percpu(pmu->pmu_cpu_context);  	mutex_unlock(&pmus_lock); diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 843e97047335..09b1537ae06c 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  #ifndef _KERNEL_EVENTS_INTERNAL_H  #define _KERNEL_EVENTS_INTERNAL_H diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index af71a84e12ee..f684d8e5fa2b 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -412,6 +412,19 @@ err:  	return NULL;  } +static bool __always_inline rb_need_aux_wakeup(struct ring_buffer *rb) +{ +	if (rb->aux_overwrite) +		return false; + +	if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) { +		rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark); +		return true; +	} + +	return false; +} +  /*   * Commit the data written by hardware into the ring buffer by adjusting   * aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the @@ -451,10 +464,8 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)  	}  	rb->user_page->aux_head = rb->aux_head; -	if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) { +	if (rb_need_aux_wakeup(rb))  		wakeup = true; -		rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark); -	}  	if (wakeup) {  		if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED) @@ -484,9 +495,8 @@ int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size)  	rb->aux_head += size;  	rb->user_page->aux_head = rb->aux_head; -	if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) { +	if (rb_need_aux_wakeup(rb)) {  		perf_output_wakeup(handle); -		rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark);  		handle->wakeup = rb->aux_wakeup + rb->aux_watermark;  	} diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c index 6873bb3e6b7e..0975b0268545 100644 --- a/kernel/exec_domain.c +++ b/kernel/exec_domain.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * Handling of different ABIs (personalities).   * diff --git a/kernel/exit.c b/kernel/exit.c index 3481ababd06a..f6cad39f35df 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1600,18 +1600,19 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,  	struct waitid_info info = {.status = 0};  	long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL);  	int signo = 0; +  	if (err > 0) {  		signo = SIGCHLD;  		err = 0; -	} - -	if (!err) {  		if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))  			return -EFAULT;  	}  	if (!infop)  		return err; +	if (!access_ok(VERIFY_WRITE, infop, sizeof(*infop))) +		return -EFAULT; +  	user_access_begin();  	unsafe_put_user(signo, &infop->si_signo, Efault);  	unsafe_put_user(0, &infop->si_errno, Efault); @@ -1723,21 +1724,23 @@ COMPAT_SYSCALL_DEFINE5(waitid,  	if (err > 0) {  		signo = SIGCHLD;  		err = 0; -	} - -	if (!err && uru) { -		/* kernel_waitid() overwrites everything in ru */ -		if (COMPAT_USE_64BIT_TIME) -			err = copy_to_user(uru, &ru, sizeof(ru)); -		else -			err = put_compat_rusage(&ru, uru); -		if (err) -			return -EFAULT; +		if (uru) { +			/* kernel_waitid() overwrites everything in ru */ +			if (COMPAT_USE_64BIT_TIME) +				err = copy_to_user(uru, &ru, sizeof(ru)); +			else +				err = put_compat_rusage(&ru, uru); +			if (err) +				return -EFAULT; +		}  	}  	if (!infop)  		return err; +	if (!access_ok(VERIFY_WRITE, infop, sizeof(*infop))) +		return -EFAULT; +  	user_access_begin();  	unsafe_put_user(signo, &infop->si_signo, Efault);  	unsafe_put_user(0, &infop->si_errno, Efault); diff --git a/kernel/extable.c b/kernel/extable.c index 38c2412401a1..9aa1cc41ecf7 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -102,15 +102,7 @@ int core_kernel_data(unsigned long addr)  int __kernel_text_address(unsigned long addr)  { -	if (core_kernel_text(addr)) -		return 1; -	if (is_module_text_address(addr)) -		return 1; -	if (is_ftrace_trampoline(addr)) -		return 1; -	if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr)) -		return 1; -	if (is_bpf_text_address(addr)) +	if (kernel_text_address(addr))  		return 1;  	/*  	 * There might be init symbols in saved stacktraces. @@ -127,17 +119,42 @@ int __kernel_text_address(unsigned long addr)  int kernel_text_address(unsigned long addr)  { +	bool no_rcu; +	int ret = 1; +  	if (core_kernel_text(addr))  		return 1; + +	/* +	 * If a stack dump happens while RCU is not watching, then +	 * RCU needs to be notified that it requires to start +	 * watching again. This can happen either by tracing that +	 * triggers a stack trace, or a WARN() that happens during +	 * coming back from idle, or cpu on or offlining. +	 * +	 * is_module_text_address() as well as the kprobe slots +	 * and is_bpf_text_address() require RCU to be watching. +	 */ +	no_rcu = !rcu_is_watching(); + +	/* Treat this like an NMI as it can happen anywhere */ +	if (no_rcu) +		rcu_nmi_enter(); +  	if (is_module_text_address(addr)) -		return 1; +		goto out;  	if (is_ftrace_trampoline(addr)) -		return 1; +		goto out;  	if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr)) -		return 1; +		goto out;  	if (is_bpf_text_address(addr)) -		return 1; -	return 0; +		goto out; +	ret = 0; +out: +	if (no_rcu) +		rcu_nmi_exit(); + +	return ret;  }  /* diff --git a/kernel/fork.c b/kernel/fork.c index 10646182440f..07cc743698d3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -215,6 +215,10 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)  		if (!s)  			continue; +#ifdef CONFIG_DEBUG_KMEMLEAK +		/* Clear stale pointers from reused stack. */ +		memset(s->addr, 0, THREAD_SIZE); +#endif  		tsk->stack_vm_area = s;  		return s->addr;  	} @@ -946,6 +950,24 @@ void mmput(struct mm_struct *mm)  }  EXPORT_SYMBOL_GPL(mmput); +#ifdef CONFIG_MMU +static void mmput_async_fn(struct work_struct *work) +{ +	struct mm_struct *mm = container_of(work, struct mm_struct, +					    async_put_work); + +	__mmput(mm); +} + +void mmput_async(struct mm_struct *mm) +{ +	if (atomic_dec_and_test(&mm->mm_users)) { +		INIT_WORK(&mm->async_put_work, mmput_async_fn); +		schedule_work(&mm->async_put_work); +	} +} +#endif +  /**   * set_mm_exe_file - change a reference to the mm's executable file   * diff --git a/kernel/futex.c b/kernel/futex.c index 3d38eaf05492..76ed5921117a 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -821,8 +821,6 @@ static void get_pi_state(struct futex_pi_state *pi_state)  /*   * Drops a reference to the pi_state object and frees or caches it   * when the last reference is gone. - * - * Must be called with the hb lock held.   */  static void put_pi_state(struct futex_pi_state *pi_state)  { @@ -837,16 +835,22 @@ static void put_pi_state(struct futex_pi_state *pi_state)  	 * and has cleaned up the pi_state already  	 */  	if (pi_state->owner) { -		raw_spin_lock_irq(&pi_state->owner->pi_lock); -		list_del_init(&pi_state->list); -		raw_spin_unlock_irq(&pi_state->owner->pi_lock); +		struct task_struct *owner; -		rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner); +		raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); +		owner = pi_state->owner; +		if (owner) { +			raw_spin_lock(&owner->pi_lock); +			list_del_init(&pi_state->list); +			raw_spin_unlock(&owner->pi_lock); +		} +		rt_mutex_proxy_unlock(&pi_state->pi_mutex, owner); +		raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);  	} -	if (current->pi_state_cache) +	if (current->pi_state_cache) {  		kfree(pi_state); -	else { +	} else {  		/*  		 * pi_state->list is already empty.  		 * clear pi_state->owner. @@ -899,22 +903,41 @@ void exit_pi_state_list(struct task_struct *curr)  	 */  	raw_spin_lock_irq(&curr->pi_lock);  	while (!list_empty(head)) { -  		next = head->next;  		pi_state = list_entry(next, struct futex_pi_state, list);  		key = pi_state->key;  		hb = hash_futex(&key); + +		/* +		 * We can race against put_pi_state() removing itself from the +		 * list (a waiter going away). put_pi_state() will first +		 * decrement the reference count and then modify the list, so +		 * its possible to see the list entry but fail this reference +		 * acquire. +		 * +		 * In that case; drop the locks to let put_pi_state() make +		 * progress and retry the loop. +		 */ +		if (!atomic_inc_not_zero(&pi_state->refcount)) { +			raw_spin_unlock_irq(&curr->pi_lock); +			cpu_relax(); +			raw_spin_lock_irq(&curr->pi_lock); +			continue; +		}  		raw_spin_unlock_irq(&curr->pi_lock);  		spin_lock(&hb->lock); - -		raw_spin_lock_irq(&curr->pi_lock); +		raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); +		raw_spin_lock(&curr->pi_lock);  		/*  		 * We dropped the pi-lock, so re-check whether this  		 * task still owns the PI-state:  		 */  		if (head->next != next) { +			/* retain curr->pi_lock for the loop invariant */ +			raw_spin_unlock(&pi_state->pi_mutex.wait_lock);  			spin_unlock(&hb->lock); +			put_pi_state(pi_state);  			continue;  		} @@ -922,9 +945,9 @@ void exit_pi_state_list(struct task_struct *curr)  		WARN_ON(list_empty(&pi_state->list));  		list_del_init(&pi_state->list);  		pi_state->owner = NULL; -		raw_spin_unlock_irq(&curr->pi_lock); -		get_pi_state(pi_state); +		raw_spin_unlock(&curr->pi_lock); +		raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);  		spin_unlock(&hb->lock);  		rt_mutex_futex_unlock(&pi_state->pi_mutex); @@ -1208,6 +1231,10 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,  	WARN_ON(!list_empty(&pi_state->list));  	list_add(&pi_state->list, &p->pi_state_list); +	/* +	 * Assignment without holding pi_state->pi_mutex.wait_lock is safe +	 * because there is no concurrency as the object is not published yet. +	 */  	pi_state->owner = p;  	raw_spin_unlock_irq(&p->pi_lock); @@ -1560,8 +1587,16 @@ static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)  	int oldval, ret;  	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) { -		if (oparg < 0 || oparg > 31) -			return -EINVAL; +		if (oparg < 0 || oparg > 31) { +			char comm[sizeof(current->comm)]; +			/* +			 * kill this print and return -EINVAL when userspace +			 * is sane again +			 */ +			pr_info_ratelimited("futex_wake_op: %s tries to shift op by %d; fix this program\n", +					get_task_comm(comm, current), oparg); +			oparg &= 31; +		}  		oparg = 1 << oparg;  	} @@ -2878,6 +2913,7 @@ retry:  		raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);  		spin_unlock(&hb->lock); +		/* drops pi_state->pi_mutex.wait_lock */  		ret = wake_futex_pi(uaddr, uval, pi_state);  		put_pi_state(pi_state); diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 3f409968e466..83f830acbb5f 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * linux/kernel/futex_compat.c   * diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile index 752d6486b67e..c6c50e5c680e 100644 --- a/kernel/gcov/Makefile +++ b/kernel/gcov/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0  ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'  obj-y := base.o fs.o diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c index c51a49c9be70..9c7c8d5c18f2 100644 --- a/kernel/gcov/base.c +++ b/kernel/gcov/base.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   *  This code maintains a list of active profiling data structures.   * diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c index edf67c493a8e..6e40ff6be083 100644 --- a/kernel/gcov/fs.c +++ b/kernel/gcov/fs.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   *  This code exports profiling data as debugfs files to userspace.   * diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c index 27bc88a35013..1e32e66c9563 100644 --- a/kernel/gcov/gcc_3_4.c +++ b/kernel/gcov/gcc_3_4.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   *  This code provides functions to handle gcc's profiling data format   *  introduced with gcc 3.4. Future versions of gcc may change the gcov diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index 46a18e72bce6..ca5e5c0ef853 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   *  This code provides functions to handle gcc's profiling data format   *  introduced with gcc 4.7. diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h index 92c8e22a29ed..de118ad4a024 100644 --- a/kernel/gcov/gcov.h +++ b/kernel/gcov/gcov.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  /*   *  Profiling infrastructure declarations.   * diff --git a/kernel/groups.c b/kernel/groups.c index 434f6665f187..e357bc800111 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * Supplementary group IDs   */ diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 1970cafe8f2a..ed15d142694b 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0  obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o  obj-$(CONFIG_IRQ_TIMINGS) += timings.o diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index d69bd77252a7..e12d35108225 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * Copyright (C) 2016 Thomas Gleixner.   * Copyright (C) 2016-2017 Christoph Hellwig. diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index d30a0dd5cc02..befa671fba64 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * linux/kernel/irq/autoprobe.c   * diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index f51b7b6d2451..5a2ef92c2782 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -202,7 +202,7 @@ __irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force)  	irqd_clr_managed_shutdown(d); -	if (cpumask_any_and(aff, cpu_online_mask) > nr_cpu_ids) { +	if (cpumask_any_and(aff, cpu_online_mask) >= nr_cpu_ids) {  		/*  		 * Catch code which fiddles with enable_irq() on a managed  		 * and potentially shutdown IRQ. Chained interrupt @@ -265,8 +265,8 @@ int irq_startup(struct irq_desc *desc, bool resend, bool force)  			irq_setup_affinity(desc);  			break;  		case IRQ_STARTUP_MANAGED: +			irq_do_set_affinity(d, aff, false);  			ret = __irq_startup(desc); -			irq_set_affinity_locked(d, aff, false);  			break;  		case IRQ_STARTUP_ABORT:  			return 0; diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 638eb9c83d9f..9eb09aef0313 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -18,8 +18,34 @@  static inline bool irq_needs_fixup(struct irq_data *d)  {  	const struct cpumask *m = irq_data_get_effective_affinity_mask(d); +	unsigned int cpu = smp_processor_id(); -	return cpumask_test_cpu(smp_processor_id(), m); +#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK +	/* +	 * The cpumask_empty() check is a workaround for interrupt chips, +	 * which do not implement effective affinity, but the architecture has +	 * enabled the config switch. Use the general affinity mask instead. +	 */ +	if (cpumask_empty(m)) +		m = irq_data_get_affinity_mask(d); + +	/* +	 * Sanity check. If the mask is not empty when excluding the outgoing +	 * CPU then it must contain at least one online CPU. The outgoing CPU +	 * has been removed from the online mask already. +	 */ +	if (cpumask_any_but(m, cpu) < nr_cpu_ids && +	    cpumask_any_and(m, cpu_online_mask) >= nr_cpu_ids) { +		/* +		 * If this happens then there was a missed IRQ fixup at some +		 * point. Warn about it and enforce fixup. +		 */ +		pr_warn("Eff. affinity %*pbl of IRQ %u contains only offline CPUs after offlining CPU %u\n", +			cpumask_pr_args(m), d->irq, cpu); +		return true; +	} +#endif +	return cpumask_test_cpu(cpu, m);  }  static bool migrate_one_irq(struct irq_desc *desc) diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h index e75e29e4434a..17f05ef8f575 100644 --- a/kernel/irq/debug.h +++ b/kernel/irq/debug.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  /*   * Debugging printout:   */ diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index f7086b78ad6e..c26c5bb6b491 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -135,17 +135,26 @@ void irq_gc_ack_clr_bit(struct irq_data *d)  }  /** - * irq_gc_mask_disable_reg_and_ack - Mask and ack pending interrupt + * irq_gc_mask_disable_and_ack_set - Mask and ack pending interrupt   * @d: irq_data + * + * This generic implementation of the irq_mask_ack method is for chips + * with separate enable/disable registers instead of a single mask + * register and where a pending interrupt is acknowledged by setting a + * bit. + * + * Note: This is the only permutation currently used.  Similar generic + * functions should be added here if other permutations are required.   */ -void irq_gc_mask_disable_reg_and_ack(struct irq_data *d) +void irq_gc_mask_disable_and_ack_set(struct irq_data *d)  {  	struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);  	struct irq_chip_type *ct = irq_data_get_chip_type(d);  	u32 mask = d->mask;  	irq_gc_lock(gc); -	irq_reg_writel(gc, mask, ct->regs.mask); +	irq_reg_writel(gc, mask, ct->regs.disable); +	*ct->mask_cache &= ~mask;  	irq_reg_writel(gc, mask, ct->regs.ack);  	irq_gc_unlock(gc);  } @@ -322,7 +331,6 @@ int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,  		/* Calc pointer to the next generic chip */  		tmp += sizeof(*gc) + num_ct * sizeof(struct irq_chip_type);  	} -	d->name = name;  	return 0;  }  EXPORT_SYMBOL_GPL(__irq_alloc_domain_generic_chips); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index a4aa39009f0d..44ed5f8c8759 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  /*   * IRQ subsystem internal functions and variables:   * diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index e84b7056bb08..ac4644e92b49 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -945,7 +945,7 @@ static int virq_debug_show(struct seq_file *m, void *private)  	struct irq_desc *desc;  	struct irq_domain *domain;  	struct radix_tree_iter iter; -	void **slot; +	void __rcu **slot;  	int i;  	seq_printf(m, " %-16s  %-6s  %-10s  %-10s  %s\n", @@ -1453,7 +1453,7 @@ out_free_desc:  /* The irq_data was moved, fix the revmap to refer to the new location */  static void irq_domain_fix_revmap(struct irq_data *d)  { -	void **slot; +	void __rcu **slot;  	if (d->hwirq < d->domain->revmap_size)  		return; /* Not using radix tree. */ diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 573dc52b0806..4bff6a10ae8e 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -168,6 +168,19 @@ void irq_set_thread_affinity(struct irq_desc *desc)  			set_bit(IRQTF_AFFINITY, &action->thread_flags);  } +static void irq_validate_effective_affinity(struct irq_data *data) +{ +#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK +	const struct cpumask *m = irq_data_get_effective_affinity_mask(data); +	struct irq_chip *chip = irq_data_get_irq_chip(data); + +	if (!cpumask_empty(m)) +		return; +	pr_warn_once("irq_chip %s did not update eff. affinity mask of irq %u\n", +		     chip->name, data->irq); +#endif +} +  int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,  			bool force)  { @@ -175,12 +188,16 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,  	struct irq_chip *chip = irq_data_get_irq_chip(data);  	int ret; +	if (!chip || !chip->irq_set_affinity) +		return -EINVAL; +  	ret = chip->irq_set_affinity(data, mask, force);  	switch (ret) {  	case IRQ_SET_MASK_OK:  	case IRQ_SET_MASK_OK_DONE:  		cpumask_copy(desc->irq_common_data.affinity, mask);  	case IRQ_SET_MASK_OK_NOCOPY: +		irq_validate_effective_affinity(data);  		irq_set_thread_affinity(desc);  		ret = 0;  	} @@ -1643,6 +1660,10 @@ const void *free_irq(unsigned int irq, void *dev_id)  #endif  	action = __free_irq(irq, dev_id); + +	if (!action) +		return NULL; +  	devname = action->name;  	kfree(action);  	return devname; diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 6ca054a3f91d..86ae0eb80b53 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  #include <linux/irq.h>  #include <linux/interrupt.h> diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 6376b4a598d3..c010cc0daf79 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * linux/kernel/irq/proc.c   * diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index b86886beee4f..1d08f45135c2 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * linux/kernel/irq/resend.c   * diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index 320579d89091..e43795cd2ccf 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  /*   * Internal header to deal with irq_desc->status which will be renamed   * to irq_desc->settings. diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 061ba7eed4ed..987d7bca4864 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * linux/kernel/irq/spurious.c   * diff --git a/kernel/kcmp.c b/kernel/kcmp.c index ea34ed8bb952..a0e3d7a0e8b8 100644 --- a/kernel/kcmp.c +++ b/kernel/kcmp.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  #include <linux/kernel.h>  #include <linux/syscalls.h>  #include <linux/fdtable.h> @@ -131,7 +132,7 @@ static int kcmp_epoll_target(struct task_struct *task1,  	if (filp_epoll) {  		filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, slot.toff);  		fput(filp_epoll); -	} else +	}  	if (IS_ERR(filp_tgt))  		return PTR_ERR(filp_tgt); diff --git a/kernel/kcov.c b/kernel/kcov.c index 3f693a0f6f3e..fc6af9e1308b 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  #define pr_fmt(fmt) "kcov: " fmt  #define DISABLE_BRANCH_PROFILING diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h index 50dfcb039a41..48aaf2ac0d0d 100644 --- a/kernel/kexec_internal.h +++ b/kernel/kexec_internal.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  #ifndef LINUX_KEXEC_INTERNAL_H  #define LINUX_KEXEC_INTERNAL_H diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index b9628e43c78f..bf8c8fd72589 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -830,6 +830,41 @@ int klp_register_patch(struct klp_patch *patch)  }  EXPORT_SYMBOL_GPL(klp_register_patch); +/* + * Remove parts of patches that touch a given kernel module. The list of + * patches processed might be limited. When limit is NULL, all patches + * will be handled. + */ +static void klp_cleanup_module_patches_limited(struct module *mod, +					       struct klp_patch *limit) +{ +	struct klp_patch *patch; +	struct klp_object *obj; + +	list_for_each_entry(patch, &klp_patches, list) { +		if (patch == limit) +			break; + +		klp_for_each_object(patch, obj) { +			if (!klp_is_module(obj) || strcmp(obj->name, mod->name)) +				continue; + +			/* +			 * Only unpatch the module if the patch is enabled or +			 * is in transition. +			 */ +			if (patch->enabled || patch == klp_transition_patch) { +				pr_notice("reverting patch '%s' on unloading module '%s'\n", +					  patch->mod->name, obj->mod->name); +				klp_unpatch_object(obj); +			} + +			klp_free_object_loaded(obj); +			break; +		} +	} +} +  int klp_module_coming(struct module *mod)  {  	int ret; @@ -894,7 +929,7 @@ err:  	pr_warn("patch '%s' failed for module '%s', refusing to load module '%s'\n",  		patch->mod->name, obj->mod->name, obj->mod->name);  	mod->klp_alive = false; -	klp_free_object_loaded(obj); +	klp_cleanup_module_patches_limited(mod, patch);  	mutex_unlock(&klp_mutex);  	return ret; @@ -902,9 +937,6 @@ err:  void klp_module_going(struct module *mod)  { -	struct klp_patch *patch; -	struct klp_object *obj; -  	if (WARN_ON(mod->state != MODULE_STATE_GOING &&  		    mod->state != MODULE_STATE_COMING))  		return; @@ -917,25 +949,7 @@ void klp_module_going(struct module *mod)  	 */  	mod->klp_alive = false; -	list_for_each_entry(patch, &klp_patches, list) { -		klp_for_each_object(patch, obj) { -			if (!klp_is_module(obj) || strcmp(obj->name, mod->name)) -				continue; - -			/* -			 * Only unpatch the module if the patch is enabled or -			 * is in transition. -			 */ -			if (patch->enabled || patch == klp_transition_patch) { -				pr_notice("reverting patch '%s' on unloading module '%s'\n", -					  patch->mod->name, obj->mod->name); -				klp_unpatch_object(obj); -			} - -			klp_free_object_loaded(obj); -			break; -		} -	} +	klp_cleanup_module_patches_limited(mod, NULL);  	mutex_unlock(&klp_mutex);  } diff --git a/kernel/livepatch/core.h b/kernel/livepatch/core.h index c74f24c47837..a351601d7f76 100644 --- a/kernel/livepatch/core.h +++ b/kernel/livepatch/core.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  #ifndef _LIVEPATCH_CORE_H  #define _LIVEPATCH_CORE_H diff --git a/kernel/livepatch/patch.h b/kernel/livepatch/patch.h index 0db227170c36..e72d8250d04b 100644 --- a/kernel/livepatch/patch.h +++ b/kernel/livepatch/patch.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  #ifndef _LIVEPATCH_PATCH_H  #define _LIVEPATCH_PATCH_H diff --git a/kernel/livepatch/transition.h b/kernel/livepatch/transition.h index ce09b326546c..0f6e27c481f9 100644 --- a/kernel/livepatch/transition.h +++ b/kernel/livepatch/transition.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  #ifndef _LIVEPATCH_TRANSITION_H  #define _LIVEPATCH_TRANSITION_H diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 760158d9d98d..392c7f23af76 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0  # Any varying coverage in these files is non-deterministic  # and is generally not a function of system call inputs.  KCOV_INSTRUMENT		:= n diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 44c8d0d17170..e36e652d996f 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1873,10 +1873,10 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,  	       struct held_lock *next, int distance, struct stack_trace *trace,  	       int (*save)(struct stack_trace *trace))  { +	struct lock_list *uninitialized_var(target_entry);  	struct lock_list *entry; -	int ret;  	struct lock_list this; -	struct lock_list *uninitialized_var(target_entry); +	int ret;  	/*  	 * Prove that the new <prev> -> <next> dependency would not @@ -1890,8 +1890,17 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,  	this.class = hlock_class(next);  	this.parent = NULL;  	ret = check_noncircular(&this, hlock_class(prev), &target_entry); -	if (unlikely(!ret)) +	if (unlikely(!ret)) { +		if (!trace->entries) { +			/* +			 * If @save fails here, the printing might trigger +			 * a WARN but because of the !nr_entries it should +			 * not do bad things. +			 */ +			save(trace); +		}  		return print_circular_bug(&this, target_entry, next, prev, trace); +	}  	else if (unlikely(ret < 0))  		return print_bfs_bug(ret); @@ -1938,7 +1947,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,  		return print_bfs_bug(ret); -	if (save && !save(trace)) +	if (!trace->entries && !save(trace))  		return 0;  	/* @@ -1958,20 +1967,6 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,  	if (!ret)  		return 0; -	/* -	 * Debugging printouts: -	 */ -	if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) { -		graph_unlock(); -		printk("\n new dependency: "); -		print_lock_name(hlock_class(prev)); -		printk(KERN_CONT " => "); -		print_lock_name(hlock_class(next)); -		printk(KERN_CONT "\n"); -		dump_stack(); -		if (!graph_lock()) -			return 0; -	}  	return 2;  } @@ -1986,8 +1981,12 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)  {  	int depth = curr->lockdep_depth;  	struct held_lock *hlock; -	struct stack_trace trace; -	int (*save)(struct stack_trace *trace) = save_trace; +	struct stack_trace trace = { +		.nr_entries = 0, +		.max_entries = 0, +		.entries = NULL, +		.skip = 0, +	};  	/*  	 * Debugging checks. @@ -2018,18 +2017,11 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)  			 */  			if (hlock->read != 2 && hlock->check) {  				int ret = check_prev_add(curr, hlock, next, -							 distance, &trace, save); +							 distance, &trace, save_trace);  				if (!ret)  					return 0;  				/* -				 * Stop saving stack_trace if save_trace() was -				 * called at least once: -				 */ -				if (save && ret == 2) -					save = NULL; - -				/*  				 * Stop after the first non-trylock entry,  				 * as non-trylock entries have added their  				 * own direct dependencies already, so this diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index 1da4669d57a7..d459d624ba2a 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  /*   * kernel/lockdep_internals.h   * diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index 68d9e267ccd4..ad69bbc9bd28 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * kernel/lockdep_proc.c   * diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index 6a385aabcce7..f046b7ce9dd6 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  /*   * MCS lock defines   * diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h index 4174417d5309..1edd3f45a4ec 100644 --- a/kernel/locking/mutex-debug.h +++ b/kernel/locking/mutex-debug.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  /*   * Mutexes: blocking mutual exclusion locks   * diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h index 6ebc1902f779..1c2287d3fa71 100644 --- a/kernel/locking/mutex.h +++ b/kernel/locking/mutex.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  /*   * Mutexes: blocking mutual exclusion locks   * diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c index a74ee6abd039..6ef600aa0f47 100644 --- a/kernel/locking/osq_lock.c +++ b/kernel/locking/osq_lock.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  #include <linux/percpu.h>  #include <linux/sched.h>  #include <linux/osq_lock.h> diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 43555681c40b..15b6a39366c6 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  #ifndef _GEN_PV_LOCK_SLOWPATH  #error "do not include this file"  #endif diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c index f4a74e78d467..fd4fe1f5b458 100644 --- a/kernel/locking/rtmutex-debug.c +++ b/kernel/locking/rtmutex-debug.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * RT-Mutexes: blocking mutual exclusion locks with PI support   * diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h index 5078c6ddf4a5..fc549713bba3 100644 --- a/kernel/locking/rtmutex-debug.h +++ b/kernel/locking/rtmutex-debug.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  /*   * RT-Mutexes: blocking mutual exclusion locks with PI support   * diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h index 5c253caffe91..732f96abf462 100644 --- a/kernel/locking/rtmutex.h +++ b/kernel/locking/rtmutex.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  /*   * RT-Mutexes: blocking mutual exclusion locks with PI support   * diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 7453be0485a5..124e98ca0b17 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  /*   * RT Mutexes: blocking mutual exclusion locks with PI support   * diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c index 0848634c5512..a7ffb2a96ede 100644 --- a/kernel/locking/rwsem-spinlock.c +++ b/kernel/locking/rwsem-spinlock.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /* rwsem-spinlock.c: R/W semaphores: contention handling functions for   * generic spinlock implementation   * diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 02f660666ab8..e795908f3607 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /* rwsem.c: R/W semaphores: contention handling functions   *   * Written by David Howells ([email protected]). @@ -613,6 +614,33 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)  	DEFINE_WAKE_Q(wake_q);  	/* +	* __rwsem_down_write_failed_common(sem) +	*   rwsem_optimistic_spin(sem) +	*     osq_unlock(sem->osq) +	*   ... +	*   atomic_long_add_return(&sem->count) +	* +	*      - VS - +	* +	*              __up_write() +	*                if (atomic_long_sub_return_release(&sem->count) < 0) +	*                  rwsem_wake(sem) +	*                    osq_is_locked(&sem->osq) +	* +	* And __up_write() must observe !osq_is_locked() when it observes the +	* atomic_long_add_return() in order to not miss a wakeup. +	* +	* This boils down to: +	* +	* [S.rel] X = 1                [RmW] r0 = (Y += 0) +	*         MB                         RMB +	* [RmW]   Y += 1               [L]   r1 = X +	* +	* exists (r0=1 /\ r1=0) +	*/ +	smp_rmb(); + +	/*  	 * If a spinner is present, it is not necessary to do the wakeup.  	 * Try to do wakeup only if the trylock succeeds to minimize  	 * spinlock contention which may introduce too much delay in the diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 4d48b1c4870d..a6c76a4832b4 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /* kernel/rwsem.c: R/W semaphores, public implementation   *   * Written by David Howells ([email protected]). diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h index a699f4048ba1..a883b8f1fdc6 100644 --- a/kernel/locking/rwsem.h +++ b/kernel/locking/rwsem.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  /*   * The owner field of the rw_semaphore structure will be set to   * RWSEM_READ_OWNED when a reader grabs the lock. A writer will clear diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c index 4b082b5cac9e..6e40fdfba326 100644 --- a/kernel/locking/spinlock.c +++ b/kernel/locking/spinlock.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * Copyright (2004) Linus Torvalds   * diff --git a/kernel/memremap.c b/kernel/memremap.c index 6bcbfbf1a8fd..403ab9cdb949 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -350,7 +350,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,  	pgprot_t pgprot = PAGE_KERNEL;  	struct dev_pagemap *pgmap;  	struct page_map *page_map; -	int error, nid, is_ram; +	int error, nid, is_ram, i = 0;  	align_start = res->start & ~(SECTION_SIZE - 1);  	align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) @@ -448,6 +448,8 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,  		list_del(&page->lru);  		page->pgmap = pgmap;  		percpu_ref_get(ref); +		if (!(++i % 1024)) +			cond_resched();  	}  	devres_add(dev, page_map);  	return __va(res->start); diff --git a/kernel/params.c b/kernel/params.c index 60b2d8101355..cc9108c2a1fd 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -224,7 +224,7 @@ char *parse_args(const char *doing,  	}								\  	int param_get_##name(char *buffer, const struct kernel_param *kp) \  	{								\ -		return scnprintf(buffer, PAGE_SIZE, format,		\ +		return scnprintf(buffer, PAGE_SIZE, format "\n",	\  				*((type *)kp->arg));			\  	}								\  	const struct kernel_param_ops param_ops_##name = {			\ @@ -236,14 +236,14 @@ char *parse_args(const char *doing,  	EXPORT_SYMBOL(param_ops_##name) -STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", kstrtou8); -STANDARD_PARAM_DEF(short, short, "%hi", kstrtos16); -STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", kstrtou16); -STANDARD_PARAM_DEF(int, int, "%i", kstrtoint); -STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint); -STANDARD_PARAM_DEF(long, long, "%li", kstrtol); -STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul); -STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull); +STANDARD_PARAM_DEF(byte,	unsigned char,		"%hhu", kstrtou8); +STANDARD_PARAM_DEF(short,	short,			"%hi",  kstrtos16); +STANDARD_PARAM_DEF(ushort,	unsigned short,		"%hu",  kstrtou16); +STANDARD_PARAM_DEF(int,		int,			"%i",   kstrtoint); +STANDARD_PARAM_DEF(uint,	unsigned int,		"%u",   kstrtouint); +STANDARD_PARAM_DEF(long,	long,			"%li",  kstrtol); +STANDARD_PARAM_DEF(ulong,	unsigned long,		"%lu",  kstrtoul); +STANDARD_PARAM_DEF(ullong,	unsigned long long,	"%llu", kstrtoull);  int param_set_charp(const char *val, const struct kernel_param *kp)  { @@ -270,7 +270,7 @@ EXPORT_SYMBOL(param_set_charp);  int param_get_charp(char *buffer, const struct kernel_param *kp)  { -	return scnprintf(buffer, PAGE_SIZE, "%s", *((char **)kp->arg)); +	return scnprintf(buffer, PAGE_SIZE, "%s\n", *((char **)kp->arg));  }  EXPORT_SYMBOL(param_get_charp); @@ -301,7 +301,7 @@ EXPORT_SYMBOL(param_set_bool);  int param_get_bool(char *buffer, const struct kernel_param *kp)  {  	/* Y and N chosen as being relatively non-coder friendly */ -	return sprintf(buffer, "%c", *(bool *)kp->arg ? 'Y' : 'N'); +	return sprintf(buffer, "%c\n", *(bool *)kp->arg ? 'Y' : 'N');  }  EXPORT_SYMBOL(param_get_bool); @@ -360,7 +360,7 @@ EXPORT_SYMBOL(param_set_invbool);  int param_get_invbool(char *buffer, const struct kernel_param *kp)  { -	return sprintf(buffer, "%c", (*(bool *)kp->arg) ? 'N' : 'Y'); +	return sprintf(buffer, "%c\n", (*(bool *)kp->arg) ? 'N' : 'Y');  }  EXPORT_SYMBOL(param_get_invbool); @@ -460,8 +460,9 @@ static int param_array_get(char *buffer, const struct kernel_param *kp)  	struct kernel_param p = *kp;  	for (i = off = 0; i < (arr->num ? *arr->num : arr->max); i++) { +		/* Replace \n with comma */  		if (i) -			buffer[off++] = ','; +			buffer[off - 1] = ',';  		p.arg = arr->elem + arr->elemsize * i;  		check_kparam_locked(p.mod);  		ret = arr->ops->get(buffer + off, &p); @@ -507,7 +508,7 @@ EXPORT_SYMBOL(param_set_copystring);  int param_get_string(char *buffer, const struct kernel_param *kp)  {  	const struct kparam_string *kps = kp->str; -	return strlcpy(buffer, kps->string, kps->maxlen); +	return scnprintf(buffer, PAGE_SIZE, "%s\n", kps->string);  }  EXPORT_SYMBOL(param_get_string); @@ -549,10 +550,6 @@ static ssize_t param_attr_show(struct module_attribute *mattr,  	kernel_param_lock(mk->mod);  	count = attribute->param->ops->get(buf, attribute->param);  	kernel_param_unlock(mk->mod); -	if (count > 0) { -		strcat(buf, "\n"); -		++count; -	}  	return count;  } @@ -600,7 +597,7 @@ EXPORT_SYMBOL(kernel_param_unlock);  /*   * add_sysfs_param - add a parameter to sysfs   * @mk: struct module_kobject - * @kparam: the actual parameter definition to add to sysfs + * @kp: the actual parameter definition to add to sysfs   * @name: name of parameter   *   * Create a kobject if for a (per-module) parameter if mp NULL, and diff --git a/kernel/power/Makefile b/kernel/power/Makefile index eb4f717705ba..a3f79f0eef36 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0  ccflags-$(CONFIG_PM_DEBUG)	:= -DDEBUG diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c index 9012ecf7b814..41e83a779e19 100644 --- a/kernel/power/autosleep.c +++ b/kernel/power/autosleep.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * kernel/power/autosleep.c   * diff --git a/kernel/power/console.c b/kernel/power/console.c index 0e781798b0b3..fcdf0e14a47d 100644 --- a/kernel/power/console.c +++ b/kernel/power/console.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * Functions for saving/restoring console.   * diff --git a/kernel/power/power.h b/kernel/power/power.h index 1d2d761e3c25..f29cd178df90 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  #include <linux/suspend.h>  #include <linux/suspend_ioctls.h>  #include <linux/utsname.h> diff --git a/kernel/power/process.c b/kernel/power/process.c index 50f25cb370c6..7381d49a44db 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * drivers/power/process.c - Functions for starting/stopping processes on    *                           suspend transitions. diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 3e2b4f519009..ccd2d20e6b06 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -120,22 +120,26 @@ static void s2idle_loop(void)  		 * frozen processes + suspended devices + idle processors.  		 * Thus s2idle_enter() should be called right after  		 * all devices have been suspended. +		 * +		 * Wakeups during the noirq suspend of devices may be spurious, +		 * so prevent them from terminating the loop right away.  		 */  		error = dpm_noirq_suspend_devices(PMSG_SUSPEND);  		if (!error)  			s2idle_enter(); +		else if (error == -EBUSY && pm_wakeup_pending()) +			error = 0; -		dpm_noirq_resume_devices(PMSG_RESUME); -		if (error && (error != -EBUSY || !pm_wakeup_pending())) { -			dpm_noirq_end(); -			break; -		} - -		if (s2idle_ops && s2idle_ops->wake) +		if (!error && s2idle_ops && s2idle_ops->wake)  			s2idle_ops->wake(); +		dpm_noirq_resume_devices(PMSG_RESUME); +  		dpm_noirq_end(); +		if (error) +			break; +  		if (s2idle_ops && s2idle_ops->sync)  			s2idle_ops->sync(); diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c index 1896386e16bb..dfba59be190b 100644 --- a/kernel/power/wakelock.c +++ b/kernel/power/wakelock.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * kernel/power/wakelock.c   * diff --git a/kernel/printk/braille.c b/kernel/printk/braille.c index 61d41ca41844..1d21ebacfdb8 100644 --- a/kernel/printk/braille.c +++ b/kernel/printk/braille.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt  #include <linux/kernel.h> diff --git a/kernel/printk/braille.h b/kernel/printk/braille.h index 749a6756843a..123154f86304 100644 --- a/kernel/printk/braille.h +++ b/kernel/printk/braille.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  #ifndef _PRINTK_BRAILLE_H  #define _PRINTK_BRAILLE_H diff --git a/kernel/printk/console_cmdline.h b/kernel/printk/console_cmdline.h index 2ca4a8b5fe57..11f19c466af5 100644 --- a/kernel/printk/console_cmdline.h +++ b/kernel/printk/console_cmdline.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  #ifndef _CONSOLE_CMDLINE_H  #define _CONSOLE_CMDLINE_H diff --git a/kernel/range.c b/kernel/range.c index 82cfc285b046..d84de6766472 100644 --- a/kernel/range.c +++ b/kernel/range.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * Range add and subtract   */ diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 13c0fc852767..020e8b6a644b 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0  # Any varying coverage in these files is non-deterministic  # and is generally not a function of system call inputs.  KCOV_INSTRUMENT := n diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 729a8706751d..6d5880089ff6 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -854,7 +854,7 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,  /**   * call_srcu() - Queue a callback for invocation after an SRCU grace period   * @sp: srcu_struct in queue the callback - * @head: structure to be used for queueing the SRCU callback. + * @rhp: structure to be used for queueing the SRCU callback.   * @func: function to be invoked after the SRCU grace period   *   * The callback function will be invoked some time after a full SRCU diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index 50d1861f7759..3f943efcf61c 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c @@ -85,6 +85,9 @@ void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type)  }  /** + * rcu_sync_enter_start - Force readers onto slow path for multiple updates + * @rsp: Pointer to rcu_sync structure to use for synchronization + *   * Must be called after rcu_sync_init() and before first use.   *   * Ensures rcu_sync_is_idle() returns false and rcu_sync_{enter,exit}() @@ -142,7 +145,7 @@ void rcu_sync_enter(struct rcu_sync *rsp)  /**   * rcu_sync_func() - Callback function managing reader access to fastpath - * @rsp: Pointer to rcu_sync structure to use for synchronization + * @rhp: Pointer to rcu_head in rcu_sync structure to use for synchronization   *   * This function is passed to one of the call_rcu() functions by   * rcu_sync_exit(), so that it is invoked after a grace period following the @@ -158,9 +161,9 @@ void rcu_sync_enter(struct rcu_sync *rsp)   * rcu_sync_exit().  Otherwise, set all state back to idle so that readers   * can again use their fastpaths.   */ -static void rcu_sync_func(struct rcu_head *rcu) +static void rcu_sync_func(struct rcu_head *rhp)  { -	struct rcu_sync *rsp = container_of(rcu, struct rcu_sync, cb_head); +	struct rcu_sync *rsp = container_of(rhp, struct rcu_sync, cb_head);  	unsigned long flags;  	BUG_ON(rsp->gp_state != GP_PASSED); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 1250e4bd4b85..3e3650e94ae6 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -882,6 +882,11 @@ void rcu_irq_exit(void)  	RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_exit() invoked with irqs enabled!!!");  	rdtp = this_cpu_ptr(&rcu_dynticks); + +	/* Page faults can happen in NMI handlers, so check... */ +	if (rdtp->dynticks_nmi_nesting) +		return; +  	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&  		     rdtp->dynticks_nesting < 1);  	if (rdtp->dynticks_nesting <= 1) { @@ -1015,6 +1020,11 @@ void rcu_irq_enter(void)  	RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_enter() invoked with irqs enabled!!!");  	rdtp = this_cpu_ptr(&rcu_dynticks); + +	/* Page faults can happen in NMI handlers, so check... */ +	if (rdtp->dynticks_nmi_nesting) +		return; +  	oldval = rdtp->dynticks_nesting;  	rdtp->dynticks_nesting++;  	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && @@ -3087,9 +3097,10 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func,   * read-side critical sections have completed. call_rcu_sched() assumes   * that the read-side critical sections end on enabling of preemption   * or on voluntary preemption. - * RCU read-side critical sections are delimited by : - *  - rcu_read_lock_sched() and rcu_read_unlock_sched(), OR - *  - anything that disables preemption. + * RCU read-side critical sections are delimited by: + * + * - rcu_read_lock_sched() and rcu_read_unlock_sched(), OR + * - anything that disables preemption.   *   *  These may be nested.   * @@ -3114,11 +3125,12 @@ EXPORT_SYMBOL_GPL(call_rcu_sched);   * handler. This means that read-side critical sections in process   * context must not be interrupted by softirqs. This interface is to be   * used when most of the read-side critical sections are in softirq context. - * RCU read-side critical sections are delimited by : - *  - rcu_read_lock() and  rcu_read_unlock(), if in interrupt context. - *  OR - *  - rcu_read_lock_bh() and rcu_read_unlock_bh(), if in process context. - *  These may be nested. + * RCU read-side critical sections are delimited by: + * + * - rcu_read_lock() and  rcu_read_unlock(), if in interrupt context, OR + * - rcu_read_lock_bh() and rcu_read_unlock_bh(), if in process context. + * + * These may be nested.   *   * See the description of call_rcu() for more detailed information on   * memory ordering guarantees. diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 78f54932ea1d..a9ee16bbc693 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0  ifdef CONFIG_FUNCTION_TRACER  CFLAGS_REMOVE_clock.o = $(CC_FLAGS_FTRACE)  endif diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index de6d7f4dfcb5..a43df5193538 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  #include "sched.h"  #include <linux/proc_fs.h> diff --git a/kernel/sched/autogroup.h b/kernel/sched/autogroup.h index ce40c810cd5c..27cd22b89824 100644 --- a/kernel/sched/autogroup.h +++ b/kernel/sched/autogroup.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  #ifdef CONFIG_SCHED_AUTOGROUP  #include <linux/kref.h> diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index cc873075c3bd..2ddaec40956f 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * Generic wait-for-completion handler;   * diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 18a6966567da..d17c5da523a0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5166,6 +5166,28 @@ void sched_show_task(struct task_struct *p)  	put_task_stack(p);  } +static inline bool +state_filter_match(unsigned long state_filter, struct task_struct *p) +{ +	/* no filter, everything matches */ +	if (!state_filter) +		return true; + +	/* filter, but doesn't match */ +	if (!(p->state & state_filter)) +		return false; + +	/* +	 * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows +	 * TASK_KILLABLE). +	 */ +	if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE) +		return false; + +	return true; +} + +  void show_state_filter(unsigned long state_filter)  {  	struct task_struct *g, *p; @@ -5188,7 +5210,7 @@ void show_state_filter(unsigned long state_filter)  		 */  		touch_nmi_watchdog();  		touch_all_softlockup_watchdogs(); -		if (!state_filter || (p->state & state_filter)) +		if (state_filter_match(state_filter, p))  			sched_show_task(p);  	} diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index f95ab29a45d0..44ab32a4fab6 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  #include <linux/cgroup.h>  #include <linux/slab.h>  #include <linux/percpu.h> diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h index ba72807c73d4..a8358a57a316 100644 --- a/kernel/sched/cpuacct.h +++ b/kernel/sched/cpuacct.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  #ifdef CONFIG_CGROUP_CPUACCT  extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index f7da8c55bba0..b010d26e108e 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  #ifndef _LINUX_CPUDL_H  #define _LINUX_CPUDL_H diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h index 63cbb9ca0496..bab050019071 100644 --- a/kernel/sched/cpupri.h +++ b/kernel/sched/cpupri.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  #ifndef _LINUX_CPUPRI_H  #define _LINUX_CPUPRI_H diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 0191ec7667c3..4ae5c1ea90e2 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * Deadline Scheduling Class (SCHED_DEADLINE)   * diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 01217fb5a5de..2f93e4a2d9f6 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -466,8 +466,6 @@ static char *task_group_path(struct task_group *tg)  }  #endif -static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; -  static void  print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)  { diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 70ba32e08a23..5c09ddf8c832 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)   * @@ -5356,91 +5357,62 @@ static int wake_wide(struct task_struct *p)  	return 1;  } -struct llc_stats { -	unsigned long	nr_running; -	unsigned long	load; -	unsigned long	capacity; -	int		has_capacity; -}; +/* + * The purpose of wake_affine() is to quickly determine on which CPU we can run + * soonest. For the purpose of speed we only consider the waking and previous + * CPU. + * + * wake_affine_idle() - only considers 'now', it check if the waking CPU is (or + *			will be) idle. + * + * wake_affine_weight() - considers the weight to reflect the average + *			  scheduling latency of the CPUs. This seems to work + *			  for the overloaded case. + */ -static bool get_llc_stats(struct llc_stats *stats, int cpu) +static bool +wake_affine_idle(struct sched_domain *sd, struct task_struct *p, +		 int this_cpu, int prev_cpu, int sync)  { -	struct sched_domain_shared *sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); - -	if (!sds) -		return false; +	if (idle_cpu(this_cpu)) +		return true; -	stats->nr_running	= READ_ONCE(sds->nr_running); -	stats->load		= READ_ONCE(sds->load); -	stats->capacity		= READ_ONCE(sds->capacity); -	stats->has_capacity	= stats->nr_running < per_cpu(sd_llc_size, cpu); +	if (sync && cpu_rq(this_cpu)->nr_running == 1) +		return true; -	return true; +	return false;  } -/* - * Can a task be moved from prev_cpu to this_cpu without causing a load - * imbalance that would trigger the load balancer? - * - * Since we're running on 'stale' values, we might in fact create an imbalance - * but recomputing these values is expensive, as that'd mean iteration 2 cache - * domains worth of CPUs. - */  static bool -wake_affine_llc(struct sched_domain *sd, struct task_struct *p, -		int this_cpu, int prev_cpu, int sync) +wake_affine_weight(struct sched_domain *sd, struct task_struct *p, +		   int this_cpu, int prev_cpu, int sync)  { -	struct llc_stats prev_stats, this_stats;  	s64 this_eff_load, prev_eff_load;  	unsigned long task_load; -	if (!get_llc_stats(&prev_stats, prev_cpu) || -	    !get_llc_stats(&this_stats, this_cpu)) -		return false; +	this_eff_load = target_load(this_cpu, sd->wake_idx); +	prev_eff_load = source_load(prev_cpu, sd->wake_idx); -	/* -	 * If sync wakeup then subtract the (maximum possible) -	 * effect of the currently running task from the load -	 * of the current LLC. -	 */  	if (sync) {  		unsigned long current_load = task_h_load(current); -		/* in this case load hits 0 and this LLC is considered 'idle' */ -		if (current_load > this_stats.load) +		if (current_load > this_eff_load)  			return true; -		this_stats.load -= current_load; +		this_eff_load -= current_load;  	} -	/* -	 * The has_capacity stuff is not SMT aware, but by trying to balance -	 * the nr_running on both ends we try and fill the domain at equal -	 * rates, thereby first consuming cores before siblings. -	 */ - -	/* if the old cache has capacity, stay there */ -	if (prev_stats.has_capacity && prev_stats.nr_running < this_stats.nr_running+1) -		return false; - -	/* if this cache has capacity, come here */ -	if (this_stats.has_capacity && this_stats.nr_running+1 < prev_stats.nr_running) -		return true; - -	/* -	 * Check to see if we can move the load without causing too much -	 * imbalance. -	 */  	task_load = task_h_load(p); -	this_eff_load = 100; -	this_eff_load *= prev_stats.capacity; - -	prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; -	prev_eff_load *= this_stats.capacity; +	this_eff_load += task_load; +	if (sched_feat(WA_BIAS)) +		this_eff_load *= 100; +	this_eff_load *= capacity_of(prev_cpu); -	this_eff_load *= this_stats.load + task_load; -	prev_eff_load *= prev_stats.load - task_load; +	prev_eff_load -= task_load; +	if (sched_feat(WA_BIAS)) +		prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2; +	prev_eff_load *= capacity_of(this_cpu);  	return this_eff_load <= prev_eff_load;  } @@ -5449,22 +5421,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,  		       int prev_cpu, int sync)  {  	int this_cpu = smp_processor_id(); -	bool affine; +	bool affine = false; -	/* -	 * Default to no affine wakeups; wake_affine() should not effect a task -	 * placement the load-balancer feels inclined to undo. The conservative -	 * option is therefore to not move tasks when they wake up. -	 */ -	affine = false; +	if (sched_feat(WA_IDLE) && !affine) +		affine = wake_affine_idle(sd, p, this_cpu, prev_cpu, sync); -	/* -	 * If the wakeup is across cache domains, try to evaluate if movement -	 * makes sense, otherwise rely on select_idle_siblings() to do -	 * placement inside the cache domain. -	 */ -	if (!cpus_share_cache(prev_cpu, this_cpu)) -		affine = wake_affine_llc(sd, p, this_cpu, prev_cpu, sync); +	if (sched_feat(WA_WEIGHT) && !affine) +		affine = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);  	schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);  	if (affine) { @@ -7600,7 +7563,6 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)   */  static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)  { -	struct sched_domain_shared *shared = env->sd->shared;  	struct sched_domain *child = env->sd->child;  	struct sched_group *sg = env->sd->groups;  	struct sg_lb_stats *local = &sds->local_stat; @@ -7672,22 +7634,6 @@ next_group:  		if (env->dst_rq->rd->overload != overload)  			env->dst_rq->rd->overload = overload;  	} - -	if (!shared) -		return; - -	/* -	 * Since these are sums over groups they can contain some CPUs -	 * multiple times for the NUMA domains. -	 * -	 * Currently only wake_affine_llc() and find_busiest_group() -	 * uses these numbers, only the last is affected by this problem. -	 * -	 * XXX fix that. -	 */ -	WRITE_ONCE(shared->nr_running,	sds->total_running); -	WRITE_ONCE(shared->load,	sds->total_load); -	WRITE_ONCE(shared->capacity,	sds->total_capacity);  }  /** @@ -8098,6 +8044,13 @@ static int should_we_balance(struct lb_env *env)  	int cpu, balance_cpu = -1;  	/* +	 * Ensure the balancing environment is consistent; can happen +	 * when the softirq triggers 'during' hotplug. +	 */ +	if (!cpumask_test_cpu(env->dst_cpu, env->cpus)) +		return 0; + +	/*  	 * In the newly idle case, we will allow all the cpu's  	 * to do the newly idle load balance.  	 */ diff --git a/kernel/sched/features.h b/kernel/sched/features.h index d3fb15555291..9552fd5854bf 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  /*   * Only give sleepers 50% of their service deficit. This allows   * them to run sooner, but does not allow tons of sleepers to @@ -81,3 +82,6 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true)  SCHED_FEAT(LB_MIN, false)  SCHED_FEAT(ATTACH_AGE_LOAD, true) +SCHED_FEAT(WA_IDLE, true) +SCHED_FEAT(WA_WEIGHT, true) +SCHED_FEAT(WA_BIAS, true) diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index 0c00172db63e..d518664cce4f 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  #include "sched.h"  /* diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index f14716a3522f..89a989e4d758 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * kernel/sched/loadavg.c   * diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index a92fddc22747..dd7908743dab 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -18,6 +18,7 @@  #include <linux/membarrier.h>  #include <linux/tick.h>  #include <linux/cpumask.h> +#include <linux/atomic.h>  #include "sched.h"	/* for cpu_rq(). */ @@ -26,21 +27,26 @@   * except MEMBARRIER_CMD_QUERY.   */  #define MEMBARRIER_CMD_BITMASK	\ -	(MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED) +	(MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED	\ +	| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED)  static void ipi_mb(void *info)  {  	smp_mb();	/* IPIs should be serializing but paranoid. */  } -static void membarrier_private_expedited(void) +static int membarrier_private_expedited(void)  {  	int cpu;  	bool fallback = false;  	cpumask_var_t tmpmask; +	if (!(atomic_read(¤t->mm->membarrier_state) +			& MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)) +		return -EPERM; +  	if (num_online_cpus() == 1) -		return; +		return 0;  	/*  	 * Matches memory barriers around rq->curr modification in @@ -94,6 +100,24 @@ static void membarrier_private_expedited(void)  	 * rq->curr modification in scheduler.  	 */  	smp_mb();	/* exit from system call is not a mb */ +	return 0; +} + +static void membarrier_register_private_expedited(void) +{ +	struct task_struct *p = current; +	struct mm_struct *mm = p->mm; + +	/* +	 * We need to consider threads belonging to different thread +	 * groups, which use the same mm. (CLONE_VM but not +	 * CLONE_THREAD). +	 */ +	if (atomic_read(&mm->membarrier_state) +			& MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY) +		return; +	atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY, +			&mm->membarrier_state);  }  /** @@ -144,7 +168,9 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)  			synchronize_sched();  		return 0;  	case MEMBARRIER_CMD_PRIVATE_EXPEDITED: -		membarrier_private_expedited(); +		return membarrier_private_expedited(); +	case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED: +		membarrier_register_private_expedited();  		return 0;  	default:  		return -EINVAL; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 0af5ca9e3e3f..3c96c80e0992 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR   * policies) diff --git a/kernel/sched/sched-pelt.h b/kernel/sched/sched-pelt.h index cd200d16529e..a26473674fb7 100644 --- a/kernel/sched/sched-pelt.h +++ b/kernel/sched/sched-pelt.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  /* Generated by Documentation/scheduler/sched-pelt; do not modify. */  static const u32 runnable_avg_yN_inv[] = { diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 14db76cd496f..3b448ba82225 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  #include <linux/sched.h>  #include <linux/sched/autogroup.h> diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 87e2c9f0c33e..940b1fa1d2ce 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  #include <linux/slab.h>  #include <linux/fs.h> diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index d5710651043b..baf500d12b7c 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  #ifdef CONFIG_SCHEDSTATS diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 9f69fb630853..45caf90b24cd 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  #include "sched.h"  /* diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c index 2227e183e202..9ff1555341ed 100644 --- a/kernel/sched/swait.c +++ b/kernel/sched/swait.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  #include <linux/sched/signal.h>  #include <linux/swait.h> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index f1cf4f306a82..6798276d29af 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * Scheduler topology setup/handling methods   */ diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 98b59b5db90b..418a1c045933 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * linux/kernel/seccomp.c   * @@ -17,11 +18,13 @@  #include <linux/audit.h>  #include <linux/compat.h>  #include <linux/coredump.h> +#include <linux/kmemleak.h>  #include <linux/sched.h>  #include <linux/sched/task_stack.h>  #include <linux/seccomp.h>  #include <linux/slab.h>  #include <linux/syscalls.h> +#include <linux/sysctl.h>  #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER  #include <asm/syscall.h> @@ -42,6 +45,7 @@   *         get/put helpers should be used when accessing an instance   *         outside of a lifetime-guarded section.  In general, this   *         is only needed for handling filters shared across tasks. + * @log: true if all actions except for SECCOMP_RET_ALLOW should be logged   * @prev: points to a previously installed, or inherited, filter   * @prog: the BPF program to evaluate   * @@ -57,6 +61,7 @@   */  struct seccomp_filter {  	refcount_t usage; +	bool log;  	struct seccomp_filter *prev;  	struct bpf_prog *prog;  }; @@ -171,10 +176,15 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)  /**   * seccomp_run_filters - evaluates all seccomp filters against @sd   * @sd: optional seccomp data to be passed to filters + * @match: stores struct seccomp_filter that resulted in the return value, + *         unless filter returned SECCOMP_RET_ALLOW, in which case it will + *         be unchanged.   *   * Returns valid seccomp BPF response codes.   */ -static u32 seccomp_run_filters(const struct seccomp_data *sd) +#define ACTION_ONLY(ret) ((s32)((ret) & (SECCOMP_RET_ACTION_FULL))) +static u32 seccomp_run_filters(const struct seccomp_data *sd, +			       struct seccomp_filter **match)  {  	struct seccomp_data sd_local;  	u32 ret = SECCOMP_RET_ALLOW; @@ -184,7 +194,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd)  	/* Ensure unexpected behavior doesn't result in failing open. */  	if (unlikely(WARN_ON(f == NULL))) -		return SECCOMP_RET_KILL; +		return SECCOMP_RET_KILL_PROCESS;  	if (!sd) {  		populate_seccomp_data(&sd_local); @@ -198,8 +208,10 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd)  	for (; f; f = f->prev) {  		u32 cur_ret = BPF_PROG_RUN(f->prog, sd); -		if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) +		if (ACTION_ONLY(cur_ret) < ACTION_ONLY(ret)) {  			ret = cur_ret; +			*match = f; +		}  	}  	return ret;  } @@ -444,6 +456,10 @@ static long seccomp_attach_filter(unsigned int flags,  			return ret;  	} +	/* Set log flag, if present. */ +	if (flags & SECCOMP_FILTER_FLAG_LOG) +		filter->log = true; +  	/*  	 * If there is an existing filter, make it the prev and don't drop its  	 * task reference. @@ -458,14 +474,19 @@ static long seccomp_attach_filter(unsigned int flags,  	return 0;  } +static void __get_seccomp_filter(struct seccomp_filter *filter) +{ +	/* Reference count is bounded by the number of total processes. */ +	refcount_inc(&filter->usage); +} +  /* get_seccomp_filter - increments the reference count of the filter on @tsk */  void get_seccomp_filter(struct task_struct *tsk)  {  	struct seccomp_filter *orig = tsk->seccomp.filter;  	if (!orig)  		return; -	/* Reference count is bounded by the number of total processes. */ -	refcount_inc(&orig->usage); +	__get_seccomp_filter(orig);  }  static inline void seccomp_filter_free(struct seccomp_filter *filter) @@ -476,10 +497,8 @@ static inline void seccomp_filter_free(struct seccomp_filter *filter)  	}  } -/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */ -void put_seccomp_filter(struct task_struct *tsk) +static void __put_seccomp_filter(struct seccomp_filter *orig)  { -	struct seccomp_filter *orig = tsk->seccomp.filter;  	/* Clean up single-reference branches iteratively. */  	while (orig && refcount_dec_and_test(&orig->usage)) {  		struct seccomp_filter *freeme = orig; @@ -488,6 +507,12 @@ void put_seccomp_filter(struct task_struct *tsk)  	}  } +/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */ +void put_seccomp_filter(struct task_struct *tsk) +{ +	__put_seccomp_filter(tsk->seccomp.filter); +} +  static void seccomp_init_siginfo(siginfo_t *info, int syscall, int reason)  {  	memset(info, 0, sizeof(*info)); @@ -514,6 +539,65 @@ static void seccomp_send_sigsys(int syscall, int reason)  }  #endif	/* CONFIG_SECCOMP_FILTER */ +/* For use with seccomp_actions_logged */ +#define SECCOMP_LOG_KILL_PROCESS	(1 << 0) +#define SECCOMP_LOG_KILL_THREAD		(1 << 1) +#define SECCOMP_LOG_TRAP		(1 << 2) +#define SECCOMP_LOG_ERRNO		(1 << 3) +#define SECCOMP_LOG_TRACE		(1 << 4) +#define SECCOMP_LOG_LOG			(1 << 5) +#define SECCOMP_LOG_ALLOW		(1 << 6) + +static u32 seccomp_actions_logged = SECCOMP_LOG_KILL_PROCESS | +				    SECCOMP_LOG_KILL_THREAD  | +				    SECCOMP_LOG_TRAP  | +				    SECCOMP_LOG_ERRNO | +				    SECCOMP_LOG_TRACE | +				    SECCOMP_LOG_LOG; + +static inline void seccomp_log(unsigned long syscall, long signr, u32 action, +			       bool requested) +{ +	bool log = false; + +	switch (action) { +	case SECCOMP_RET_ALLOW: +		break; +	case SECCOMP_RET_TRAP: +		log = requested && seccomp_actions_logged & SECCOMP_LOG_TRAP; +		break; +	case SECCOMP_RET_ERRNO: +		log = requested && seccomp_actions_logged & SECCOMP_LOG_ERRNO; +		break; +	case SECCOMP_RET_TRACE: +		log = requested && seccomp_actions_logged & SECCOMP_LOG_TRACE; +		break; +	case SECCOMP_RET_LOG: +		log = seccomp_actions_logged & SECCOMP_LOG_LOG; +		break; +	case SECCOMP_RET_KILL_THREAD: +		log = seccomp_actions_logged & SECCOMP_LOG_KILL_THREAD; +		break; +	case SECCOMP_RET_KILL_PROCESS: +	default: +		log = seccomp_actions_logged & SECCOMP_LOG_KILL_PROCESS; +	} + +	/* +	 * Force an audit message to be emitted when the action is RET_KILL_*, +	 * RET_LOG, or the FILTER_FLAG_LOG bit was set and the action is +	 * allowed to be logged by the admin. +	 */ +	if (log) +		return __audit_seccomp(syscall, signr, action); + +	/* +	 * Let the audit subsystem decide if the action should be audited based +	 * on whether the current task itself is being audited. +	 */ +	return audit_seccomp(syscall, signr, action); +} +  /*   * Secure computing mode 1 allows only read/write/exit/sigreturn.   * To be fully secure this must be combined with rlimit @@ -539,7 +623,7 @@ static void __secure_computing_strict(int this_syscall)  #ifdef SECCOMP_DEBUG  	dump_stack();  #endif -	audit_seccomp(this_syscall, SIGKILL, SECCOMP_RET_KILL); +	seccomp_log(this_syscall, SIGKILL, SECCOMP_RET_KILL_THREAD, true);  	do_exit(SIGKILL);  } @@ -566,6 +650,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,  			    const bool recheck_after_trace)  {  	u32 filter_ret, action; +	struct seccomp_filter *match = NULL;  	int data;  	/* @@ -574,9 +659,9 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,  	 */  	rmb(); -	filter_ret = seccomp_run_filters(sd); +	filter_ret = seccomp_run_filters(sd, &match);  	data = filter_ret & SECCOMP_RET_DATA; -	action = filter_ret & SECCOMP_RET_ACTION; +	action = filter_ret & SECCOMP_RET_ACTION_FULL;  	switch (action) {  	case SECCOMP_RET_ERRNO: @@ -637,14 +722,25 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,  		return 0; +	case SECCOMP_RET_LOG: +		seccomp_log(this_syscall, 0, action, true); +		return 0; +  	case SECCOMP_RET_ALLOW: +		/* +		 * Note that the "match" filter will always be NULL for +		 * this action since SECCOMP_RET_ALLOW is the starting +		 * state in seccomp_run_filters(). +		 */  		return 0; -	case SECCOMP_RET_KILL: +	case SECCOMP_RET_KILL_THREAD: +	case SECCOMP_RET_KILL_PROCESS:  	default: -		audit_seccomp(this_syscall, SIGSYS, action); +		seccomp_log(this_syscall, SIGSYS, action, true);  		/* Dump core only if this is the last remaining thread. */ -		if (get_nr_threads(current) == 1) { +		if (action == SECCOMP_RET_KILL_PROCESS || +		    get_nr_threads(current) == 1) {  			siginfo_t info;  			/* Show the original registers in the dump. */ @@ -653,13 +749,16 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,  			seccomp_init_siginfo(&info, this_syscall, data);  			do_coredump(&info);  		} -		do_exit(SIGSYS); +		if (action == SECCOMP_RET_KILL_PROCESS) +			do_group_exit(SIGSYS); +		else +			do_exit(SIGSYS);  	}  	unreachable();  skip: -	audit_seccomp(this_syscall, 0, action); +	seccomp_log(this_syscall, 0, action, match ? match->log : false);  	return -1;  }  #else @@ -794,6 +893,29 @@ static inline long seccomp_set_mode_filter(unsigned int flags,  }  #endif +static long seccomp_get_action_avail(const char __user *uaction) +{ +	u32 action; + +	if (copy_from_user(&action, uaction, sizeof(action))) +		return -EFAULT; + +	switch (action) { +	case SECCOMP_RET_KILL_PROCESS: +	case SECCOMP_RET_KILL_THREAD: +	case SECCOMP_RET_TRAP: +	case SECCOMP_RET_ERRNO: +	case SECCOMP_RET_TRACE: +	case SECCOMP_RET_LOG: +	case SECCOMP_RET_ALLOW: +		break; +	default: +		return -EOPNOTSUPP; +	} + +	return 0; +} +  /* Common entry point for both prctl and syscall. */  static long do_seccomp(unsigned int op, unsigned int flags,  		       const char __user *uargs) @@ -805,6 +927,11 @@ static long do_seccomp(unsigned int op, unsigned int flags,  		return seccomp_set_mode_strict();  	case SECCOMP_SET_MODE_FILTER:  		return seccomp_set_mode_filter(flags, uargs); +	case SECCOMP_GET_ACTION_AVAIL: +		if (flags != 0) +			return -EINVAL; + +		return seccomp_get_action_avail(uargs);  	default:  		return -EINVAL;  	} @@ -908,13 +1035,13 @@ long seccomp_get_filter(struct task_struct *task, unsigned long filter_off,  	if (!data)  		goto out; -	get_seccomp_filter(task); +	__get_seccomp_filter(filter);  	spin_unlock_irq(&task->sighand->siglock);  	if (copy_to_user(data, fprog->filter, bpf_classic_proglen(fprog)))  		ret = -EFAULT; -	put_seccomp_filter(task); +	__put_seccomp_filter(filter);  	return ret;  out: @@ -922,3 +1049,185 @@ out:  	return ret;  }  #endif + +#ifdef CONFIG_SYSCTL + +/* Human readable action names for friendly sysctl interaction */ +#define SECCOMP_RET_KILL_PROCESS_NAME	"kill_process" +#define SECCOMP_RET_KILL_THREAD_NAME	"kill_thread" +#define SECCOMP_RET_TRAP_NAME		"trap" +#define SECCOMP_RET_ERRNO_NAME		"errno" +#define SECCOMP_RET_TRACE_NAME		"trace" +#define SECCOMP_RET_LOG_NAME		"log" +#define SECCOMP_RET_ALLOW_NAME		"allow" + +static const char seccomp_actions_avail[] = +				SECCOMP_RET_KILL_PROCESS_NAME	" " +				SECCOMP_RET_KILL_THREAD_NAME	" " +				SECCOMP_RET_TRAP_NAME		" " +				SECCOMP_RET_ERRNO_NAME		" " +				SECCOMP_RET_TRACE_NAME		" " +				SECCOMP_RET_LOG_NAME		" " +				SECCOMP_RET_ALLOW_NAME; + +struct seccomp_log_name { +	u32		log; +	const char	*name; +}; + +static const struct seccomp_log_name seccomp_log_names[] = { +	{ SECCOMP_LOG_KILL_PROCESS, SECCOMP_RET_KILL_PROCESS_NAME }, +	{ SECCOMP_LOG_KILL_THREAD, SECCOMP_RET_KILL_THREAD_NAME }, +	{ SECCOMP_LOG_TRAP, SECCOMP_RET_TRAP_NAME }, +	{ SECCOMP_LOG_ERRNO, SECCOMP_RET_ERRNO_NAME }, +	{ SECCOMP_LOG_TRACE, SECCOMP_RET_TRACE_NAME }, +	{ SECCOMP_LOG_LOG, SECCOMP_RET_LOG_NAME }, +	{ SECCOMP_LOG_ALLOW, SECCOMP_RET_ALLOW_NAME }, +	{ } +}; + +static bool seccomp_names_from_actions_logged(char *names, size_t size, +					      u32 actions_logged) +{ +	const struct seccomp_log_name *cur; +	bool append_space = false; + +	for (cur = seccomp_log_names; cur->name && size; cur++) { +		ssize_t ret; + +		if (!(actions_logged & cur->log)) +			continue; + +		if (append_space) { +			ret = strscpy(names, " ", size); +			if (ret < 0) +				return false; + +			names += ret; +			size -= ret; +		} else +			append_space = true; + +		ret = strscpy(names, cur->name, size); +		if (ret < 0) +			return false; + +		names += ret; +		size -= ret; +	} + +	return true; +} + +static bool seccomp_action_logged_from_name(u32 *action_logged, +					    const char *name) +{ +	const struct seccomp_log_name *cur; + +	for (cur = seccomp_log_names; cur->name; cur++) { +		if (!strcmp(cur->name, name)) { +			*action_logged = cur->log; +			return true; +		} +	} + +	return false; +} + +static bool seccomp_actions_logged_from_names(u32 *actions_logged, char *names) +{ +	char *name; + +	*actions_logged = 0; +	while ((name = strsep(&names, " ")) && *name) { +		u32 action_logged = 0; + +		if (!seccomp_action_logged_from_name(&action_logged, name)) +			return false; + +		*actions_logged |= action_logged; +	} + +	return true; +} + +static int seccomp_actions_logged_handler(struct ctl_table *ro_table, int write, +					  void __user *buffer, size_t *lenp, +					  loff_t *ppos) +{ +	char names[sizeof(seccomp_actions_avail)]; +	struct ctl_table table; +	int ret; + +	if (write && !capable(CAP_SYS_ADMIN)) +		return -EPERM; + +	memset(names, 0, sizeof(names)); + +	if (!write) { +		if (!seccomp_names_from_actions_logged(names, sizeof(names), +						       seccomp_actions_logged)) +			return -EINVAL; +	} + +	table = *ro_table; +	table.data = names; +	table.maxlen = sizeof(names); +	ret = proc_dostring(&table, write, buffer, lenp, ppos); +	if (ret) +		return ret; + +	if (write) { +		u32 actions_logged; + +		if (!seccomp_actions_logged_from_names(&actions_logged, +						       table.data)) +			return -EINVAL; + +		if (actions_logged & SECCOMP_LOG_ALLOW) +			return -EINVAL; + +		seccomp_actions_logged = actions_logged; +	} + +	return 0; +} + +static struct ctl_path seccomp_sysctl_path[] = { +	{ .procname = "kernel", }, +	{ .procname = "seccomp", }, +	{ } +}; + +static struct ctl_table seccomp_sysctl_table[] = { +	{ +		.procname	= "actions_avail", +		.data		= (void *) &seccomp_actions_avail, +		.maxlen		= sizeof(seccomp_actions_avail), +		.mode		= 0444, +		.proc_handler	= proc_dostring, +	}, +	{ +		.procname	= "actions_logged", +		.mode		= 0644, +		.proc_handler	= seccomp_actions_logged_handler, +	}, +	{ } +}; + +static int __init seccomp_sysctl_init(void) +{ +	struct ctl_table_header *hdr; + +	hdr = register_sysctl_paths(seccomp_sysctl_path, seccomp_sysctl_table); +	if (!hdr) +		pr_warn("seccomp: sysctl registration failed\n"); +	else +		kmemleak_not_leak(hdr); + +	return 0; +} + +device_initcall(seccomp_sysctl_init) + +#endif /* CONFIG_SYSCTL */ diff --git a/kernel/signal.c b/kernel/signal.c index 800a18f77732..8dcd8825b2de 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2698,7 +2698,7 @@ enum siginfo_layout siginfo_layout(int sig, int si_code)  			[SIGSEGV] = { NSIGSEGV, SIL_FAULT },  			[SIGBUS]  = { NSIGBUS,  SIL_FAULT },  			[SIGTRAP] = { NSIGTRAP, SIL_FAULT }, -#if defined(SIGMET) && defined(NSIGEMT) +#if defined(SIGEMT) && defined(NSIGEMT)  			[SIGEMT]  = { NSIGEMT,  SIL_FAULT },  #endif  			[SIGCHLD] = { NSIGCHLD, SIL_CHLD }, diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 1d71c051a951..5043e7433f4b 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -344,39 +344,30 @@ EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread);   * by the client, but only by calling this function.   * This function can only be called on a registered smp_hotplug_thread.   */ -int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread, -					 const struct cpumask *new) +void smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread, +					  const struct cpumask *new)  {  	struct cpumask *old = plug_thread->cpumask; -	cpumask_var_t tmp; +	static struct cpumask tmp;  	unsigned int cpu; -	if (!alloc_cpumask_var(&tmp, GFP_KERNEL)) -		return -ENOMEM; - -	get_online_cpus(); +	lockdep_assert_cpus_held();  	mutex_lock(&smpboot_threads_lock);  	/* Park threads that were exclusively enabled on the old mask. */ -	cpumask_andnot(tmp, old, new); -	for_each_cpu_and(cpu, tmp, cpu_online_mask) +	cpumask_andnot(&tmp, old, new); +	for_each_cpu_and(cpu, &tmp, cpu_online_mask)  		smpboot_park_thread(plug_thread, cpu);  	/* Unpark threads that are exclusively enabled on the new mask. */ -	cpumask_andnot(tmp, new, old); -	for_each_cpu_and(cpu, tmp, cpu_online_mask) +	cpumask_andnot(&tmp, new, old); +	for_each_cpu_and(cpu, &tmp, cpu_online_mask)  		smpboot_unpark_thread(plug_thread, cpu);  	cpumask_copy(old, new);  	mutex_unlock(&smpboot_threads_lock); -	put_online_cpus(); - -	free_cpumask_var(tmp); - -	return 0;  } -EXPORT_SYMBOL_GPL(smpboot_update_cpumask_percpu_thread);  static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD); diff --git a/kernel/smpboot.h b/kernel/smpboot.h index 485b81cfab34..34dd3d7ba40b 100644 --- a/kernel/smpboot.h +++ b/kernel/smpboot.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  #ifndef SMPBOOT_H  #define SMPBOOT_H diff --git a/kernel/sys.c b/kernel/sys.c index 9aebc2935013..524a4cb9bbe2 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   *  linux/kernel/sys.c   * diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 8acef8576ce9..b5189762d275 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  #include <linux/linkage.h>  #include <linux/errno.h> diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 6648fbbb8157..d9c31bc2eaea 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -367,7 +367,8 @@ static struct ctl_table kern_table[] = {  		.data		= &sysctl_sched_time_avg,  		.maxlen		= sizeof(unsigned int),  		.mode		= 0644, -		.proc_handler	= proc_dointvec, +		.proc_handler	= proc_dointvec_minmax, +		.extra1		= &one,  	},  #ifdef CONFIG_SCHEDSTATS  	{ @@ -871,9 +872,9 @@ static struct ctl_table kern_table[] = {  #if defined(CONFIG_LOCKUP_DETECTOR)  	{  		.procname       = "watchdog", -		.data           = &watchdog_user_enabled, -		.maxlen         = sizeof (int), -		.mode           = 0644, +		.data		= &watchdog_user_enabled, +		.maxlen		= sizeof(int), +		.mode		= 0644,  		.proc_handler   = proc_watchdog,  		.extra1		= &zero,  		.extra2		= &one, @@ -889,16 +890,12 @@ static struct ctl_table kern_table[] = {  	},  	{  		.procname       = "nmi_watchdog", -		.data           = &nmi_watchdog_enabled, -		.maxlen         = sizeof (int), -		.mode           = 0644, +		.data		= &nmi_watchdog_user_enabled, +		.maxlen		= sizeof(int), +		.mode		= NMI_WATCHDOG_SYSCTL_PERM,  		.proc_handler   = proc_nmi_watchdog,  		.extra1		= &zero, -#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR)  		.extra2		= &one, -#else -		.extra2		= &zero, -#endif  	},  	{  		.procname	= "watchdog_cpumask", @@ -910,9 +907,9 @@ static struct ctl_table kern_table[] = {  #ifdef CONFIG_SOFTLOCKUP_DETECTOR  	{  		.procname       = "soft_watchdog", -		.data           = &soft_watchdog_enabled, -		.maxlen         = sizeof (int), -		.mode           = 0644, +		.data		= &soft_watchdog_user_enabled, +		.maxlen		= sizeof(int), +		.mode		= 0644,  		.proc_handler   = proc_soft_watchdog,  		.extra1		= &zero,  		.extra2		= &one, @@ -2187,8 +2184,6 @@ static int do_proc_douintvec_conv(unsigned long *lvalp,  	if (write) {  		if (*lvalp > UINT_MAX)  			return -EINVAL; -		if (*lvalp > UINT_MAX) -			return -EINVAL;  		*valp = *lvalp;  	} else {  		unsigned int val = *valp; diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 58ea8c03662e..e8c0dab4fd65 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  #include <linux/stat.h>  #include <linux/sysctl.h>  #include "../fs/xfs/xfs_sysctl.h" diff --git a/kernel/task_work.c b/kernel/task_work.c index 836a72a66fba..5718b3ea202a 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  #include <linux/spinlock.h>  #include <linux/task_work.h>  #include <linux/tracehook.h> diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 938dbf33ef49..f1e46f338a9c 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0  obj-y += time.o timer.o hrtimer.o  obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o  obj-y += timeconv.o timecounter.o alarmtimer.o diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index 2ef98a02376a..f26acef5d7b4 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * linux/kernel/itimer.c   * diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index edf19cc53140..99e03bec68e4 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * NTP state machine interfaces and logic.   * diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h index d8a7c11fa71a..0a53e6ea47b1 100644 --- a/kernel/time/ntp_internal.h +++ b/kernel/time/ntp_internal.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  #ifndef _LINUX_NTP_INTERNAL_H  #define _LINUX_NTP_INTERNAL_H diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 8585ad6e472a..5b117110b55b 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * Implement CPU time clocks for the POSIX clock interface.   */ diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h index fb303c3be4d3..151e28f5bf30 100644 --- a/kernel/time/posix-timers.h +++ b/kernel/time/posix-timers.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  #define TIMER_RETRY 1  struct k_clock { diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c index a7bb8f33ae07..58045eb976c3 100644 --- a/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * linux/kernel/time/tick-broadcast-hrtimer.c   * This file emulates a local clock event device diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index be0ac01f2e12..f8e1845aa464 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  /*   * tick internal variable and functions used by low/high res code   */ diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h index 075444e3d48e..954b43dbf21c 100644 --- a/kernel/time/tick-sched.h +++ b/kernel/time/tick-sched.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  #ifndef _TICK_SCHED_H  #define _TICK_SCHED_H diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h index d0914676d4c5..c9f9af339914 100644 --- a/kernel/time/timekeeping.h +++ b/kernel/time/timekeeping.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  #ifndef _KERNEL_TIME_TIMEKEEPING_H  #define _KERNEL_TIME_TIMEKEEPING_H  /* diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h index 9a18f121f399..fdbeeb02dde9 100644 --- a/kernel/time/timekeeping_internal.h +++ b/kernel/time/timekeeping_internal.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  #ifndef _TIMEKEEPING_INTERNAL_H  #define _TIMEKEEPING_INTERNAL_H  /* diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 90f2701d92a7..19a15b2f1190 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0  # Do not instrument the tracer itself: diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 2a685b45b73b..45a3928544ce 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -648,6 +648,12 @@ int blk_trace_startstop(struct request_queue *q, int start)  }  EXPORT_SYMBOL_GPL(blk_trace_startstop); +/* + * When reading or writing the blktrace sysfs files, the references to the + * opened sysfs or device files should prevent the underlying block device + * from being removed. So no further delete protection is really needed. + */ +  /**   * blk_trace_ioctl: - handle the ioctls associated with tracing   * @bdev:	the block device @@ -665,7 +671,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)  	if (!q)  		return -ENXIO; -	mutex_lock(&bdev->bd_mutex); +	mutex_lock(&q->blk_trace_mutex);  	switch (cmd) {  	case BLKTRACESETUP: @@ -691,7 +697,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)  		break;  	} -	mutex_unlock(&bdev->bd_mutex); +	mutex_unlock(&q->blk_trace_mutex);  	return ret;  } @@ -1727,7 +1733,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,  	if (q == NULL)  		goto out_bdput; -	mutex_lock(&bdev->bd_mutex); +	mutex_lock(&q->blk_trace_mutex);  	if (attr == &dev_attr_enable) {  		ret = sprintf(buf, "%u\n", !!q->blk_trace); @@ -1746,7 +1752,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,  		ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba);  out_unlock_bdev: -	mutex_unlock(&bdev->bd_mutex); +	mutex_unlock(&q->blk_trace_mutex);  out_bdput:  	bdput(bdev);  out: @@ -1788,7 +1794,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,  	if (q == NULL)  		goto out_bdput; -	mutex_lock(&bdev->bd_mutex); +	mutex_lock(&q->blk_trace_mutex);  	if (attr == &dev_attr_enable) {  		if (value) @@ -1814,7 +1820,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,  	}  out_unlock_bdev: -	mutex_unlock(&bdev->bd_mutex); +	mutex_unlock(&q->blk_trace_mutex);  out_bdput:  	bdput(bdev);  out: diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6abfafd7f173..8319e09e15b9 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4954,9 +4954,6 @@ static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;  static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata;  static int ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer); -static unsigned long save_global_trampoline; -static unsigned long save_global_flags; -  static int __init set_graph_function(char *str)  {  	strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); @@ -6808,17 +6805,6 @@ void unregister_ftrace_graph(void)  	unregister_pm_notifier(&ftrace_suspend_notifier);  	unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); -#ifdef CONFIG_DYNAMIC_FTRACE -	/* -	 * Function graph does not allocate the trampoline, but -	 * other global_ops do. We need to reset the ALLOC_TRAMP flag -	 * if one was used. -	 */ -	global_ops.trampoline = save_global_trampoline; -	if (save_global_flags & FTRACE_OPS_FL_ALLOC_TRAMP) -		global_ops.flags |= FTRACE_OPS_FL_ALLOC_TRAMP; -#endif -   out:  	mutex_unlock(&ftrace_lock);  } diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c index 0c7dee221dca..21bb161c2316 100644 --- a/kernel/trace/power-traces.c +++ b/kernel/trace/power-traces.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * Power trace points   * diff --git a/kernel/trace/rpm-traces.c b/kernel/trace/rpm-traces.c index 4b3b5eaf94d1..25dec0b00280 100644 --- a/kernel/trace/rpm-traces.c +++ b/kernel/trace/rpm-traces.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * Power trace points   * diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5360b7aec57a..752e5daf0896 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4020,11 +4020,17 @@ static int tracing_open(struct inode *inode, struct file *file)  	/* If this file was open for write, then erase contents */  	if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {  		int cpu = tracing_get_cpu(inode); +		struct trace_buffer *trace_buf = &tr->trace_buffer; + +#ifdef CONFIG_TRACER_MAX_TRACE +		if (tr->current_trace->print_max) +			trace_buf = &tr->max_buffer; +#endif  		if (cpu == RING_BUFFER_ALL_CPUS) -			tracing_reset_online_cpus(&tr->trace_buffer); +			tracing_reset_online_cpus(trace_buf);  		else -			tracing_reset(&tr->trace_buffer, cpu); +			tracing_reset(trace_buf, cpu);  	}  	if (file->f_mode & FMODE_READ) { @@ -5358,6 +5364,13 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf)  	if (t == tr->current_trace)  		goto out; +	/* Some tracers won't work on kernel command line */ +	if (system_state < SYSTEM_RUNNING && t->noboot) { +		pr_warn("Tracer '%s' is not allowed on command line, ignored\n", +			t->name); +		goto out; +	} +  	/* Some tracers are only allowed for the top level buffer */  	if (!trace_ok_for_array(t, tr)) {  		ret = -EINVAL; @@ -5667,7 +5680,7 @@ static int tracing_wait_pipe(struct file *filp)  		 *  		 * iter->pos will be 0 if we haven't read anything.  		 */ -		if (!tracing_is_on() && iter->pos) +		if (!tracer_tracing_is_on(iter->tr) && iter->pos)  			break;  		mutex_unlock(&iter->mutex); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index fb5d54d0d1b3..401b0639116f 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  #ifndef _LINUX_KERNEL_TRACE_H  #define _LINUX_KERNEL_TRACE_H @@ -444,6 +445,8 @@ struct tracer {  #ifdef CONFIG_TRACER_MAX_TRACE  	bool			use_max_tr;  #endif +	/* True if tracer cannot be enabled in kernel param */ +	bool			noboot;  }; diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c index 16a8cf02eee9..79f838a75077 100644 --- a/kernel/trace/trace_benchmark.c +++ b/kernel/trace/trace_benchmark.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  #include <linux/delay.h>  #include <linux/module.h>  #include <linux/kthread.h> diff --git a/kernel/trace/trace_benchmark.h b/kernel/trace/trace_benchmark.h index ebdbfc2f2a64..be1d86ff753d 100644 --- a/kernel/trace/trace_benchmark.h +++ b/kernel/trace/trace_benchmark.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  #undef TRACE_SYSTEM  #define TRACE_SYSTEM benchmark diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 4d8fdf3184dc..4ad967453b6f 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * unlikely profiler   * diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index adcdbbeae010..e954ae3d82c0 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  /*   * This file defines the trace event structures that go into the ring   * buffer directly. They are created via macros so that changes for them diff --git a/kernel/trace/trace_events_filter_test.h b/kernel/trace/trace_events_filter_test.h index bfd4dba0d603..39d7ef4f57cb 100644 --- a/kernel/trace/trace_events_filter_test.h +++ b/kernel/trace/trace_events_filter_test.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  #undef TRACE_SYSTEM  #define TRACE_SYSTEM test diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 39aa7aa66468..548e62eb5c46 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * trace_export.c - export basic ftrace utilities to user space   * diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index a0910c0cdf2e..27f7ad12c4b1 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * ring buffer based function tracer   * diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index b8f1f54731af..23c0b0cb5fb9 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   *   * Function graph tracer. diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index 57149bce6aad..d953c163a079 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * kdb helper for dumping the ftrace buffer   * diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index cd7480d0a201..b0388016b687 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * Memory mapped I/O tracing   * @@ -282,6 +283,7 @@ static struct tracer mmio_tracer __read_mostly =  	.close		= mmio_close,  	.read		= mmio_read,  	.print_line	= mmio_print_line, +	.noboot		= true,  };  __init static int init_mmio_trace(void) diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c index 49f61fe96a6b..50523f953a5d 100644 --- a/kernel/trace/trace_nop.c +++ b/kernel/trace/trace_nop.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * nop tracer   * diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index bac629af2285..c738e764e2a5 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -656,15 +656,6 @@ int trace_print_lat_context(struct trace_iterator *iter)  	return !trace_seq_has_overflowed(s);  } -static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; - -static int task_state_char(unsigned long state) -{ -	int bit = state ? __ffs(state) + 1 : 0; - -	return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?'; -} -  /**   * ftrace_find_event - find a registered event   * @type: the type of event to look for @@ -930,8 +921,8 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,  	trace_assign_type(field, iter->ent); -	T = task_state_char(field->next_state); -	S = task_state_char(field->prev_state); +	T = __task_state_to_char(field->next_state); +	S = __task_state_to_char(field->prev_state);  	trace_find_cmdline(field->next_pid, comm);  	trace_seq_printf(&iter->seq,  			 " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", @@ -966,8 +957,8 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S)  	trace_assign_type(field, iter->ent);  	if (!S) -		S = task_state_char(field->prev_state); -	T = task_state_char(field->next_state); +		S = __task_state_to_char(field->prev_state); +	T = __task_state_to_char(field->next_state);  	trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n",  			 field->prev_pid,  			 field->prev_prio, @@ -1002,8 +993,8 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S)  	trace_assign_type(field, iter->ent);  	if (!S) -		S = task_state_char(field->prev_state); -	T = task_state_char(field->next_state); +		S = __task_state_to_char(field->prev_state); +	T = __task_state_to_char(field->next_state);  	SEQ_PUT_HEX_FIELD(s, field->prev_pid);  	SEQ_PUT_HEX_FIELD(s, field->prev_prio); diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index fabc49bcd493..dbba03ed96de 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  #ifndef __TRACE_EVENTS_H  #define __TRACE_EVENTS_H diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index b341c02730be..e288168661e1 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * trace context switch   * diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index ddec53b67646..7d461dcd4831 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * trace task wakeup timings   * @@ -397,10 +398,10 @@ tracing_sched_switch_trace(struct trace_array *tr,  	entry	= ring_buffer_event_data(event);  	entry->prev_pid			= prev->pid;  	entry->prev_prio		= prev->prio; -	entry->prev_state		= prev->state; +	entry->prev_state		= __get_task_state(prev);  	entry->next_pid			= next->pid;  	entry->next_prio		= next->prio; -	entry->next_state		= next->state; +	entry->next_state		= __get_task_state(next);  	entry->next_cpu	= task_cpu(next);  	if (!call_filter_check_discard(call, entry, buffer, event)) @@ -425,10 +426,10 @@ tracing_sched_wakeup_trace(struct trace_array *tr,  	entry	= ring_buffer_event_data(event);  	entry->prev_pid			= curr->pid;  	entry->prev_prio		= curr->prio; -	entry->prev_state		= curr->state; +	entry->prev_state		= __get_task_state(curr);  	entry->next_pid			= wakee->pid;  	entry->next_prio		= wakee->prio; -	entry->next_state		= wakee->state; +	entry->next_state		= __get_task_state(wakee);  	entry->next_cpu			= task_cpu(wakee);  	if (!call_filter_check_discard(call, entry, buffer, event)) diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index b17ec642793b..cd70eb5df38e 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /* Include in trace.c */  #include <uapi/linux/sched/types.h> diff --git a/kernel/trace/trace_selftest_dynamic.c b/kernel/trace/trace_selftest_dynamic.c index b4c475a0a48b..8cda06a10d66 100644 --- a/kernel/trace/trace_selftest_dynamic.c +++ b/kernel/trace/trace_selftest_dynamic.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  #include "trace.h"  int DYN_FTRACE_TEST_NAME(void) diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index a4df67cbc711..719a52a4064a 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * Copyright (C) 2008 Steven Rostedt <[email protected]>   * @@ -96,23 +97,9 @@ check_stack(unsigned long ip, unsigned long *stack)  	if (in_nmi())  		return; -	/* -	 * There's a slight chance that we are tracing inside the -	 * RCU infrastructure, and rcu_irq_enter() will not work -	 * as expected. -	 */ -	if (unlikely(rcu_irq_enter_disabled())) -		return; -  	local_irq_save(flags);  	arch_spin_lock(&stack_trace_max_lock); -	/* -	 * RCU may not be watching, make it see us. -	 * The stack trace code uses rcu_sched. -	 */ -	rcu_irq_enter(); -  	/* In case another CPU set the tracer_frame on us */  	if (unlikely(!frame_size))  		this_size -= tracer_frame; @@ -205,7 +192,6 @@ check_stack(unsigned long ip, unsigned long *stack)  	}   out: -	rcu_irq_exit();  	arch_spin_unlock(&stack_trace_max_lock);  	local_irq_restore(flags);  } diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 413ff108fbd0..75bf1bcb4a8a 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * Infrastructure for statistic tracing (histogram output).   * diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h index 8f03914b9a6a..76d30b4ebe83 100644 --- a/kernel/trace/trace_stat.h +++ b/kernel/trace/trace_stat.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  #ifndef __TRACE_STAT_H  #define __TRACE_STAT_H diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 696afe72d3b1..a2a642f2c64f 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  #include <trace/syscall.h>  #include <trace/events/syscalls.h>  #include <linux/syscalls.h> diff --git a/kernel/trace/tracing_map.h b/kernel/trace/tracing_map.h index 618838f5f30a..ab0ca77331d0 100644 --- a/kernel/trace/tracing_map.h +++ b/kernel/trace/tracing_map.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  #ifndef __TRACING_MAP_H  #define __TRACING_MAP_H diff --git a/kernel/uid16.c b/kernel/uid16.c index 5c2dc5b2bf4f..ce74a4901d2b 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   *	Wrapper functions for 16bit uid back compatibility. All nicely tied   *	together in the faint hope we can take the out in five years time. diff --git a/kernel/watchdog.c b/kernel/watchdog.c index f5d52024f6b7..c8e06703e44c 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * Detect hard and soft lockups on a system   * @@ -29,20 +30,29 @@  #include <linux/kvm_para.h>  #include <linux/kthread.h> -/* Watchdog configuration */ -static DEFINE_MUTEX(watchdog_proc_mutex); - -int __read_mostly nmi_watchdog_enabled; +static DEFINE_MUTEX(watchdog_mutex);  #if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_NMI_WATCHDOG) -unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED | -						NMI_WATCHDOG_ENABLED; +# define WATCHDOG_DEFAULT	(SOFT_WATCHDOG_ENABLED | NMI_WATCHDOG_ENABLED) +# define NMI_WATCHDOG_DEFAULT	1  #else -unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED; +# define WATCHDOG_DEFAULT	(SOFT_WATCHDOG_ENABLED) +# define NMI_WATCHDOG_DEFAULT	0  #endif +unsigned long __read_mostly watchdog_enabled; +int __read_mostly watchdog_user_enabled = 1; +int __read_mostly nmi_watchdog_user_enabled = NMI_WATCHDOG_DEFAULT; +int __read_mostly soft_watchdog_user_enabled = 1; +int __read_mostly watchdog_thresh = 10; +int __read_mostly nmi_watchdog_available; + +struct cpumask watchdog_allowed_mask __read_mostly; + +struct cpumask watchdog_cpumask __read_mostly; +unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); +  #ifdef CONFIG_HARDLOCKUP_DETECTOR -/* boot commands */  /*   * Should we panic when a soft-lockup or hard-lockup occurs:   */ @@ -56,9 +66,9 @@ unsigned int __read_mostly hardlockup_panic =   * kernel command line parameters are parsed, because otherwise it is not   * possible to override this in hardlockup_panic_setup().   */ -void hardlockup_detector_disable(void) +void __init hardlockup_detector_disable(void)  { -	watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; +	nmi_watchdog_user_enabled = 0;  }  static int __init hardlockup_panic_setup(char *str) @@ -68,48 +78,24 @@ static int __init hardlockup_panic_setup(char *str)  	else if (!strncmp(str, "nopanic", 7))  		hardlockup_panic = 0;  	else if (!strncmp(str, "0", 1)) -		watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; +		nmi_watchdog_user_enabled = 0;  	else if (!strncmp(str, "1", 1)) -		watchdog_enabled |= NMI_WATCHDOG_ENABLED; +		nmi_watchdog_user_enabled = 1;  	return 1;  }  __setup("nmi_watchdog=", hardlockup_panic_setup); -#endif - -#ifdef CONFIG_SOFTLOCKUP_DETECTOR -int __read_mostly soft_watchdog_enabled; -#endif - -int __read_mostly watchdog_user_enabled; -int __read_mostly watchdog_thresh = 10; - -#ifdef CONFIG_SMP -int __read_mostly sysctl_softlockup_all_cpu_backtrace; +# ifdef CONFIG_SMP  int __read_mostly sysctl_hardlockup_all_cpu_backtrace; -#endif -struct cpumask watchdog_cpumask __read_mostly; -unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); -/* - * The 'watchdog_running' variable is set to 1 when the watchdog threads - * are registered/started and is set to 0 when the watchdog threads are - * unregistered/stopped, so it is an indicator whether the threads exist. - */ -static int __read_mostly watchdog_running; -/* - * If a subsystem has a need to deactivate the watchdog temporarily, it - * can use the suspend/resume interface to achieve this. The content of - * the 'watchdog_suspended' variable reflects this state. Existing threads - * are parked/unparked by the lockup_detector_{suspend|resume} functions - * (see comment blocks pertaining to those functions for further details). - * - * 'watchdog_suspended' also prevents threads from being registered/started - * or unregistered/stopped via parameters in /proc/sys/kernel, so the state - * of 'watchdog_running' cannot change while the watchdog is deactivated - * temporarily (see related code in 'proc' handlers). - */ -int __read_mostly watchdog_suspended; +static int __init hardlockup_all_cpu_backtrace_setup(char *str) +{ +	sysctl_hardlockup_all_cpu_backtrace = !!simple_strtol(str, NULL, 0); +	return 1; +} +__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup); +# endif /* CONFIG_SMP */ +#endif /* CONFIG_HARDLOCKUP_DETECTOR */  /*   * These functions can be overridden if an architecture implements its @@ -121,36 +107,68 @@ int __read_mostly watchdog_suspended;   */  int __weak watchdog_nmi_enable(unsigned int cpu)  { +	hardlockup_detector_perf_enable();  	return 0;  } +  void __weak watchdog_nmi_disable(unsigned int cpu)  { +	hardlockup_detector_perf_disable();  } -/* - * watchdog_nmi_reconfigure can be implemented to be notified after any - * watchdog configuration change. The arch hardlockup watchdog should - * respond to the following variables: - * - nmi_watchdog_enabled +/* Return 0, if a NMI watchdog is available. Error code otherwise */ +int __weak __init watchdog_nmi_probe(void) +{ +	return hardlockup_detector_perf_init(); +} + +/** + * watchdog_nmi_stop - Stop the watchdog for reconfiguration + * + * The reconfiguration steps are: + * watchdog_nmi_stop(); + * update_variables(); + * watchdog_nmi_start(); + */ +void __weak watchdog_nmi_stop(void) { } + +/** + * watchdog_nmi_start - Start the watchdog after reconfiguration + * + * Counterpart to watchdog_nmi_stop(). + * + * The following variables have been updated in update_variables() and + * contain the currently valid configuration: + * - watchdog_enabled   * - watchdog_thresh   * - watchdog_cpumask - * - sysctl_hardlockup_all_cpu_backtrace - * - hardlockup_panic - * - watchdog_suspended   */ -void __weak watchdog_nmi_reconfigure(void) +void __weak watchdog_nmi_start(void) { } + +/** + * lockup_detector_update_enable - Update the sysctl enable bit + * + * Caller needs to make sure that the NMI/perf watchdogs are off, so this + * can't race with watchdog_nmi_disable(). + */ +static void lockup_detector_update_enable(void)  { +	watchdog_enabled = 0; +	if (!watchdog_user_enabled) +		return; +	if (nmi_watchdog_available && nmi_watchdog_user_enabled) +		watchdog_enabled |= NMI_WATCHDOG_ENABLED; +	if (soft_watchdog_user_enabled) +		watchdog_enabled |= SOFT_WATCHDOG_ENABLED;  } -  #ifdef CONFIG_SOFTLOCKUP_DETECTOR -/* Helper for online, unparked cpus. */ -#define for_each_watchdog_cpu(cpu) \ -	for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask) - -atomic_t watchdog_park_in_progress = ATOMIC_INIT(0); +/* Global variables, exported for sysctl */ +unsigned int __read_mostly softlockup_panic = +			CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; +static bool softlockup_threads_initialized __read_mostly;  static u64 __read_mostly sample_period;  static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); @@ -164,50 +182,40 @@ static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved);  static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);  static unsigned long soft_lockup_nmi_warn; -unsigned int __read_mostly softlockup_panic = -			CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; -  static int __init softlockup_panic_setup(char *str)  {  	softlockup_panic = simple_strtoul(str, NULL, 0); -  	return 1;  }  __setup("softlockup_panic=", softlockup_panic_setup);  static int __init nowatchdog_setup(char *str)  { -	watchdog_enabled = 0; +	watchdog_user_enabled = 0;  	return 1;  }  __setup("nowatchdog", nowatchdog_setup);  static int __init nosoftlockup_setup(char *str)  { -	watchdog_enabled &= ~SOFT_WATCHDOG_ENABLED; +	soft_watchdog_user_enabled = 0;  	return 1;  }  __setup("nosoftlockup", nosoftlockup_setup);  #ifdef CONFIG_SMP +int __read_mostly sysctl_softlockup_all_cpu_backtrace; +  static int __init softlockup_all_cpu_backtrace_setup(char *str)  { -	sysctl_softlockup_all_cpu_backtrace = -		!!simple_strtol(str, NULL, 0); +	sysctl_softlockup_all_cpu_backtrace = !!simple_strtol(str, NULL, 0);  	return 1;  }  __setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup); -#ifdef CONFIG_HARDLOCKUP_DETECTOR -static int __init hardlockup_all_cpu_backtrace_setup(char *str) -{ -	sysctl_hardlockup_all_cpu_backtrace = -		!!simple_strtol(str, NULL, 0); -	return 1; -} -__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup); -#endif  #endif +static void __lockup_detector_cleanup(void); +  /*   * Hard-lockup warnings should be triggered after just a few seconds. Soft-   * lockups can have false positives under extreme conditions. So we generally @@ -278,11 +286,15 @@ void touch_all_softlockup_watchdogs(void)  	int cpu;  	/* -	 * this is done lockless -	 * do we care if a 0 races with a timestamp? -	 * all it means is the softlock check starts one cycle later +	 * watchdog_mutex cannpt be taken here, as this might be called +	 * from (soft)interrupt context, so the access to +	 * watchdog_allowed_cpumask might race with a concurrent update. +	 * +	 * The watchdog time stamp can race against a concurrent real +	 * update as well, the only side effect might be a cycle delay for +	 * the softlockup check.  	 */ -	for_each_watchdog_cpu(cpu) +	for_each_cpu(cpu, &watchdog_allowed_mask)  		per_cpu(watchdog_touch_ts, cpu) = 0;  	wq_watchdog_touch(-1);  } @@ -322,9 +334,6 @@ static void watchdog_interrupt_count(void)  	__this_cpu_inc(hrtimer_interrupts);  } -static int watchdog_enable_all_cpus(void); -static void watchdog_disable_all_cpus(void); -  /* watchdog kicker functions */  static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)  { @@ -333,7 +342,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)  	int duration;  	int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace; -	if (atomic_read(&watchdog_park_in_progress) != 0) +	if (!watchdog_enabled)  		return HRTIMER_NORESTART;  	/* kick the hardlockup detector */ @@ -447,32 +456,38 @@ static void watchdog_set_prio(unsigned int policy, unsigned int prio)  static void watchdog_enable(unsigned int cpu)  { -	struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer); +	struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer); -	/* kick off the timer for the hardlockup detector */ +	/* +	 * Start the timer first to prevent the NMI watchdog triggering +	 * before the timer has a chance to fire. +	 */  	hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);  	hrtimer->function = watchdog_timer_fn; - -	/* Enable the perf event */ -	watchdog_nmi_enable(cpu); - -	/* done here because hrtimer_start can only pin to smp_processor_id() */  	hrtimer_start(hrtimer, ns_to_ktime(sample_period),  		      HRTIMER_MODE_REL_PINNED); -	/* initialize timestamp */ -	watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1); +	/* Initialize timestamp */  	__touch_watchdog(); +	/* Enable the perf event */ +	if (watchdog_enabled & NMI_WATCHDOG_ENABLED) +		watchdog_nmi_enable(cpu); + +	watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1);  }  static void watchdog_disable(unsigned int cpu)  { -	struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer); +	struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer);  	watchdog_set_prio(SCHED_NORMAL, 0); -	hrtimer_cancel(hrtimer); -	/* disable the perf event */ +	/* +	 * Disable the perf event first. That prevents that a large delay +	 * between disabling the timer and disabling the perf event causes +	 * the perf NMI to detect a false positive. +	 */  	watchdog_nmi_disable(cpu); +	hrtimer_cancel(hrtimer);  }  static void watchdog_cleanup(unsigned int cpu, bool online) @@ -499,21 +514,6 @@ static void watchdog(unsigned int cpu)  	__this_cpu_write(soft_lockup_hrtimer_cnt,  			 __this_cpu_read(hrtimer_interrupts));  	__touch_watchdog(); - -	/* -	 * watchdog_nmi_enable() clears the NMI_WATCHDOG_ENABLED bit in the -	 * failure path. Check for failures that can occur asynchronously - -	 * for example, when CPUs are on-lined - and shut down the hardware -	 * perf event on each CPU accordingly. -	 * -	 * The only non-obvious place this bit can be cleared is through -	 * watchdog_nmi_enable(), so a pr_info() is placed there.  Placing a -	 * pr_info here would be too noisy as it would result in a message -	 * every few seconds if the hardlockup was disabled but the softlockup -	 * enabled. -	 */ -	if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) -		watchdog_nmi_disable(cpu);  }  static struct smp_hotplug_thread watchdog_threads = { @@ -527,295 +527,174 @@ static struct smp_hotplug_thread watchdog_threads = {  	.unpark			= watchdog_enable,  }; -/* - * park all watchdog threads that are specified in 'watchdog_cpumask' - * - * This function returns an error if kthread_park() of a watchdog thread - * fails. In this situation, the watchdog threads of some CPUs can already - * be parked and the watchdog threads of other CPUs can still be runnable. - * Callers are expected to handle this special condition as appropriate in - * their context. - * - * This function may only be called in a context that is protected against - * races with CPU hotplug - for example, via get_online_cpus(). - */ -static int watchdog_park_threads(void) +static void softlockup_update_smpboot_threads(void)  { -	int cpu, ret = 0; +	lockdep_assert_held(&watchdog_mutex); -	atomic_set(&watchdog_park_in_progress, 1); +	if (!softlockup_threads_initialized) +		return; -	for_each_watchdog_cpu(cpu) { -		ret = kthread_park(per_cpu(softlockup_watchdog, cpu)); -		if (ret) -			break; -	} - -	atomic_set(&watchdog_park_in_progress, 0); - -	return ret; +	smpboot_update_cpumask_percpu_thread(&watchdog_threads, +					     &watchdog_allowed_mask);  } -/* - * unpark all watchdog threads that are specified in 'watchdog_cpumask' - * - * This function may only be called in a context that is protected against - * races with CPU hotplug - for example, via get_online_cpus(). - */ -static void watchdog_unpark_threads(void) +/* Temporarily park all watchdog threads */ +static void softlockup_park_all_threads(void)  { -	int cpu; - -	for_each_watchdog_cpu(cpu) -		kthread_unpark(per_cpu(softlockup_watchdog, cpu)); +	cpumask_clear(&watchdog_allowed_mask); +	softlockup_update_smpboot_threads();  } -static int update_watchdog_all_cpus(void) +/* Unpark enabled threads */ +static void softlockup_unpark_threads(void)  { -	int ret; - -	ret = watchdog_park_threads(); -	if (ret) -		return ret; - -	watchdog_unpark_threads(); - -	return 0; +	cpumask_copy(&watchdog_allowed_mask, &watchdog_cpumask); +	softlockup_update_smpboot_threads();  } -static int watchdog_enable_all_cpus(void) +static void lockup_detector_reconfigure(void)  { -	int err = 0; - -	if (!watchdog_running) { -		err = smpboot_register_percpu_thread_cpumask(&watchdog_threads, -							     &watchdog_cpumask); -		if (err) -			pr_err("Failed to create watchdog threads, disabled\n"); -		else -			watchdog_running = 1; -	} else { -		/* -		 * Enable/disable the lockup detectors or -		 * change the sample period 'on the fly'. -		 */ -		err = update_watchdog_all_cpus(); - -		if (err) { -			watchdog_disable_all_cpus(); -			pr_err("Failed to update lockup detectors, disabled\n"); -		} -	} - -	if (err) -		watchdog_enabled = 0; - -	return err; +	cpus_read_lock(); +	watchdog_nmi_stop(); +	softlockup_park_all_threads(); +	set_sample_period(); +	lockup_detector_update_enable(); +	if (watchdog_enabled && watchdog_thresh) +		softlockup_unpark_threads(); +	watchdog_nmi_start(); +	cpus_read_unlock(); +	/* +	 * Must be called outside the cpus locked section to prevent +	 * recursive locking in the perf code. +	 */ +	__lockup_detector_cleanup();  } -static void watchdog_disable_all_cpus(void) +/* + * Create the watchdog thread infrastructure and configure the detector(s). + * + * The threads are not unparked as watchdog_allowed_mask is empty.  When + * the threads are sucessfully initialized, take the proper locks and + * unpark the threads in the watchdog_cpumask if the watchdog is enabled. + */ +static __init void lockup_detector_setup(void)  { -	if (watchdog_running) { -		watchdog_running = 0; -		smpboot_unregister_percpu_thread(&watchdog_threads); -	} -} +	int ret; -#ifdef CONFIG_SYSCTL -static int watchdog_update_cpus(void) -{ -	return smpboot_update_cpumask_percpu_thread( -		    &watchdog_threads, &watchdog_cpumask); -} -#endif +	/* +	 * If sysctl is off and watchdog got disabled on the command line, +	 * nothing to do here. +	 */ +	lockup_detector_update_enable(); -#else /* SOFTLOCKUP */ -static int watchdog_park_threads(void) -{ -	return 0; -} +	if (!IS_ENABLED(CONFIG_SYSCTL) && +	    !(watchdog_enabled && watchdog_thresh)) +		return; -static void watchdog_unpark_threads(void) -{ -} +	ret = smpboot_register_percpu_thread_cpumask(&watchdog_threads, +						     &watchdog_allowed_mask); +	if (ret) { +		pr_err("Failed to initialize soft lockup detector threads\n"); +		return; +	} -static int watchdog_enable_all_cpus(void) -{ -	return 0; +	mutex_lock(&watchdog_mutex); +	softlockup_threads_initialized = true; +	lockup_detector_reconfigure(); +	mutex_unlock(&watchdog_mutex);  } -static void watchdog_disable_all_cpus(void) +#else /* CONFIG_SOFTLOCKUP_DETECTOR */ +static inline int watchdog_park_threads(void) { return 0; } +static inline void watchdog_unpark_threads(void) { } +static inline int watchdog_enable_all_cpus(void) { return 0; } +static inline void watchdog_disable_all_cpus(void) { } +static void lockup_detector_reconfigure(void)  { +	cpus_read_lock(); +	watchdog_nmi_stop(); +	lockup_detector_update_enable(); +	watchdog_nmi_start(); +	cpus_read_unlock();  } - -#ifdef CONFIG_SYSCTL -static int watchdog_update_cpus(void) +static inline void lockup_detector_setup(void)  { -	return 0; +	lockup_detector_reconfigure();  } -#endif +#endif /* !CONFIG_SOFTLOCKUP_DETECTOR */ -static void set_sample_period(void) +static void __lockup_detector_cleanup(void)  { +	lockdep_assert_held(&watchdog_mutex); +	hardlockup_detector_perf_cleanup();  } -#endif /* SOFTLOCKUP */ -/* - * Suspend the hard and soft lockup detector by parking the watchdog threads. +/** + * lockup_detector_cleanup - Cleanup after cpu hotplug or sysctl changes + * + * Caller must not hold the cpu hotplug rwsem.   */ -int lockup_detector_suspend(void) +void lockup_detector_cleanup(void)  { -	int ret = 0; - -	get_online_cpus(); -	mutex_lock(&watchdog_proc_mutex); -	/* -	 * Multiple suspend requests can be active in parallel (counted by -	 * the 'watchdog_suspended' variable). If the watchdog threads are -	 * running, the first caller takes care that they will be parked. -	 * The state of 'watchdog_running' cannot change while a suspend -	 * request is active (see related code in 'proc' handlers). -	 */ -	if (watchdog_running && !watchdog_suspended) -		ret = watchdog_park_threads(); - -	if (ret == 0) -		watchdog_suspended++; -	else { -		watchdog_disable_all_cpus(); -		pr_err("Failed to suspend lockup detectors, disabled\n"); -		watchdog_enabled = 0; -	} - -	watchdog_nmi_reconfigure(); - -	mutex_unlock(&watchdog_proc_mutex); - -	return ret; +	mutex_lock(&watchdog_mutex); +	__lockup_detector_cleanup(); +	mutex_unlock(&watchdog_mutex);  } -/* - * Resume the hard and soft lockup detector by unparking the watchdog threads. +/** + * lockup_detector_soft_poweroff - Interface to stop lockup detector(s) + * + * Special interface for parisc. It prevents lockup detector warnings from + * the default pm_poweroff() function which busy loops forever.   */ -void lockup_detector_resume(void) +void lockup_detector_soft_poweroff(void)  { -	mutex_lock(&watchdog_proc_mutex); - -	watchdog_suspended--; -	/* -	 * The watchdog threads are unparked if they were previously running -	 * and if there is no more active suspend request. -	 */ -	if (watchdog_running && !watchdog_suspended) -		watchdog_unpark_threads(); - -	watchdog_nmi_reconfigure(); - -	mutex_unlock(&watchdog_proc_mutex); -	put_online_cpus(); +	watchdog_enabled = 0;  }  #ifdef CONFIG_SYSCTL -/* - * Update the run state of the lockup detectors. - */ -static int proc_watchdog_update(void) +/* Propagate any changes to the watchdog threads */ +static void proc_watchdog_update(void)  { -	int err = 0; - -	/* -	 * Watchdog threads won't be started if they are already active. -	 * The 'watchdog_running' variable in watchdog_*_all_cpus() takes -	 * care of this. If those threads are already active, the sample -	 * period will be updated and the lockup detectors will be enabled -	 * or disabled 'on the fly'. -	 */ -	if (watchdog_enabled && watchdog_thresh) -		err = watchdog_enable_all_cpus(); -	else -		watchdog_disable_all_cpus(); - -	watchdog_nmi_reconfigure(); - -	return err; - +	/* Remove impossible cpus to keep sysctl output clean. */ +	cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask); +	lockup_detector_reconfigure();  }  /*   * common function for watchdog, nmi_watchdog and soft_watchdog parameter   * - * caller             | table->data points to | 'which' contains the flag(s) - * -------------------|-----------------------|----------------------------- - * proc_watchdog      | watchdog_user_enabled | NMI_WATCHDOG_ENABLED or'ed - *                    |                       | with SOFT_WATCHDOG_ENABLED - * -------------------|-----------------------|----------------------------- - * proc_nmi_watchdog  | nmi_watchdog_enabled  | NMI_WATCHDOG_ENABLED - * -------------------|-----------------------|----------------------------- - * proc_soft_watchdog | soft_watchdog_enabled | SOFT_WATCHDOG_ENABLED + * caller             | table->data points to      | 'which' + * -------------------|----------------------------|-------------------------- + * proc_watchdog      | watchdog_user_enabled      | NMI_WATCHDOG_ENABLED | + *                    |                            | SOFT_WATCHDOG_ENABLED + * -------------------|----------------------------|-------------------------- + * proc_nmi_watchdog  | nmi_watchdog_user_enabled  | NMI_WATCHDOG_ENABLED + * -------------------|----------------------------|-------------------------- + * proc_soft_watchdog | soft_watchdog_user_enabled | SOFT_WATCHDOG_ENABLED   */  static int proc_watchdog_common(int which, struct ctl_table *table, int write,  				void __user *buffer, size_t *lenp, loff_t *ppos)  { -	int err, old, new; -	int *watchdog_param = (int *)table->data; +	int err, old, *param = table->data; -	get_online_cpus(); -	mutex_lock(&watchdog_proc_mutex); +	mutex_lock(&watchdog_mutex); -	if (watchdog_suspended) { -		/* no parameter changes allowed while watchdog is suspended */ -		err = -EAGAIN; -		goto out; -	} - -	/* -	 * If the parameter is being read return the state of the corresponding -	 * bit(s) in 'watchdog_enabled', else update 'watchdog_enabled' and the -	 * run state of the lockup detectors. -	 */  	if (!write) { -		*watchdog_param = (watchdog_enabled & which) != 0; +		/* +		 * On read synchronize the userspace interface. This is a +		 * racy snapshot. +		 */ +		*param = (watchdog_enabled & which) != 0;  		err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);  	} else { +		old = READ_ONCE(*param);  		err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); -		if (err) -			goto out; - -		/* -		 * There is a race window between fetching the current value -		 * from 'watchdog_enabled' and storing the new value. During -		 * this race window, watchdog_nmi_enable() can sneak in and -		 * clear the NMI_WATCHDOG_ENABLED bit in 'watchdog_enabled'. -		 * The 'cmpxchg' detects this race and the loop retries. -		 */ -		do { -			old = watchdog_enabled; -			/* -			 * If the parameter value is not zero set the -			 * corresponding bit(s), else clear it(them). -			 */ -			if (*watchdog_param) -				new = old | which; -			else -				new = old & ~which; -		} while (cmpxchg(&watchdog_enabled, old, new) != old); - -		/* -		 * Update the run state of the lockup detectors. There is _no_ -		 * need to check the value returned by proc_watchdog_update() -		 * and to restore the previous value of 'watchdog_enabled' as -		 * both lockup detectors are disabled if proc_watchdog_update() -		 * returns an error. -		 */ -		if (old == new) -			goto out; - -		err = proc_watchdog_update(); +		if (!err && old != READ_ONCE(*param)) +			proc_watchdog_update();  	} -out: -	mutex_unlock(&watchdog_proc_mutex); -	put_online_cpus(); +	mutex_unlock(&watchdog_mutex);  	return err;  } @@ -835,6 +714,8 @@ int proc_watchdog(struct ctl_table *table, int write,  int proc_nmi_watchdog(struct ctl_table *table, int write,  		      void __user *buffer, size_t *lenp, loff_t *ppos)  { +	if (!nmi_watchdog_available && write) +		return -ENOTSUPP;  	return proc_watchdog_common(NMI_WATCHDOG_ENABLED,  				    table, write, buffer, lenp, ppos);  } @@ -855,39 +736,17 @@ int proc_soft_watchdog(struct ctl_table *table, int write,  int proc_watchdog_thresh(struct ctl_table *table, int write,  			 void __user *buffer, size_t *lenp, loff_t *ppos)  { -	int err, old, new; - -	get_online_cpus(); -	mutex_lock(&watchdog_proc_mutex); +	int err, old; -	if (watchdog_suspended) { -		/* no parameter changes allowed while watchdog is suspended */ -		err = -EAGAIN; -		goto out; -	} +	mutex_lock(&watchdog_mutex); -	old = ACCESS_ONCE(watchdog_thresh); +	old = READ_ONCE(watchdog_thresh);  	err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); -	if (err || !write) -		goto out; - -	/* -	 * Update the sample period. Restore on failure. -	 */ -	new = ACCESS_ONCE(watchdog_thresh); -	if (old == new) -		goto out; +	if (!err && write && old != READ_ONCE(watchdog_thresh)) +		proc_watchdog_update(); -	set_sample_period(); -	err = proc_watchdog_update(); -	if (err) { -		watchdog_thresh = old; -		set_sample_period(); -	} -out: -	mutex_unlock(&watchdog_proc_mutex); -	put_online_cpus(); +	mutex_unlock(&watchdog_mutex);  	return err;  } @@ -902,45 +761,19 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,  {  	int err; -	get_online_cpus(); -	mutex_lock(&watchdog_proc_mutex); - -	if (watchdog_suspended) { -		/* no parameter changes allowed while watchdog is suspended */ -		err = -EAGAIN; -		goto out; -	} +	mutex_lock(&watchdog_mutex);  	err = proc_do_large_bitmap(table, write, buffer, lenp, ppos); -	if (!err && write) { -		/* Remove impossible cpus to keep sysctl output cleaner. */ -		cpumask_and(&watchdog_cpumask, &watchdog_cpumask, -			    cpu_possible_mask); - -		if (watchdog_running) { -			/* -			 * Failure would be due to being unable to allocate -			 * a temporary cpumask, so we are likely not in a -			 * position to do much else to make things better. -			 */ -			if (watchdog_update_cpus() != 0) -				pr_err("cpumask update failed\n"); -		} +	if (!err && write) +		proc_watchdog_update(); -		watchdog_nmi_reconfigure(); -	} -out: -	mutex_unlock(&watchdog_proc_mutex); -	put_online_cpus(); +	mutex_unlock(&watchdog_mutex);  	return err;  } -  #endif /* CONFIG_SYSCTL */  void __init lockup_detector_init(void)  { -	set_sample_period(); -  #ifdef CONFIG_NO_HZ_FULL  	if (tick_nohz_full_enabled()) {  		pr_info("Disabling watchdog on nohz_full cores by default\n"); @@ -951,6 +784,7 @@ void __init lockup_detector_init(void)  	cpumask_copy(&watchdog_cpumask, cpu_possible_mask);  #endif -	if (watchdog_enabled) -		watchdog_enable_all_cpus(); +	if (!watchdog_nmi_probe()) +		nmi_watchdog_available = true; +	lockup_detector_setup();  } diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 3a09ea1b1d3d..e449a23e9d59 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * Detect hard lockups on a system   * @@ -12,6 +13,7 @@  #define pr_fmt(fmt) "NMI watchdog: " fmt  #include <linux/nmi.h> +#include <linux/atomic.h>  #include <linux/module.h>  #include <linux/sched/debug.h> @@ -21,8 +23,11 @@  static DEFINE_PER_CPU(bool, hard_watchdog_warn);  static DEFINE_PER_CPU(bool, watchdog_nmi_touch);  static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); +static DEFINE_PER_CPU(struct perf_event *, dead_event); +static struct cpumask dead_events_mask;  static unsigned long hardlockup_allcpu_dumped; +static atomic_t watchdog_cpus = ATOMIC_INIT(0);  void arch_touch_nmi_watchdog(void)  { @@ -103,15 +108,12 @@ static struct perf_event_attr wd_hw_attr = {  /* Callback function for perf event subsystem */  static void watchdog_overflow_callback(struct perf_event *event, -		 struct perf_sample_data *data, -		 struct pt_regs *regs) +				       struct perf_sample_data *data, +				       struct pt_regs *regs)  {  	/* Ensure the watchdog never gets throttled */  	event->hw.interrupts = 0; -	if (atomic_read(&watchdog_park_in_progress) != 0) -		return; -  	if (__this_cpu_read(watchdog_nmi_touch) == true) {  		__this_cpu_write(watchdog_nmi_touch, false);  		return; @@ -160,104 +162,134 @@ static void watchdog_overflow_callback(struct perf_event *event,  	return;  } -/* - * People like the simple clean cpu node info on boot. - * Reduce the watchdog noise by only printing messages - * that are different from what cpu0 displayed. - */ -static unsigned long firstcpu_err; -static atomic_t watchdog_cpus; - -int watchdog_nmi_enable(unsigned int cpu) +static int hardlockup_detector_event_create(void)  { +	unsigned int cpu = smp_processor_id();  	struct perf_event_attr *wd_attr; -	struct perf_event *event = per_cpu(watchdog_ev, cpu); -	int firstcpu = 0; - -	/* nothing to do if the hard lockup detector is disabled */ -	if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) -		goto out; - -	/* is it already setup and enabled? */ -	if (event && event->state > PERF_EVENT_STATE_OFF) -		goto out; - -	/* it is setup but not enabled */ -	if (event != NULL) -		goto out_enable; - -	if (atomic_inc_return(&watchdog_cpus) == 1) -		firstcpu = 1; +	struct perf_event *evt;  	wd_attr = &wd_hw_attr;  	wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);  	/* Try to register using hardware perf events */ -	event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); +	evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL, +					       watchdog_overflow_callback, NULL); +	if (IS_ERR(evt)) { +		pr_info("Perf event create on CPU %d failed with %ld\n", cpu, +			PTR_ERR(evt)); +		return PTR_ERR(evt); +	} +	this_cpu_write(watchdog_ev, evt); +	return 0; +} -	/* save the first cpu's error for future comparision */ -	if (firstcpu && IS_ERR(event)) -		firstcpu_err = PTR_ERR(event); +/** + * hardlockup_detector_perf_enable - Enable the local event + */ +void hardlockup_detector_perf_enable(void) +{ +	if (hardlockup_detector_event_create()) +		return; -	if (!IS_ERR(event)) { -		/* only print for the first cpu initialized */ -		if (firstcpu || firstcpu_err) -			pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n"); -		goto out_save; -	} +	/* use original value for check */ +	if (!atomic_fetch_inc(&watchdog_cpus)) +		pr_info("Enabled. Permanently consumes one hw-PMU counter.\n"); -	/* -	 * Disable the hard lockup detector if _any_ CPU fails to set up -	 * set up the hardware perf event. The watchdog() function checks -	 * the NMI_WATCHDOG_ENABLED bit periodically. -	 * -	 * The barriers are for syncing up watchdog_enabled across all the -	 * cpus, as clear_bit() does not use barriers. -	 */ -	smp_mb__before_atomic(); -	clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled); -	smp_mb__after_atomic(); - -	/* skip displaying the same error again */ -	if (!firstcpu && (PTR_ERR(event) == firstcpu_err)) -		return PTR_ERR(event); - -	/* vary the KERN level based on the returned errno */ -	if (PTR_ERR(event) == -EOPNOTSUPP) -		pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu); -	else if (PTR_ERR(event) == -ENOENT) -		pr_warn("disabled (cpu%i): hardware events not enabled\n", -			 cpu); -	else -		pr_err("disabled (cpu%i): unable to create perf event: %ld\n", -			cpu, PTR_ERR(event)); - -	pr_info("Shutting down hard lockup detector on all cpus\n"); - -	return PTR_ERR(event); - -	/* success path */ -out_save: -	per_cpu(watchdog_ev, cpu) = event; -out_enable: -	perf_event_enable(per_cpu(watchdog_ev, cpu)); -out: -	return 0; +	perf_event_enable(this_cpu_read(watchdog_ev));  } -void watchdog_nmi_disable(unsigned int cpu) +/** + * hardlockup_detector_perf_disable - Disable the local event + */ +void hardlockup_detector_perf_disable(void)  { -	struct perf_event *event = per_cpu(watchdog_ev, cpu); +	struct perf_event *event = this_cpu_read(watchdog_ev);  	if (event) {  		perf_event_disable(event); -		per_cpu(watchdog_ev, cpu) = NULL; +		this_cpu_write(watchdog_ev, NULL); +		this_cpu_write(dead_event, event); +		cpumask_set_cpu(smp_processor_id(), &dead_events_mask); +		atomic_dec(&watchdog_cpus); +	} +} + +/** + * hardlockup_detector_perf_cleanup - Cleanup disabled events and destroy them + * + * Called from lockup_detector_cleanup(). Serialized by the caller. + */ +void hardlockup_detector_perf_cleanup(void) +{ +	int cpu; + +	for_each_cpu(cpu, &dead_events_mask) { +		struct perf_event *event = per_cpu(dead_event, cpu); -		/* should be in cleanup, but blocks oprofile */ -		perf_event_release_kernel(event); +		/* +		 * Required because for_each_cpu() reports  unconditionally +		 * CPU0 as set on UP kernels. Sigh. +		 */ +		if (event) +			perf_event_release_kernel(event); +		per_cpu(dead_event, cpu) = NULL; +	} +	cpumask_clear(&dead_events_mask); +} + +/** + * hardlockup_detector_perf_stop - Globally stop watchdog events + * + * Special interface for x86 to handle the perf HT bug. + */ +void __init hardlockup_detector_perf_stop(void) +{ +	int cpu; + +	lockdep_assert_cpus_held(); + +	for_each_online_cpu(cpu) { +		struct perf_event *event = per_cpu(watchdog_ev, cpu); + +		if (event) +			perf_event_disable(event); +	} +} + +/** + * hardlockup_detector_perf_restart - Globally restart watchdog events + * + * Special interface for x86 to handle the perf HT bug. + */ +void __init hardlockup_detector_perf_restart(void) +{ +	int cpu; + +	lockdep_assert_cpus_held(); + +	if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) +		return; + +	for_each_online_cpu(cpu) { +		struct perf_event *event = per_cpu(watchdog_ev, cpu); + +		if (event) +			perf_event_enable(event); +	} +} + +/** + * hardlockup_detector_perf_init - Probe whether NMI event is available at all + */ +int __init hardlockup_detector_perf_init(void) +{ +	int ret = hardlockup_detector_event_create(); -		/* watchdog_nmi_enable() expects this to be zero initially. */ -		if (atomic_dec_and_test(&watchdog_cpus)) -			firstcpu_err = 0; +	if (ret) { +		pr_info("Perf NMI watchdog permanently disabled\n"); +	} else { +		perf_event_release_kernel(this_cpu_read(watchdog_ev)); +		this_cpu_write(watchdog_ev, NULL);  	} +	return ret;  } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 64d0edf428f8..a2dccfe1acec 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -68,6 +68,7 @@ enum {  	 * attach_mutex to avoid changing binding state while  	 * worker_attach_to_pool() is in progress.  	 */ +	POOL_MANAGER_ACTIVE	= 1 << 0,	/* being managed */  	POOL_DISASSOCIATED	= 1 << 2,	/* cpu can't serve workers */  	/* worker flags */ @@ -165,7 +166,6 @@ struct worker_pool {  						/* L: hash of busy workers */  	/* see manage_workers() for details on the two manager mutexes */ -	struct mutex		manager_arb;	/* manager arbitration */  	struct worker		*manager;	/* L: purely informational */  	struct mutex		attach_mutex;	/* attach/detach exclusion */  	struct list_head	workers;	/* A: attached workers */ @@ -299,6 +299,7 @@ static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf;  static DEFINE_MUTEX(wq_pool_mutex);	/* protects pools and workqueues list */  static DEFINE_SPINLOCK(wq_mayday_lock);	/* protects wq->maydays list */ +static DECLARE_WAIT_QUEUE_HEAD(wq_manager_wait); /* wait for manager to go away */  static LIST_HEAD(workqueues);		/* PR: list of all workqueues */  static bool workqueue_freezing;		/* PL: have wqs started freezing? */ @@ -801,7 +802,7 @@ static bool need_to_create_worker(struct worker_pool *pool)  /* Do we have too many workers and should some go away? */  static bool too_many_workers(struct worker_pool *pool)  { -	bool managing = mutex_is_locked(&pool->manager_arb); +	bool managing = pool->flags & POOL_MANAGER_ACTIVE;  	int nr_idle = pool->nr_idle + managing; /* manager is considered idle */  	int nr_busy = pool->nr_workers - nr_idle; @@ -1980,24 +1981,17 @@ static bool manage_workers(struct worker *worker)  {  	struct worker_pool *pool = worker->pool; -	/* -	 * Anyone who successfully grabs manager_arb wins the arbitration -	 * and becomes the manager.  mutex_trylock() on pool->manager_arb -	 * failure while holding pool->lock reliably indicates that someone -	 * else is managing the pool and the worker which failed trylock -	 * can proceed to executing work items.  This means that anyone -	 * grabbing manager_arb is responsible for actually performing -	 * manager duties.  If manager_arb is grabbed and released without -	 * actual management, the pool may stall indefinitely. -	 */ -	if (!mutex_trylock(&pool->manager_arb)) +	if (pool->flags & POOL_MANAGER_ACTIVE)  		return false; + +	pool->flags |= POOL_MANAGER_ACTIVE;  	pool->manager = worker;  	maybe_create_worker(pool);  	pool->manager = NULL; -	mutex_unlock(&pool->manager_arb); +	pool->flags &= ~POOL_MANAGER_ACTIVE; +	wake_up(&wq_manager_wait);  	return true;  } @@ -3248,7 +3242,6 @@ static int init_worker_pool(struct worker_pool *pool)  	setup_timer(&pool->mayday_timer, pool_mayday_timeout,  		    (unsigned long)pool); -	mutex_init(&pool->manager_arb);  	mutex_init(&pool->attach_mutex);  	INIT_LIST_HEAD(&pool->workers); @@ -3318,13 +3311,15 @@ static void put_unbound_pool(struct worker_pool *pool)  	hash_del(&pool->hash_node);  	/* -	 * Become the manager and destroy all workers.  Grabbing -	 * manager_arb prevents @pool's workers from blocking on -	 * attach_mutex. +	 * Become the manager and destroy all workers.  This prevents +	 * @pool's workers from blocking on attach_mutex.  We're the last +	 * manager and @pool gets freed with the flag set.  	 */ -	mutex_lock(&pool->manager_arb); -  	spin_lock_irq(&pool->lock); +	wait_event_lock_irq(wq_manager_wait, +			    !(pool->flags & POOL_MANAGER_ACTIVE), pool->lock); +	pool->flags |= POOL_MANAGER_ACTIVE; +  	while ((worker = first_idle_worker(pool)))  		destroy_worker(worker);  	WARN_ON(pool->nr_workers || pool->nr_idle); @@ -3338,8 +3333,6 @@ static void put_unbound_pool(struct worker_pool *pool)  	if (pool->detach_completion)  		wait_for_completion(pool->detach_completion); -	mutex_unlock(&pool->manager_arb); -  	/* shut down the timers */  	del_timer_sync(&pool->idle_timer);  	del_timer_sync(&pool->mayday_timer); diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index 8635417c587b..efdd72e15794 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  /*   * kernel/workqueue_internal.h   * |