diff options
Diffstat (limited to 'kernel')
115 files changed, 5567 insertions, 3928 deletions
diff --git a/kernel/audit.c b/kernel/audit.c index 22bb4f24f071..a8a91bd2b2a9 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -932,7 +932,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)  		if (!audit_enabled && msg_type != AUDIT_USER_AVC)  			return 0; -		err = audit_filter_user(msg_type); +		err = audit_filter(msg_type, AUDIT_FILTER_USER);  		if (err == 1) { /* match or error */  			err = 0;  			if (msg_type == AUDIT_USER_TTY) { @@ -1379,7 +1379,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,  	if (audit_initialized != AUDIT_INITIALIZED)  		return NULL; -	if (unlikely(audit_filter_type(type))) +	if (unlikely(!audit_filter(type, AUDIT_FILTER_TYPE)))  		return NULL;  	if (gfp_mask & __GFP_DIRECT_RECLAIM) { @@ -1883,6 +1883,23 @@ out_null:  	audit_log_format(ab, " exe=(null)");  } +struct tty_struct *audit_get_tty(struct task_struct *tsk) +{ +	struct tty_struct *tty = NULL; +	unsigned long flags; + +	spin_lock_irqsave(&tsk->sighand->siglock, flags); +	if (tsk->signal) +		tty = tty_kref_get(tsk->signal->tty); +	spin_unlock_irqrestore(&tsk->sighand->siglock, flags); +	return tty; +} + +void audit_put_tty(struct tty_struct *tty) +{ +	tty_kref_put(tty); +} +  void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)  {  	const struct cred *cred; diff --git a/kernel/audit.h b/kernel/audit.h index cbbe6bb6496e..431444c3708b 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -23,6 +23,7 @@  #include <linux/audit.h>  #include <linux/skbuff.h>  #include <uapi/linux/mqueue.h> +#include <linux/tty.h>  /* AUDIT_NAMES is the number of slots we reserve in the audit_context   * for saving names from getname().  If we get more names we will allocate @@ -262,6 +263,9 @@ extern struct audit_entry *audit_dupe_rule(struct audit_krule *old);  extern void audit_log_d_path_exe(struct audit_buffer *ab,  				 struct mm_struct *mm); +extern struct tty_struct *audit_get_tty(struct task_struct *tsk); +extern void audit_put_tty(struct tty_struct *tty); +  /* audit watch functions */  #ifdef CONFIG_AUDIT_WATCH  extern void audit_put_watch(struct audit_watch *watch); @@ -327,6 +331,8 @@ extern pid_t audit_sig_pid;  extern kuid_t audit_sig_uid;  extern u32 audit_sig_sid; +extern int audit_filter(int msgtype, unsigned int listtype); +  #ifdef CONFIG_AUDITSYSCALL  extern int __audit_signal_info(int sig, struct task_struct *t);  static inline int audit_signal_info(int sig, struct task_struct *t) diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 94ca7b1e5e7e..85d9cac497e4 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1290,113 +1290,72 @@ int audit_compare_dname_path(const char *dname, const char *path, int parentlen)  	return strncmp(p, dname, dlen);  } -static int audit_filter_user_rules(struct audit_krule *rule, int type, -				   enum audit_state *state) +int audit_filter(int msgtype, unsigned int listtype)  { -	int i; - -	for (i = 0; i < rule->field_count; i++) { -		struct audit_field *f = &rule->fields[i]; -		pid_t pid; -		int result = 0; -		u32 sid; - -		switch (f->type) { -		case AUDIT_PID: -			pid = task_pid_nr(current); -			result = audit_comparator(pid, f->op, f->val); -			break; -		case AUDIT_UID: -			result = audit_uid_comparator(current_uid(), f->op, f->uid); -			break; -		case AUDIT_GID: -			result = audit_gid_comparator(current_gid(), f->op, f->gid); -			break; -		case AUDIT_LOGINUID: -			result = audit_uid_comparator(audit_get_loginuid(current), -						  f->op, f->uid); -			break; -		case AUDIT_LOGINUID_SET: -			result = audit_comparator(audit_loginuid_set(current), -						  f->op, f->val); -			break; -		case AUDIT_MSGTYPE: -			result = audit_comparator(type, f->op, f->val); -			break; -		case AUDIT_SUBJ_USER: -		case AUDIT_SUBJ_ROLE: -		case AUDIT_SUBJ_TYPE: -		case AUDIT_SUBJ_SEN: -		case AUDIT_SUBJ_CLR: -			if (f->lsm_rule) { -				security_task_getsecid(current, &sid); -				result = security_audit_rule_match(sid, -								   f->type, -								   f->op, -								   f->lsm_rule, -								   NULL); -			} -			break; -		} - -		if (!result) -			return 0; -	} -	switch (rule->action) { -	case AUDIT_NEVER:    *state = AUDIT_DISABLED;	    break; -	case AUDIT_ALWAYS:   *state = AUDIT_RECORD_CONTEXT; break; -	} -	return 1; -} - -int audit_filter_user(int type) -{ -	enum audit_state state = AUDIT_DISABLED;  	struct audit_entry *e; -	int rc, ret; - -	ret = 1; /* Audit by default */ - -	rcu_read_lock(); -	list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) { -		rc = audit_filter_user_rules(&e->rule, type, &state); -		if (rc) { -			if (rc > 0 && state == AUDIT_DISABLED) -				ret = 0; -			break; -		} -	} -	rcu_read_unlock(); - -	return ret; -} - -int audit_filter_type(int type) -{ -	struct audit_entry *e; -	int result = 0; +	int ret = 1; /* Audit by default */  	rcu_read_lock(); -	if (list_empty(&audit_filter_list[AUDIT_FILTER_TYPE])) +	if (list_empty(&audit_filter_list[listtype]))  		goto unlock_and_return; +	list_for_each_entry_rcu(e, &audit_filter_list[listtype], list) { +		int i, result = 0; -	list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TYPE], -				list) { -		int i;  		for (i = 0; i < e->rule.field_count; i++) {  			struct audit_field *f = &e->rule.fields[i]; -			if (f->type == AUDIT_MSGTYPE) { -				result = audit_comparator(type, f->op, f->val); -				if (!result) -					break; +			pid_t pid; +			u32 sid; + +			switch (f->type) { +			case AUDIT_PID: +				pid = task_pid_nr(current); +				result = audit_comparator(pid, f->op, f->val); +				break; +			case AUDIT_UID: +				result = audit_uid_comparator(current_uid(), f->op, f->uid); +				break; +			case AUDIT_GID: +				result = audit_gid_comparator(current_gid(), f->op, f->gid); +				break; +			case AUDIT_LOGINUID: +				result = audit_uid_comparator(audit_get_loginuid(current), +							      f->op, f->uid); +				break; +			case AUDIT_LOGINUID_SET: +				result = audit_comparator(audit_loginuid_set(current), +							  f->op, f->val); +				break; +			case AUDIT_MSGTYPE: +				result = audit_comparator(msgtype, f->op, f->val); +				break; +			case AUDIT_SUBJ_USER: +			case AUDIT_SUBJ_ROLE: +			case AUDIT_SUBJ_TYPE: +			case AUDIT_SUBJ_SEN: +			case AUDIT_SUBJ_CLR: +				if (f->lsm_rule) { +					security_task_getsecid(current, &sid); +					result = security_audit_rule_match(sid, +							f->type, f->op, f->lsm_rule, NULL); +				} +				break; +			default: +				goto unlock_and_return;  			} +			if (result < 0) /* error */ +				goto unlock_and_return; +			if (!result) +				break; +		} +		if (result > 0) { +			if (e->rule.action == AUDIT_NEVER || listtype == AUDIT_FILTER_TYPE) +				ret = 0; +			break;  		} -		if (result) -			goto unlock_and_return;  	}  unlock_and_return:  	rcu_read_unlock(); -	return result; +	return ret;  }  static int update_lsm_rule(struct audit_krule *r) diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 62ab53d7619c..5abf1dc1f91c 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -63,7 +63,6 @@  #include <asm/unistd.h>  #include <linux/security.h>  #include <linux/list.h> -#include <linux/tty.h>  #include <linux/binfmts.h>  #include <linux/highmem.h>  #include <linux/syscalls.h> @@ -73,6 +72,7 @@  #include <linux/compat.h>  #include <linux/ctype.h>  #include <linux/string.h> +#include <linux/uaccess.h>  #include <uapi/linux/limits.h>  #include "audit.h" @@ -82,7 +82,8 @@  #define AUDITSC_SUCCESS 1  #define AUDITSC_FAILURE 2 -/* no execve audit message should be longer than this (userspace limits) */ +/* no execve audit message should be longer than this (userspace limits), + * see the note near the top of audit_log_execve_info() about this value */  #define MAX_EXECVE_AUDIT_LEN 7500  /* max length to print of cmdline/proctitle value during audit */ @@ -695,8 +696,12 @@ static int audit_filter_rules(struct task_struct *tsk,  		ctx->prio = rule->prio;  	}  	switch (rule->action) { -	case AUDIT_NEVER:    *state = AUDIT_DISABLED;	    break; -	case AUDIT_ALWAYS:   *state = AUDIT_RECORD_CONTEXT; break; +	case AUDIT_NEVER: +		*state = AUDIT_DISABLED; +		break; +	case AUDIT_ALWAYS: +		*state = AUDIT_RECORD_CONTEXT; +		break;  	}  	return 1;  } @@ -988,184 +993,178 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,  	return rc;  } -/* - * to_send and len_sent accounting are very loose estimates.  We aren't - * really worried about a hard cap to MAX_EXECVE_AUDIT_LEN so much as being - * within about 500 bytes (next page boundary) - * - * why snprintf?  an int is up to 12 digits long.  if we just assumed when - * logging that a[%d]= was going to be 16 characters long we would be wasting - * space in every audit message.  In one 7500 byte message we can log up to - * about 1000 min size arguments.  That comes down to about 50% waste of space - * if we didn't do the snprintf to find out how long arg_num_len was. - */ -static int audit_log_single_execve_arg(struct audit_context *context, -					struct audit_buffer **ab, -					int arg_num, -					size_t *len_sent, -					const char __user *p, -					char *buf) +static void audit_log_execve_info(struct audit_context *context, +				  struct audit_buffer **ab)  { -	char arg_num_len_buf[12]; -	const char __user *tmp_p = p; -	/* how many digits are in arg_num? 5 is the length of ' a=""' */ -	size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 5; -	size_t len, len_left, to_send; -	size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN; -	unsigned int i, has_cntl = 0, too_long = 0; -	int ret; - -	/* strnlen_user includes the null we don't want to send */ -	len_left = len = strnlen_user(p, MAX_ARG_STRLEN) - 1; - -	/* -	 * We just created this mm, if we can't find the strings -	 * we just copied into it something is _very_ wrong. Similar -	 * for strings that are too long, we should not have created -	 * any. -	 */ -	if (WARN_ON_ONCE(len < 0 || len > MAX_ARG_STRLEN - 1)) { -		send_sig(SIGKILL, current, 0); -		return -1; +	long len_max; +	long len_rem; +	long len_full; +	long len_buf; +	long len_abuf; +	long len_tmp; +	bool require_data; +	bool encode; +	unsigned int iter; +	unsigned int arg; +	char *buf_head; +	char *buf; +	const char __user *p = (const char __user *)current->mm->arg_start; + +	/* NOTE: this buffer needs to be large enough to hold all the non-arg +	 *       data we put in the audit record for this argument (see the +	 *       code below) ... at this point in time 96 is plenty */ +	char abuf[96]; + +	/* NOTE: we set MAX_EXECVE_AUDIT_LEN to a rather arbitrary limit, the +	 *       current value of 7500 is not as important as the fact that it +	 *       is less than 8k, a setting of 7500 gives us plenty of wiggle +	 *       room if we go over a little bit in the logging below */ +	WARN_ON_ONCE(MAX_EXECVE_AUDIT_LEN > 7500); +	len_max = MAX_EXECVE_AUDIT_LEN; + +	/* scratch buffer to hold the userspace args */ +	buf_head = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL); +	if (!buf_head) { +		audit_panic("out of memory for argv string"); +		return;  	} +	buf = buf_head; + +	audit_log_format(*ab, "argc=%d", context->execve.argc); -	/* walk the whole argument looking for non-ascii chars */ +	len_rem = len_max; +	len_buf = 0; +	len_full = 0; +	require_data = true; +	encode = false; +	iter = 0; +	arg = 0;  	do { -		if (len_left > MAX_EXECVE_AUDIT_LEN) -			to_send = MAX_EXECVE_AUDIT_LEN; -		else -			to_send = len_left; -		ret = copy_from_user(buf, tmp_p, to_send); -		/* -		 * There is no reason for this copy to be short. We just -		 * copied them here, and the mm hasn't been exposed to user- -		 * space yet. -		 */ -		if (ret) { -			WARN_ON(1); -			send_sig(SIGKILL, current, 0); -			return -1; -		} -		buf[to_send] = '\0'; -		has_cntl = audit_string_contains_control(buf, to_send); -		if (has_cntl) { -			/* -			 * hex messages get logged as 2 bytes, so we can only -			 * send half as much in each message -			 */ -			max_execve_audit_len = MAX_EXECVE_AUDIT_LEN / 2; -			break; -		} -		len_left -= to_send; -		tmp_p += to_send; -	} while (len_left > 0); - -	len_left = len; - -	if (len > max_execve_audit_len) -		too_long = 1; - -	/* rewalk the argument actually logging the message */ -	for (i = 0; len_left > 0; i++) { -		int room_left; - -		if (len_left > max_execve_audit_len) -			to_send = max_execve_audit_len; -		else -			to_send = len_left; - -		/* do we have space left to send this argument in this ab? */ -		room_left = MAX_EXECVE_AUDIT_LEN - arg_num_len - *len_sent; -		if (has_cntl) -			room_left -= (to_send * 2); -		else -			room_left -= to_send; -		if (room_left < 0) { -			*len_sent = 0; -			audit_log_end(*ab); -			*ab = audit_log_start(context, GFP_KERNEL, AUDIT_EXECVE); -			if (!*ab) -				return 0; -		} +		/* NOTE: we don't ever want to trust this value for anything +		 *       serious, but the audit record format insists we +		 *       provide an argument length for really long arguments, +		 *       e.g. > MAX_EXECVE_AUDIT_LEN, so we have no choice but +		 *       to use strncpy_from_user() to obtain this value for +		 *       recording in the log, although we don't use it +		 *       anywhere here to avoid a double-fetch problem */ +		if (len_full == 0) +			len_full = strnlen_user(p, MAX_ARG_STRLEN) - 1; + +		/* read more data from userspace */ +		if (require_data) { +			/* can we make more room in the buffer? */ +			if (buf != buf_head) { +				memmove(buf_head, buf, len_buf); +				buf = buf_head; +			} + +			/* fetch as much as we can of the argument */ +			len_tmp = strncpy_from_user(&buf_head[len_buf], p, +						    len_max - len_buf); +			if (len_tmp == -EFAULT) { +				/* unable to copy from userspace */ +				send_sig(SIGKILL, current, 0); +				goto out; +			} else if (len_tmp == (len_max - len_buf)) { +				/* buffer is not large enough */ +				require_data = true; +				/* NOTE: if we are going to span multiple +				 *       buffers force the encoding so we stand +				 *       a chance at a sane len_full value and +				 *       consistent record encoding */ +				encode = true; +				len_full = len_full * 2; +				p += len_tmp; +			} else { +				require_data = false; +				if (!encode) +					encode = audit_string_contains_control( +								buf, len_tmp); +				/* try to use a trusted value for len_full */ +				if (len_full < len_max) +					len_full = (encode ? +						    len_tmp * 2 : len_tmp); +				p += len_tmp + 1; +			} +			len_buf += len_tmp; +			buf_head[len_buf] = '\0'; -		/* -		 * first record needs to say how long the original string was -		 * so we can be sure nothing was lost. -		 */ -		if ((i == 0) && (too_long)) -			audit_log_format(*ab, " a%d_len=%zu", arg_num, -					 has_cntl ? 2*len : len); - -		/* -		 * normally arguments are small enough to fit and we already -		 * filled buf above when we checked for control characters -		 * so don't bother with another copy_from_user -		 */ -		if (len >= max_execve_audit_len) -			ret = copy_from_user(buf, p, to_send); -		else -			ret = 0; -		if (ret) { -			WARN_ON(1); -			send_sig(SIGKILL, current, 0); -			return -1; +			/* length of the buffer in the audit record? */ +			len_abuf = (encode ? len_buf * 2 : len_buf + 2);  		} -		buf[to_send] = '\0'; - -		/* actually log it */ -		audit_log_format(*ab, " a%d", arg_num); -		if (too_long) -			audit_log_format(*ab, "[%d]", i); -		audit_log_format(*ab, "="); -		if (has_cntl) -			audit_log_n_hex(*ab, buf, to_send); -		else -			audit_log_string(*ab, buf); - -		p += to_send; -		len_left -= to_send; -		*len_sent += arg_num_len; -		if (has_cntl) -			*len_sent += to_send * 2; -		else -			*len_sent += to_send; -	} -	/* include the null we didn't log */ -	return len + 1; -} -static void audit_log_execve_info(struct audit_context *context, -				  struct audit_buffer **ab) -{ -	int i, len; -	size_t len_sent = 0; -	const char __user *p; -	char *buf; +		/* write as much as we can to the audit log */ +		if (len_buf > 0) { +			/* NOTE: some magic numbers here - basically if we +			 *       can't fit a reasonable amount of data into the +			 *       existing audit buffer, flush it and start with +			 *       a new buffer */ +			if ((sizeof(abuf) + 8) > len_rem) { +				len_rem = len_max; +				audit_log_end(*ab); +				*ab = audit_log_start(context, +						      GFP_KERNEL, AUDIT_EXECVE); +				if (!*ab) +					goto out; +			} -	p = (const char __user *)current->mm->arg_start; +			/* create the non-arg portion of the arg record */ +			len_tmp = 0; +			if (require_data || (iter > 0) || +			    ((len_abuf + sizeof(abuf)) > len_rem)) { +				if (iter == 0) { +					len_tmp += snprintf(&abuf[len_tmp], +							sizeof(abuf) - len_tmp, +							" a%d_len=%lu", +							arg, len_full); +				} +				len_tmp += snprintf(&abuf[len_tmp], +						    sizeof(abuf) - len_tmp, +						    " a%d[%d]=", arg, iter++); +			} else +				len_tmp += snprintf(&abuf[len_tmp], +						    sizeof(abuf) - len_tmp, +						    " a%d=", arg); +			WARN_ON(len_tmp >= sizeof(abuf)); +			abuf[sizeof(abuf) - 1] = '\0'; + +			/* log the arg in the audit record */ +			audit_log_format(*ab, "%s", abuf); +			len_rem -= len_tmp; +			len_tmp = len_buf; +			if (encode) { +				if (len_abuf > len_rem) +					len_tmp = len_rem / 2; /* encoding */ +				audit_log_n_hex(*ab, buf, len_tmp); +				len_rem -= len_tmp * 2; +				len_abuf -= len_tmp * 2; +			} else { +				if (len_abuf > len_rem) +					len_tmp = len_rem - 2; /* quotes */ +				audit_log_n_string(*ab, buf, len_tmp); +				len_rem -= len_tmp + 2; +				/* don't subtract the "2" because we still need +				 * to add quotes to the remaining string */ +				len_abuf -= len_tmp; +			} +			len_buf -= len_tmp; +			buf += len_tmp; +		} -	audit_log_format(*ab, "argc=%d", context->execve.argc); +		/* ready to move to the next argument? */ +		if ((len_buf == 0) && !require_data) { +			arg++; +			iter = 0; +			len_full = 0; +			require_data = true; +			encode = false; +		} +	} while (arg < context->execve.argc); -	/* -	 * we need some kernel buffer to hold the userspace args.  Just -	 * allocate one big one rather than allocating one of the right size -	 * for every single argument inside audit_log_single_execve_arg() -	 * should be <8k allocation so should be pretty safe. -	 */ -	buf = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL); -	if (!buf) { -		audit_panic("out of memory for argv string"); -		return; -	} +	/* NOTE: the caller handles the final audit_log_end() call */ -	for (i = 0; i < context->execve.argc; i++) { -		len = audit_log_single_execve_arg(context, ab, i, -						  &len_sent, p, buf); -		if (len <= 0) -			break; -		p += len; -	} -	kfree(buf); +out: +	kfree(buf_head);  }  static void show_special(struct audit_context *context, int *call_panic) @@ -1426,7 +1425,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts  	if (context->pwd.dentry && context->pwd.mnt) {  		ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD);  		if (ab) { -			audit_log_d_path(ab, " cwd=", &context->pwd); +			audit_log_d_path(ab, "cwd=", &context->pwd);  			audit_log_end(ab);  		}  	} @@ -1985,14 +1984,15 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,  	if (!audit_enabled)  		return; +	ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); +	if (!ab) +		return; +  	uid = from_kuid(&init_user_ns, task_uid(current));  	oldloginuid = from_kuid(&init_user_ns, koldloginuid);  	loginuid = from_kuid(&init_user_ns, kloginuid),  	tty = audit_get_tty(current); -	ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); -	if (!ab) -		return;  	audit_log_format(ab, "pid=%d uid=%u", task_pid_nr(current), uid);  	audit_log_task_context(ab);  	audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d", diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 76d5a794e426..633a650d7aeb 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -328,8 +328,8 @@ static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key)  }  /* only called from syscall */ -static int fd_array_map_update_elem(struct bpf_map *map, void *key, -				    void *value, u64 map_flags) +int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, +				 void *key, void *value, u64 map_flags)  {  	struct bpf_array *array = container_of(map, struct bpf_array, map);  	void *new_ptr, *old_ptr; @@ -342,7 +342,7 @@ static int fd_array_map_update_elem(struct bpf_map *map, void *key,  		return -E2BIG;  	ufd = *(u32 *)value; -	new_ptr = map->ops->map_fd_get_ptr(map, ufd); +	new_ptr = map->ops->map_fd_get_ptr(map, map_file, ufd);  	if (IS_ERR(new_ptr))  		return PTR_ERR(new_ptr); @@ -371,10 +371,12 @@ static int fd_array_map_delete_elem(struct bpf_map *map, void *key)  	}  } -static void *prog_fd_array_get_ptr(struct bpf_map *map, int fd) +static void *prog_fd_array_get_ptr(struct bpf_map *map, +				   struct file *map_file, int fd)  {  	struct bpf_array *array = container_of(map, struct bpf_array, map);  	struct bpf_prog *prog = bpf_prog_get(fd); +  	if (IS_ERR(prog))  		return prog; @@ -382,14 +384,13 @@ static void *prog_fd_array_get_ptr(struct bpf_map *map, int fd)  		bpf_prog_put(prog);  		return ERR_PTR(-EINVAL);  	} +  	return prog;  }  static void prog_fd_array_put_ptr(void *ptr)  { -	struct bpf_prog *prog = ptr; - -	bpf_prog_put_rcu(prog); +	bpf_prog_put(ptr);  }  /* decrement refcnt of all bpf_progs that are stored in this map */ @@ -407,7 +408,6 @@ static const struct bpf_map_ops prog_array_ops = {  	.map_free = fd_array_map_free,  	.map_get_next_key = array_map_get_next_key,  	.map_lookup_elem = fd_array_map_lookup_elem, -	.map_update_elem = fd_array_map_update_elem,  	.map_delete_elem = fd_array_map_delete_elem,  	.map_fd_get_ptr = prog_fd_array_get_ptr,  	.map_fd_put_ptr = prog_fd_array_put_ptr, @@ -425,59 +425,105 @@ static int __init register_prog_array_map(void)  }  late_initcall(register_prog_array_map); -static void perf_event_array_map_free(struct bpf_map *map) +static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, +						   struct file *map_file)  { -	bpf_fd_array_map_clear(map); -	fd_array_map_free(map); +	struct bpf_event_entry *ee; + +	ee = kzalloc(sizeof(*ee), GFP_ATOMIC); +	if (ee) { +		ee->event = perf_file->private_data; +		ee->perf_file = perf_file; +		ee->map_file = map_file; +	} + +	return ee;  } -static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd) +static void __bpf_event_entry_free(struct rcu_head *rcu)  { -	struct perf_event *event; -	const struct perf_event_attr *attr; -	struct file *file; +	struct bpf_event_entry *ee; -	file = perf_event_get(fd); -	if (IS_ERR(file)) -		return file; +	ee = container_of(rcu, struct bpf_event_entry, rcu); +	fput(ee->perf_file); +	kfree(ee); +} -	event = file->private_data; +static void bpf_event_entry_free_rcu(struct bpf_event_entry *ee) +{ +	call_rcu(&ee->rcu, __bpf_event_entry_free); +} -	attr = perf_event_attrs(event); -	if (IS_ERR(attr)) -		goto err; +static void *perf_event_fd_array_get_ptr(struct bpf_map *map, +					 struct file *map_file, int fd) +{ +	const struct perf_event_attr *attr; +	struct bpf_event_entry *ee; +	struct perf_event *event; +	struct file *perf_file; -	if (attr->inherit) -		goto err; +	perf_file = perf_event_get(fd); +	if (IS_ERR(perf_file)) +		return perf_file; -	if (attr->type == PERF_TYPE_RAW) -		return file; +	event = perf_file->private_data; +	ee = ERR_PTR(-EINVAL); -	if (attr->type == PERF_TYPE_HARDWARE) -		return file; +	attr = perf_event_attrs(event); +	if (IS_ERR(attr) || attr->inherit) +		goto err_out; + +	switch (attr->type) { +	case PERF_TYPE_SOFTWARE: +		if (attr->config != PERF_COUNT_SW_BPF_OUTPUT) +			goto err_out; +		/* fall-through */ +	case PERF_TYPE_RAW: +	case PERF_TYPE_HARDWARE: +		ee = bpf_event_entry_gen(perf_file, map_file); +		if (ee) +			return ee; +		ee = ERR_PTR(-ENOMEM); +		/* fall-through */ +	default: +		break; +	} -	if (attr->type == PERF_TYPE_SOFTWARE && -	    attr->config == PERF_COUNT_SW_BPF_OUTPUT) -		return file; -err: -	fput(file); -	return ERR_PTR(-EINVAL); +err_out: +	fput(perf_file); +	return ee;  }  static void perf_event_fd_array_put_ptr(void *ptr)  { -	fput((struct file *)ptr); +	bpf_event_entry_free_rcu(ptr); +} + +static void perf_event_fd_array_release(struct bpf_map *map, +					struct file *map_file) +{ +	struct bpf_array *array = container_of(map, struct bpf_array, map); +	struct bpf_event_entry *ee; +	int i; + +	rcu_read_lock(); +	for (i = 0; i < array->map.max_entries; i++) { +		ee = READ_ONCE(array->ptrs[i]); +		if (ee && ee->map_file == map_file) +			fd_array_map_delete_elem(map, &i); +	} +	rcu_read_unlock();  }  static const struct bpf_map_ops perf_event_array_ops = {  	.map_alloc = fd_array_map_alloc, -	.map_free = perf_event_array_map_free, +	.map_free = fd_array_map_free,  	.map_get_next_key = array_map_get_next_key,  	.map_lookup_elem = fd_array_map_lookup_elem, -	.map_update_elem = fd_array_map_update_elem,  	.map_delete_elem = fd_array_map_delete_elem,  	.map_fd_get_ptr = perf_event_fd_array_get_ptr,  	.map_fd_put_ptr = perf_event_fd_array_put_ptr, +	.map_release = perf_event_fd_array_release,  };  static struct bpf_map_type_list perf_event_array_type __read_mostly = { @@ -491,3 +537,46 @@ static int __init register_perf_event_array_map(void)  	return 0;  }  late_initcall(register_perf_event_array_map); + +#ifdef CONFIG_SOCK_CGROUP_DATA +static void *cgroup_fd_array_get_ptr(struct bpf_map *map, +				     struct file *map_file /* not used */, +				     int fd) +{ +	return cgroup_get_from_fd(fd); +} + +static void cgroup_fd_array_put_ptr(void *ptr) +{ +	/* cgroup_put free cgrp after a rcu grace period */ +	cgroup_put(ptr); +} + +static void cgroup_fd_array_free(struct bpf_map *map) +{ +	bpf_fd_array_map_clear(map); +	fd_array_map_free(map); +} + +static const struct bpf_map_ops cgroup_array_ops = { +	.map_alloc = fd_array_map_alloc, +	.map_free = cgroup_fd_array_free, +	.map_get_next_key = array_map_get_next_key, +	.map_lookup_elem = fd_array_map_lookup_elem, +	.map_delete_elem = fd_array_map_delete_elem, +	.map_fd_get_ptr = cgroup_fd_array_get_ptr, +	.map_fd_put_ptr = cgroup_fd_array_put_ptr, +}; + +static struct bpf_map_type_list cgroup_array_type __read_mostly = { +	.ops = &cgroup_array_ops, +	.type = BPF_MAP_TYPE_CGROUP_ARRAY, +}; + +static int __init register_cgroup_array_map(void) +{ +	bpf_register_map_type(&cgroup_array_type); +	return 0; +} +late_initcall(register_cgroup_array_map); +#endif diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index b94a36550591..03fd23d4d587 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -719,14 +719,13 @@ select_insn:  		if (unlikely(index >= array->map.max_entries))  			goto out; -  		if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT))  			goto out;  		tail_call_cnt++;  		prog = READ_ONCE(array->ptrs[index]); -		if (unlikely(!prog)) +		if (!prog)  			goto out;  		/* ARG1 at this point is guaranteed to point to CTX from @@ -1055,9 +1054,11 @@ const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)  	return NULL;  } -const struct bpf_func_proto * __weak bpf_get_event_output_proto(void) +u64 __weak +bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, +		 void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)  { -	return NULL; +	return -ENOTSUPP;  }  /* Always built-in helper functions. */ diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index ad7a0573f71b..1ea3afba1a4f 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -101,7 +101,7 @@ const struct bpf_func_proto bpf_get_prandom_u32_proto = {  static u64 bpf_get_smp_processor_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)  { -	return raw_smp_processor_id(); +	return smp_processor_id();  }  const struct bpf_func_proto bpf_get_smp_processor_id_proto = { diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 318858edb1cd..5967b870a895 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -11,7 +11,7 @@   * version 2 as published by the Free Software Foundation.   */ -#include <linux/module.h> +#include <linux/init.h>  #include <linux/magic.h>  #include <linux/major.h>  #include <linux/mount.h> @@ -367,8 +367,6 @@ static struct file_system_type bpf_fs_type = {  	.kill_sb	= kill_litter_super,  }; -MODULE_ALIAS_FS("bpf"); -  static int __init bpf_init(void)  {  	int ret; diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 080a2dfb5800..bf4495fcd25d 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -99,7 +99,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)  	if (err)  		goto free_smap; -	err = get_callchain_buffers(); +	err = get_callchain_buffers(sysctl_perf_event_max_stack);  	if (err)  		goto free_smap; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 46ecce4b79ed..228f962447a5 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -124,7 +124,12 @@ void bpf_map_put_with_uref(struct bpf_map *map)  static int bpf_map_release(struct inode *inode, struct file *filp)  { -	bpf_map_put_with_uref(filp->private_data); +	struct bpf_map *map = filp->private_data; + +	if (map->ops->map_release) +		map->ops->map_release(map, filp); + +	bpf_map_put_with_uref(map);  	return 0;  } @@ -387,6 +392,13 @@ static int map_update_elem(union bpf_attr *attr)  		err = bpf_percpu_hash_update(map, key, value, attr->flags);  	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {  		err = bpf_percpu_array_update(map, key, value, attr->flags); +	} else if (map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || +		   map->map_type == BPF_MAP_TYPE_PROG_ARRAY || +		   map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY) { +		rcu_read_lock(); +		err = bpf_fd_array_map_update_elem(map, f.file, key, value, +						   attr->flags); +		rcu_read_unlock();  	} else {  		rcu_read_lock();  		err = map->ops->map_update_elem(map, key, value, attr->flags); @@ -612,7 +624,7 @@ static void bpf_prog_uncharge_memlock(struct bpf_prog *prog)  	free_uid(user);  } -static void __prog_put_common(struct rcu_head *rcu) +static void __bpf_prog_put_rcu(struct rcu_head *rcu)  {  	struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu); @@ -621,17 +633,10 @@ static void __prog_put_common(struct rcu_head *rcu)  	bpf_prog_free(aux->prog);  } -/* version of bpf_prog_put() that is called after a grace period */ -void bpf_prog_put_rcu(struct bpf_prog *prog) -{ -	if (atomic_dec_and_test(&prog->aux->refcnt)) -		call_rcu(&prog->aux->rcu, __prog_put_common); -} -  void bpf_prog_put(struct bpf_prog *prog)  {  	if (atomic_dec_and_test(&prog->aux->refcnt)) -		__prog_put_common(&prog->aux->rcu); +		call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);  }  EXPORT_SYMBOL_GPL(bpf_prog_put); @@ -639,7 +644,7 @@ static int bpf_prog_release(struct inode *inode, struct file *filp)  {  	struct bpf_prog *prog = filp->private_data; -	bpf_prog_put_rcu(prog); +	bpf_prog_put(prog);  	return 0;  } @@ -653,7 +658,7 @@ int bpf_prog_new_fd(struct bpf_prog *prog)  				O_RDWR | O_CLOEXEC);  } -static struct bpf_prog *__bpf_prog_get(struct fd f) +static struct bpf_prog *____bpf_prog_get(struct fd f)  {  	if (!f.file)  		return ERR_PTR(-EBADF); @@ -665,33 +670,50 @@ static struct bpf_prog *__bpf_prog_get(struct fd f)  	return f.file->private_data;  } -struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog) +struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i)  { -	if (atomic_inc_return(&prog->aux->refcnt) > BPF_MAX_REFCNT) { -		atomic_dec(&prog->aux->refcnt); +	if (atomic_add_return(i, &prog->aux->refcnt) > BPF_MAX_REFCNT) { +		atomic_sub(i, &prog->aux->refcnt);  		return ERR_PTR(-EBUSY);  	}  	return prog;  } +EXPORT_SYMBOL_GPL(bpf_prog_add); -/* called by sockets/tracing/seccomp before attaching program to an event - * pairs with bpf_prog_put() - */ -struct bpf_prog *bpf_prog_get(u32 ufd) +struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog) +{ +	return bpf_prog_add(prog, 1); +} + +static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type)  {  	struct fd f = fdget(ufd);  	struct bpf_prog *prog; -	prog = __bpf_prog_get(f); +	prog = ____bpf_prog_get(f);  	if (IS_ERR(prog))  		return prog; +	if (type && prog->type != *type) { +		prog = ERR_PTR(-EINVAL); +		goto out; +	}  	prog = bpf_prog_inc(prog); +out:  	fdput(f); -  	return prog;  } -EXPORT_SYMBOL_GPL(bpf_prog_get); + +struct bpf_prog *bpf_prog_get(u32 ufd) +{ +	return __bpf_prog_get(ufd, NULL); +} + +struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type) +{ +	return __bpf_prog_get(ufd, &type); +} +EXPORT_SYMBOL_GPL(bpf_prog_get_type);  /* last field in 'union bpf_attr' used by this command */  #define	BPF_PROG_LOAD_LAST_FIELD kern_version diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 668e07903c8f..f72f23b8fdab 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -126,31 +126,6 @@   * are set to NOT_INIT to indicate that they are no longer readable.   */ -/* types of values stored in eBPF registers */ -enum bpf_reg_type { -	NOT_INIT = 0,		 /* nothing was written into register */ -	UNKNOWN_VALUE,		 /* reg doesn't contain a valid pointer */ -	PTR_TO_CTX,		 /* reg points to bpf_context */ -	CONST_PTR_TO_MAP,	 /* reg points to struct bpf_map */ -	PTR_TO_MAP_VALUE,	 /* reg points to map element value */ -	PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */ -	FRAME_PTR,		 /* reg == frame_pointer */ -	PTR_TO_STACK,		 /* reg == frame_pointer + imm */ -	CONST_IMM,		 /* constant integer value */ - -	/* PTR_TO_PACKET represents: -	 * skb->data -	 * skb->data + imm -	 * skb->data + (u16) var -	 * skb->data + (u16) var + imm -	 * if (range > 0) then [ptr, ptr + range - off) is safe to access -	 * if (id > 0) means that some 'var' was added -	 * if (off > 0) menas that 'imm' was added -	 */ -	PTR_TO_PACKET, -	PTR_TO_PACKET_END,	 /* skb->data + headlen */ -}; -  struct reg_state {  	enum bpf_reg_type type;  	union { @@ -678,6 +653,16 @@ static int check_map_access(struct verifier_env *env, u32 regno, int off,  #define MAX_PACKET_OFF 0xffff +static bool may_write_pkt_data(enum bpf_prog_type type) +{ +	switch (type) { +	case BPF_PROG_TYPE_XDP: +		return true; +	default: +		return false; +	} +} +  static int check_packet_access(struct verifier_env *env, u32 regno, int off,  			       int size)  { @@ -695,10 +680,10 @@ static int check_packet_access(struct verifier_env *env, u32 regno, int off,  /* check access to 'struct bpf_context' fields */  static int check_ctx_access(struct verifier_env *env, int off, int size, -			    enum bpf_access_type t) +			    enum bpf_access_type t, enum bpf_reg_type *reg_type)  {  	if (env->prog->aux->ops->is_valid_access && -	    env->prog->aux->ops->is_valid_access(off, size, t)) { +	    env->prog->aux->ops->is_valid_access(off, size, t, reg_type)) {  		/* remember the offset of last byte accessed in ctx */  		if (env->prog->aux->max_ctx_offset < off + size)  			env->prog->aux->max_ctx_offset = off + size; @@ -738,6 +723,7 @@ static int check_ptr_alignment(struct verifier_env *env, struct reg_state *reg,  	switch (env->prog->type) {  	case BPF_PROG_TYPE_SCHED_CLS:  	case BPF_PROG_TYPE_SCHED_ACT: +	case BPF_PROG_TYPE_XDP:  		break;  	default:  		verbose("verifier is misconfigured\n"); @@ -798,21 +784,19 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,  			mark_reg_unknown_value(state->regs, value_regno);  	} else if (reg->type == PTR_TO_CTX) { +		enum bpf_reg_type reg_type = UNKNOWN_VALUE; +  		if (t == BPF_WRITE && value_regno >= 0 &&  		    is_pointer_value(env, value_regno)) {  			verbose("R%d leaks addr into ctx\n", value_regno);  			return -EACCES;  		} -		err = check_ctx_access(env, off, size, t); +		err = check_ctx_access(env, off, size, t, ®_type);  		if (!err && t == BPF_READ && value_regno >= 0) {  			mark_reg_unknown_value(state->regs, value_regno); -			if (off == offsetof(struct __sk_buff, data) && -			    env->allow_ptr_leaks) +			if (env->allow_ptr_leaks)  				/* note that reg.[id|off|range] == 0 */ -				state->regs[value_regno].type = PTR_TO_PACKET; -			else if (off == offsetof(struct __sk_buff, data_end) && -				 env->allow_ptr_leaks) -				state->regs[value_regno].type = PTR_TO_PACKET_END; +				state->regs[value_regno].type = reg_type;  		}  	} else if (reg->type == FRAME_PTR || reg->type == PTR_TO_STACK) { @@ -832,10 +816,15 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,  			err = check_stack_read(state, off, size, value_regno);  		}  	} else if (state->regs[regno].type == PTR_TO_PACKET) { -		if (t == BPF_WRITE) { +		if (t == BPF_WRITE && !may_write_pkt_data(env->prog->type)) {  			verbose("cannot write into packet\n");  			return -EACCES;  		} +		if (t == BPF_WRITE && value_regno >= 0 && +		    is_pointer_value(env, value_regno)) { +			verbose("R%d leaks addr into packet\n", value_regno); +			return -EACCES; +		}  		err = check_packet_access(env, regno, off, size);  		if (!err && t == BPF_READ && value_regno >= 0)  			mark_reg_unknown_value(state->regs, value_regno); @@ -1062,6 +1051,10 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)  		if (func_id != BPF_FUNC_get_stackid)  			goto error;  		break; +	case BPF_MAP_TYPE_CGROUP_ARRAY: +		if (func_id != BPF_FUNC_skb_in_cgroup) +			goto error; +		break;  	default:  		break;  	} @@ -1081,6 +1074,10 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)  		if (map->map_type != BPF_MAP_TYPE_STACK_TRACE)  			goto error;  		break; +	case BPF_FUNC_skb_in_cgroup: +		if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY) +			goto error; +		break;  	default:  		break;  	} diff --git a/kernel/capability.c b/kernel/capability.c index 45432b54d5c6..00411c82dac5 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -361,6 +361,24 @@ bool has_capability_noaudit(struct task_struct *t, int cap)  	return has_ns_capability_noaudit(t, &init_user_ns, cap);  } +static bool ns_capable_common(struct user_namespace *ns, int cap, bool audit) +{ +	int capable; + +	if (unlikely(!cap_valid(cap))) { +		pr_crit("capable() called with invalid cap=%u\n", cap); +		BUG(); +	} + +	capable = audit ? security_capable(current_cred(), ns, cap) : +			  security_capable_noaudit(current_cred(), ns, cap); +	if (capable == 0) { +		current->flags |= PF_SUPERPRIV; +		return true; +	} +	return false; +} +  /**   * ns_capable - Determine if the current task has a superior capability in effect   * @ns:  The usernamespace we want the capability in @@ -374,19 +392,27 @@ bool has_capability_noaudit(struct task_struct *t, int cap)   */  bool ns_capable(struct user_namespace *ns, int cap)  { -	if (unlikely(!cap_valid(cap))) { -		pr_crit("capable() called with invalid cap=%u\n", cap); -		BUG(); -	} - -	if (security_capable(current_cred(), ns, cap) == 0) { -		current->flags |= PF_SUPERPRIV; -		return true; -	} -	return false; +	return ns_capable_common(ns, cap, true);  }  EXPORT_SYMBOL(ns_capable); +/** + * ns_capable_noaudit - Determine if the current task has a superior capability + * (unaudited) in effect + * @ns:  The usernamespace we want the capability in + * @cap: The capability to be tested for + * + * Return true if the current task has the given superior capability currently + * available for use, false if not. + * + * This sets PF_SUPERPRIV on the task if the capability is available on the + * assumption that it's about to be used. + */ +bool ns_capable_noaudit(struct user_namespace *ns, int cap) +{ +	return ns_capable_common(ns, cap, false); +} +EXPORT_SYMBOL(ns_capable_noaudit);  /**   * capable - Determine if the current task has a superior capability in effect diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 86cb5c6e8932..d1c51b7f5221 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -61,7 +61,7 @@  #include <linux/cpuset.h>  #include <linux/proc_ns.h>  #include <linux/nsproxy.h> -#include <linux/proc_ns.h> +#include <linux/file.h>  #include <net/sock.h>  /* @@ -837,6 +837,8 @@ static void put_css_set_locked(struct css_set *cset)  static void put_css_set(struct css_set *cset)  { +	unsigned long flags; +  	/*  	 * Ensure that the refcount doesn't hit zero while any readers  	 * can see it. Similar to atomic_dec_and_lock(), but for an @@ -845,9 +847,9 @@ static void put_css_set(struct css_set *cset)  	if (atomic_add_unless(&cset->refcount, -1, 1))  		return; -	spin_lock_bh(&css_set_lock); +	spin_lock_irqsave(&css_set_lock, flags);  	put_css_set_locked(cset); -	spin_unlock_bh(&css_set_lock); +	spin_unlock_irqrestore(&css_set_lock, flags);  }  /* @@ -1070,11 +1072,11 @@ static struct css_set *find_css_set(struct css_set *old_cset,  	/* First see if we already have a cgroup group that matches  	 * the desired set */ -	spin_lock_bh(&css_set_lock); +	spin_lock_irq(&css_set_lock);  	cset = find_existing_css_set(old_cset, cgrp, template);  	if (cset)  		get_css_set(cset); -	spin_unlock_bh(&css_set_lock); +	spin_unlock_irq(&css_set_lock);  	if (cset)  		return cset; @@ -1102,7 +1104,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,  	 * find_existing_css_set() */  	memcpy(cset->subsys, template, sizeof(cset->subsys)); -	spin_lock_bh(&css_set_lock); +	spin_lock_irq(&css_set_lock);  	/* Add reference counts and links from the new css_set. */  	list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {  		struct cgroup *c = link->cgrp; @@ -1128,7 +1130,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,  		css_get(css);  	} -	spin_unlock_bh(&css_set_lock); +	spin_unlock_irq(&css_set_lock);  	return cset;  } @@ -1158,18 +1160,12 @@ static void cgroup_exit_root_id(struct cgroup_root *root)  {  	lockdep_assert_held(&cgroup_mutex); -	if (root->hierarchy_id) { -		idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id); -		root->hierarchy_id = 0; -	} +	idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);  }  static void cgroup_free_root(struct cgroup_root *root)  {  	if (root) { -		/* hierarchy ID should already have been released */ -		WARN_ON_ONCE(root->hierarchy_id); -  		idr_destroy(&root->cgroup_idr);  		kfree(root);  	} @@ -1192,7 +1188,7 @@ static void cgroup_destroy_root(struct cgroup_root *root)  	 * Release all the links from cset_links to this hierarchy's  	 * root cgroup  	 */ -	spin_lock_bh(&css_set_lock); +	spin_lock_irq(&css_set_lock);  	list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {  		list_del(&link->cset_link); @@ -1200,7 +1196,7 @@ static void cgroup_destroy_root(struct cgroup_root *root)  		kfree(link);  	} -	spin_unlock_bh(&css_set_lock); +	spin_unlock_irq(&css_set_lock);  	if (!list_empty(&root->root_list)) {  		list_del(&root->root_list); @@ -1600,11 +1596,11 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)  		ss->root = dst_root;  		css->cgroup = dcgrp; -		spin_lock_bh(&css_set_lock); +		spin_lock_irq(&css_set_lock);  		hash_for_each(css_set_table, i, cset, hlist)  			list_move_tail(&cset->e_cset_node[ss->id],  				       &dcgrp->e_csets[ss->id]); -		spin_unlock_bh(&css_set_lock); +		spin_unlock_irq(&css_set_lock);  		/* default hierarchy doesn't enable controllers by default */  		dst_root->subsys_mask |= 1 << ssid; @@ -1640,10 +1636,10 @@ static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,  	if (!buf)  		return -ENOMEM; -	spin_lock_bh(&css_set_lock); +	spin_lock_irq(&css_set_lock);  	ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot);  	len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX); -	spin_unlock_bh(&css_set_lock); +	spin_unlock_irq(&css_set_lock);  	if (len >= PATH_MAX)  		len = -ERANGE; @@ -1897,7 +1893,7 @@ static void cgroup_enable_task_cg_lists(void)  {  	struct task_struct *p, *g; -	spin_lock_bh(&css_set_lock); +	spin_lock_irq(&css_set_lock);  	if (use_task_css_set_links)  		goto out_unlock; @@ -1922,8 +1918,12 @@ static void cgroup_enable_task_cg_lists(void)  		 * entry won't be deleted though the process has exited.  		 * Do it while holding siglock so that we don't end up  		 * racing against cgroup_exit(). +		 * +		 * Interrupts were already disabled while acquiring +		 * the css_set_lock, so we do not need to disable it +		 * again when acquiring the sighand->siglock here.  		 */ -		spin_lock_irq(&p->sighand->siglock); +		spin_lock(&p->sighand->siglock);  		if (!(p->flags & PF_EXITING)) {  			struct css_set *cset = task_css_set(p); @@ -1932,11 +1932,11 @@ static void cgroup_enable_task_cg_lists(void)  			list_add_tail(&p->cg_list, &cset->tasks);  			get_css_set(cset);  		} -		spin_unlock_irq(&p->sighand->siglock); +		spin_unlock(&p->sighand->siglock);  	} while_each_thread(g, p);  	read_unlock(&tasklist_lock);  out_unlock: -	spin_unlock_bh(&css_set_lock); +	spin_unlock_irq(&css_set_lock);  }  static void init_cgroup_housekeeping(struct cgroup *cgrp) @@ -2043,13 +2043,13 @@ static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)  	 * Link the root cgroup in this hierarchy into all the css_set  	 * objects.  	 */ -	spin_lock_bh(&css_set_lock); +	spin_lock_irq(&css_set_lock);  	hash_for_each(css_set_table, i, cset, hlist) {  		link_css_set(&tmp_links, cset, root_cgrp);  		if (css_set_populated(cset))  			cgroup_update_populated(root_cgrp, true);  	} -	spin_unlock_bh(&css_set_lock); +	spin_unlock_irq(&css_set_lock);  	BUG_ON(!list_empty(&root_cgrp->self.children));  	BUG_ON(atomic_read(&root->nr_cgrps) != 1); @@ -2209,12 +2209,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,  		goto out_unlock;  	} -	/* -	 * We know this subsystem has not yet been bound.  Users in a non-init -	 * user namespace may only mount hierarchies with no bound subsystems, -	 * i.e. 'none,name=user1' -	 */ -	if (!opts.none && !capable(CAP_SYS_ADMIN)) { +	/* Hierarchies may only be created in the initial cgroup namespace. */ +	if (ns != &init_cgroup_ns) {  		ret = -EPERM;  		goto out_unlock;  	} @@ -2256,11 +2252,11 @@ out_mount:  		struct cgroup *cgrp;  		mutex_lock(&cgroup_mutex); -		spin_lock_bh(&css_set_lock); +		spin_lock_irq(&css_set_lock);  		cgrp = cset_cgroup_from_root(ns->root_cset, root); -		spin_unlock_bh(&css_set_lock); +		spin_unlock_irq(&css_set_lock);  		mutex_unlock(&cgroup_mutex);  		nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb); @@ -2337,11 +2333,11 @@ char *cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,  	char *ret;  	mutex_lock(&cgroup_mutex); -	spin_lock_bh(&css_set_lock); +	spin_lock_irq(&css_set_lock);  	ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns); -	spin_unlock_bh(&css_set_lock); +	spin_unlock_irq(&css_set_lock);  	mutex_unlock(&cgroup_mutex);  	return ret; @@ -2369,7 +2365,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)  	char *path = NULL;  	mutex_lock(&cgroup_mutex); -	spin_lock_bh(&css_set_lock); +	spin_lock_irq(&css_set_lock);  	root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id); @@ -2382,7 +2378,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)  			path = buf;  	} -	spin_unlock_bh(&css_set_lock); +	spin_unlock_irq(&css_set_lock);  	mutex_unlock(&cgroup_mutex);  	return path;  } @@ -2557,7 +2553,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,  	 * the new cgroup.  There are no failure cases after here, so this  	 * is the commit point.  	 */ -	spin_lock_bh(&css_set_lock); +	spin_lock_irq(&css_set_lock);  	list_for_each_entry(cset, &tset->src_csets, mg_node) {  		list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) {  			struct css_set *from_cset = task_css_set(task); @@ -2568,7 +2564,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,  			put_css_set_locked(from_cset);  		}  	} -	spin_unlock_bh(&css_set_lock); +	spin_unlock_irq(&css_set_lock);  	/*  	 * Migration is committed, all target tasks are now on dst_csets. @@ -2597,13 +2593,13 @@ out_cancel_attach:  		}  	} while_each_subsys_mask();  out_release_tset: -	spin_lock_bh(&css_set_lock); +	spin_lock_irq(&css_set_lock);  	list_splice_init(&tset->dst_csets, &tset->src_csets);  	list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) {  		list_splice_tail_init(&cset->mg_tasks, &cset->tasks);  		list_del_init(&cset->mg_node);  	} -	spin_unlock_bh(&css_set_lock); +	spin_unlock_irq(&css_set_lock);  	return ret;  } @@ -2634,7 +2630,7 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)  	lockdep_assert_held(&cgroup_mutex); -	spin_lock_bh(&css_set_lock); +	spin_lock_irq(&css_set_lock);  	list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {  		cset->mg_src_cgrp = NULL;  		cset->mg_dst_cgrp = NULL; @@ -2642,7 +2638,7 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)  		list_del_init(&cset->mg_preload_node);  		put_css_set_locked(cset);  	} -	spin_unlock_bh(&css_set_lock); +	spin_unlock_irq(&css_set_lock);  }  /** @@ -2783,7 +2779,7 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup,  	 * already PF_EXITING could be freed from underneath us unless we  	 * take an rcu_read_lock.  	 */ -	spin_lock_bh(&css_set_lock); +	spin_lock_irq(&css_set_lock);  	rcu_read_lock();  	task = leader;  	do { @@ -2792,7 +2788,7 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup,  			break;  	} while_each_thread(leader, task);  	rcu_read_unlock(); -	spin_unlock_bh(&css_set_lock); +	spin_unlock_irq(&css_set_lock);  	return cgroup_taskset_migrate(&tset, root);  } @@ -2816,7 +2812,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,  		return -EBUSY;  	/* look up all src csets */ -	spin_lock_bh(&css_set_lock); +	spin_lock_irq(&css_set_lock);  	rcu_read_lock();  	task = leader;  	do { @@ -2826,7 +2822,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,  			break;  	} while_each_thread(leader, task);  	rcu_read_unlock(); -	spin_unlock_bh(&css_set_lock); +	spin_unlock_irq(&css_set_lock);  	/* prepare dst csets and commit */  	ret = cgroup_migrate_prepare_dst(&preloaded_csets); @@ -2859,9 +2855,9 @@ static int cgroup_procs_write_permission(struct task_struct *task,  		struct cgroup *cgrp;  		struct inode *inode; -		spin_lock_bh(&css_set_lock); +		spin_lock_irq(&css_set_lock);  		cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); -		spin_unlock_bh(&css_set_lock); +		spin_unlock_irq(&css_set_lock);  		while (!cgroup_is_descendant(dst_cgrp, cgrp))  			cgrp = cgroup_parent(cgrp); @@ -2956,20 +2952,22 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)  	int retval = 0;  	mutex_lock(&cgroup_mutex); +	percpu_down_write(&cgroup_threadgroup_rwsem);  	for_each_root(root) {  		struct cgroup *from_cgrp;  		if (root == &cgrp_dfl_root)  			continue; -		spin_lock_bh(&css_set_lock); +		spin_lock_irq(&css_set_lock);  		from_cgrp = task_cgroup_from_root(from, root); -		spin_unlock_bh(&css_set_lock); +		spin_unlock_irq(&css_set_lock);  		retval = cgroup_attach_task(from_cgrp, tsk, false);  		if (retval)  			break;  	} +	percpu_up_write(&cgroup_threadgroup_rwsem);  	mutex_unlock(&cgroup_mutex);  	return retval; @@ -3080,7 +3078,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)  	percpu_down_write(&cgroup_threadgroup_rwsem);  	/* look up all csses currently attached to @cgrp's subtree */ -	spin_lock_bh(&css_set_lock); +	spin_lock_irq(&css_set_lock);  	cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {  		struct cgrp_cset_link *link; @@ -3088,14 +3086,14 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)  			cgroup_migrate_add_src(link->cset, dsct,  					       &preloaded_csets);  	} -	spin_unlock_bh(&css_set_lock); +	spin_unlock_irq(&css_set_lock);  	/* NULL dst indicates self on default hierarchy */  	ret = cgroup_migrate_prepare_dst(&preloaded_csets);  	if (ret)  		goto out_finish; -	spin_lock_bh(&css_set_lock); +	spin_lock_irq(&css_set_lock);  	list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {  		struct task_struct *task, *ntask; @@ -3107,7 +3105,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)  		list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)  			cgroup_taskset_add(task, &tset);  	} -	spin_unlock_bh(&css_set_lock); +	spin_unlock_irq(&css_set_lock);  	ret = cgroup_taskset_migrate(&tset, cgrp->root);  out_finish: @@ -3908,10 +3906,10 @@ static int cgroup_task_count(const struct cgroup *cgrp)  	int count = 0;  	struct cgrp_cset_link *link; -	spin_lock_bh(&css_set_lock); +	spin_lock_irq(&css_set_lock);  	list_for_each_entry(link, &cgrp->cset_links, cset_link)  		count += atomic_read(&link->cset->refcount); -	spin_unlock_bh(&css_set_lock); +	spin_unlock_irq(&css_set_lock);  	return count;  } @@ -4249,7 +4247,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css,  	memset(it, 0, sizeof(*it)); -	spin_lock_bh(&css_set_lock); +	spin_lock_irq(&css_set_lock);  	it->ss = css->ss; @@ -4262,7 +4260,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css,  	css_task_iter_advance_css_set(it); -	spin_unlock_bh(&css_set_lock); +	spin_unlock_irq(&css_set_lock);  }  /** @@ -4280,7 +4278,7 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)  		it->cur_task = NULL;  	} -	spin_lock_bh(&css_set_lock); +	spin_lock_irq(&css_set_lock);  	if (it->task_pos) {  		it->cur_task = list_entry(it->task_pos, struct task_struct, @@ -4289,7 +4287,7 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)  		css_task_iter_advance(it);  	} -	spin_unlock_bh(&css_set_lock); +	spin_unlock_irq(&css_set_lock);  	return it->cur_task;  } @@ -4303,10 +4301,10 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)  void css_task_iter_end(struct css_task_iter *it)  {  	if (it->cur_cset) { -		spin_lock_bh(&css_set_lock); +		spin_lock_irq(&css_set_lock);  		list_del(&it->iters_node);  		put_css_set_locked(it->cur_cset); -		spin_unlock_bh(&css_set_lock); +		spin_unlock_irq(&css_set_lock);  	}  	if (it->cur_task) @@ -4337,11 +4335,13 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)  	mutex_lock(&cgroup_mutex); +	percpu_down_write(&cgroup_threadgroup_rwsem); +  	/* all tasks in @from are being moved, all csets are source */ -	spin_lock_bh(&css_set_lock); +	spin_lock_irq(&css_set_lock);  	list_for_each_entry(link, &from->cset_links, cset_link)  		cgroup_migrate_add_src(link->cset, to, &preloaded_csets); -	spin_unlock_bh(&css_set_lock); +	spin_unlock_irq(&css_set_lock);  	ret = cgroup_migrate_prepare_dst(&preloaded_csets);  	if (ret) @@ -4365,6 +4365,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)  	} while (task && !ret);  out_err:  	cgroup_migrate_finish(&preloaded_csets); +	percpu_up_write(&cgroup_threadgroup_rwsem);  	mutex_unlock(&cgroup_mutex);  	return ret;  } @@ -5063,6 +5064,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css,  	memset(css, 0, sizeof(*css));  	css->cgroup = cgrp;  	css->ss = ss; +	css->id = -1;  	INIT_LIST_HEAD(&css->sibling);  	INIT_LIST_HEAD(&css->children);  	css->serial_nr = css_serial_nr_next++; @@ -5139,6 +5141,8 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,  	lockdep_assert_held(&cgroup_mutex);  	css = ss->css_alloc(parent_css); +	if (!css) +		css = ERR_PTR(-ENOMEM);  	if (IS_ERR(css))  		return css; @@ -5150,7 +5154,7 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,  	err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL);  	if (err < 0) -		goto err_free_percpu_ref; +		goto err_free_css;  	css->id = err;  	/* @css is ready to be brought online now, make it visible */ @@ -5174,9 +5178,6 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,  err_list_del:  	list_del_rcu(&css->sibling); -	cgroup_idr_remove(&ss->css_idr, css->id); -err_free_percpu_ref: -	percpu_ref_exit(&css->refcnt);  err_free_css:  	call_rcu(&css->rcu_head, css_free_rcu_fn);  	return ERR_PTR(err); @@ -5451,10 +5452,10 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)  	 */  	cgrp->self.flags &= ~CSS_ONLINE; -	spin_lock_bh(&css_set_lock); +	spin_lock_irq(&css_set_lock);  	list_for_each_entry(link, &cgrp->cset_links, cset_link)  		link->cset->dead = true; -	spin_unlock_bh(&css_set_lock); +	spin_unlock_irq(&css_set_lock);  	/* initiate massacre of all css's */  	for_each_css(css, ssid, cgrp) @@ -5725,7 +5726,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,  		goto out;  	mutex_lock(&cgroup_mutex); -	spin_lock_bh(&css_set_lock); +	spin_lock_irq(&css_set_lock);  	for_each_root(root) {  		struct cgroup_subsys *ss; @@ -5778,7 +5779,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,  	retval = 0;  out_unlock: -	spin_unlock_bh(&css_set_lock); +	spin_unlock_irq(&css_set_lock);  	mutex_unlock(&cgroup_mutex);  	kfree(buf);  out: @@ -5923,13 +5924,13 @@ void cgroup_post_fork(struct task_struct *child)  	if (use_task_css_set_links) {  		struct css_set *cset; -		spin_lock_bh(&css_set_lock); +		spin_lock_irq(&css_set_lock);  		cset = task_css_set(current);  		if (list_empty(&child->cg_list)) {  			get_css_set(cset);  			css_set_move_task(child, NULL, cset, false);  		} -		spin_unlock_bh(&css_set_lock); +		spin_unlock_irq(&css_set_lock);  	}  	/* @@ -5974,9 +5975,9 @@ void cgroup_exit(struct task_struct *tsk)  	cset = task_css_set(tsk);  	if (!list_empty(&tsk->cg_list)) { -		spin_lock_bh(&css_set_lock); +		spin_lock_irq(&css_set_lock);  		css_set_move_task(tsk, cset, NULL, false); -		spin_unlock_bh(&css_set_lock); +		spin_unlock_irq(&css_set_lock);  	} else {  		get_css_set(cset);  	} @@ -6044,9 +6045,9 @@ static void cgroup_release_agent(struct work_struct *work)  	if (!pathbuf || !agentbuf)  		goto out; -	spin_lock_bh(&css_set_lock); +	spin_lock_irq(&css_set_lock);  	path = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); -	spin_unlock_bh(&css_set_lock); +	spin_unlock_irq(&css_set_lock);  	if (!path)  		goto out; @@ -6168,7 +6169,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,  struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)  {  	WARN_ON_ONCE(!rcu_read_lock_held()); -	return id > 0 ? idr_find(&ss->css_idr, id) : NULL; +	return idr_find(&ss->css_idr, id);  }  /** @@ -6205,6 +6206,40 @@ struct cgroup *cgroup_get_from_path(const char *path)  }  EXPORT_SYMBOL_GPL(cgroup_get_from_path); +/** + * cgroup_get_from_fd - get a cgroup pointer from a fd + * @fd: fd obtained by open(cgroup2_dir) + * + * Find the cgroup from a fd which should be obtained + * by opening a cgroup directory.  Returns a pointer to the + * cgroup on success. ERR_PTR is returned if the cgroup + * cannot be found. + */ +struct cgroup *cgroup_get_from_fd(int fd) +{ +	struct cgroup_subsys_state *css; +	struct cgroup *cgrp; +	struct file *f; + +	f = fget_raw(fd); +	if (!f) +		return ERR_PTR(-EBADF); + +	css = css_tryget_online_from_dir(f->f_path.dentry, NULL); +	fput(f); +	if (IS_ERR(css)) +		return ERR_CAST(css); + +	cgrp = css->cgroup; +	if (!cgroup_on_dfl(cgrp)) { +		cgroup_put(cgrp); +		return ERR_PTR(-EBADF); +	} + +	return cgrp; +} +EXPORT_SYMBOL_GPL(cgroup_get_from_fd); +  /*   * sock->sk_cgrp_data handling.  For more info, see sock_cgroup_data   * definition in cgroup-defs.h. @@ -6305,14 +6340,11 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,  	if (!ns_capable(user_ns, CAP_SYS_ADMIN))  		return ERR_PTR(-EPERM); -	mutex_lock(&cgroup_mutex); -	spin_lock_bh(&css_set_lock); - +	/* It is not safe to take cgroup_mutex here */ +	spin_lock_irq(&css_set_lock);  	cset = task_css_set(current);  	get_css_set(cset); - -	spin_unlock_bh(&css_set_lock); -	mutex_unlock(&cgroup_mutex); +	spin_unlock_irq(&css_set_lock);  	new_ns = alloc_cgroup_ns();  	if (IS_ERR(new_ns)) { @@ -6435,7 +6467,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)  	if (!name_buf)  		return -ENOMEM; -	spin_lock_bh(&css_set_lock); +	spin_lock_irq(&css_set_lock);  	rcu_read_lock();  	cset = rcu_dereference(current->cgroups);  	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { @@ -6446,7 +6478,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)  			   c->root->hierarchy_id, name_buf);  	}  	rcu_read_unlock(); -	spin_unlock_bh(&css_set_lock); +	spin_unlock_irq(&css_set_lock);  	kfree(name_buf);  	return 0;  } @@ -6457,7 +6489,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)  	struct cgroup_subsys_state *css = seq_css(seq);  	struct cgrp_cset_link *link; -	spin_lock_bh(&css_set_lock); +	spin_lock_irq(&css_set_lock);  	list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {  		struct css_set *cset = link->cset;  		struct task_struct *task; @@ -6480,7 +6512,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)  	overflow:  		seq_puts(seq, "  ...\n");  	} -	spin_unlock_bh(&css_set_lock); +	spin_unlock_irq(&css_set_lock);  	return 0;  } diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c index 303097b37429..2bd673783f1a 100644 --- a/kernel/cgroup_pids.c +++ b/kernel/cgroup_pids.c @@ -49,6 +49,12 @@ struct pids_cgroup {  	 */  	atomic64_t			counter;  	int64_t				limit; + +	/* Handle for "pids.events" */ +	struct cgroup_file		events_file; + +	/* Number of times fork failed because limit was hit. */ +	atomic64_t			events_limit;  };  static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css) @@ -72,6 +78,7 @@ pids_css_alloc(struct cgroup_subsys_state *parent)  	pids->limit = PIDS_MAX;  	atomic64_set(&pids->counter, 0); +	atomic64_set(&pids->events_limit, 0);  	return &pids->css;  } @@ -213,10 +220,21 @@ static int pids_can_fork(struct task_struct *task)  {  	struct cgroup_subsys_state *css;  	struct pids_cgroup *pids; +	int err;  	css = task_css_check(current, pids_cgrp_id, true);  	pids = css_pids(css); -	return pids_try_charge(pids, 1); +	err = pids_try_charge(pids, 1); +	if (err) { +		/* Only log the first time events_limit is incremented. */ +		if (atomic64_inc_return(&pids->events_limit) == 1) { +			pr_info("cgroup: fork rejected by pids controller in "); +			pr_cont_cgroup_path(task_cgroup(current, pids_cgrp_id)); +			pr_cont("\n"); +		} +		cgroup_file_notify(&pids->events_file); +	} +	return err;  }  static void pids_cancel_fork(struct task_struct *task) @@ -288,6 +306,14 @@ static s64 pids_current_read(struct cgroup_subsys_state *css,  	return atomic64_read(&pids->counter);  } +static int pids_events_show(struct seq_file *sf, void *v) +{ +	struct pids_cgroup *pids = css_pids(seq_css(sf)); + +	seq_printf(sf, "max %lld\n", (s64)atomic64_read(&pids->events_limit)); +	return 0; +} +  static struct cftype pids_files[] = {  	{  		.name = "max", @@ -300,6 +326,12 @@ static struct cftype pids_files[] = {  		.read_s64 = pids_current_read,  		.flags = CFTYPE_NOT_ON_ROOT,  	}, +	{ +		.name = "events", +		.seq_show = pids_events_show, +		.file_offset = offsetof(struct pids_cgroup, events_file), +		.flags = CFTYPE_NOT_ON_ROOT, +	},  	{ }	/* terminate */  }; diff --git a/kernel/cpu.c b/kernel/cpu.c index d948e44c471e..341bf80f80bd 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -517,6 +517,13 @@ static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state,  	if (!cpu_online(cpu))  		return 0; +	/* +	 * If we are up and running, use the hotplug thread. For early calls +	 * we invoke the thread function directly. +	 */ +	if (!st->thread) +		return cpuhp_invoke_callback(cpu, state, cb); +  	st->cb_state = state;  	st->cb = cb;  	/* @@ -1173,6 +1180,31 @@ static struct cpuhp_step cpuhp_bp_states[] = {  		.teardown		= NULL,  		.cant_stop		= true,  	}, +	[CPUHP_PERF_PREPARE] = { +		.name = "perf prepare", +		.startup = perf_event_init_cpu, +		.teardown = perf_event_exit_cpu, +	}, +	[CPUHP_WORKQUEUE_PREP] = { +		.name = "workqueue prepare", +		.startup = workqueue_prepare_cpu, +		.teardown = NULL, +	}, +	[CPUHP_HRTIMERS_PREPARE] = { +		.name = "hrtimers prepare", +		.startup = hrtimers_prepare_cpu, +		.teardown = hrtimers_dead_cpu, +	}, +	[CPUHP_SMPCFD_PREPARE] = { +		.name = "SMPCFD prepare", +		.startup = smpcfd_prepare_cpu, +		.teardown = smpcfd_dead_cpu, +	}, +	[CPUHP_RCUTREE_PREP] = { +		.name = "RCU-tree prepare", +		.startup = rcutree_prepare_cpu, +		.teardown = rcutree_dead_cpu, +	},  	/*  	 * Preparatory and dead notifiers. Will be replaced once the notifiers  	 * are converted to states. @@ -1184,6 +1216,16 @@ static struct cpuhp_step cpuhp_bp_states[] = {  		.skip_onerr		= true,  		.cant_stop		= true,  	}, +	/* +	 * On the tear-down path, timers_dead_cpu() must be invoked +	 * before blk_mq_queue_reinit_notify() from notify_dead(), +	 * otherwise a RCU stall occurs. +	 */ +	[CPUHP_TIMERS_DEAD] = { +		.name = "timers dead", +		.startup = NULL, +		.teardown = timers_dead_cpu, +	},  	/* Kicks the plugged cpu into life */  	[CPUHP_BRINGUP_CPU] = {  		.name			= "cpu:bringup", @@ -1191,6 +1233,10 @@ static struct cpuhp_step cpuhp_bp_states[] = {  		.teardown		= NULL,  		.cant_stop		= true,  	}, +	[CPUHP_AP_SMPCFD_DYING] = { +		.startup = NULL, +		.teardown = smpcfd_dying_cpu, +	},  	/*  	 * Handled on controll processor until the plugged processor manages  	 * this itself. @@ -1201,6 +1247,8 @@ static struct cpuhp_step cpuhp_bp_states[] = {  		.teardown		= takedown_cpu,  		.cant_stop		= true,  	}, +#else +	[CPUHP_BRINGUP_CPU] = { },  #endif  }; @@ -1225,6 +1273,10 @@ static struct cpuhp_step cpuhp_ap_states[] = {  		.startup		= sched_cpu_starting,  		.teardown		= sched_cpu_dying,  	}, +	[CPUHP_AP_RCUTREE_DYING] = { +		.startup = NULL, +		.teardown = rcutree_dying_cpu, +	},  	/*  	 * Low level startup/teardown notifiers. Run with interrupts  	 * disabled. Will be removed once the notifiers are converted to @@ -1248,6 +1300,22 @@ static struct cpuhp_step cpuhp_ap_states[] = {  		.startup		= smpboot_unpark_threads,  		.teardown		= NULL,  	}, +	[CPUHP_AP_PERF_ONLINE] = { +		.name = "perf online", +		.startup = perf_event_init_cpu, +		.teardown = perf_event_exit_cpu, +	}, +	[CPUHP_AP_WORKQUEUE_ONLINE] = { +		.name = "workqueue online", +		.startup = workqueue_online_cpu, +		.teardown = workqueue_offline_cpu, +	}, +	[CPUHP_AP_RCUTREE_ONLINE] = { +		.name = "RCU-tree online", +		.startup = rcutree_online_cpu, +		.teardown = rcutree_offline_cpu, +	}, +  	/*  	 * Online/down_prepare notifiers. Will be removed once the notifiers  	 * are converted to states. diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 73e93e53884d..c7fd2778ed50 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1034,15 +1034,6 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,  {  	bool need_loop; -	/* -	 * Allow tasks that have access to memory reserves because they have -	 * been OOM killed to get memory anywhere. -	 */ -	if (unlikely(test_thread_flag(TIF_MEMDIE))) -		return; -	if (current->flags & PF_EXITING) /* Let dying task have memory */ -		return; -  	task_lock(tsk);  	/*  	 * Determine if a loop is necessary if another thread is doing diff --git a/kernel/cred.c b/kernel/cred.c index 0c0cd8a62285..5f264fb5737d 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -689,6 +689,8 @@ EXPORT_SYMBOL(set_security_override_from_ctx);   */  int set_create_files_as(struct cred *new, struct inode *inode)  { +	if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid)) +		return -EINVAL;  	new->fsuid = inode->i_uid;  	new->fsgid = inode->i_gid;  	return security_kernel_create_files_as(new, inode); diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 179ef4640964..e9fdb5203de5 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -104,7 +104,7 @@ fail:  	return -ENOMEM;  } -int get_callchain_buffers(void) +int get_callchain_buffers(int event_max_stack)  {  	int err = 0;  	int count; @@ -121,6 +121,15 @@ int get_callchain_buffers(void)  		/* If the allocation failed, give up */  		if (!callchain_cpus_entries)  			err = -ENOMEM; +		/* +		 * If requesting per event more than the global cap, +		 * return a different error to help userspace figure +		 * this out. +		 * +		 * And also do it here so that we have &callchain_mutex held. +		 */ +		if (event_max_stack > sysctl_perf_event_max_stack) +			err = -EOVERFLOW;  		goto exit;  	} @@ -174,11 +183,12 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)  	bool user   = !event->attr.exclude_callchain_user;  	/* Disallow cross-task user callchains. */  	bool crosstask = event->ctx->task && event->ctx->task != current; +	const u32 max_stack = event->attr.sample_max_stack;  	if (!kernel && !user)  		return NULL; -	return get_perf_callchain(regs, 0, kernel, user, sysctl_perf_event_max_stack, crosstask, true); +	return get_perf_callchain(regs, 0, kernel, user, max_stack, crosstask, true);  }  struct perf_callchain_entry * diff --git a/kernel/events/core.c b/kernel/events/core.c index 9c51ec3f0f44..356a6c7cb52a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -335,6 +335,7 @@ static atomic_t perf_sched_count;  static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);  static DEFINE_PER_CPU(int, perf_sched_cb_usages); +static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);  static atomic_t nr_mmap_events __read_mostly;  static atomic_t nr_comm_events __read_mostly; @@ -396,6 +397,13 @@ int perf_proc_update_handler(struct ctl_table *table, int write,  	if (ret || !write)  		return ret; +	/* +	 * If throttling is disabled don't allow the write: +	 */ +	if (sysctl_perf_cpu_time_max_percent == 100 || +	    sysctl_perf_cpu_time_max_percent == 0) +		return -EINVAL; +  	max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);  	perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;  	update_perf_cpu_limits(); @@ -1678,12 +1686,33 @@ static bool is_orphaned_event(struct perf_event *event)  	return event->state == PERF_EVENT_STATE_DEAD;  } -static inline int pmu_filter_match(struct perf_event *event) +static inline int __pmu_filter_match(struct perf_event *event)  {  	struct pmu *pmu = event->pmu;  	return pmu->filter_match ? pmu->filter_match(event) : 1;  } +/* + * Check whether we should attempt to schedule an event group based on + * PMU-specific filtering. An event group can consist of HW and SW events, + * potentially with a SW leader, so we must check all the filters, to + * determine whether a group is schedulable: + */ +static inline int pmu_filter_match(struct perf_event *event) +{ +	struct perf_event *child; + +	if (!__pmu_filter_match(event)) +		return 0; + +	list_for_each_entry(child, &event->sibling_list, group_entry) { +		if (!__pmu_filter_match(child)) +			return 0; +	} + +	return 1; +} +  static inline int  event_filter_match(struct perf_event *event)  { @@ -3665,6 +3694,39 @@ static void free_event_rcu(struct rcu_head *head)  static void ring_buffer_attach(struct perf_event *event,  			       struct ring_buffer *rb); +static void detach_sb_event(struct perf_event *event) +{ +	struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu); + +	raw_spin_lock(&pel->lock); +	list_del_rcu(&event->sb_list); +	raw_spin_unlock(&pel->lock); +} + +static bool is_sb_event(struct perf_event *event) +{ +	struct perf_event_attr *attr = &event->attr; + +	if (event->parent) +		return false; + +	if (event->attach_state & PERF_ATTACH_TASK) +		return false; + +	if (attr->mmap || attr->mmap_data || attr->mmap2 || +	    attr->comm || attr->comm_exec || +	    attr->task || +	    attr->context_switch) +		return true; +	return false; +} + +static void unaccount_pmu_sb_event(struct perf_event *event) +{ +	if (is_sb_event(event)) +		detach_sb_event(event); +} +  static void unaccount_event_cpu(struct perf_event *event, int cpu)  {  	if (event->parent) @@ -3728,6 +3790,8 @@ static void unaccount_event(struct perf_event *event)  	}  	unaccount_event_cpu(event, event->cpu); + +	unaccount_pmu_sb_event(event);  }  static void perf_sched_delayed(struct work_struct *work) @@ -5553,16 +5617,26 @@ void perf_output_sample(struct perf_output_handle *handle,  	}  	if (sample_type & PERF_SAMPLE_RAW) { -		if (data->raw) { -			u32 raw_size = data->raw->size; -			u32 real_size = round_up(raw_size + sizeof(u32), -						 sizeof(u64)) - sizeof(u32); -			u64 zero = 0; - -			perf_output_put(handle, real_size); -			__output_copy(handle, data->raw->data, raw_size); -			if (real_size - raw_size) -				__output_copy(handle, &zero, real_size - raw_size); +		struct perf_raw_record *raw = data->raw; + +		if (raw) { +			struct perf_raw_frag *frag = &raw->frag; + +			perf_output_put(handle, raw->size); +			do { +				if (frag->copy) { +					__output_custom(handle, frag->copy, +							frag->data, frag->size); +				} else { +					__output_copy(handle, frag->data, +						      frag->size); +				} +				if (perf_raw_frag_last(frag)) +					break; +				frag = frag->next; +			} while (1); +			if (frag->pad) +				__output_skip(handle, NULL, frag->pad);  		} else {  			struct {  				u32	size; @@ -5687,14 +5761,28 @@ void perf_prepare_sample(struct perf_event_header *header,  	}  	if (sample_type & PERF_SAMPLE_RAW) { -		int size = sizeof(u32); - -		if (data->raw) -			size += data->raw->size; -		else -			size += sizeof(u32); +		struct perf_raw_record *raw = data->raw; +		int size; + +		if (raw) { +			struct perf_raw_frag *frag = &raw->frag; +			u32 sum = 0; + +			do { +				sum += frag->size; +				if (perf_raw_frag_last(frag)) +					break; +				frag = frag->next; +			} while (1); + +			size = round_up(sum + sizeof(u32), sizeof(u64)); +			raw->size = size - sizeof(u32); +			frag->pad = raw->size - sum; +		} else { +			size = sizeof(u64); +		} -		header->size += round_up(size, sizeof(u64)); +		header->size += size;  	}  	if (sample_type & PERF_SAMPLE_BRANCH_STACK) { @@ -5854,11 +5942,11 @@ perf_event_read_event(struct perf_event *event,  	perf_output_end(&handle);  } -typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data); +typedef void (perf_iterate_f)(struct perf_event *event, void *data);  static void -perf_event_aux_ctx(struct perf_event_context *ctx, -		   perf_event_aux_output_cb output, +perf_iterate_ctx(struct perf_event_context *ctx, +		   perf_iterate_f output,  		   void *data, bool all)  {  	struct perf_event *event; @@ -5875,52 +5963,55 @@ perf_event_aux_ctx(struct perf_event_context *ctx,  	}  } -static void -perf_event_aux_task_ctx(perf_event_aux_output_cb output, void *data, -			struct perf_event_context *task_ctx) +static void perf_iterate_sb_cpu(perf_iterate_f output, void *data)  { -	rcu_read_lock(); -	preempt_disable(); -	perf_event_aux_ctx(task_ctx, output, data, false); -	preempt_enable(); -	rcu_read_unlock(); +	struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events); +	struct perf_event *event; + +	list_for_each_entry_rcu(event, &pel->list, sb_list) { +		if (event->state < PERF_EVENT_STATE_INACTIVE) +			continue; +		if (!event_filter_match(event)) +			continue; +		output(event, data); +	}  } +/* + * Iterate all events that need to receive side-band events. + * + * For new callers; ensure that account_pmu_sb_event() includes + * your event, otherwise it might not get delivered. + */  static void -perf_event_aux(perf_event_aux_output_cb output, void *data, +perf_iterate_sb(perf_iterate_f output, void *data,  	       struct perf_event_context *task_ctx)  { -	struct perf_cpu_context *cpuctx;  	struct perf_event_context *ctx; -	struct pmu *pmu;  	int ctxn; +	rcu_read_lock(); +	preempt_disable(); +  	/* -	 * If we have task_ctx != NULL we only notify -	 * the task context itself. The task_ctx is set -	 * only for EXIT events before releasing task +	 * If we have task_ctx != NULL we only notify the task context itself. +	 * The task_ctx is set only for EXIT events before releasing task  	 * context.  	 */  	if (task_ctx) { -		perf_event_aux_task_ctx(output, data, task_ctx); -		return; +		perf_iterate_ctx(task_ctx, output, data, false); +		goto done;  	} -	rcu_read_lock(); -	list_for_each_entry_rcu(pmu, &pmus, entry) { -		cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); -		if (cpuctx->unique_pmu != pmu) -			goto next; -		perf_event_aux_ctx(&cpuctx->ctx, output, data, false); -		ctxn = pmu->task_ctx_nr; -		if (ctxn < 0) -			goto next; +	perf_iterate_sb_cpu(output, data); + +	for_each_task_context_nr(ctxn) {  		ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);  		if (ctx) -			perf_event_aux_ctx(ctx, output, data, false); -next: -		put_cpu_ptr(pmu->pmu_cpu_context); +			perf_iterate_ctx(ctx, output, data, false);  	} +done: +	preempt_enable();  	rcu_read_unlock();  } @@ -5969,7 +6060,7 @@ void perf_event_exec(void)  		perf_event_enable_on_exec(ctxn); -		perf_event_aux_ctx(ctx, perf_event_addr_filters_exec, NULL, +		perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL,  				   true);  	}  	rcu_read_unlock(); @@ -6013,9 +6104,9 @@ static int __perf_pmu_output_stop(void *info)  	};  	rcu_read_lock(); -	perf_event_aux_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false); +	perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false);  	if (cpuctx->task_ctx) -		perf_event_aux_ctx(cpuctx->task_ctx, __perf_event_output_stop, +		perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop,  				   &ro, false);  	rcu_read_unlock(); @@ -6144,7 +6235,7 @@ static void perf_event_task(struct task_struct *task,  		},  	}; -	perf_event_aux(perf_event_task_output, +	perf_iterate_sb(perf_event_task_output,  		       &task_event,  		       task_ctx);  } @@ -6223,7 +6314,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)  	comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; -	perf_event_aux(perf_event_comm_output, +	perf_iterate_sb(perf_event_comm_output,  		       comm_event,  		       NULL);  } @@ -6454,7 +6545,7 @@ got_name:  	mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; -	perf_event_aux(perf_event_mmap_output, +	perf_iterate_sb(perf_event_mmap_output,  		       mmap_event,  		       NULL); @@ -6537,7 +6628,7 @@ static void perf_addr_filters_adjust(struct vm_area_struct *vma)  		if (!ctx)  			continue; -		perf_event_aux_ctx(ctx, __perf_addr_filters_adjust, vma, true); +		perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true);  	}  	rcu_read_unlock();  } @@ -6724,7 +6815,7 @@ static void perf_event_switch(struct task_struct *task,  		},  	}; -	perf_event_aux(perf_event_switch_output, +	perf_iterate_sb(perf_event_switch_output,  		       &switch_event,  		       NULL);  } @@ -7331,7 +7422,7 @@ static struct pmu perf_swevent = {  static int perf_tp_filter_match(struct perf_event *event,  				struct perf_sample_data *data)  { -	void *record = data->raw->data; +	void *record = data->raw->frag.data;  	/* only top level events have filters set */  	if (event->parent) @@ -7387,8 +7478,10 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,  	struct perf_event *event;  	struct perf_raw_record raw = { -		.size = entry_size, -		.data = record, +		.frag = { +			.size = entry_size, +			.data = record, +		},  	};  	perf_sample_data_init(&data, 0, 0); @@ -8646,6 +8739,28 @@ unlock:  	return pmu;  } +static void attach_sb_event(struct perf_event *event) +{ +	struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu); + +	raw_spin_lock(&pel->lock); +	list_add_rcu(&event->sb_list, &pel->list); +	raw_spin_unlock(&pel->lock); +} + +/* + * We keep a list of all !task (and therefore per-cpu) events + * that need to receive side-band records. + * + * This avoids having to scan all the various PMU per-cpu contexts + * looking for them. + */ +static void account_pmu_sb_event(struct perf_event *event) +{ +	if (is_sb_event(event)) +		attach_sb_event(event); +} +  static void account_event_cpu(struct perf_event *event, int cpu)  {  	if (event->parent) @@ -8726,6 +8841,8 @@ static void account_event(struct perf_event *event)  enabled:  	account_event_cpu(event, event->cpu); + +	account_pmu_sb_event(event);  }  /* @@ -8874,7 +8991,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,  	if (!event->parent) {  		if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { -			err = get_callchain_buffers(); +			err = get_callchain_buffers(attr->sample_max_stack);  			if (err)  				goto err_addr_filters;  		} @@ -9196,6 +9313,9 @@ SYSCALL_DEFINE5(perf_event_open,  			return -EINVAL;  	} +	if (!attr.sample_max_stack) +		attr.sample_max_stack = sysctl_perf_event_max_stack; +  	/*  	 * In cgroup mode, the pid argument is used to pass the fd  	 * opened to the cgroup directory in cgroupfs. The cpu argument @@ -9269,7 +9389,7 @@ SYSCALL_DEFINE5(perf_event_open,  	if (is_sampling_event(event)) {  		if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) { -			err = -ENOTSUPP; +			err = -EOPNOTSUPP;  			goto err_alloc;  		}  	} @@ -10231,10 +10351,13 @@ static void __init perf_event_init_all_cpus(void)  		swhash = &per_cpu(swevent_htable, cpu);  		mutex_init(&swhash->hlist_mutex);  		INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu)); + +		INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu)); +		raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));  	}  } -static void perf_event_init_cpu(int cpu) +int perf_event_init_cpu(unsigned int cpu)  {  	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); @@ -10247,6 +10370,7 @@ static void perf_event_init_cpu(int cpu)  		rcu_assign_pointer(swhash->swevent_hlist, hlist);  	}  	mutex_unlock(&swhash->hlist_mutex); +	return 0;  }  #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE @@ -10278,14 +10402,17 @@ static void perf_event_exit_cpu_context(int cpu)  	}  	srcu_read_unlock(&pmus_srcu, idx);  } +#else + +static void perf_event_exit_cpu_context(int cpu) { } + +#endif -static void perf_event_exit_cpu(int cpu) +int perf_event_exit_cpu(unsigned int cpu)  {  	perf_event_exit_cpu_context(cpu); +	return 0;  } -#else -static inline void perf_event_exit_cpu(int cpu) { } -#endif  static int  perf_reboot(struct notifier_block *notifier, unsigned long val, void *v) @@ -10307,46 +10434,6 @@ static struct notifier_block perf_reboot_notifier = {  	.priority = INT_MIN,  }; -static int -perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) -{ -	unsigned int cpu = (long)hcpu; - -	switch (action & ~CPU_TASKS_FROZEN) { - -	case CPU_UP_PREPARE: -		/* -		 * This must be done before the CPU comes alive, because the -		 * moment we can run tasks we can encounter (software) events. -		 * -		 * Specifically, someone can have inherited events on kthreadd -		 * or a pre-existing worker thread that gets re-bound. -		 */ -		perf_event_init_cpu(cpu); -		break; - -	case CPU_DOWN_PREPARE: -		/* -		 * This must be done before the CPU dies because after that an -		 * active event might want to IPI the CPU and that'll not work -		 * so great for dead CPUs. -		 * -		 * XXX smp_call_function_single() return -ENXIO without a warn -		 * so we could possibly deal with this. -		 * -		 * This is safe against new events arriving because -		 * sys_perf_event_open() serializes against hotplug using -		 * get_online_cpus(). -		 */ -		perf_event_exit_cpu(cpu); -		break; -	default: -		break; -	} - -	return NOTIFY_OK; -} -  void __init perf_event_init(void)  {  	int ret; @@ -10359,7 +10446,7 @@ void __init perf_event_init(void)  	perf_pmu_register(&perf_cpu_clock, NULL, -1);  	perf_pmu_register(&perf_task_clock, NULL, -1);  	perf_tp_register(); -	perf_cpu_notifier(perf_cpu_notify); +	perf_event_init_cpu(smp_processor_id());  	register_reboot_notifier(&perf_reboot_notifier);  	ret = init_hw_breakpoint(); diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 05f9f6d626df..486fd78eb8d5 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -123,21 +123,19 @@ static inline unsigned long perf_aux_size(struct ring_buffer *rb)  	return rb->aux_nr_pages << PAGE_SHIFT;  } -#define DEFINE_OUTPUT_COPY(func_name, memcpy_func)			\ -static inline unsigned long						\ -func_name(struct perf_output_handle *handle,				\ -	  const void *buf, unsigned long len)				\ +#define __DEFINE_OUTPUT_COPY_BODY(advance_buf, memcpy_func, ...)	\  {									\  	unsigned long size, written;					\  									\  	do {								\  		size    = min(handle->size, len);			\ -		written = memcpy_func(handle->addr, buf, size);		\ +		written = memcpy_func(__VA_ARGS__);			\  		written = size - written;				\  									\  		len -= written;						\  		handle->addr += written;				\ -		buf += written;						\ +		if (advance_buf)					\ +			buf += written;					\  		handle->size -= written;				\  		if (!handle->size) {					\  			struct ring_buffer *rb = handle->rb;		\ @@ -152,6 +150,21 @@ func_name(struct perf_output_handle *handle,				\  	return len;							\  } +#define DEFINE_OUTPUT_COPY(func_name, memcpy_func)			\ +static inline unsigned long						\ +func_name(struct perf_output_handle *handle,				\ +	  const void *buf, unsigned long len)				\ +__DEFINE_OUTPUT_COPY_BODY(true, memcpy_func, handle->addr, buf, size) + +static inline unsigned long +__output_custom(struct perf_output_handle *handle, perf_copy_f copy_func, +		const void *buf, unsigned long len) +{ +	unsigned long orig_len = len; +	__DEFINE_OUTPUT_COPY_BODY(false, copy_func, handle->addr, buf, +				  orig_len - len, size) +} +  static inline unsigned long  memcpy_common(void *dst, const void *src, unsigned long n)  { diff --git a/kernel/exit.c b/kernel/exit.c index 9e6e1356e6bb..84ae830234f8 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -211,6 +211,82 @@ repeat:  }  /* + * Note that if this function returns a valid task_struct pointer (!NULL) + * task->usage must remain >0 for the duration of the RCU critical section. + */ +struct task_struct *task_rcu_dereference(struct task_struct **ptask) +{ +	struct sighand_struct *sighand; +	struct task_struct *task; + +	/* +	 * We need to verify that release_task() was not called and thus +	 * delayed_put_task_struct() can't run and drop the last reference +	 * before rcu_read_unlock(). We check task->sighand != NULL, +	 * but we can read the already freed and reused memory. +	 */ +retry: +	task = rcu_dereference(*ptask); +	if (!task) +		return NULL; + +	probe_kernel_address(&task->sighand, sighand); + +	/* +	 * Pairs with atomic_dec_and_test() in put_task_struct(). If this task +	 * was already freed we can not miss the preceding update of this +	 * pointer. +	 */ +	smp_rmb(); +	if (unlikely(task != READ_ONCE(*ptask))) +		goto retry; + +	/* +	 * We've re-checked that "task == *ptask", now we have two different +	 * cases: +	 * +	 * 1. This is actually the same task/task_struct. In this case +	 *    sighand != NULL tells us it is still alive. +	 * +	 * 2. This is another task which got the same memory for task_struct. +	 *    We can't know this of course, and we can not trust +	 *    sighand != NULL. +	 * +	 *    In this case we actually return a random value, but this is +	 *    correct. +	 * +	 *    If we return NULL - we can pretend that we actually noticed that +	 *    *ptask was updated when the previous task has exited. Or pretend +	 *    that probe_slab_address(&sighand) reads NULL. +	 * +	 *    If we return the new task (because sighand is not NULL for any +	 *    reason) - this is fine too. This (new) task can't go away before +	 *    another gp pass. +	 * +	 *    And note: We could even eliminate the false positive if re-read +	 *    task->sighand once again to avoid the falsely NULL. But this case +	 *    is very unlikely so we don't care. +	 */ +	if (!sighand) +		return NULL; + +	return task; +} + +struct task_struct *try_get_task_struct(struct task_struct **ptask) +{ +	struct task_struct *task; + +	rcu_read_lock(); +	task = task_rcu_dereference(ptask); +	if (task) +		get_task_struct(task); +	rcu_read_unlock(); + +	return task; +} + +/*   * Determine if a process group is "orphaned", according to the POSIX   * definition in 2.2.2.52.  Orphaned process groups are not to be affected   * by terminal-generated stop signals.  Newly orphaned process groups are @@ -700,10 +776,14 @@ void do_exit(long code)  	exit_signals(tsk);  /* sets PF_EXITING */  	/* -	 * tsk->flags are checked in the futex code to protect against -	 * an exiting task cleaning up the robust pi futexes. +	 * Ensure that all new tsk->pi_lock acquisitions must observe +	 * PF_EXITING. Serializes against futex.c:attach_to_pi_owner().  	 */  	smp_mb(); +	/* +	 * Ensure that we must observe the pi_state in exit_mm() -> +	 * mm_release() -> exit_pi_state_list(). +	 */  	raw_spin_unlock_wait(&tsk->pi_lock);  	if (unlikely(in_atomic())) { diff --git a/kernel/fork.c b/kernel/fork.c index 5c2c355aa97f..52e725d4a866 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -148,57 +148,49 @@ static inline void free_task_struct(struct task_struct *tsk)  }  #endif -void __weak arch_release_thread_info(struct thread_info *ti) +void __weak arch_release_thread_stack(unsigned long *stack)  {  } -#ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR +#ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR  /*   * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a   * kmemcache based allocator.   */  # if THREAD_SIZE >= PAGE_SIZE -static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, +static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,  						  int node)  { -	struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP, -						  THREAD_SIZE_ORDER); - -	if (page) -		memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK, -					    1 << THREAD_SIZE_ORDER); +	struct page *page = alloc_pages_node(node, THREADINFO_GFP, +					     THREAD_SIZE_ORDER);  	return page ? page_address(page) : NULL;  } -static inline void free_thread_info(struct thread_info *ti) +static inline void free_thread_stack(unsigned long *stack)  { -	struct page *page = virt_to_page(ti); - -	memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK, -				    -(1 << THREAD_SIZE_ORDER)); -	__free_kmem_pages(page, THREAD_SIZE_ORDER); +	__free_pages(virt_to_page(stack), THREAD_SIZE_ORDER);  }  # else -static struct kmem_cache *thread_info_cache; +static struct kmem_cache *thread_stack_cache; -static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, +static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,  						  int node)  { -	return kmem_cache_alloc_node(thread_info_cache, THREADINFO_GFP, node); +	return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);  } -static void free_thread_info(struct thread_info *ti) +static void free_thread_stack(unsigned long *stack)  { -	kmem_cache_free(thread_info_cache, ti); +	kmem_cache_free(thread_stack_cache, stack);  } -void thread_info_cache_init(void) +void thread_stack_cache_init(void)  { -	thread_info_cache = kmem_cache_create("thread_info", THREAD_SIZE, +	thread_stack_cache = kmem_cache_create("thread_stack", THREAD_SIZE,  					      THREAD_SIZE, 0, NULL); -	BUG_ON(thread_info_cache == NULL); +	BUG_ON(thread_stack_cache == NULL);  }  # endif  #endif @@ -221,18 +213,24 @@ struct kmem_cache *vm_area_cachep;  /* SLAB cache for mm_struct structures (tsk->mm) */  static struct kmem_cache *mm_cachep; -static void account_kernel_stack(struct thread_info *ti, int account) +static void account_kernel_stack(unsigned long *stack, int account)  { -	struct zone *zone = page_zone(virt_to_page(ti)); +	/* All stack pages are in the same zone and belong to the same memcg. */ +	struct page *first_page = virt_to_page(stack); + +	mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, +			    THREAD_SIZE / 1024 * account); -	mod_zone_page_state(zone, NR_KERNEL_STACK, account); +	memcg_kmem_update_page_stat( +		first_page, MEMCG_KERNEL_STACK_KB, +		account * (THREAD_SIZE / 1024));  }  void free_task(struct task_struct *tsk)  {  	account_kernel_stack(tsk->stack, -1); -	arch_release_thread_info(tsk->stack); -	free_thread_info(tsk->stack); +	arch_release_thread_stack(tsk->stack); +	free_thread_stack(tsk->stack);  	rt_mutex_debug_task_free(tsk);  	ftrace_graph_exit_task(tsk);  	put_seccomp_filter(tsk); @@ -343,7 +341,7 @@ void set_task_stack_end_magic(struct task_struct *tsk)  static struct task_struct *dup_task_struct(struct task_struct *orig, int node)  {  	struct task_struct *tsk; -	struct thread_info *ti; +	unsigned long *stack;  	int err;  	if (node == NUMA_NO_NODE) @@ -352,15 +350,15 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)  	if (!tsk)  		return NULL; -	ti = alloc_thread_info_node(tsk, node); -	if (!ti) +	stack = alloc_thread_stack_node(tsk, node); +	if (!stack)  		goto free_tsk;  	err = arch_dup_task_struct(tsk, orig);  	if (err) -		goto free_ti; +		goto free_stack; -	tsk->stack = ti; +	tsk->stack = stack;  #ifdef CONFIG_SECCOMP  	/*  	 * We must handle setting up seccomp filters once we're under @@ -392,14 +390,14 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)  	tsk->task_frag.page = NULL;  	tsk->wake_q.next = NULL; -	account_kernel_stack(ti, 1); +	account_kernel_stack(stack, 1);  	kcov_task_init(tsk);  	return tsk; -free_ti: -	free_thread_info(ti); +free_stack: +	free_thread_stack(stack);  free_tsk:  	free_task_struct(tsk);  	return NULL; diff --git a/kernel/freezer.c b/kernel/freezer.c index a8900a3bc27a..6f56a9e219fa 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -42,7 +42,7 @@ bool freezing_slow_path(struct task_struct *p)  	if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK))  		return false; -	if (test_thread_flag(TIF_MEMDIE)) +	if (test_tsk_thread_flag(p, TIF_MEMDIE))  		return false;  	if (pm_nosig_freezing || cgroup_freezing(p)) diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index e25e92fb44fa..6a5c239c7669 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c @@ -18,7 +18,7 @@  #include <linux/vmalloc.h>  #include "gcov.h" -#if __GNUC__ == 5 && __GNUC_MINOR__ >= 1 +#if (__GNUC__ > 5) || (__GNUC__ == 5 && __GNUC_MINOR__ >= 1)  #define GCOV_COUNTERS			10  #elif __GNUC__ == 4 && __GNUC_MINOR__ >= 9  #define GCOV_COUNTERS			9 diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 2ee42e95a3ce..1d3ee3169202 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -9,3 +9,4 @@ obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o  obj-$(CONFIG_PM_SLEEP) += pm.o  obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o  obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o +obj-$(CONFIG_SMP) += affinity.o diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c new file mode 100644 index 000000000000..f68959341c0f --- /dev/null +++ b/kernel/irq/affinity.c @@ -0,0 +1,61 @@ + +#include <linux/interrupt.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/cpu.h> + +static int get_first_sibling(unsigned int cpu) +{ +	unsigned int ret; + +	ret = cpumask_first(topology_sibling_cpumask(cpu)); +	if (ret < nr_cpu_ids) +		return ret; +	return cpu; +} + +/* + * Take a map of online CPUs and the number of available interrupt vectors + * and generate an output cpumask suitable for spreading MSI/MSI-X vectors + * so that they are distributed as good as possible around the CPUs.  If + * more vectors than CPUs are available we'll map one to each CPU, + * otherwise we map one to the first sibling of each socket. + * + * If there are more vectors than CPUs we will still only have one bit + * set per CPU, but interrupt code will keep on assigning the vectors from + * the start of the bitmap until we run out of vectors. + */ +struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs) +{ +	struct cpumask *affinity_mask; +	unsigned int max_vecs = *nr_vecs; + +	if (max_vecs == 1) +		return NULL; + +	affinity_mask = kzalloc(cpumask_size(), GFP_KERNEL); +	if (!affinity_mask) { +		*nr_vecs = 1; +		return NULL; +	} + +	if (max_vecs >= num_online_cpus()) { +		cpumask_copy(affinity_mask, cpu_online_mask); +		*nr_vecs = num_online_cpus(); +	} else { +		unsigned int vecs = 0, cpu; + +		for_each_online_cpu(cpu) { +			if (cpu == get_first_sibling(cpu)) { +				cpumask_set_cpu(cpu, affinity_mask); +				vecs++; +			} + +			if (--max_vecs == 0) +				break; +		} +		*nr_vecs = vecs; +	} + +	return affinity_mask; +} diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 2f9f2b0e79f2..b4c1bc7c9ca2 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -426,6 +426,49 @@ out_unlock:  }  EXPORT_SYMBOL_GPL(handle_simple_irq); +/** + *	handle_untracked_irq - Simple and software-decoded IRQs. + *	@desc:	the interrupt description structure for this irq + * + *	Untracked interrupts are sent from a demultiplexing interrupt + *	handler when the demultiplexer does not know which device it its + *	multiplexed irq domain generated the interrupt. IRQ's handled + *	through here are not subjected to stats tracking, randomness, or + *	spurious interrupt detection. + * + *	Note: Like handle_simple_irq, the caller is expected to handle + *	the ack, clear, mask and unmask issues if necessary. + */ +void handle_untracked_irq(struct irq_desc *desc) +{ +	unsigned int flags = 0; + +	raw_spin_lock(&desc->lock); + +	if (!irq_may_run(desc)) +		goto out_unlock; + +	desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); + +	if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { +		desc->istate |= IRQS_PENDING; +		goto out_unlock; +	} + +	desc->istate &= ~IRQS_PENDING; +	irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); +	raw_spin_unlock(&desc->lock); + +	__handle_irq_event_percpu(desc, &flags); + +	raw_spin_lock(&desc->lock); +	irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); + +out_unlock: +	raw_spin_unlock(&desc->lock); +} +EXPORT_SYMBOL_GPL(handle_untracked_irq); +  /*   * Called unconditionally from handle_level_irq() and only for oneshot   * interrupts from handle_fasteoi_irq() @@ -1093,3 +1136,43 @@ int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)  	return 0;  } + +/** + * irq_chip_pm_get - Enable power for an IRQ chip + * @data:	Pointer to interrupt specific data + * + * Enable the power to the IRQ chip referenced by the interrupt data + * structure. + */ +int irq_chip_pm_get(struct irq_data *data) +{ +	int retval; + +	if (IS_ENABLED(CONFIG_PM) && data->chip->parent_device) { +		retval = pm_runtime_get_sync(data->chip->parent_device); +		if (retval < 0) { +			pm_runtime_put_noidle(data->chip->parent_device); +			return retval; +		} +	} + +	return 0; +} + +/** + * irq_chip_pm_put - Disable power for an IRQ chip + * @data:	Pointer to interrupt specific data + * + * Disable the power to the IRQ chip referenced by the interrupt data + * structure, belongs. Note that power will only be disabled, once this + * function has been called for all IRQs that have called irq_chip_pm_get(). + */ +int irq_chip_pm_put(struct irq_data *data) +{ +	int retval = 0; + +	if (IS_ENABLED(CONFIG_PM) && data->chip->parent_device) +		retval = pm_runtime_put(data->chip->parent_device); + +	return (retval < 0) ? retval : 0; +} diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index a15b5485b446..d3f24905852c 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -132,10 +132,10 @@ void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action)  	wake_up_process(action->thread);  } -irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) +irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags)  {  	irqreturn_t retval = IRQ_NONE; -	unsigned int flags = 0, irq = desc->irq_data.irq; +	unsigned int irq = desc->irq_data.irq;  	struct irqaction *action;  	for_each_action_of_desc(desc, action) { @@ -164,7 +164,7 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)  			/* Fall through to add to randomness */  		case IRQ_HANDLED: -			flags |= action->flags; +			*flags |= action->flags;  			break;  		default: @@ -174,7 +174,17 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)  		retval |= res;  	} -	add_interrupt_randomness(irq, flags); +	return retval; +} + +irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) +{ +	irqreturn_t retval; +	unsigned int flags = 0; + +	retval = __handle_irq_event_percpu(desc, &flags); + +	add_interrupt_randomness(desc->irq_data.irq, flags);  	if (!noirqdebug)  		note_interrupt(desc, retval); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 09be2c903c6d..bc226e783bd2 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -7,6 +7,7 @@   */  #include <linux/irqdesc.h>  #include <linux/kernel_stat.h> +#include <linux/pm_runtime.h>  #ifdef CONFIG_SPARSE_IRQ  # define IRQ_BITMAP_BITS	(NR_IRQS + 8196) @@ -83,6 +84,7 @@ extern void irq_mark_irq(unsigned int irq);  extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); +irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags);  irqreturn_t handle_irq_event_percpu(struct irq_desc *desc);  irqreturn_t handle_irq_event(struct irq_desc *desc); @@ -105,6 +107,8 @@ static inline void unregister_handler_proc(unsigned int irq,  					   struct irqaction *action) { }  #endif +extern bool irq_can_set_affinity_usr(unsigned int irq); +  extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask);  extern void irq_set_thread_affinity(struct irq_desc *desc); diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c index 89b49f6773f0..1a9abc1c8ea0 100644 --- a/kernel/irq/ipi.c +++ b/kernel/irq/ipi.c @@ -76,14 +76,14 @@ int irq_reserve_ipi(struct irq_domain *domain,  		}  	} -	virq = irq_domain_alloc_descs(-1, nr_irqs, 0, NUMA_NO_NODE); +	virq = irq_domain_alloc_descs(-1, nr_irqs, 0, NUMA_NO_NODE, NULL);  	if (virq <= 0) {  		pr_warn("Can't reserve IPI, failed to alloc descs\n");  		return -ENOMEM;  	}  	virq = __irq_domain_alloc_irqs(domain, virq, nr_irqs, NUMA_NO_NODE, -				       (void *) dest, true); +				       (void *) dest, true, NULL);  	if (virq <= 0) {  		pr_warn("Can't reserve IPI, failed to alloc hw irqs\n"); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 8731e1c5d1e7..a623b44f2d4b 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -68,9 +68,13 @@ static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node)  	return 0;  } -static void desc_smp_init(struct irq_desc *desc, int node) +static void desc_smp_init(struct irq_desc *desc, int node, +			  const struct cpumask *affinity)  { -	cpumask_copy(desc->irq_common_data.affinity, irq_default_affinity); +	if (!affinity) +		affinity = irq_default_affinity; +	cpumask_copy(desc->irq_common_data.affinity, affinity); +  #ifdef CONFIG_GENERIC_PENDING_IRQ  	cpumask_clear(desc->pending_mask);  #endif @@ -82,11 +86,12 @@ static void desc_smp_init(struct irq_desc *desc, int node)  #else  static inline int  alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) { return 0; } -static inline void desc_smp_init(struct irq_desc *desc, int node) { } +static inline void +desc_smp_init(struct irq_desc *desc, int node, const struct cpumask *affinity) { }  #endif  static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node, -		struct module *owner) +			      const struct cpumask *affinity, struct module *owner)  {  	int cpu; @@ -107,7 +112,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node,  	desc->owner = owner;  	for_each_possible_cpu(cpu)  		*per_cpu_ptr(desc->kstat_irqs, cpu) = 0; -	desc_smp_init(desc, node); +	desc_smp_init(desc, node, affinity);  }  int nr_irqs = NR_IRQS; @@ -158,7 +163,9 @@ void irq_unlock_sparse(void)  	mutex_unlock(&sparse_irq_lock);  } -static struct irq_desc *alloc_desc(int irq, int node, struct module *owner) +static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags, +				   const struct cpumask *affinity, +				   struct module *owner)  {  	struct irq_desc *desc;  	gfp_t gfp = GFP_KERNEL; @@ -178,7 +185,8 @@ static struct irq_desc *alloc_desc(int irq, int node, struct module *owner)  	lockdep_set_class(&desc->lock, &irq_desc_lock_class);  	init_rcu_head(&desc->rcu); -	desc_set_defaults(irq, desc, node, owner); +	desc_set_defaults(irq, desc, node, affinity, owner); +	irqd_set(&desc->irq_data, flags);  	return desc; @@ -223,13 +231,32 @@ static void free_desc(unsigned int irq)  }  static int alloc_descs(unsigned int start, unsigned int cnt, int node, -		       struct module *owner) +		       const struct cpumask *affinity, struct module *owner)  { +	const struct cpumask *mask = NULL;  	struct irq_desc *desc; -	int i; +	unsigned int flags; +	int i, cpu = -1; + +	if (affinity && cpumask_empty(affinity)) +		return -EINVAL; + +	flags = affinity ? IRQD_AFFINITY_MANAGED : 0;  	for (i = 0; i < cnt; i++) { -		desc = alloc_desc(start + i, node, owner); +		if (affinity) { +			cpu = cpumask_next(cpu, affinity); +			if (cpu >= nr_cpu_ids) +				cpu = cpumask_first(affinity); +			node = cpu_to_node(cpu); + +			/* +			 * For single allocations we use the caller provided +			 * mask otherwise we use the mask of the target cpu +			 */ +			mask = cnt == 1 ? affinity : cpumask_of(cpu); +		} +		desc = alloc_desc(start + i, node, flags, mask, owner);  		if (!desc)  			goto err;  		mutex_lock(&sparse_irq_lock); @@ -277,7 +304,7 @@ int __init early_irq_init(void)  		nr_irqs = initcnt;  	for (i = 0; i < initcnt; i++) { -		desc = alloc_desc(i, node, NULL); +		desc = alloc_desc(i, node, 0, NULL, NULL);  		set_bit(i, allocated_irqs);  		irq_insert_desc(i, desc);  	} @@ -311,7 +338,7 @@ int __init early_irq_init(void)  		alloc_masks(&desc[i], GFP_KERNEL, node);  		raw_spin_lock_init(&desc[i].lock);  		lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); -		desc_set_defaults(i, &desc[i], node, NULL); +		desc_set_defaults(i, &desc[i], node, NULL, NULL);  	}  	return arch_early_irq_init();  } @@ -328,11 +355,12 @@ static void free_desc(unsigned int irq)  	unsigned long flags;  	raw_spin_lock_irqsave(&desc->lock, flags); -	desc_set_defaults(irq, desc, irq_desc_get_node(desc), NULL); +	desc_set_defaults(irq, desc, irq_desc_get_node(desc), NULL, NULL);  	raw_spin_unlock_irqrestore(&desc->lock, flags);  }  static inline int alloc_descs(unsigned int start, unsigned int cnt, int node, +			      const struct cpumask *affinity,  			      struct module *owner)  {  	u32 i; @@ -453,12 +481,15 @@ EXPORT_SYMBOL_GPL(irq_free_descs);   * @cnt:	Number of consecutive irqs to allocate.   * @node:	Preferred node on which the irq descriptor should be allocated   * @owner:	Owning module (can be NULL) + * @affinity:	Optional pointer to an affinity mask which hints where the + *		irq descriptors should be allocated and which default + *		affinities to use   *   * Returns the first irq number or error code   */  int __ref  __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, -		  struct module *owner) +		  struct module *owner, const struct cpumask *affinity)  {  	int start, ret; @@ -494,7 +525,7 @@ __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,  	bitmap_set(allocated_irqs, start, cnt);  	mutex_unlock(&sparse_irq_lock); -	return alloc_descs(start, cnt, node, owner); +	return alloc_descs(start, cnt, node, affinity, owner);  err:  	mutex_unlock(&sparse_irq_lock); @@ -512,7 +543,7 @@ EXPORT_SYMBOL_GPL(__irq_alloc_descs);   */  unsigned int irq_alloc_hwirqs(int cnt, int node)  { -	int i, irq = __irq_alloc_descs(-1, 0, cnt, node, NULL); +	int i, irq = __irq_alloc_descs(-1, 0, cnt, node, NULL, NULL);  	if (irq < 0)  		return 0; diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 8798b6c9e945..4752b43662e0 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -481,7 +481,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain,  	}  	/* Allocate a virtual interrupt number */ -	virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node)); +	virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node), NULL);  	if (virq <= 0) {  		pr_debug("-> virq allocation failed\n");  		return 0; @@ -567,6 +567,7 @@ static void of_phandle_args_to_fwspec(struct of_phandle_args *irq_data,  unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)  {  	struct irq_domain *domain; +	struct irq_data *irq_data;  	irq_hw_number_t hwirq;  	unsigned int type = IRQ_TYPE_NONE;  	int virq; @@ -588,15 +589,46 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)  	if (irq_domain_translate(domain, fwspec, &hwirq, &type))  		return 0; -	if (irq_domain_is_hierarchy(domain)) { +	/* +	 * WARN if the irqchip returns a type with bits +	 * outside the sense mask set and clear these bits. +	 */ +	if (WARN_ON(type & ~IRQ_TYPE_SENSE_MASK)) +		type &= IRQ_TYPE_SENSE_MASK; + +	/* +	 * If we've already configured this interrupt, +	 * don't do it again, or hell will break loose. +	 */ +	virq = irq_find_mapping(domain, hwirq); +	if (virq) { +		/* +		 * If the trigger type is not specified or matches the +		 * current trigger type then we are done so return the +		 * interrupt number. +		 */ +		if (type == IRQ_TYPE_NONE || type == irq_get_trigger_type(virq)) +			return virq; +  		/* -		 * If we've already configured this interrupt, -		 * don't do it again, or hell will break loose. +		 * If the trigger type has not been set yet, then set +		 * it now and return the interrupt number.  		 */ -		virq = irq_find_mapping(domain, hwirq); -		if (virq) +		if (irq_get_trigger_type(virq) == IRQ_TYPE_NONE) { +			irq_data = irq_get_irq_data(virq); +			if (!irq_data) +				return 0; + +			irqd_set_trigger_type(irq_data, type);  			return virq; +		} +		pr_warn("type mismatch, failed to map hwirq-%lu for %s!\n", +			hwirq, of_node_full_name(to_of_node(fwspec->fwnode))); +		return 0; +	} + +	if (irq_domain_is_hierarchy(domain)) {  		virq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, fwspec);  		if (virq <= 0)  			return 0; @@ -607,10 +639,18 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)  			return virq;  	} -	/* Set type if specified and different than the current one */ -	if (type != IRQ_TYPE_NONE && -	    type != irq_get_trigger_type(virq)) -		irq_set_irq_type(virq, type); +	irq_data = irq_get_irq_data(virq); +	if (!irq_data) { +		if (irq_domain_is_hierarchy(domain)) +			irq_domain_free_irqs(virq, 1); +		else +			irq_dispose_mapping(virq); +		return 0; +	} + +	/* Store trigger type */ +	irqd_set_trigger_type(irq_data, type); +  	return virq;  }  EXPORT_SYMBOL_GPL(irq_create_fwspec_mapping); @@ -640,8 +680,12 @@ void irq_dispose_mapping(unsigned int virq)  	if (WARN_ON(domain == NULL))  		return; -	irq_domain_disassociate(domain, virq); -	irq_free_desc(virq); +	if (irq_domain_is_hierarchy(domain)) { +		irq_domain_free_irqs(virq, 1); +	} else { +		irq_domain_disassociate(domain, virq); +		irq_free_desc(virq); +	}  }  EXPORT_SYMBOL_GPL(irq_dispose_mapping); @@ -835,19 +879,23 @@ const struct irq_domain_ops irq_domain_simple_ops = {  EXPORT_SYMBOL_GPL(irq_domain_simple_ops);  int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq, -			   int node) +			   int node, const struct cpumask *affinity)  {  	unsigned int hint;  	if (virq >= 0) { -		virq = irq_alloc_descs(virq, virq, cnt, node); +		virq = __irq_alloc_descs(virq, virq, cnt, node, THIS_MODULE, +					 affinity);  	} else {  		hint = hwirq % nr_irqs;  		if (hint == 0)  			hint++; -		virq = irq_alloc_descs_from(hint, cnt, node); -		if (virq <= 0 && hint > 1) -			virq = irq_alloc_descs_from(1, cnt, node); +		virq = __irq_alloc_descs(-1, hint, cnt, node, THIS_MODULE, +					 affinity); +		if (virq <= 0 && hint > 1) { +			virq = __irq_alloc_descs(-1, 1, cnt, node, THIS_MODULE, +						 affinity); +		}  	}  	return virq; @@ -1144,8 +1192,10 @@ int irq_domain_alloc_irqs_recursive(struct irq_domain *domain,  	if (recursive)  		ret = irq_domain_alloc_irqs_recursive(parent, irq_base,  						      nr_irqs, arg); -	if (ret >= 0) -		ret = domain->ops->alloc(domain, irq_base, nr_irqs, arg); +	if (ret < 0) +		return ret; + +	ret = domain->ops->alloc(domain, irq_base, nr_irqs, arg);  	if (ret < 0 && recursive)  		irq_domain_free_irqs_recursive(parent, irq_base, nr_irqs); @@ -1160,6 +1210,7 @@ int irq_domain_alloc_irqs_recursive(struct irq_domain *domain,   * @node:	NUMA node id for memory allocation   * @arg:	domain specific argument   * @realloc:	IRQ descriptors have already been allocated if true + * @affinity:	Optional irq affinity mask for multiqueue devices   *   * Allocate IRQ numbers and initialized all data structures to support   * hierarchy IRQ domains. @@ -1175,7 +1226,7 @@ int irq_domain_alloc_irqs_recursive(struct irq_domain *domain,   */  int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,  			    unsigned int nr_irqs, int node, void *arg, -			    bool realloc) +			    bool realloc, const struct cpumask *affinity)  {  	int i, ret, virq; @@ -1193,7 +1244,8 @@ int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,  	if (realloc && irq_base >= 0) {  		virq = irq_base;  	} else { -		virq = irq_domain_alloc_descs(irq_base, nr_irqs, 0, node); +		virq = irq_domain_alloc_descs(irq_base, nr_irqs, 0, node, +					      affinity);  		if (virq < 0) {  			pr_debug("cannot allocate IRQ(base %d, count %d)\n",  				 irq_base, nr_irqs); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index ef0bc02c3a70..73a2b786b5e9 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -115,12 +115,12 @@ EXPORT_SYMBOL(synchronize_irq);  #ifdef CONFIG_SMP  cpumask_var_t irq_default_affinity; -static int __irq_can_set_affinity(struct irq_desc *desc) +static bool __irq_can_set_affinity(struct irq_desc *desc)  {  	if (!desc || !irqd_can_balance(&desc->irq_data) ||  	    !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity) -		return 0; -	return 1; +		return false; +	return true;  }  /** @@ -134,6 +134,21 @@ int irq_can_set_affinity(unsigned int irq)  }  /** + * irq_can_set_affinity_usr - Check if affinity of a irq can be set from user space + * @irq:	Interrupt to check + * + * Like irq_can_set_affinity() above, but additionally checks for the + * AFFINITY_MANAGED flag. + */ +bool irq_can_set_affinity_usr(unsigned int irq) +{ +	struct irq_desc *desc = irq_to_desc(irq); + +	return __irq_can_set_affinity(desc) && +		!irqd_affinity_is_managed(&desc->irq_data); +} + +/**   *	irq_set_thread_affinity - Notify irq threads to adjust affinity   *	@desc:		irq descriptor which has affitnity changed   * @@ -338,10 +353,11 @@ static int setup_affinity(struct irq_desc *desc, struct cpumask *mask)  		return 0;  	/* -	 * Preserve an userspace affinity setup, but make sure that -	 * one of the targets is online. +	 * Preserve the managed affinity setting and an userspace affinity +	 * setup, but make sure that one of the targets is online.  	 */ -	if (irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) { +	if (irqd_affinity_is_managed(&desc->irq_data) || +	    irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) {  		if (cpumask_intersects(desc->irq_common_data.affinity,  				       cpu_online_mask))  			set = desc->irq_common_data.affinity; @@ -1117,6 +1133,13 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)  	new->irq = irq;  	/* +	 * If the trigger type is not specified by the caller, +	 * then use the default for this interrupt. +	 */ +	if (!(new->flags & IRQF_TRIGGER_MASK)) +		new->flags |= irqd_get_trigger_type(&desc->irq_data); + +	/*  	 * Check whether the interrupt nests into another interrupt  	 * thread.  	 */ @@ -1409,10 +1432,18 @@ int setup_irq(unsigned int irq, struct irqaction *act)  	if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc)))  		return -EINVAL; + +	retval = irq_chip_pm_get(&desc->irq_data); +	if (retval < 0) +		return retval; +  	chip_bus_lock(desc);  	retval = __setup_irq(irq, desc, act);  	chip_bus_sync_unlock(desc); +	if (retval) +		irq_chip_pm_put(&desc->irq_data); +  	return retval;  }  EXPORT_SYMBOL_GPL(setup_irq); @@ -1506,6 +1537,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)  		}  	} +	irq_chip_pm_put(&desc->irq_data);  	module_put(desc->owner);  	kfree(action->secondary);  	return action; @@ -1648,11 +1680,16 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,  	action->name = devname;  	action->dev_id = dev_id; +	retval = irq_chip_pm_get(&desc->irq_data); +	if (retval < 0) +		return retval; +  	chip_bus_lock(desc);  	retval = __setup_irq(irq, desc, action);  	chip_bus_sync_unlock(desc);  	if (retval) { +		irq_chip_pm_put(&desc->irq_data);  		kfree(action->secondary);  		kfree(action);  	} @@ -1730,7 +1767,14 @@ void enable_percpu_irq(unsigned int irq, unsigned int type)  	if (!desc)  		return; +	/* +	 * If the trigger type is not specified by the caller, then +	 * use the default for this interrupt. +	 */  	type &= IRQ_TYPE_SENSE_MASK; +	if (type == IRQ_TYPE_NONE) +		type = irqd_get_trigger_type(&desc->irq_data); +  	if (type != IRQ_TYPE_NONE) {  		int ret; @@ -1822,6 +1866,7 @@ static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_  	unregister_handler_proc(irq, action); +	irq_chip_pm_put(&desc->irq_data);  	module_put(desc->owner);  	return action; @@ -1884,10 +1929,18 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act)  	if (!desc || !irq_settings_is_per_cpu_devid(desc))  		return -EINVAL; + +	retval = irq_chip_pm_get(&desc->irq_data); +	if (retval < 0) +		return retval; +  	chip_bus_lock(desc);  	retval = __setup_irq(irq, desc, act);  	chip_bus_sync_unlock(desc); +	if (retval) +		irq_chip_pm_put(&desc->irq_data); +  	return retval;  } @@ -1931,12 +1984,18 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler,  	action->name = devname;  	action->percpu_dev_id = dev_id; +	retval = irq_chip_pm_get(&desc->irq_data); +	if (retval < 0) +		return retval; +  	chip_bus_lock(desc);  	retval = __setup_irq(irq, desc, action);  	chip_bus_sync_unlock(desc); -	if (retval) +	if (retval) { +		irq_chip_pm_put(&desc->irq_data);  		kfree(action); +	}  	return retval;  } diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 38e89ce7b071..54999350162c 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -324,7 +324,7 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,  	struct msi_domain_ops *ops = info->ops;  	msi_alloc_info_t arg;  	struct msi_desc *desc; -	int i, ret, virq = -1; +	int i, ret, virq;  	ret = msi_domain_prepare_irqs(domain, dev, nvec, &arg);  	if (ret) @@ -332,13 +332,10 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,  	for_each_msi_entry(desc, dev) {  		ops->set_desc(&arg, desc); -		if (info->flags & MSI_FLAG_IDENTITY_MAP) -			virq = (int)ops->get_hwirq(info, &arg); -		else -			virq = -1; -		virq = __irq_domain_alloc_irqs(domain, virq, desc->nvec_used, -					       dev_to_node(dev), &arg, false); +		virq = __irq_domain_alloc_irqs(domain, -1, desc->nvec_used, +					       dev_to_node(dev), &arg, false, +					       desc->affinity);  		if (virq < 0) {  			ret = -ENOSPC;  			if (ops->handle_error) @@ -356,6 +353,7 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,  		ops->msi_finish(&arg, 0);  	for_each_msi_entry(desc, dev) { +		virq = desc->irq;  		if (desc->nvec_used == 1)  			dev_dbg(dev, "irq %d for MSI\n", virq);  		else diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 4e1b94726818..feaa813b84a9 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -96,7 +96,7 @@ static ssize_t write_irq_affinity(int type, struct file *file,  	cpumask_var_t new_value;  	int err; -	if (!irq_can_set_affinity(irq) || no_irq_affinity) +	if (!irq_can_set_affinity_usr(irq) || no_irq_affinity)  		return -EIO;  	if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) @@ -311,7 +311,6 @@ void register_handler_proc(unsigned int irq, struct irqaction *action)  					!name_unique(irq, action))  		return; -	memset(name, 0, MAX_NAMELEN);  	snprintf(name, MAX_NAMELEN, "%s", action->name);  	/* create /proc/irq/1234/handler/ */ @@ -340,7 +339,6 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)  	if (desc->dir)  		goto out_unlock; -	memset(name, 0, MAX_NAMELEN);  	sprintf(name, "%d", irq);  	/* create /proc/irq/1234 */ @@ -386,7 +384,6 @@ void unregister_irq_proc(unsigned int irq, struct irq_desc *desc)  #endif  	remove_proc_entry("spurious", desc->dir); -	memset(name, 0, MAX_NAMELEN);  	sprintf(name, "%u", irq);  	remove_proc_entry(name, root_irq_dir);  } @@ -421,12 +418,8 @@ void init_irq_proc(void)  	/*  	 * Create entries for all existing IRQs.  	 */ -	for_each_irq_desc(irq, desc) { -		if (!desc) -			continue; - +	for_each_irq_desc(irq, desc)  		register_irq_proc(irq, desc); -	}  }  #ifdef CONFIG_GENERIC_IRQ_SHOW diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 05254eeb4b4e..0dbea887d625 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -58,13 +58,36 @@ static void jump_label_update(struct static_key *key);  void static_key_slow_inc(struct static_key *key)  { +	int v, v1; +  	STATIC_KEY_CHECK_USE(); -	if (atomic_inc_not_zero(&key->enabled)) -		return; + +	/* +	 * Careful if we get concurrent static_key_slow_inc() calls; +	 * later calls must wait for the first one to _finish_ the +	 * jump_label_update() process.  At the same time, however, +	 * the jump_label_update() call below wants to see +	 * static_key_enabled(&key) for jumps to be updated properly. +	 * +	 * So give a special meaning to negative key->enabled: it sends +	 * static_key_slow_inc() down the slow path, and it is non-zero +	 * so it counts as "enabled" in jump_label_update().  Note that +	 * atomic_inc_unless_negative() checks >= 0, so roll our own. +	 */ +	for (v = atomic_read(&key->enabled); v > 0; v = v1) { +		v1 = atomic_cmpxchg(&key->enabled, v, v + 1); +		if (likely(v1 == v)) +			return; +	}  	jump_label_lock(); -	if (atomic_inc_return(&key->enabled) == 1) +	if (atomic_read(&key->enabled) == 0) { +		atomic_set(&key->enabled, -1);  		jump_label_update(key); +		atomic_set(&key->enabled, 1); +	} else { +		atomic_inc(&key->enabled); +	}  	jump_label_unlock();  }  EXPORT_SYMBOL_GPL(static_key_slow_inc); @@ -72,6 +95,13 @@ EXPORT_SYMBOL_GPL(static_key_slow_inc);  static void __static_key_slow_dec(struct static_key *key,  		unsigned long rate_limit, struct delayed_work *work)  { +	/* +	 * The negative count check is valid even when a negative +	 * key->enabled is in use by static_key_slow_inc(); a +	 * __static_key_slow_dec() before the first static_key_slow_inc() +	 * returns is unbalanced, because all other static_key_slow_inc() +	 * instances block while the update is in progress. +	 */  	if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) {  		WARN(atomic_read(&key->enabled) < 0,  		     "jump label: negative count!\n"); @@ -422,7 +452,7 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val,  	return notifier_from_errno(ret);  } -struct notifier_block jump_label_module_nb = { +static struct notifier_block jump_label_module_nb = {  	.notifier_call = jump_label_module_notify,  	.priority = 1, /* higher than tracepoints */  }; diff --git a/kernel/kcov.c b/kernel/kcov.c index a02f2dddd1d7..8d44b3fea9d0 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -264,7 +264,12 @@ static const struct file_operations kcov_fops = {  static int __init kcov_init(void)  { -	if (!debugfs_create_file("kcov", 0600, NULL, NULL, &kcov_fops)) { +	/* +	 * The kcov debugfs file won't ever get removed and thus, +	 * there is no need to protect it against removal races. The +	 * use of debugfs_create_file_unsafe() is actually safe here. +	 */ +	if (!debugfs_create_file_unsafe("kcov", 0600, NULL, NULL, &kcov_fops)) {  		pr_err("failed to create kcov in debugfs\n");  		return -ENOMEM;  	} diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 81f1a7107c0e..589d763a49b3 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -46,6 +46,7 @@  #include <linux/gfp.h>  #include <linux/kmemcheck.h>  #include <linux/random.h> +#include <linux/jhash.h>  #include <asm/sections.h> @@ -309,10 +310,14 @@ static struct hlist_head chainhash_table[CHAINHASH_SIZE];   * It's a 64-bit hash, because it's important for the keys to be   * unique.   */ -#define iterate_chain_key(key1, key2) \ -	(((key1) << MAX_LOCKDEP_KEYS_BITS) ^ \ -	((key1) >> (64-MAX_LOCKDEP_KEYS_BITS)) ^ \ -	(key2)) +static inline u64 iterate_chain_key(u64 key, u32 idx) +{ +	u32 k0 = key, k1 = key >> 32; + +	__jhash_mix(idx, k0, k1); /* Macro that modifies arguments! */ + +	return k0 | (u64)k1 << 32; +}  void lockdep_off(void)  { diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index 3ef3736002d8..9c951fade415 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c @@ -49,21 +49,21 @@ void debug_mutex_free_waiter(struct mutex_waiter *waiter)  }  void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, -			    struct thread_info *ti) +			    struct task_struct *task)  {  	SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock));  	/* Mark the current thread as blocked on the lock: */ -	ti->task->blocked_on = waiter; +	task->blocked_on = waiter;  }  void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, -			 struct thread_info *ti) +			 struct task_struct *task)  {  	DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list)); -	DEBUG_LOCKS_WARN_ON(waiter->task != ti->task); -	DEBUG_LOCKS_WARN_ON(ti->task->blocked_on != waiter); -	ti->task->blocked_on = NULL; +	DEBUG_LOCKS_WARN_ON(waiter->task != task); +	DEBUG_LOCKS_WARN_ON(task->blocked_on != waiter); +	task->blocked_on = NULL;  	list_del_init(&waiter->list);  	waiter->task = NULL; diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h index 0799fd3e4cfa..57a871ae3c81 100644 --- a/kernel/locking/mutex-debug.h +++ b/kernel/locking/mutex-debug.h @@ -20,21 +20,21 @@ extern void debug_mutex_wake_waiter(struct mutex *lock,  extern void debug_mutex_free_waiter(struct mutex_waiter *waiter);  extern void debug_mutex_add_waiter(struct mutex *lock,  				   struct mutex_waiter *waiter, -				   struct thread_info *ti); +				   struct task_struct *task);  extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, -				struct thread_info *ti); +				struct task_struct *task);  extern void debug_mutex_unlock(struct mutex *lock);  extern void debug_mutex_init(struct mutex *lock, const char *name,  			     struct lock_class_key *key);  static inline void mutex_set_owner(struct mutex *lock)  { -	lock->owner = current; +	WRITE_ONCE(lock->owner, current);  }  static inline void mutex_clear_owner(struct mutex *lock)  { -	lock->owner = NULL; +	WRITE_ONCE(lock->owner, NULL);  }  #define spin_lock_mutex(lock, flags)			\ diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 79d2d765a75f..a70b90db3909 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -537,7 +537,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,  		goto skip_wait;  	debug_mutex_lock_common(lock, &waiter); -	debug_mutex_add_waiter(lock, &waiter, task_thread_info(task)); +	debug_mutex_add_waiter(lock, &waiter, task);  	/* add waiting tasks to the end of the waitqueue (FIFO): */  	list_add_tail(&waiter.list, &lock->wait_list); @@ -584,7 +584,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,  	}  	__set_task_state(task, TASK_RUNNING); -	mutex_remove_waiter(lock, &waiter, current_thread_info()); +	mutex_remove_waiter(lock, &waiter, task);  	/* set it to 0 if there are no waiters left: */  	if (likely(list_empty(&lock->wait_list)))  		atomic_set(&lock->count, 0); @@ -605,7 +605,7 @@ skip_wait:  	return 0;  err: -	mutex_remove_waiter(lock, &waiter, task_thread_info(task)); +	mutex_remove_waiter(lock, &waiter, task);  	spin_unlock_mutex(&lock->wait_lock, flags);  	debug_mutex_free_waiter(&waiter);  	mutex_release(&lock->dep_map, 1, ip); diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h index 5cda397607f2..6cd6b8e9efd7 100644 --- a/kernel/locking/mutex.h +++ b/kernel/locking/mutex.h @@ -13,18 +13,24 @@  		do { spin_lock(lock); (void)(flags); } while (0)  #define spin_unlock_mutex(lock, flags) \  		do { spin_unlock(lock); (void)(flags); } while (0) -#define mutex_remove_waiter(lock, waiter, ti) \ +#define mutex_remove_waiter(lock, waiter, task) \  		__list_del((waiter)->list.prev, (waiter)->list.next)  #ifdef CONFIG_MUTEX_SPIN_ON_OWNER +/* + * The mutex owner can get read and written to locklessly. + * We should use WRITE_ONCE when writing the owner value to + * avoid store tearing, otherwise, a thread could potentially + * read a partially written and incomplete owner value. + */  static inline void mutex_set_owner(struct mutex *lock)  { -	lock->owner = current; +	WRITE_ONCE(lock->owner, current);  }  static inline void mutex_clear_owner(struct mutex *lock)  { -	lock->owner = NULL; +	WRITE_ONCE(lock->owner, NULL);  }  #else  static inline void mutex_set_owner(struct mutex *lock) diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index fec082338668..19248ddf37ce 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -93,7 +93,7 @@ void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts)  	 * that accesses can't leak upwards out of our subsequent critical  	 * section in the case that the lock is currently held for write.  	 */ -	cnts = atomic_add_return_acquire(_QR_BIAS, &lock->cnts) - _QR_BIAS; +	cnts = atomic_fetch_add_acquire(_QR_BIAS, &lock->cnts);  	rspin_until_writer_unlock(lock, cnts);  	/* diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 5fc8c311b8fe..b2caec7315af 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -90,7 +90,7 @@ static DEFINE_PER_CPU_ALIGNED(struct mcs_spinlock, mcs_nodes[MAX_NODES]);   * therefore increment the cpu number by one.   */ -static inline u32 encode_tail(int cpu, int idx) +static inline __pure u32 encode_tail(int cpu, int idx)  {  	u32 tail; @@ -103,7 +103,7 @@ static inline u32 encode_tail(int cpu, int idx)  	return tail;  } -static inline struct mcs_spinlock *decode_tail(u32 tail) +static inline __pure struct mcs_spinlock *decode_tail(u32 tail)  {  	int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1;  	int idx = (tail &  _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; @@ -268,6 +268,63 @@ static __always_inline u32  __pv_wait_head_or_lock(struct qspinlock *lock,  #endif  /* + * Various notes on spin_is_locked() and spin_unlock_wait(), which are + * 'interesting' functions: + * + * PROBLEM: some architectures have an interesting issue with atomic ACQUIRE + * operations in that the ACQUIRE applies to the LOAD _not_ the STORE (ARM64, + * PPC). Also qspinlock has a similar issue per construction, the setting of + * the locked byte can be unordered acquiring the lock proper. + * + * This gets to be 'interesting' in the following cases, where the /should/s + * end up false because of this issue. + * + * + * CASE 1: + * + * So the spin_is_locked() correctness issue comes from something like: + * + *   CPU0				CPU1 + * + *   global_lock();			local_lock(i) + *     spin_lock(&G)			  spin_lock(&L[i]) + *     for (i)				  if (!spin_is_locked(&G)) { + *       spin_unlock_wait(&L[i]);	    smp_acquire__after_ctrl_dep(); + *					    return; + *					  } + *					  // deal with fail + * + * Where it is important CPU1 sees G locked or CPU0 sees L[i] locked such + * that there is exclusion between the two critical sections. + * + * The load from spin_is_locked(&G) /should/ be constrained by the ACQUIRE from + * spin_lock(&L[i]), and similarly the load(s) from spin_unlock_wait(&L[i]) + * /should/ be constrained by the ACQUIRE from spin_lock(&G). + * + * Similarly, later stuff is constrained by the ACQUIRE from CTRL+RMB. + * + * + * CASE 2: + * + * For spin_unlock_wait() there is a second correctness issue, namely: + * + *   CPU0				CPU1 + * + *   flag = set; + *   smp_mb();				spin_lock(&l) + *   spin_unlock_wait(&l);		if (!flag) + *					  // add to lockless list + *					spin_unlock(&l); + *   // iterate lockless list + * + * Which wants to ensure that CPU1 will stop adding bits to the list and CPU0 + * will observe the last entry on the list (if spin_unlock_wait() had ACQUIRE + * semantics etc..) + * + * Where flag /should/ be ordered against the locked store of l. + */ + +/*   * queued_spin_lock_slowpath() can (load-)ACQUIRE the lock before   * issuing an _unordered_ store to set _Q_LOCKED_VAL.   * @@ -322,7 +379,7 @@ void queued_spin_unlock_wait(struct qspinlock *lock)  		cpu_relax();  done: -	smp_rmb(); /* CTRL + RMB -> ACQUIRE */ +	smp_acquire__after_ctrl_dep();  }  EXPORT_SYMBOL(queued_spin_unlock_wait);  #endif @@ -418,7 +475,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)  	 * sequentiality; this is because not all clear_pending_set_locked()  	 * implementations imply full barriers.  	 */ -	smp_cond_acquire(!(atomic_read(&lock->val) & _Q_LOCKED_MASK)); +	smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_MASK));  	/*  	 * take ownership and clear the pending bit. @@ -455,6 +512,8 @@ queue:  	 * pending stuff.  	 *  	 * p,*,* -> n,*,* +	 * +	 * RELEASE, such that the stores to @node must be complete.  	 */  	old = xchg_tail(lock, tail);  	next = NULL; @@ -465,6 +524,15 @@ queue:  	 */  	if (old & _Q_TAIL_MASK) {  		prev = decode_tail(old); +		/* +		 * The above xchg_tail() is also a load of @lock which generates, +		 * through decode_tail(), a pointer. +		 * +		 * The address dependency matches the RELEASE of xchg_tail() +		 * such that the access to @prev must happen after. +		 */ +		smp_read_barrier_depends(); +  		WRITE_ONCE(prev->next, node);  		pv_wait_node(node, prev); @@ -494,7 +562,7 @@ queue:  	 *  	 * The PV pv_wait_head_or_lock function, if active, will acquire  	 * the lock and return a non-zero value. So we have to skip the -	 * smp_cond_acquire() call. As the next PV queue head hasn't been +	 * smp_cond_load_acquire() call. As the next PV queue head hasn't been  	 * designated yet, there is no way for the locked value to become  	 * _Q_SLOW_VAL. So both the set_locked() and the  	 * atomic_cmpxchg_relaxed() calls will be safe. @@ -505,7 +573,7 @@ queue:  	if ((val = pv_wait_head_or_lock(lock, node)))  		goto locked; -	smp_cond_acquire(!((val = atomic_read(&lock->val)) & _Q_LOCKED_PENDING_MASK)); +	val = smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_PENDING_MASK));  locked:  	/* @@ -525,9 +593,9 @@ locked:  			break;  		}  		/* -		 * The smp_cond_acquire() call above has provided the necessary -		 * acquire semantics required for locking. At most two -		 * iterations of this loop may be ran. +		 * The smp_cond_load_acquire() call above has provided the +		 * necessary acquire semantics required for locking. At most +		 * two iterations of this loop may be ran.  		 */  		old = atomic_cmpxchg_relaxed(&lock->val, val, _Q_LOCKED_VAL);  		if (old == val) @@ -551,7 +619,7 @@ release:  	/*  	 * release the node  	 */ -	this_cpu_dec(mcs_nodes[0].count); +	__this_cpu_dec(mcs_nodes[0].count);  }  EXPORT_SYMBOL(queued_spin_lock_slowpath); diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 21ede57f68b3..37649e69056c 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -112,12 +112,12 @@ static __always_inline int trylock_clear_pending(struct qspinlock *lock)  #else /* _Q_PENDING_BITS == 8 */  static __always_inline void set_pending(struct qspinlock *lock)  { -	atomic_set_mask(_Q_PENDING_VAL, &lock->val); +	atomic_or(_Q_PENDING_VAL, &lock->val);  }  static __always_inline void clear_pending(struct qspinlock *lock)  { -	atomic_clear_mask(_Q_PENDING_VAL, &lock->val); +	atomic_andnot(_Q_PENDING_VAL, &lock->val);  }  static __always_inline int trylock_clear_pending(struct qspinlock *lock) diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 3e746607abe5..1ec0f48962b3 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1478,7 +1478,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);   */  int __sched rt_mutex_trylock(struct rt_mutex *lock)  { -	if (WARN_ON(in_irq() || in_nmi() || in_serving_softirq())) +	if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq()))  		return 0;  	return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock); diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 09e30c6225e5..447e08de1fab 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -80,7 +80,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,  	debug_check_no_locks_freed((void *)sem, sizeof(*sem));  	lockdep_init_map(&sem->dep_map, name, key, 0);  #endif -	sem->count = RWSEM_UNLOCKED_VALUE; +	atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE);  	raw_spin_lock_init(&sem->wait_lock);  	INIT_LIST_HEAD(&sem->wait_list);  #ifdef CONFIG_RWSEM_SPIN_ON_OWNER @@ -114,12 +114,16 @@ enum rwsem_wake_type {   *   - the 'active part' of count (&0x0000ffff) reached 0 (but may have changed)   *   - the 'waiting part' of count (&0xffff0000) is -ve (and will still be so)   * - there must be someone on the queue - * - the spinlock must be held by the caller + * - the wait_lock must be held by the caller + * - tasks are marked for wakeup, the caller must later invoke wake_up_q() + *   to actually wakeup the blocked task(s) and drop the reference count, + *   preferably when the wait_lock is released   * - woken process blocks are discarded from the list after having task zeroed - * - writers are only woken if downgrading is false + * - writers are only marked woken if downgrading is false   */  static struct rw_semaphore * -__rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) +__rwsem_mark_wake(struct rw_semaphore *sem, +		  enum rwsem_wake_type wake_type, struct wake_q_head *wake_q)  {  	struct rwsem_waiter *waiter;  	struct task_struct *tsk; @@ -128,13 +132,16 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)  	waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);  	if (waiter->type == RWSEM_WAITING_FOR_WRITE) { -		if (wake_type == RWSEM_WAKE_ANY) -			/* Wake writer at the front of the queue, but do not -			 * grant it the lock yet as we want other writers -			 * to be able to steal it.  Readers, on the other hand, -			 * will block as they will notice the queued writer. +		if (wake_type == RWSEM_WAKE_ANY) { +			/* +			 * Mark writer at the front of the queue for wakeup. +			 * Until the task is actually later awoken later by +			 * the caller, other writers are able to steal it. +			 * Readers, on the other hand, will block as they +			 * will notice the queued writer.  			 */ -			wake_up_process(waiter->task); +			wake_q_add(wake_q, waiter->task); +		}  		goto out;  	} @@ -146,15 +153,27 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)  	if (wake_type != RWSEM_WAKE_READ_OWNED) {  		adjustment = RWSEM_ACTIVE_READ_BIAS;   try_reader_grant: -		oldcount = rwsem_atomic_update(adjustment, sem) - adjustment; +		oldcount = atomic_long_fetch_add(adjustment, &sem->count); +  		if (unlikely(oldcount < RWSEM_WAITING_BIAS)) { -			/* A writer stole the lock. Undo our reader grant. */ -			if (rwsem_atomic_update(-adjustment, sem) & -						RWSEM_ACTIVE_MASK) +			/* +			 * If the count is still less than RWSEM_WAITING_BIAS +			 * after removing the adjustment, it is assumed that +			 * a writer has stolen the lock. We have to undo our +			 * reader grant. +			 */ +			if (atomic_long_add_return(-adjustment, &sem->count) < +			    RWSEM_WAITING_BIAS)  				goto out;  			/* Last active locker left. Retry waking readers. */  			goto try_reader_grant;  		} +		/* +		 * It is not really necessary to set it to reader-owned here, +		 * but it gives the spinners an early indication that the +		 * readers now have the lock. +		 */ +		rwsem_set_reader_owned(sem);  	}  	/* Grant an infinite number of read locks to the readers at the front @@ -179,7 +198,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)  		adjustment -= RWSEM_WAITING_BIAS;  	if (adjustment) -		rwsem_atomic_add(adjustment, sem); +		atomic_long_add(adjustment, &sem->count);  	next = sem->wait_list.next;  	loop = woken; @@ -187,17 +206,15 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)  		waiter = list_entry(next, struct rwsem_waiter, list);  		next = waiter->list.next;  		tsk = waiter->task; + +		wake_q_add(wake_q, tsk);  		/* -		 * Make sure we do not wakeup the next reader before -		 * setting the nil condition to grant the next reader; -		 * otherwise we could miss the wakeup on the other -		 * side and end up sleeping again. See the pairing -		 * in rwsem_down_read_failed(). +		 * Ensure that the last operation is setting the reader +		 * waiter to nil such that rwsem_down_read_failed() cannot +		 * race with do_exit() by always holding a reference count +		 * to the task to wakeup.  		 */ -		smp_mb(); -		waiter->task = NULL; -		wake_up_process(tsk); -		put_task_struct(tsk); +		smp_store_release(&waiter->task, NULL);  	} while (--loop);  	sem->wait_list.next = next; @@ -216,11 +233,11 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)  	long count, adjustment = -RWSEM_ACTIVE_READ_BIAS;  	struct rwsem_waiter waiter;  	struct task_struct *tsk = current; +	WAKE_Q(wake_q);  	/* set up my own style of waitqueue */  	waiter.task = tsk;  	waiter.type = RWSEM_WAITING_FOR_READ; -	get_task_struct(tsk);  	raw_spin_lock_irq(&sem->wait_lock);  	if (list_empty(&sem->wait_list)) @@ -228,7 +245,7 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)  	list_add_tail(&waiter.list, &sem->wait_list);  	/* we're now waiting on the lock, but no longer actively locking */ -	count = rwsem_atomic_update(adjustment, sem); +	count = atomic_long_add_return(adjustment, &sem->count);  	/* If there are no active locks, wake the front queued process(es).  	 * @@ -238,9 +255,10 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)  	if (count == RWSEM_WAITING_BIAS ||  	    (count > RWSEM_WAITING_BIAS &&  	     adjustment != -RWSEM_ACTIVE_READ_BIAS)) -		sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY); +		sem = __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);  	raw_spin_unlock_irq(&sem->wait_lock); +	wake_up_q(&wake_q);  	/* wait to be given the lock */  	while (true) { @@ -255,17 +273,29 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)  }  EXPORT_SYMBOL(rwsem_down_read_failed); +/* + * This function must be called with the sem->wait_lock held to prevent + * race conditions between checking the rwsem wait list and setting the + * sem->count accordingly. + */  static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)  {  	/* -	 * Try acquiring the write lock. Check count first in order -	 * to reduce unnecessary expensive cmpxchg() operations. +	 * Avoid trying to acquire write lock if count isn't RWSEM_WAITING_BIAS.  	 */ -	if (count == RWSEM_WAITING_BIAS && -	    cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS, -		    RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) { -		if (!list_is_singular(&sem->wait_list)) -			rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); +	if (count != RWSEM_WAITING_BIAS) +		return false; + +	/* +	 * Acquire the lock by trying to set it to ACTIVE_WRITE_BIAS. If there +	 * are other tasks on the wait list, we need to add on WAITING_BIAS. +	 */ +	count = list_is_singular(&sem->wait_list) ? +			RWSEM_ACTIVE_WRITE_BIAS : +			RWSEM_ACTIVE_WRITE_BIAS + RWSEM_WAITING_BIAS; + +	if (atomic_long_cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS, count) +							== RWSEM_WAITING_BIAS) {  		rwsem_set_owner(sem);  		return true;  	} @@ -279,13 +309,13 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)   */  static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)  { -	long old, count = READ_ONCE(sem->count); +	long old, count = atomic_long_read(&sem->count);  	while (true) {  		if (!(count == 0 || count == RWSEM_WAITING_BIAS))  			return false; -		old = cmpxchg_acquire(&sem->count, count, +		old = atomic_long_cmpxchg_acquire(&sem->count, count,  				      count + RWSEM_ACTIVE_WRITE_BIAS);  		if (old == count) {  			rwsem_set_owner(sem); @@ -306,16 +336,11 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)  	rcu_read_lock();  	owner = READ_ONCE(sem->owner); -	if (!owner) { -		long count = READ_ONCE(sem->count); +	if (!rwsem_owner_is_writer(owner)) {  		/* -		 * If sem->owner is not set, yet we have just recently entered the -		 * slowpath with the lock being active, then there is a possibility -		 * reader(s) may have the lock. To be safe, bail spinning in these -		 * situations. +		 * Don't spin if the rwsem is readers owned.  		 */ -		if (count & RWSEM_ACTIVE_MASK) -			ret = false; +		ret = !rwsem_owner_is_reader(owner);  		goto done;  	} @@ -325,10 +350,15 @@ done:  	return ret;  } -static noinline -bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner) +/* + * Return true only if we can still spin on the owner field of the rwsem. + */ +static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem)  { -	long count; +	struct task_struct *owner = READ_ONCE(sem->owner); + +	if (!rwsem_owner_is_writer(owner)) +		goto out;  	rcu_read_lock();  	while (sem->owner == owner) { @@ -349,22 +379,16 @@ bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner)  		cpu_relax_lowlatency();  	}  	rcu_read_unlock(); - -	if (READ_ONCE(sem->owner)) -		return true; /* new owner, continue spinning */ - +out:  	/* -	 * When the owner is not set, the lock could be free or -	 * held by readers. Check the counter to verify the -	 * state. +	 * If there is a new owner or the owner is not set, we continue +	 * spinning.  	 */ -	count = READ_ONCE(sem->count); -	return (count == 0 || count == RWSEM_WAITING_BIAS); +	return !rwsem_owner_is_reader(READ_ONCE(sem->owner));  }  static bool rwsem_optimistic_spin(struct rw_semaphore *sem)  { -	struct task_struct *owner;  	bool taken = false;  	preempt_disable(); @@ -376,12 +400,17 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)  	if (!osq_lock(&sem->osq))  		goto done; -	while (true) { -		owner = READ_ONCE(sem->owner); -		if (owner && !rwsem_spin_on_owner(sem, owner)) -			break; - -		/* wait_lock will be acquired if write_lock is obtained */ +	/* +	 * Optimistically spin on the owner field and attempt to acquire the +	 * lock whenever the owner changes. Spinning will be stopped when: +	 *  1) the owning writer isn't running; or +	 *  2) readers own the lock as we can't determine if they are +	 *     actively running or not. +	 */ +	while (rwsem_spin_on_owner(sem)) { +		/* +		 * Try to acquire the lock +		 */  		if (rwsem_try_write_lock_unqueued(sem)) {  			taken = true;  			break; @@ -393,7 +422,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)  		 * we're an RT task that will live-lock because we won't let  		 * the owner complete.  		 */ -		if (!owner && (need_resched() || rt_task(current))) +		if (!sem->owner && (need_resched() || rt_task(current)))  			break;  		/* @@ -440,9 +469,10 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)  	bool waiting = true; /* any queued threads before us */  	struct rwsem_waiter waiter;  	struct rw_semaphore *ret = sem; +	WAKE_Q(wake_q);  	/* undo write bias from down_write operation, stop active locking */ -	count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem); +	count = atomic_long_sub_return(RWSEM_ACTIVE_WRITE_BIAS, &sem->count);  	/* do optimistic spinning and steal lock if possible */  	if (rwsem_optimistic_spin(sem)) @@ -465,18 +495,29 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)  	/* we're now waiting on the lock, but no longer actively locking */  	if (waiting) { -		count = READ_ONCE(sem->count); +		count = atomic_long_read(&sem->count);  		/*  		 * If there were already threads queued before us and there are  		 * no active writers, the lock must be read owned; so we try to  		 * wake any read locks that were queued ahead of us.  		 */ -		if (count > RWSEM_WAITING_BIAS) -			sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS); +		if (count > RWSEM_WAITING_BIAS) { +			WAKE_Q(wake_q); + +			sem = __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q); +			/* +			 * The wakeup is normally called _after_ the wait_lock +			 * is released, but given that we are proactively waking +			 * readers we can deal with the wake_q overhead as it is +			 * similar to releasing and taking the wait_lock again +			 * for attempting rwsem_try_write_lock(). +			 */ +			wake_up_q(&wake_q); +		}  	} else -		count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); +		count = atomic_long_add_return(RWSEM_WAITING_BIAS, &sem->count);  	/* wait until we successfully acquire the lock */  	set_current_state(state); @@ -492,7 +533,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)  			schedule();  			set_current_state(state); -		} while ((count = sem->count) & RWSEM_ACTIVE_MASK); +		} while ((count = atomic_long_read(&sem->count)) & RWSEM_ACTIVE_MASK);  		raw_spin_lock_irq(&sem->wait_lock);  	} @@ -507,10 +548,11 @@ out_nolock:  	raw_spin_lock_irq(&sem->wait_lock);  	list_del(&waiter.list);  	if (list_empty(&sem->wait_list)) -		rwsem_atomic_update(-RWSEM_WAITING_BIAS, sem); +		atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count);  	else -		__rwsem_do_wake(sem, RWSEM_WAKE_ANY); +		__rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);  	raw_spin_unlock_irq(&sem->wait_lock); +	wake_up_q(&wake_q);  	return ERR_PTR(-EINTR);  } @@ -537,6 +579,7 @@ __visible  struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)  {  	unsigned long flags; +	WAKE_Q(wake_q);  	/*  	 * If a spinner is present, it is not necessary to do the wakeup. @@ -573,9 +616,10 @@ locked:  	/* do nothing if list empty */  	if (!list_empty(&sem->wait_list)) -		sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY); +		sem = __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);  	raw_spin_unlock_irqrestore(&sem->wait_lock, flags); +	wake_up_q(&wake_q);  	return sem;  } @@ -590,14 +634,16 @@ __visible  struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)  {  	unsigned long flags; +	WAKE_Q(wake_q);  	raw_spin_lock_irqsave(&sem->wait_lock, flags);  	/* do nothing if list empty */  	if (!list_empty(&sem->wait_list)) -		sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED); +		sem = __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q);  	raw_spin_unlock_irqrestore(&sem->wait_lock, flags); +	wake_up_q(&wake_q);  	return sem;  } diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 2e853ad93a3a..45ba475d4be3 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -22,6 +22,7 @@ void __sched down_read(struct rw_semaphore *sem)  	rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);  	LOCK_CONTENDED(sem, __down_read_trylock, __down_read); +	rwsem_set_reader_owned(sem);  }  EXPORT_SYMBOL(down_read); @@ -33,8 +34,10 @@ int down_read_trylock(struct rw_semaphore *sem)  {  	int ret = __down_read_trylock(sem); -	if (ret == 1) +	if (ret == 1) {  		rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_); +		rwsem_set_reader_owned(sem); +	}  	return ret;  } @@ -124,7 +127,7 @@ void downgrade_write(struct rw_semaphore *sem)  	 * lockdep: a downgraded write will live on as a write  	 * dependency.  	 */ -	rwsem_clear_owner(sem); +	rwsem_set_reader_owned(sem);  	__downgrade_write(sem);  } @@ -138,6 +141,7 @@ void down_read_nested(struct rw_semaphore *sem, int subclass)  	rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);  	LOCK_CONTENDED(sem, __down_read_trylock, __down_read); +	rwsem_set_reader_owned(sem);  }  EXPORT_SYMBOL(down_read_nested); diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h index 870ed9a5b426..a699f4048ba1 100644 --- a/kernel/locking/rwsem.h +++ b/kernel/locking/rwsem.h @@ -1,14 +1,58 @@ +/* + * The owner field of the rw_semaphore structure will be set to + * RWSEM_READ_OWNED when a reader grabs the lock. A writer will clear + * the owner field when it unlocks. A reader, on the other hand, will + * not touch the owner field when it unlocks. + * + * In essence, the owner field now has the following 3 states: + *  1) 0 + *     - lock is free or the owner hasn't set the field yet + *  2) RWSEM_READER_OWNED + *     - lock is currently or previously owned by readers (lock is free + *       or not set by owner yet) + *  3) Other non-zero value + *     - a writer owns the lock + */ +#define RWSEM_READER_OWNED	((struct task_struct *)1UL) +  #ifdef CONFIG_RWSEM_SPIN_ON_OWNER +/* + * All writes to owner are protected by WRITE_ONCE() to make sure that + * store tearing can't happen as optimistic spinners may read and use + * the owner value concurrently without lock. Read from owner, however, + * may not need READ_ONCE() as long as the pointer value is only used + * for comparison and isn't being dereferenced. + */  static inline void rwsem_set_owner(struct rw_semaphore *sem)  { -	sem->owner = current; +	WRITE_ONCE(sem->owner, current);  }  static inline void rwsem_clear_owner(struct rw_semaphore *sem)  { -	sem->owner = NULL; +	WRITE_ONCE(sem->owner, NULL); +} + +static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) +{ +	/* +	 * We check the owner value first to make sure that we will only +	 * do a write to the rwsem cacheline when it is really necessary +	 * to minimize cacheline contention. +	 */ +	if (sem->owner != RWSEM_READER_OWNED) +		WRITE_ONCE(sem->owner, RWSEM_READER_OWNED); +} + +static inline bool rwsem_owner_is_writer(struct task_struct *owner) +{ +	return owner && owner != RWSEM_READER_OWNED;  } +static inline bool rwsem_owner_is_reader(struct task_struct *owner) +{ +	return owner == RWSEM_READER_OWNED; +}  #else  static inline void rwsem_set_owner(struct rw_semaphore *sem)  { @@ -17,4 +61,8 @@ static inline void rwsem_set_owner(struct rw_semaphore *sem)  static inline void rwsem_clear_owner(struct rw_semaphore *sem)  {  } + +static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) +{ +}  #endif diff --git a/kernel/memremap.c b/kernel/memremap.c index 017532193fb1..251d16b4cb41 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -169,12 +169,6 @@ void devm_memunmap(struct device *dev, void *addr)  }  EXPORT_SYMBOL(devm_memunmap); -pfn_t phys_to_pfn_t(phys_addr_t addr, u64 flags) -{ -	return __pfn_to_pfn_t(addr >> PAGE_SHIFT, flags); -} -EXPORT_SYMBOL(phys_to_pfn_t); -  #ifdef CONFIG_ZONE_DEVICE  static DEFINE_MUTEX(pgmap_lock);  static RADIX_TREE(pgmap_radix, GFP_KERNEL); @@ -308,12 +302,6 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,  	if (is_ram == REGION_INTERSECTS)  		return __va(res->start); -	if (altmap && !IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)) { -		dev_err(dev, "%s: altmap requires CONFIG_SPARSEMEM_VMEMMAP=y\n", -				__func__); -		return ERR_PTR(-ENXIO); -	} -  	if (!ref)  		return ERR_PTR(-EINVAL); @@ -401,7 +389,6 @@ void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns)  	altmap->alloc -= nr_pfns;  } -#ifdef CONFIG_SPARSEMEM_VMEMMAP  struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)  {  	/* @@ -427,5 +414,4 @@ struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)  	return pgmap ? pgmap->altmap : NULL;  } -#endif /* CONFIG_SPARSEMEM_VMEMMAP */  #endif /* CONFIG_ZONE_DEVICE */ diff --git a/kernel/power/Makefile b/kernel/power/Makefile index cb880a14cc39..eb4f717705ba 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -1,6 +1,8 @@  ccflags-$(CONFIG_PM_DEBUG)	:= -DDEBUG +KASAN_SANITIZE_snapshot.o	:= n +  obj-y				+= qos.o  obj-$(CONFIG_PM)		+= main.o  obj-$(CONFIG_VT_CONSOLE_SLEEP)	+= console.o diff --git a/kernel/power/console.c b/kernel/power/console.c index aba9c545a0e3..0e781798b0b3 100644 --- a/kernel/power/console.c +++ b/kernel/power/console.c @@ -126,17 +126,17 @@ out:  	return ret;  } -int pm_prepare_console(void) +void pm_prepare_console(void)  {  	if (!pm_vt_switch()) -		return 0; +		return;  	orig_fgconsole = vt_move_to_console(SUSPEND_CONSOLE, 1);  	if (orig_fgconsole < 0) -		return 1; +		return;  	orig_kmsg = vt_kmsg_redirect(SUSPEND_CONSOLE); -	return 0; +	return;  }  void pm_restore_console(void) diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index fca9254280ee..a881c6a7ba74 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -52,6 +52,7 @@ enum {  #ifdef CONFIG_SUSPEND  	HIBERNATION_SUSPEND,  #endif +	HIBERNATION_TEST_RESUME,  	/* keep last */  	__HIBERNATION_AFTER_LAST  }; @@ -409,6 +410,11 @@ int hibernation_snapshot(int platform_mode)  	goto Close;  } +int __weak hibernate_resume_nonboot_cpu_disable(void) +{ +	return disable_nonboot_cpus(); +} +  /**   * resume_target_kernel - Restore system state from a hibernation image.   * @platform_mode: Whether or not to use the platform driver. @@ -433,7 +439,7 @@ static int resume_target_kernel(bool platform_mode)  	if (error)  		goto Cleanup; -	error = disable_nonboot_cpus(); +	error = hibernate_resume_nonboot_cpu_disable();  	if (error)  		goto Enable_cpus; @@ -642,12 +648,39 @@ static void power_down(void)  		cpu_relax();  } +static int load_image_and_restore(void) +{ +	int error; +	unsigned int flags; + +	pr_debug("PM: Loading hibernation image.\n"); + +	lock_device_hotplug(); +	error = create_basic_memory_bitmaps(); +	if (error) +		goto Unlock; + +	error = swsusp_read(&flags); +	swsusp_close(FMODE_READ); +	if (!error) +		hibernation_restore(flags & SF_PLATFORM_MODE); + +	printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n"); +	swsusp_free(); +	free_basic_memory_bitmaps(); + Unlock: +	unlock_device_hotplug(); + +	return error; +} +  /**   * hibernate - Carry out system hibernation, including saving the image.   */  int hibernate(void)  { -	int error; +	int error, nr_calls = 0; +	bool snapshot_test = false;  	if (!hibernation_available()) {  		pr_debug("PM: Hibernation not available.\n"); @@ -662,9 +695,11 @@ int hibernate(void)  	}  	pm_prepare_console(); -	error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); -	if (error) +	error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls); +	if (error) { +		nr_calls--;  		goto Exit; +	}  	printk(KERN_INFO "PM: Syncing filesystems ... ");  	sys_sync(); @@ -697,8 +732,12 @@ int hibernate(void)  		pr_debug("PM: writing image.\n");  		error = swsusp_write(flags);  		swsusp_free(); -		if (!error) -			power_down(); +		if (!error) { +			if (hibernation_mode == HIBERNATION_TEST_RESUME) +				snapshot_test = true; +			else +				power_down(); +		}  		in_suspend = 0;  		pm_restore_gfp_mask();  	} else { @@ -709,12 +748,18 @@ int hibernate(void)  	free_basic_memory_bitmaps();   Thaw:  	unlock_device_hotplug(); +	if (snapshot_test) { +		pr_debug("PM: Checking hibernation image\n"); +		error = swsusp_check(); +		if (!error) +			error = load_image_and_restore(); +	}  	thaw_processes();  	/* Don't bother checking whether freezer_test_done is true */  	freezer_test_done = false;   Exit: -	pm_notifier_call_chain(PM_POST_HIBERNATION); +	__pm_notifier_call_chain(PM_POST_HIBERNATION, nr_calls, NULL);  	pm_restore_console();  	atomic_inc(&snapshot_device_available);   Unlock: @@ -740,8 +785,7 @@ int hibernate(void)   */  static int software_resume(void)  { -	int error; -	unsigned int flags; +	int error, nr_calls = 0;  	/*  	 * If the user said "noresume".. bail out early. @@ -827,35 +871,20 @@ static int software_resume(void)  	}  	pm_prepare_console(); -	error = pm_notifier_call_chain(PM_RESTORE_PREPARE); -	if (error) +	error = __pm_notifier_call_chain(PM_RESTORE_PREPARE, -1, &nr_calls); +	if (error) { +		nr_calls--;  		goto Close_Finish; +	}  	pr_debug("PM: Preparing processes for restore.\n");  	error = freeze_processes();  	if (error)  		goto Close_Finish; - -	pr_debug("PM: Loading hibernation image.\n"); - -	lock_device_hotplug(); -	error = create_basic_memory_bitmaps(); -	if (error) -		goto Thaw; - -	error = swsusp_read(&flags); -	swsusp_close(FMODE_READ); -	if (!error) -		hibernation_restore(flags & SF_PLATFORM_MODE); - -	printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n"); -	swsusp_free(); -	free_basic_memory_bitmaps(); - Thaw: -	unlock_device_hotplug(); +	error = load_image_and_restore();  	thaw_processes();   Finish: -	pm_notifier_call_chain(PM_POST_RESTORE); +	__pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL);  	pm_restore_console();  	atomic_inc(&snapshot_device_available);  	/* For success case, the suspend path will release the lock */ @@ -878,6 +907,7 @@ static const char * const hibernation_modes[] = {  #ifdef CONFIG_SUSPEND  	[HIBERNATION_SUSPEND]	= "suspend",  #endif +	[HIBERNATION_TEST_RESUME]	= "test_resume",  };  /* @@ -924,6 +954,7 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,  #ifdef CONFIG_SUSPEND  		case HIBERNATION_SUSPEND:  #endif +		case HIBERNATION_TEST_RESUME:  			break;  		case HIBERNATION_PLATFORM:  			if (hibernation_ops) @@ -970,6 +1001,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,  #ifdef CONFIG_SUSPEND  		case HIBERNATION_SUSPEND:  #endif +		case HIBERNATION_TEST_RESUME:  			hibernation_mode = mode;  			break;  		case HIBERNATION_PLATFORM: @@ -1115,13 +1147,16 @@ static int __init resume_offset_setup(char *str)  static int __init hibernate_setup(char *str)  { -	if (!strncmp(str, "noresume", 8)) +	if (!strncmp(str, "noresume", 8)) {  		noresume = 1; -	else if (!strncmp(str, "nocompress", 10)) +	} else if (!strncmp(str, "nocompress", 10)) {  		nocompress = 1; -	else if (!strncmp(str, "no", 2)) { +	} else if (!strncmp(str, "no", 2)) {  		noresume = 1;  		nohibernate = 1; +	} else if (IS_ENABLED(CONFIG_DEBUG_RODATA) +		   && !strncmp(str, "protect_image", 13)) { +		enable_restore_image_protection();  	}  	return 1;  } @@ -1154,11 +1189,6 @@ static int __init nohibernate_setup(char *str)  	return 1;  } -static int __init kaslr_nohibernate_setup(char *str) -{ -	return nohibernate_setup(str); -} -  static int __init page_poison_nohibernate_setup(char *str)  {  #ifdef CONFIG_PAGE_POISONING_ZERO @@ -1182,5 +1212,4 @@ __setup("hibernate=", hibernate_setup);  __setup("resumewait", resumewait_setup);  __setup("resumedelay=", resumedelay_setup);  __setup("nohibernate", nohibernate_setup); -__setup("kaslr", kaslr_nohibernate_setup);  __setup("page_poison=", page_poison_nohibernate_setup); diff --git a/kernel/power/main.c b/kernel/power/main.c index 27946975eff0..5ea50b1b7595 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -38,12 +38,19 @@ int unregister_pm_notifier(struct notifier_block *nb)  }  EXPORT_SYMBOL_GPL(unregister_pm_notifier); -int pm_notifier_call_chain(unsigned long val) +int __pm_notifier_call_chain(unsigned long val, int nr_to_call, int *nr_calls)  { -	int ret = blocking_notifier_call_chain(&pm_chain_head, val, NULL); +	int ret; + +	ret = __blocking_notifier_call_chain(&pm_chain_head, val, NULL, +						nr_to_call, nr_calls);  	return notifier_to_errno(ret);  } +int pm_notifier_call_chain(unsigned long val) +{ +	return __pm_notifier_call_chain(val, -1, NULL); +}  /* If set, devices may be suspended and resumed asynchronously. */  int pm_async_enabled = 1; diff --git a/kernel/power/power.h b/kernel/power/power.h index efe1b3b17c88..242d8b827dd5 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -38,6 +38,8 @@ static inline char *check_image_kernel(struct swsusp_info *info)  }  #endif /* CONFIG_ARCH_HIBERNATION_HEADER */ +extern int hibernate_resume_nonboot_cpu_disable(void); +  /*   * Keep some memory free so that I/O operations can succeed without paging   * [Might this be more than 4 MB?] @@ -59,6 +61,13 @@ extern int hibernation_snapshot(int platform_mode);  extern int hibernation_restore(int platform_mode);  extern int hibernation_platform_enter(void); +#ifdef CONFIG_DEBUG_RODATA +/* kernel/power/snapshot.c */ +extern void enable_restore_image_protection(void); +#else +static inline void enable_restore_image_protection(void) {} +#endif /* CONFIG_DEBUG_RODATA */ +  #else /* !CONFIG_HIBERNATION */  static inline void hibernate_reserved_size_init(void) {} @@ -200,6 +209,8 @@ static inline void suspend_test_finish(const char *label) {}  #ifdef CONFIG_PM_SLEEP  /* kernel/power/main.c */ +extern int __pm_notifier_call_chain(unsigned long val, int nr_to_call, +				    int *nr_calls);  extern int pm_notifier_call_chain(unsigned long val);  #endif diff --git a/kernel/power/process.c b/kernel/power/process.c index df058bed53ce..8f27d5a8adf6 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -89,6 +89,9 @@ static int try_to_freeze_tasks(bool user_only)  		       elapsed_msecs / 1000, elapsed_msecs % 1000,  		       todo - wq_busy, wq_busy); +		if (wq_busy) +			show_workqueue_state(); +  		if (!wakeup) {  			read_lock(&tasklist_lock);  			for_each_process_thread(g, p) { @@ -146,6 +149,18 @@ int freeze_processes(void)  	if (!error && !oom_killer_disable())  		error = -EBUSY; +	/* +	 * There is a hard to fix race between oom_reaper kernel thread +	 * and oom_killer_disable. oom_reaper calls exit_oom_victim +	 * before the victim reaches exit_mm so try to freeze all the tasks +	 * again and catch such a left over task. +	 */ +	if (!error) { +		pr_info("Double checking all user space processes after OOM killer disable... "); +		error = try_to_freeze_tasks(true); +		pr_cont("\n"); +	} +  	if (error)  		thaw_processes();  	return error; diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 3a970604308f..9a0178c2ac1d 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -38,6 +38,43 @@  #include "power.h" +#ifdef CONFIG_DEBUG_RODATA +static bool hibernate_restore_protection; +static bool hibernate_restore_protection_active; + +void enable_restore_image_protection(void) +{ +	hibernate_restore_protection = true; +} + +static inline void hibernate_restore_protection_begin(void) +{ +	hibernate_restore_protection_active = hibernate_restore_protection; +} + +static inline void hibernate_restore_protection_end(void) +{ +	hibernate_restore_protection_active = false; +} + +static inline void hibernate_restore_protect_page(void *page_address) +{ +	if (hibernate_restore_protection_active) +		set_memory_ro((unsigned long)page_address, 1); +} + +static inline void hibernate_restore_unprotect_page(void *page_address) +{ +	if (hibernate_restore_protection_active) +		set_memory_rw((unsigned long)page_address, 1); +} +#else +static inline void hibernate_restore_protection_begin(void) {} +static inline void hibernate_restore_protection_end(void) {} +static inline void hibernate_restore_protect_page(void *page_address) {} +static inline void hibernate_restore_unprotect_page(void *page_address) {} +#endif /* CONFIG_DEBUG_RODATA */ +  static int swsusp_page_is_free(struct page *);  static void swsusp_set_page_forbidden(struct page *);  static void swsusp_unset_page_forbidden(struct page *); @@ -67,25 +104,32 @@ void __init hibernate_image_size_init(void)  	image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE;  } -/* List of PBEs needed for restoring the pages that were allocated before +/* + * List of PBEs needed for restoring the pages that were allocated before   * the suspend and included in the suspend image, but have also been   * allocated by the "resume" kernel, so their contents cannot be written   * directly to their "original" page frames.   */  struct pbe *restore_pblist; -/* Pointer to an auxiliary buffer (1 page) */ -static void *buffer; +/* struct linked_page is used to build chains of pages */ -/** - *	@safe_needed - on resume, for storing the PBE list and the image, - *	we can only use memory pages that do not conflict with the pages - *	used before suspend.  The unsafe pages have PageNosaveFree set - *	and we count them using unsafe_pages. - * - *	Each allocated image page is marked as PageNosave and PageNosaveFree - *	so that swsusp_free() can release it. +#define LINKED_PAGE_DATA_SIZE	(PAGE_SIZE - sizeof(void *)) + +struct linked_page { +	struct linked_page *next; +	char data[LINKED_PAGE_DATA_SIZE]; +} __packed; + +/* + * List of "safe" pages (ie. pages that were not used by the image kernel + * before hibernation) that may be used as temporary storage for image kernel + * memory contents.   */ +static struct linked_page *safe_pages_list; + +/* Pointer to an auxiliary buffer (1 page) */ +static void *buffer;  #define PG_ANY		0  #define PG_SAFE		1 @@ -94,6 +138,19 @@ static void *buffer;  static unsigned int allocated_unsafe_pages; +/** + * get_image_page - Allocate a page for a hibernation image. + * @gfp_mask: GFP mask for the allocation. + * @safe_needed: Get pages that were not used before hibernation (restore only) + * + * During image restoration, for storing the PBE list and the image data, we can + * only use memory pages that do not conflict with the pages used before + * hibernation.  The "unsafe" pages have PageNosaveFree set and we count them + * using allocated_unsafe_pages. + * + * Each allocated image page is marked as PageNosave and PageNosaveFree so that + * swsusp_free() can release it. + */  static void *get_image_page(gfp_t gfp_mask, int safe_needed)  {  	void *res; @@ -113,9 +170,21 @@ static void *get_image_page(gfp_t gfp_mask, int safe_needed)  	return res;  } +static void *__get_safe_page(gfp_t gfp_mask) +{ +	if (safe_pages_list) { +		void *ret = safe_pages_list; + +		safe_pages_list = safe_pages_list->next; +		memset(ret, 0, PAGE_SIZE); +		return ret; +	} +	return get_image_page(gfp_mask, PG_SAFE); +} +  unsigned long get_safe_page(gfp_t gfp_mask)  { -	return (unsigned long)get_image_page(gfp_mask, PG_SAFE); +	return (unsigned long)__get_safe_page(gfp_mask);  }  static struct page *alloc_image_page(gfp_t gfp_mask) @@ -130,11 +199,22 @@ static struct page *alloc_image_page(gfp_t gfp_mask)  	return page;  } +static void recycle_safe_page(void *page_address) +{ +	struct linked_page *lp = page_address; + +	lp->next = safe_pages_list; +	safe_pages_list = lp; +} +  /** - *	free_image_page - free page represented by @addr, allocated with - *	get_image_page (page flags set by it must be cleared) + * free_image_page - Free a page allocated for hibernation image. + * @addr: Address of the page to free. + * @clear_nosave_free: If set, clear the PageNosaveFree bit for the page. + * + * The page to free should have been allocated by get_image_page() (page flags + * set by it are affected).   */ -  static inline void free_image_page(void *addr, int clear_nosave_free)  {  	struct page *page; @@ -150,17 +230,8 @@ static inline void free_image_page(void *addr, int clear_nosave_free)  	__free_page(page);  } -/* struct linked_page is used to build chains of pages */ - -#define LINKED_PAGE_DATA_SIZE	(PAGE_SIZE - sizeof(void *)) - -struct linked_page { -	struct linked_page *next; -	char data[LINKED_PAGE_DATA_SIZE]; -} __packed; - -static inline void -free_list_of_pages(struct linked_page *list, int clear_page_nosave) +static inline void free_list_of_pages(struct linked_page *list, +				      int clear_page_nosave)  {  	while (list) {  		struct linked_page *lp = list->next; @@ -170,30 +241,28 @@ free_list_of_pages(struct linked_page *list, int clear_page_nosave)  	}  } -/** -  *	struct chain_allocator is used for allocating small objects out of -  *	a linked list of pages called 'the chain'. -  * -  *	The chain grows each time when there is no room for a new object in -  *	the current page.  The allocated objects cannot be freed individually. -  *	It is only possible to free them all at once, by freeing the entire -  *	chain. -  * -  *	NOTE: The chain allocator may be inefficient if the allocated objects -  *	are not much smaller than PAGE_SIZE. -  */ - +/* + * struct chain_allocator is used for allocating small objects out of + * a linked list of pages called 'the chain'. + * + * The chain grows each time when there is no room for a new object in + * the current page.  The allocated objects cannot be freed individually. + * It is only possible to free them all at once, by freeing the entire + * chain. + * + * NOTE: The chain allocator may be inefficient if the allocated objects + * are not much smaller than PAGE_SIZE. + */  struct chain_allocator {  	struct linked_page *chain;	/* the chain */  	unsigned int used_space;	/* total size of objects allocated out -					 * of the current page -					 */ +					   of the current page */  	gfp_t gfp_mask;		/* mask for allocating pages */  	int safe_needed;	/* if set, only "safe" pages are allocated */  }; -static void -chain_init(struct chain_allocator *ca, gfp_t gfp_mask, int safe_needed) +static void chain_init(struct chain_allocator *ca, gfp_t gfp_mask, +		       int safe_needed)  {  	ca->chain = NULL;  	ca->used_space = LINKED_PAGE_DATA_SIZE; @@ -208,7 +277,8 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)  	if (LINKED_PAGE_DATA_SIZE - ca->used_space < size) {  		struct linked_page *lp; -		lp = get_image_page(ca->gfp_mask, ca->safe_needed); +		lp = ca->safe_needed ? __get_safe_page(ca->gfp_mask) : +					get_image_page(ca->gfp_mask, PG_ANY);  		if (!lp)  			return NULL; @@ -222,44 +292,44 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)  }  /** - *	Data types related to memory bitmaps. + * Data types related to memory bitmaps.   * - *	Memory bitmap is a structure consiting of many linked lists of - *	objects.  The main list's elements are of type struct zone_bitmap - *	and each of them corresonds to one zone.  For each zone bitmap - *	object there is a list of objects of type struct bm_block that - *	represent each blocks of bitmap in which information is stored. + * Memory bitmap is a structure consiting of many linked lists of + * objects.  The main list's elements are of type struct zone_bitmap + * and each of them corresonds to one zone.  For each zone bitmap + * object there is a list of objects of type struct bm_block that + * represent each blocks of bitmap in which information is stored.   * - *	struct memory_bitmap contains a pointer to the main list of zone - *	bitmap objects, a struct bm_position used for browsing the bitmap, - *	and a pointer to the list of pages used for allocating all of the - *	zone bitmap objects and bitmap block objects. + * struct memory_bitmap contains a pointer to the main list of zone + * bitmap objects, a struct bm_position used for browsing the bitmap, + * and a pointer to the list of pages used for allocating all of the + * zone bitmap objects and bitmap block objects.   * - *	NOTE: It has to be possible to lay out the bitmap in memory - *	using only allocations of order 0.  Additionally, the bitmap is - *	designed to work with arbitrary number of zones (this is over the - *	top for now, but let's avoid making unnecessary assumptions ;-). + * NOTE: It has to be possible to lay out the bitmap in memory + * using only allocations of order 0.  Additionally, the bitmap is + * designed to work with arbitrary number of zones (this is over the + * top for now, but let's avoid making unnecessary assumptions ;-).   * - *	struct zone_bitmap contains a pointer to a list of bitmap block - *	objects and a pointer to the bitmap block object that has been - *	most recently used for setting bits.  Additionally, it contains the - *	pfns that correspond to the start and end of the represented zone. + * struct zone_bitmap contains a pointer to a list of bitmap block + * objects and a pointer to the bitmap block object that has been + * most recently used for setting bits.  Additionally, it contains the + * PFNs that correspond to the start and end of the represented zone.   * - *	struct bm_block contains a pointer to the memory page in which - *	information is stored (in the form of a block of bitmap) - *	It also contains the pfns that correspond to the start and end of - *	the represented memory area. + * struct bm_block contains a pointer to the memory page in which + * information is stored (in the form of a block of bitmap) + * It also contains the pfns that correspond to the start and end of + * the represented memory area.   * - *	The memory bitmap is organized as a radix tree to guarantee fast random - *	access to the bits. There is one radix tree for each zone (as returned - *	from create_mem_extents). + * The memory bitmap is organized as a radix tree to guarantee fast random + * access to the bits. There is one radix tree for each zone (as returned + * from create_mem_extents).   * - *	One radix tree is represented by one struct mem_zone_bm_rtree. There are - *	two linked lists for the nodes of the tree, one for the inner nodes and - *	one for the leave nodes. The linked leave nodes are used for fast linear - *	access of the memory bitmap. + * One radix tree is represented by one struct mem_zone_bm_rtree. There are + * two linked lists for the nodes of the tree, one for the inner nodes and + * one for the leave nodes. The linked leave nodes are used for fast linear + * access of the memory bitmap.   * - *	The struct rtree_node represents one node of the radix tree. + * The struct rtree_node represents one node of the radix tree.   */  #define BM_END_OF_MAP	(~0UL) @@ -305,9 +375,8 @@ struct bm_position {  struct memory_bitmap {  	struct list_head zones;  	struct linked_page *p_list;	/* list of pages used to store zone -					 * bitmap objects and bitmap block -					 * objects -					 */ +					   bitmap objects and bitmap block +					   objects */  	struct bm_position cur;	/* most recently used bit position */  }; @@ -321,12 +390,12 @@ struct memory_bitmap {  #endif  #define BM_RTREE_LEVEL_MASK	((1UL << BM_RTREE_LEVEL_SHIFT) - 1) -/* - *	alloc_rtree_node - Allocate a new node and add it to the radix tree. +/** + * alloc_rtree_node - Allocate a new node and add it to the radix tree.   * - *	This function is used to allocate inner nodes as well as the - *	leave nodes of the radix tree. It also adds the node to the - *	corresponding linked list passed in by the *list parameter. + * This function is used to allocate inner nodes as well as the + * leave nodes of the radix tree. It also adds the node to the + * corresponding linked list passed in by the *list parameter.   */  static struct rtree_node *alloc_rtree_node(gfp_t gfp_mask, int safe_needed,  					   struct chain_allocator *ca, @@ -347,12 +416,12 @@ static struct rtree_node *alloc_rtree_node(gfp_t gfp_mask, int safe_needed,  	return node;  } -/* - *	add_rtree_block - Add a new leave node to the radix tree +/** + * add_rtree_block - Add a new leave node to the radix tree.   * - *	The leave nodes need to be allocated in order to keep the leaves - *	linked list in order. This is guaranteed by the zone->blocks - *	counter. + * The leave nodes need to be allocated in order to keep the leaves + * linked list in order. This is guaranteed by the zone->blocks + * counter.   */  static int add_rtree_block(struct mem_zone_bm_rtree *zone, gfp_t gfp_mask,  			   int safe_needed, struct chain_allocator *ca) @@ -417,17 +486,18 @@ static int add_rtree_block(struct mem_zone_bm_rtree *zone, gfp_t gfp_mask,  static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone,  			       int clear_nosave_free); -/* - *	create_zone_bm_rtree - create a radix tree for one zone +/** + * create_zone_bm_rtree - Create a radix tree for one zone.   * - *	Allocated the mem_zone_bm_rtree structure and initializes it. - *	This function also allocated and builds the radix tree for the - *	zone. + * Allocated the mem_zone_bm_rtree structure and initializes it. + * This function also allocated and builds the radix tree for the + * zone.   */ -static struct mem_zone_bm_rtree * -create_zone_bm_rtree(gfp_t gfp_mask, int safe_needed, -		     struct chain_allocator *ca, -		     unsigned long start, unsigned long end) +static struct mem_zone_bm_rtree *create_zone_bm_rtree(gfp_t gfp_mask, +						      int safe_needed, +						      struct chain_allocator *ca, +						      unsigned long start, +						      unsigned long end)  {  	struct mem_zone_bm_rtree *zone;  	unsigned int i, nr_blocks; @@ -454,12 +524,12 @@ create_zone_bm_rtree(gfp_t gfp_mask, int safe_needed,  	return zone;  } -/* - *	free_zone_bm_rtree - Free the memory of the radix tree +/** + * free_zone_bm_rtree - Free the memory of the radix tree.   * - *	Free all node pages of the radix tree. The mem_zone_bm_rtree - *	structure itself is not freed here nor are the rtree_node - *	structs. + * Free all node pages of the radix tree. The mem_zone_bm_rtree + * structure itself is not freed here nor are the rtree_node + * structs.   */  static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone,  			       int clear_nosave_free) @@ -492,8 +562,8 @@ struct mem_extent {  };  /** - *	free_mem_extents - free a list of memory extents - *	@list - list of extents to empty + * free_mem_extents - Free a list of memory extents. + * @list: List of extents to free.   */  static void free_mem_extents(struct list_head *list)  { @@ -506,10 +576,11 @@ static void free_mem_extents(struct list_head *list)  }  /** - *	create_mem_extents - create a list of memory extents representing - *	                     contiguous ranges of PFNs - *	@list - list to put the extents into - *	@gfp_mask - mask to use for memory allocations + * create_mem_extents - Create a list of memory extents. + * @list: List to put the extents into. + * @gfp_mask: Mask to use for memory allocations. + * + * The extents represent contiguous ranges of PFNs.   */  static int create_mem_extents(struct list_head *list, gfp_t gfp_mask)  { @@ -565,10 +636,10 @@ static int create_mem_extents(struct list_head *list, gfp_t gfp_mask)  }  /** -  *	memory_bm_create - allocate memory for a memory bitmap -  */ -static int -memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) + * memory_bm_create - Allocate memory for a memory bitmap. + */ +static int memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, +			    int safe_needed)  {  	struct chain_allocator ca;  	struct list_head mem_extents; @@ -607,8 +678,9 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)  }  /** -  *	memory_bm_free - free memory occupied by the memory bitmap @bm -  */ + * memory_bm_free - Free memory occupied by the memory bitmap. + * @bm: Memory bitmap. + */  static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)  {  	struct mem_zone_bm_rtree *zone; @@ -622,14 +694,13 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)  }  /** - *	memory_bm_find_bit - Find the bit for pfn in the memory - *			     bitmap + * memory_bm_find_bit - Find the bit for a given PFN in a memory bitmap.   * - *	Find the bit in the bitmap @bm that corresponds to given pfn. - *	The cur.zone, cur.block and cur.node_pfn member of @bm are - *	updated. - *	It walks the radix tree to find the page which contains the bit for - *	pfn and returns the bit position in **addr and *bit_nr. + * Find the bit in memory bitmap @bm that corresponds to the given PFN. + * The cur.zone, cur.block and cur.node_pfn members of @bm are updated. + * + * Walk the radix tree to find the page containing the bit that represents @pfn + * and return the position of the bit in @addr and @bit_nr.   */  static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,  			      void **addr, unsigned int *bit_nr) @@ -658,10 +729,9 @@ static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,  zone_found:  	/* -	 * We have a zone. Now walk the radix tree to find the leave -	 * node for our pfn. +	 * We have found the zone. Now walk the radix tree to find the leaf node +	 * for our PFN.  	 */ -  	node = bm->cur.node;  	if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn)  		goto node_found; @@ -754,14 +824,14 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn)  }  /* - *	rtree_next_node - Jumps to the next leave node + * rtree_next_node - Jump to the next leaf node.   * - *	Sets the position to the beginning of the next node in the - *	memory bitmap. This is either the next node in the current - *	zone's radix tree or the first node in the radix tree of the - *	next zone. + * Set the position to the beginning of the next node in the + * memory bitmap. This is either the next node in the current + * zone's radix tree or the first node in the radix tree of the + * next zone.   * - *	Returns true if there is a next node, false otherwise. + * Return true if there is a next node, false otherwise.   */  static bool rtree_next_node(struct memory_bitmap *bm)  { @@ -790,14 +860,15 @@ static bool rtree_next_node(struct memory_bitmap *bm)  }  /** - *	memory_bm_rtree_next_pfn - Find the next set bit in the bitmap @bm + * memory_bm_rtree_next_pfn - Find the next set bit in a memory bitmap. + * @bm: Memory bitmap.   * - *	Starting from the last returned position this function searches - *	for the next set bit in the memory bitmap and returns its - *	number. If no more bit is set BM_END_OF_MAP is returned. + * Starting from the last returned position this function searches for the next + * set bit in @bm and returns the PFN represented by it.  If no more bits are + * set, BM_END_OF_MAP is returned.   * - *	It is required to run memory_bm_position_reset() before the - *	first call to this function. + * It is required to run memory_bm_position_reset() before the first call to + * this function for the given memory bitmap.   */  static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)  { @@ -819,11 +890,10 @@ static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)  	return BM_END_OF_MAP;  } -/** - *	This structure represents a range of page frames the contents of which - *	should not be saved during the suspend. +/* + * This structure represents a range of page frames the contents of which + * should not be saved during hibernation.   */ -  struct nosave_region {  	struct list_head list;  	unsigned long start_pfn; @@ -832,15 +902,42 @@ struct nosave_region {  static LIST_HEAD(nosave_regions); +static void recycle_zone_bm_rtree(struct mem_zone_bm_rtree *zone) +{ +	struct rtree_node *node; + +	list_for_each_entry(node, &zone->nodes, list) +		recycle_safe_page(node->data); + +	list_for_each_entry(node, &zone->leaves, list) +		recycle_safe_page(node->data); +} + +static void memory_bm_recycle(struct memory_bitmap *bm) +{ +	struct mem_zone_bm_rtree *zone; +	struct linked_page *p_list; + +	list_for_each_entry(zone, &bm->zones, list) +		recycle_zone_bm_rtree(zone); + +	p_list = bm->p_list; +	while (p_list) { +		struct linked_page *lp = p_list; + +		p_list = lp->next; +		recycle_safe_page(lp); +	} +} +  /** - *	register_nosave_region - register a range of page frames the contents - *	of which should not be saved during the suspend (to be used in the early - *	initialization code) + * register_nosave_region - Register a region of unsaveable memory. + * + * Register a range of page frames the contents of which should not be saved + * during hibernation (to be used in the early initialization code).   */ - -void __init -__register_nosave_region(unsigned long start_pfn, unsigned long end_pfn, -			 int use_kmalloc) +void __init __register_nosave_region(unsigned long start_pfn, +				     unsigned long end_pfn, int use_kmalloc)  {  	struct nosave_region *region; @@ -857,12 +954,13 @@ __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn,  		}  	}  	if (use_kmalloc) { -		/* during init, this shouldn't fail */ +		/* During init, this shouldn't fail */  		region = kmalloc(sizeof(struct nosave_region), GFP_KERNEL);  		BUG_ON(!region); -	} else +	} else {  		/* This allocation cannot fail */  		region = memblock_virt_alloc(sizeof(struct nosave_region), 0); +	}  	region->start_pfn = start_pfn;  	region->end_pfn = end_pfn;  	list_add_tail(®ion->list, &nosave_regions); @@ -923,10 +1021,12 @@ static void swsusp_unset_page_forbidden(struct page *page)  }  /** - *	mark_nosave_pages - set bits corresponding to the page frames the - *	contents of which should not be saved in a given bitmap. + * mark_nosave_pages - Mark pages that should not be saved. + * @bm: Memory bitmap. + * + * Set the bits in @bm that correspond to the page frames the contents of which + * should not be saved.   */ -  static void mark_nosave_pages(struct memory_bitmap *bm)  {  	struct nosave_region *region; @@ -956,13 +1056,13 @@ static void mark_nosave_pages(struct memory_bitmap *bm)  }  /** - *	create_basic_memory_bitmaps - create bitmaps needed for marking page - *	frames that should not be saved and free page frames.  The pointers - *	forbidden_pages_map and free_pages_map are only modified if everything - *	goes well, because we don't want the bits to be used before both bitmaps - *	are set up. + * create_basic_memory_bitmaps - Create bitmaps to hold basic page information. + * + * Create bitmaps needed for marking page frames that should not be saved and + * free page frames.  The forbidden_pages_map and free_pages_map pointers are + * only modified if everything goes well, because we don't want the bits to be + * touched before both bitmaps are set up.   */ -  int create_basic_memory_bitmaps(void)  {  	struct memory_bitmap *bm1, *bm2; @@ -1007,12 +1107,12 @@ int create_basic_memory_bitmaps(void)  }  /** - *	free_basic_memory_bitmaps - free memory bitmaps allocated by - *	create_basic_memory_bitmaps().  The auxiliary pointers are necessary - *	so that the bitmaps themselves are not referred to while they are being - *	freed. + * free_basic_memory_bitmaps - Free memory bitmaps holding basic information. + * + * Free memory bitmaps allocated by create_basic_memory_bitmaps().  The + * auxiliary pointers are necessary so that the bitmaps themselves are not + * referred to while they are being freed.   */ -  void free_basic_memory_bitmaps(void)  {  	struct memory_bitmap *bm1, *bm2; @@ -1033,11 +1133,13 @@ void free_basic_memory_bitmaps(void)  }  /** - *	snapshot_additional_pages - estimate the number of additional pages - *	be needed for setting up the suspend image data structures for given - *	zone (usually the returned value is greater than the exact number) + * snapshot_additional_pages - Estimate the number of extra pages needed. + * @zone: Memory zone to carry out the computation for. + * + * Estimate the number of additional pages needed for setting up a hibernation + * image data structures for @zone (usually, the returned value is greater than + * the exact number).   */ -  unsigned int snapshot_additional_pages(struct zone *zone)  {  	unsigned int rtree, nodes; @@ -1055,10 +1157,10 @@ unsigned int snapshot_additional_pages(struct zone *zone)  #ifdef CONFIG_HIGHMEM  /** - *	count_free_highmem_pages - compute the total number of free highmem - *	pages, system-wide. + * count_free_highmem_pages - Compute the total number of free highmem pages. + * + * The returned number is system-wide.   */ -  static unsigned int count_free_highmem_pages(void)  {  	struct zone *zone; @@ -1072,11 +1174,12 @@ static unsigned int count_free_highmem_pages(void)  }  /** - *	saveable_highmem_page - Determine whether a highmem page should be - *	included in the suspend image. + * saveable_highmem_page - Check if a highmem page is saveable.   * - *	We should save the page if it isn't Nosave or NosaveFree, or Reserved, - *	and it isn't a part of a free chunk of pages. + * Determine whether a highmem page should be included in a hibernation image. + * + * We should save the page if it isn't Nosave or NosaveFree, or Reserved, + * and it isn't part of a free chunk of pages.   */  static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)  { @@ -1102,10 +1205,8 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)  }  /** - *	count_highmem_pages - compute the total number of saveable highmem - *	pages. + * count_highmem_pages - Compute the total number of saveable highmem pages.   */ -  static unsigned int count_highmem_pages(void)  {  	struct zone *zone; @@ -1133,12 +1234,14 @@ static inline void *saveable_highmem_page(struct zone *z, unsigned long p)  #endif /* CONFIG_HIGHMEM */  /** - *	saveable_page - Determine whether a non-highmem page should be included - *	in the suspend image. + * saveable_page - Check if the given page is saveable.   * - *	We should save the page if it isn't Nosave, and is not in the range - *	of pages statically defined as 'unsaveable', and it isn't a part of - *	a free chunk of pages. + * Determine whether a non-highmem page should be included in a hibernation + * image. + * + * We should save the page if it isn't Nosave, and is not in the range + * of pages statically defined as 'unsaveable', and it isn't part of + * a free chunk of pages.   */  static struct page *saveable_page(struct zone *zone, unsigned long pfn)  { @@ -1167,10 +1270,8 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn)  }  /** - *	count_data_pages - compute the total number of saveable non-highmem - *	pages. + * count_data_pages - Compute the total number of saveable non-highmem pages.   */ -  static unsigned int count_data_pages(void)  {  	struct zone *zone; @@ -1190,7 +1291,8 @@ static unsigned int count_data_pages(void)  	return n;  } -/* This is needed, because copy_page and memcpy are not usable for copying +/* + * This is needed, because copy_page and memcpy are not usable for copying   * task structs.   */  static inline void do_copy_page(long *dst, long *src) @@ -1201,12 +1303,12 @@ static inline void do_copy_page(long *dst, long *src)  		*dst++ = *src++;  } -  /** - *	safe_copy_page - check if the page we are going to copy is marked as - *		present in the kernel page tables (this always is the case if - *		CONFIG_DEBUG_PAGEALLOC is not set and in that case - *		kernel_page_present() always returns 'true'). + * safe_copy_page - Copy a page in a safe way. + * + * Check if the page we are going to copy is marked as present in the kernel + * page tables (this always is the case if CONFIG_DEBUG_PAGEALLOC is not set + * and in that case kernel_page_present() always returns 'true').   */  static void safe_copy_page(void *dst, struct page *s_page)  { @@ -1219,10 +1321,8 @@ static void safe_copy_page(void *dst, struct page *s_page)  	}  } -  #ifdef CONFIG_HIGHMEM -static inline struct page * -page_is_saveable(struct zone *zone, unsigned long pfn) +static inline struct page *page_is_saveable(struct zone *zone, unsigned long pfn)  {  	return is_highmem(zone) ?  		saveable_highmem_page(zone, pfn) : saveable_page(zone, pfn); @@ -1243,7 +1343,8 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)  		kunmap_atomic(src);  	} else {  		if (PageHighMem(d_page)) { -			/* Page pointed to by src may contain some kernel +			/* +			 * The page pointed to by src may contain some kernel  			 * data modified by kmap_atomic()  			 */  			safe_copy_page(buffer, s_page); @@ -1265,8 +1366,8 @@ static inline void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)  }  #endif /* CONFIG_HIGHMEM */ -static void -copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm) +static void copy_data_pages(struct memory_bitmap *copy_bm, +			    struct memory_bitmap *orig_bm)  {  	struct zone *zone;  	unsigned long pfn; @@ -1315,12 +1416,11 @@ static struct memory_bitmap orig_bm;  static struct memory_bitmap copy_bm;  /** - *	swsusp_free - free pages allocated for the suspend. + * swsusp_free - Free pages allocated for hibernation image.   * - *	Suspend pages are alocated before the atomic copy is made, so we - *	need to release them after the resume. + * Image pages are alocated before snapshot creation, so they need to be + * released after resume.   */ -  void swsusp_free(void)  {  	unsigned long fb_pfn, fr_pfn; @@ -1351,6 +1451,7 @@ loop:  		memory_bm_clear_current(forbidden_pages_map);  		memory_bm_clear_current(free_pages_map); +		hibernate_restore_unprotect_page(page_address(page));  		__free_page(page);  		goto loop;  	} @@ -1362,6 +1463,7 @@ out:  	buffer = NULL;  	alloc_normal = 0;  	alloc_highmem = 0; +	hibernate_restore_protection_end();  }  /* Helper functions used for the shrinking of memory. */ @@ -1369,7 +1471,7 @@ out:  #define GFP_IMAGE	(GFP_KERNEL | __GFP_NOWARN)  /** - * preallocate_image_pages - Allocate a number of pages for hibernation image + * preallocate_image_pages - Allocate a number of pages for hibernation image.   * @nr_pages: Number of page frames to allocate.   * @mask: GFP flags to use for the allocation.   * @@ -1419,7 +1521,7 @@ static unsigned long preallocate_image_highmem(unsigned long nr_pages)  }  /** - *  __fraction - Compute (an approximation of) x * (multiplier / base) + *  __fraction - Compute (an approximation of) x * (multiplier / base).   */  static unsigned long __fraction(u64 x, u64 multiplier, u64 base)  { @@ -1429,8 +1531,8 @@ static unsigned long __fraction(u64 x, u64 multiplier, u64 base)  }  static unsigned long preallocate_highmem_fraction(unsigned long nr_pages, -						unsigned long highmem, -						unsigned long total) +						  unsigned long highmem, +						  unsigned long total)  {  	unsigned long alloc = __fraction(nr_pages, highmem, total); @@ -1443,15 +1545,15 @@ static inline unsigned long preallocate_image_highmem(unsigned long nr_pages)  }  static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages, -						unsigned long highmem, -						unsigned long total) +							 unsigned long highmem, +							 unsigned long total)  {  	return 0;  }  #endif /* CONFIG_HIGHMEM */  /** - * free_unnecessary_pages - Release preallocated pages not needed for the image + * free_unnecessary_pages - Release preallocated pages not needed for the image.   */  static unsigned long free_unnecessary_pages(void)  { @@ -1505,7 +1607,7 @@ static unsigned long free_unnecessary_pages(void)  }  /** - * minimum_image_size - Estimate the minimum acceptable size of an image + * minimum_image_size - Estimate the minimum acceptable size of an image.   * @saveable: Number of saveable pages in the system.   *   * We want to avoid attempting to free too much memory too hard, so estimate the @@ -1525,17 +1627,17 @@ static unsigned long minimum_image_size(unsigned long saveable)  	unsigned long size;  	size = global_page_state(NR_SLAB_RECLAIMABLE) -		+ global_page_state(NR_ACTIVE_ANON) -		+ global_page_state(NR_INACTIVE_ANON) -		+ global_page_state(NR_ACTIVE_FILE) -		+ global_page_state(NR_INACTIVE_FILE) -		- global_page_state(NR_FILE_MAPPED); +		+ global_node_page_state(NR_ACTIVE_ANON) +		+ global_node_page_state(NR_INACTIVE_ANON) +		+ global_node_page_state(NR_ACTIVE_FILE) +		+ global_node_page_state(NR_INACTIVE_FILE) +		- global_node_page_state(NR_FILE_MAPPED);  	return saveable <= size ? 0 : saveable - size;  }  /** - * hibernate_preallocate_memory - Preallocate memory for hibernation image + * hibernate_preallocate_memory - Preallocate memory for hibernation image.   *   * To create a hibernation image it is necessary to make a copy of every page   * frame in use.  We also need a number of page frames to be free during @@ -1708,10 +1810,11 @@ int hibernate_preallocate_memory(void)  #ifdef CONFIG_HIGHMEM  /** -  *	count_pages_for_highmem - compute the number of non-highmem pages -  *	that will be necessary for creating copies of highmem pages. -  */ - + * count_pages_for_highmem - Count non-highmem pages needed for copying highmem. + * + * Compute the number of non-highmem pages that will be necessary for creating + * copies of highmem pages. + */  static unsigned int count_pages_for_highmem(unsigned int nr_highmem)  {  	unsigned int free_highmem = count_free_highmem_pages() + alloc_highmem; @@ -1724,15 +1827,12 @@ static unsigned int count_pages_for_highmem(unsigned int nr_highmem)  	return nr_highmem;  }  #else -static unsigned int -count_pages_for_highmem(unsigned int nr_highmem) { return 0; } +static unsigned int count_pages_for_highmem(unsigned int nr_highmem) { return 0; }  #endif /* CONFIG_HIGHMEM */  /** - *	enough_free_mem - Make sure we have enough free memory for the - *	snapshot image. + * enough_free_mem - Check if there is enough free memory for the image.   */ -  static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)  {  	struct zone *zone; @@ -1751,10 +1851,11 @@ static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)  #ifdef CONFIG_HIGHMEM  /** - *	get_highmem_buffer - if there are some highmem pages in the suspend - *	image, we may need the buffer to copy them and/or load their data. + * get_highmem_buffer - Allocate a buffer for highmem pages. + * + * If there are some highmem pages in the hibernation image, we may need a + * buffer to copy them and/or load their data.   */ -  static inline int get_highmem_buffer(int safe_needed)  {  	buffer = get_image_page(GFP_ATOMIC | __GFP_COLD, safe_needed); @@ -1762,13 +1863,13 @@ static inline int get_highmem_buffer(int safe_needed)  }  /** - *	alloc_highmem_image_pages - allocate some highmem pages for the image. - *	Try to allocate as many pages as needed, but if the number of free - *	highmem pages is lesser than that, allocate them all. + * alloc_highmem_image_pages - Allocate some highmem pages for the image. + * + * Try to allocate as many pages as needed, but if the number of free highmem + * pages is less than that, allocate them all.   */ - -static inline unsigned int -alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem) +static inline unsigned int alloc_highmem_pages(struct memory_bitmap *bm, +					       unsigned int nr_highmem)  {  	unsigned int to_alloc = count_free_highmem_pages(); @@ -1787,25 +1888,24 @@ alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem)  #else  static inline int get_highmem_buffer(int safe_needed) { return 0; } -static inline unsigned int -alloc_highmem_pages(struct memory_bitmap *bm, unsigned int n) { return 0; } +static inline unsigned int alloc_highmem_pages(struct memory_bitmap *bm, +					       unsigned int n) { return 0; }  #endif /* CONFIG_HIGHMEM */  /** - *	swsusp_alloc - allocate memory for the suspend image + * swsusp_alloc - Allocate memory for hibernation image.   * - *	We first try to allocate as many highmem pages as there are - *	saveable highmem pages in the system.  If that fails, we allocate - *	non-highmem pages for the copies of the remaining highmem ones. + * We first try to allocate as many highmem pages as there are + * saveable highmem pages in the system.  If that fails, we allocate + * non-highmem pages for the copies of the remaining highmem ones.   * - *	In this approach it is likely that the copies of highmem pages will - *	also be located in the high memory, because of the way in which - *	copy_data_pages() works. + * In this approach it is likely that the copies of highmem pages will + * also be located in the high memory, because of the way in which + * copy_data_pages() works.   */ - -static int -swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, -		unsigned int nr_pages, unsigned int nr_highmem) +static int swsusp_alloc(struct memory_bitmap *orig_bm, +			struct memory_bitmap *copy_bm, +			unsigned int nr_pages, unsigned int nr_highmem)  {  	if (nr_highmem > 0) {  		if (get_highmem_buffer(PG_ANY)) @@ -1855,7 +1955,8 @@ asmlinkage __visible int swsusp_save(void)  		return -ENOMEM;  	} -	/* During allocating of suspend pagedir, new cold pages may appear. +	/* +	 * During allocating of suspend pagedir, new cold pages may appear.  	 * Kill them.  	 */  	drain_local_pages(NULL); @@ -1918,12 +2019,14 @@ static int init_header(struct swsusp_info *info)  }  /** - *	pack_pfns - pfns corresponding to the set bits found in the bitmap @bm - *	are stored in the array @buf[] (1 page at a time) + * pack_pfns - Prepare PFNs for saving. + * @bm: Memory bitmap. + * @buf: Memory buffer to store the PFNs in. + * + * PFNs corresponding to set bits in @bm are stored in the area of memory + * pointed to by @buf (1 page at a time).   */ - -static inline void -pack_pfns(unsigned long *buf, struct memory_bitmap *bm) +static inline void pack_pfns(unsigned long *buf, struct memory_bitmap *bm)  {  	int j; @@ -1937,22 +2040,21 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm)  }  /** - *	snapshot_read_next - used for reading the system memory snapshot. + * snapshot_read_next - Get the address to read the next image page from. + * @handle: Snapshot handle to be used for the reading.   * - *	On the first call to it @handle should point to a zeroed - *	snapshot_handle structure.  The structure gets updated and a pointer - *	to it should be passed to this function every next time. + * On the first call, @handle should point to a zeroed snapshot_handle + * structure.  The structure gets populated then and a pointer to it should be + * passed to this function every next time.   * - *	On success the function returns a positive number.  Then, the caller - *	is allowed to read up to the returned number of bytes from the memory - *	location computed by the data_of() macro. + * On success, the function returns a positive number.  Then, the caller + * is allowed to read up to the returned number of bytes from the memory + * location computed by the data_of() macro.   * - *	The function returns 0 to indicate the end of data stream condition, - *	and a negative number is returned on error.  In such cases the - *	structure pointed to by @handle is not updated and should not be used - *	any more. + * The function returns 0 to indicate the end of the data stream condition, + * and negative numbers are returned on errors.  If that happens, the structure + * pointed to by @handle is not updated and should not be used any more.   */ -  int snapshot_read_next(struct snapshot_handle *handle)  {  	if (handle->cur > nr_meta_pages + nr_copy_pages) @@ -1981,7 +2083,8 @@ int snapshot_read_next(struct snapshot_handle *handle)  		page = pfn_to_page(memory_bm_next_pfn(©_bm));  		if (PageHighMem(page)) { -			/* Highmem pages are copied to the buffer, +			/* +			 * Highmem pages are copied to the buffer,  			 * because we can't return with a kmapped  			 * highmem page (we may not be called again).  			 */ @@ -1999,53 +2102,41 @@ int snapshot_read_next(struct snapshot_handle *handle)  	return PAGE_SIZE;  } -/** - *	mark_unsafe_pages - mark the pages that cannot be used for storing - *	the image during resume, because they conflict with the pages that - *	had been used before suspend - */ - -static int mark_unsafe_pages(struct memory_bitmap *bm) +static void duplicate_memory_bitmap(struct memory_bitmap *dst, +				    struct memory_bitmap *src)  { -	struct zone *zone; -	unsigned long pfn, max_zone_pfn; +	unsigned long pfn; -	/* Clear page flags */ -	for_each_populated_zone(zone) { -		max_zone_pfn = zone_end_pfn(zone); -		for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) -			if (pfn_valid(pfn)) -				swsusp_unset_page_free(pfn_to_page(pfn)); +	memory_bm_position_reset(src); +	pfn = memory_bm_next_pfn(src); +	while (pfn != BM_END_OF_MAP) { +		memory_bm_set_bit(dst, pfn); +		pfn = memory_bm_next_pfn(src);  	} - -	/* Mark pages that correspond to the "original" pfns as "unsafe" */ -	memory_bm_position_reset(bm); -	do { -		pfn = memory_bm_next_pfn(bm); -		if (likely(pfn != BM_END_OF_MAP)) { -			if (likely(pfn_valid(pfn))) -				swsusp_set_page_free(pfn_to_page(pfn)); -			else -				return -EFAULT; -		} -	} while (pfn != BM_END_OF_MAP); - -	allocated_unsafe_pages = 0; - -	return 0;  } -static void -duplicate_memory_bitmap(struct memory_bitmap *dst, struct memory_bitmap *src) +/** + * mark_unsafe_pages - Mark pages that were used before hibernation. + * + * Mark the pages that cannot be used for storing the image during restoration, + * because they conflict with the pages that had been used before hibernation. + */ +static void mark_unsafe_pages(struct memory_bitmap *bm)  {  	unsigned long pfn; -	memory_bm_position_reset(src); -	pfn = memory_bm_next_pfn(src); +	/* Clear the "free"/"unsafe" bit for all PFNs */ +	memory_bm_position_reset(free_pages_map); +	pfn = memory_bm_next_pfn(free_pages_map);  	while (pfn != BM_END_OF_MAP) { -		memory_bm_set_bit(dst, pfn); -		pfn = memory_bm_next_pfn(src); +		memory_bm_clear_current(free_pages_map); +		pfn = memory_bm_next_pfn(free_pages_map);  	} + +	/* Mark pages that correspond to the "original" PFNs as "unsafe" */ +	duplicate_memory_bitmap(free_pages_map, bm); + +	allocated_unsafe_pages = 0;  }  static int check_header(struct swsusp_info *info) @@ -2063,11 +2154,9 @@ static int check_header(struct swsusp_info *info)  }  /** - *	load header - check the image header and copy data from it + * load header - Check the image header and copy the data from it.   */ - -static int -load_header(struct swsusp_info *info) +static int load_header(struct swsusp_info *info)  {  	int error; @@ -2081,8 +2170,12 @@ load_header(struct swsusp_info *info)  }  /** - *	unpack_orig_pfns - for each element of @buf[] (1 page at a time) set - *	the corresponding bit in the memory bitmap @bm + * unpack_orig_pfns - Set bits corresponding to given PFNs in a memory bitmap. + * @bm: Memory bitmap. + * @buf: Area of memory containing the PFNs. + * + * For each element of the array pointed to by @buf (1 page at a time), set the + * corresponding bit in @bm.   */  static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)  { @@ -2095,7 +2188,7 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)  		/* Extract and buffer page key for data page (s390 only). */  		page_key_memorize(buf + j); -		if (memory_bm_pfn_present(bm, buf[j])) +		if (pfn_valid(buf[j]) && memory_bm_pfn_present(bm, buf[j]))  			memory_bm_set_bit(bm, buf[j]);  		else  			return -EFAULT; @@ -2104,13 +2197,9 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)  	return 0;  } -/* List of "safe" pages that may be used to store data loaded from the suspend - * image - */ -static struct linked_page *safe_pages_list; -  #ifdef CONFIG_HIGHMEM -/* struct highmem_pbe is used for creating the list of highmem pages that +/* + * struct highmem_pbe is used for creating the list of highmem pages that   * should be restored atomically during the resume from disk, because the page   * frames they have occupied before the suspend are in use.   */ @@ -2120,7 +2209,8 @@ struct highmem_pbe {  	struct highmem_pbe *next;  }; -/* List of highmem PBEs needed for restoring the highmem pages that were +/* + * List of highmem PBEs needed for restoring the highmem pages that were   * allocated before the suspend and included in the suspend image, but have   * also been allocated by the "resume" kernel, so their contents cannot be   * written directly to their "original" page frames. @@ -2128,11 +2218,11 @@ struct highmem_pbe {  static struct highmem_pbe *highmem_pblist;  /** - *	count_highmem_image_pages - compute the number of highmem pages in the - *	suspend image.  The bits in the memory bitmap @bm that correspond to the - *	image pages are assumed to be set. + * count_highmem_image_pages - Compute the number of highmem pages in the image. + * @bm: Memory bitmap. + * + * The bits in @bm that correspond to image pages are assumed to be set.   */ -  static unsigned int count_highmem_image_pages(struct memory_bitmap *bm)  {  	unsigned long pfn; @@ -2149,24 +2239,25 @@ static unsigned int count_highmem_image_pages(struct memory_bitmap *bm)  	return cnt;  } -/** - *	prepare_highmem_image - try to allocate as many highmem pages as - *	there are highmem image pages (@nr_highmem_p points to the variable - *	containing the number of highmem image pages).  The pages that are - *	"safe" (ie. will not be overwritten when the suspend image is - *	restored) have the corresponding bits set in @bm (it must be - *	unitialized). - * - *	NOTE: This function should not be called if there are no highmem - *	image pages. - */ -  static unsigned int safe_highmem_pages;  static struct memory_bitmap *safe_highmem_bm; -static int -prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p) +/** + * prepare_highmem_image - Allocate memory for loading highmem data from image. + * @bm: Pointer to an uninitialized memory bitmap structure. + * @nr_highmem_p: Pointer to the number of highmem image pages. + * + * Try to allocate as many highmem pages as there are highmem image pages + * (@nr_highmem_p points to the variable containing the number of highmem image + * pages).  The pages that are "safe" (ie. will not be overwritten when the + * hibernation image is restored entirely) have the corresponding bits set in + * @bm (it must be unitialized). + * + * NOTE: This function should not be called if there are no highmem image pages. + */ +static int prepare_highmem_image(struct memory_bitmap *bm, +				 unsigned int *nr_highmem_p)  {  	unsigned int to_alloc; @@ -2201,39 +2292,42 @@ prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)  	return 0;  } +static struct page *last_highmem_page; +  /** - *	get_highmem_page_buffer - for given highmem image page find the buffer - *	that suspend_write_next() should set for its caller to write to. + * get_highmem_page_buffer - Prepare a buffer to store a highmem image page.   * - *	If the page is to be saved to its "original" page frame or a copy of - *	the page is to be made in the highmem, @buffer is returned.  Otherwise, - *	the copy of the page is to be made in normal memory, so the address of - *	the copy is returned. + * For a given highmem image page get a buffer that suspend_write_next() should + * return to its caller to write to.   * - *	If @buffer is returned, the caller of suspend_write_next() will write - *	the page's contents to @buffer, so they will have to be copied to the - *	right location on the next call to suspend_write_next() and it is done - *	with the help of copy_last_highmem_page().  For this purpose, if - *	@buffer is returned, @last_highmem page is set to the page to which - *	the data will have to be copied from @buffer. + * If the page is to be saved to its "original" page frame or a copy of + * the page is to be made in the highmem, @buffer is returned.  Otherwise, + * the copy of the page is to be made in normal memory, so the address of + * the copy is returned. + * + * If @buffer is returned, the caller of suspend_write_next() will write + * the page's contents to @buffer, so they will have to be copied to the + * right location on the next call to suspend_write_next() and it is done + * with the help of copy_last_highmem_page().  For this purpose, if + * @buffer is returned, @last_highmem_page is set to the page to which + * the data will have to be copied from @buffer.   */ - -static struct page *last_highmem_page; - -static void * -get_highmem_page_buffer(struct page *page, struct chain_allocator *ca) +static void *get_highmem_page_buffer(struct page *page, +				     struct chain_allocator *ca)  {  	struct highmem_pbe *pbe;  	void *kaddr;  	if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page)) { -		/* We have allocated the "original" page frame and we can +		/* +		 * We have allocated the "original" page frame and we can  		 * use it directly to store the loaded page.  		 */  		last_highmem_page = page;  		return buffer;  	} -	/* The "original" page frame has not been allocated and we have to +	/* +	 * The "original" page frame has not been allocated and we have to  	 * use a "safe" page frame to store the loaded page.  	 */  	pbe = chain_alloc(ca, sizeof(struct highmem_pbe)); @@ -2263,11 +2357,12 @@ get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)  }  /** - *	copy_last_highmem_page - copy the contents of a highmem image from - *	@buffer, where the caller of snapshot_write_next() has place them, - *	to the right location represented by @last_highmem_page . + * copy_last_highmem_page - Copy most the most recent highmem image page. + * + * Copy the contents of a highmem image from @buffer, where the caller of + * snapshot_write_next() has stored them, to the right location represented by + * @last_highmem_page .   */ -  static void copy_last_highmem_page(void)  {  	if (last_highmem_page) { @@ -2294,17 +2389,13 @@ static inline void free_highmem_data(void)  		free_image_page(buffer, PG_UNSAFE_CLEAR);  }  #else -static unsigned int -count_highmem_image_pages(struct memory_bitmap *bm) { return 0; } +static unsigned int count_highmem_image_pages(struct memory_bitmap *bm) { return 0; } -static inline int -prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p) -{ -	return 0; -} +static inline int prepare_highmem_image(struct memory_bitmap *bm, +					unsigned int *nr_highmem_p) { return 0; } -static inline void * -get_highmem_page_buffer(struct page *page, struct chain_allocator *ca) +static inline void *get_highmem_page_buffer(struct page *page, +					    struct chain_allocator *ca)  {  	return ERR_PTR(-EINVAL);  } @@ -2314,27 +2405,27 @@ static inline int last_highmem_page_copied(void) { return 1; }  static inline void free_highmem_data(void) {}  #endif /* CONFIG_HIGHMEM */ +#define PBES_PER_LINKED_PAGE	(LINKED_PAGE_DATA_SIZE / sizeof(struct pbe)) +  /** - *	prepare_image - use the memory bitmap @bm to mark the pages that will - *	be overwritten in the process of restoring the system memory state - *	from the suspend image ("unsafe" pages) and allocate memory for the - *	image. + * prepare_image - Make room for loading hibernation image. + * @new_bm: Unitialized memory bitmap structure. + * @bm: Memory bitmap with unsafe pages marked. + * + * Use @bm to mark the pages that will be overwritten in the process of + * restoring the system memory state from the suspend image ("unsafe" pages) + * and allocate memory for the image.   * - *	The idea is to allocate a new memory bitmap first and then allocate - *	as many pages as needed for the image data, but not to assign these - *	pages to specific tasks initially.  Instead, we just mark them as - *	allocated and create a lists of "safe" pages that will be used - *	later.  On systems with high memory a list of "safe" highmem pages is - *	also created. + * The idea is to allocate a new memory bitmap first and then allocate + * as many pages as needed for image data, but without specifying what those + * pages will be used for just yet.  Instead, we mark them all as allocated and + * create a lists of "safe" pages to be used later.  On systems with high + * memory a list of "safe" highmem pages is created too.   */ - -#define PBES_PER_LINKED_PAGE	(LINKED_PAGE_DATA_SIZE / sizeof(struct pbe)) - -static int -prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) +static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)  {  	unsigned int nr_pages, nr_highmem; -	struct linked_page *sp_list, *lp; +	struct linked_page *lp;  	int error;  	/* If there is no highmem, the buffer will not be necessary */ @@ -2342,9 +2433,7 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)  	buffer = NULL;  	nr_highmem = count_highmem_image_pages(bm); -	error = mark_unsafe_pages(bm); -	if (error) -		goto Free; +	mark_unsafe_pages(bm);  	error = memory_bm_create(new_bm, GFP_ATOMIC, PG_SAFE);  	if (error) @@ -2357,14 +2446,15 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)  		if (error)  			goto Free;  	} -	/* Reserve some safe pages for potential later use. +	/* +	 * Reserve some safe pages for potential later use.  	 *  	 * NOTE: This way we make sure there will be enough safe pages for the  	 * chain_alloc() in get_buffer().  It is a bit wasteful, but  	 * nr_copy_pages cannot be greater than 50% of the memory anyway. +	 * +	 * nr_copy_pages cannot be less than allocated_unsafe_pages too.  	 */ -	sp_list = NULL; -	/* nr_copy_pages cannot be lesser than allocated_unsafe_pages */  	nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages;  	nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE);  	while (nr_pages > 0) { @@ -2373,12 +2463,11 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)  			error = -ENOMEM;  			goto Free;  		} -		lp->next = sp_list; -		sp_list = lp; +		lp->next = safe_pages_list; +		safe_pages_list = lp;  		nr_pages--;  	}  	/* Preallocate memory for the image */ -	safe_pages_list = NULL;  	nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages;  	while (nr_pages > 0) {  		lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC); @@ -2396,12 +2485,6 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)  		swsusp_set_page_free(virt_to_page(lp));  		nr_pages--;  	} -	/* Free the reserved safe pages so that chain_alloc() can use them */ -	while (sp_list) { -		lp = sp_list->next; -		free_image_page(sp_list, PG_UNSAFE_CLEAR); -		sp_list = lp; -	}  	return 0;   Free: @@ -2410,10 +2493,11 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)  }  /** - *	get_buffer - compute the address that snapshot_write_next() should - *	set for its caller to write to. + * get_buffer - Get the address to store the next image data page. + * + * Get the address that snapshot_write_next() should return to its caller to + * write to.   */ -  static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)  {  	struct pbe *pbe; @@ -2428,12 +2512,14 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)  		return get_highmem_page_buffer(page, ca);  	if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page)) -		/* We have allocated the "original" page frame and we can +		/* +		 * We have allocated the "original" page frame and we can  		 * use it directly to store the loaded page.  		 */  		return page_address(page); -	/* The "original" page frame has not been allocated and we have to +	/* +	 * The "original" page frame has not been allocated and we have to  	 * use a "safe" page frame to store the loaded page.  	 */  	pbe = chain_alloc(ca, sizeof(struct pbe)); @@ -2450,22 +2536,21 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)  }  /** - *	snapshot_write_next - used for writing the system memory snapshot. + * snapshot_write_next - Get the address to store the next image page. + * @handle: Snapshot handle structure to guide the writing.   * - *	On the first call to it @handle should point to a zeroed - *	snapshot_handle structure.  The structure gets updated and a pointer - *	to it should be passed to this function every next time. + * On the first call, @handle should point to a zeroed snapshot_handle + * structure.  The structure gets populated then and a pointer to it should be + * passed to this function every next time.   * - *	On success the function returns a positive number.  Then, the caller - *	is allowed to write up to the returned number of bytes to the memory - *	location computed by the data_of() macro. + * On success, the function returns a positive number.  Then, the caller + * is allowed to write up to the returned number of bytes to the memory + * location computed by the data_of() macro.   * - *	The function returns 0 to indicate the "end of file" condition, - *	and a negative number is returned on error.  In such cases the - *	structure pointed to by @handle is not updated and should not be used - *	any more. + * The function returns 0 to indicate the "end of file" condition.  Negative + * numbers are returned on errors, in which cases the structure pointed to by + * @handle is not updated and should not be used any more.   */ -  int snapshot_write_next(struct snapshot_handle *handle)  {  	static struct chain_allocator ca; @@ -2491,6 +2576,8 @@ int snapshot_write_next(struct snapshot_handle *handle)  		if (error)  			return error; +		safe_pages_list = NULL; +  		error = memory_bm_create(©_bm, GFP_ATOMIC, PG_ANY);  		if (error)  			return error; @@ -2500,6 +2587,7 @@ int snapshot_write_next(struct snapshot_handle *handle)  		if (error)  			return error; +		hibernate_restore_protection_begin();  	} else if (handle->cur <= nr_meta_pages + 1) {  		error = unpack_orig_pfns(buffer, ©_bm);  		if (error) @@ -2522,6 +2610,7 @@ int snapshot_write_next(struct snapshot_handle *handle)  		copy_last_highmem_page();  		/* Restore page key for data page (s390 only). */  		page_key_write(handle->buffer); +		hibernate_restore_protect_page(handle->buffer);  		handle->buffer = get_buffer(&orig_bm, &ca);  		if (IS_ERR(handle->buffer))  			return PTR_ERR(handle->buffer); @@ -2533,22 +2622,23 @@ int snapshot_write_next(struct snapshot_handle *handle)  }  /** - *	snapshot_write_finalize - must be called after the last call to - *	snapshot_write_next() in case the last page in the image happens - *	to be a highmem page and its contents should be stored in the - *	highmem.  Additionally, it releases the memory that will not be - *	used any more. + * snapshot_write_finalize - Complete the loading of a hibernation image. + * + * Must be called after the last call to snapshot_write_next() in case the last + * page in the image happens to be a highmem page and its contents should be + * stored in highmem.  Additionally, it recycles bitmap memory that's not + * necessary any more.   */ -  void snapshot_write_finalize(struct snapshot_handle *handle)  {  	copy_last_highmem_page();  	/* Restore page key for data page (s390 only). */  	page_key_write(handle->buffer);  	page_key_free(); -	/* Free only if we have loaded the image entirely */ +	hibernate_restore_protect_page(handle->buffer); +	/* Do that only if we have loaded the image entirely */  	if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) { -		memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR); +		memory_bm_recycle(&orig_bm);  		free_highmem_data();  	}  } @@ -2561,8 +2651,8 @@ int snapshot_image_loaded(struct snapshot_handle *handle)  #ifdef CONFIG_HIGHMEM  /* Assumes that @buf is ready and points to a "safe" page */ -static inline void -swap_two_pages_data(struct page *p1, struct page *p2, void *buf) +static inline void swap_two_pages_data(struct page *p1, struct page *p2, +				       void *buf)  {  	void *kaddr1, *kaddr2; @@ -2576,15 +2666,15 @@ swap_two_pages_data(struct page *p1, struct page *p2, void *buf)  }  /** - *	restore_highmem - for each highmem page that was allocated before - *	the suspend and included in the suspend image, and also has been - *	allocated by the "resume" kernel swap its current (ie. "before - *	resume") contents with the previous (ie. "before suspend") one. + * restore_highmem - Put highmem image pages into their original locations. + * + * For each highmem page that was in use before hibernation and is included in + * the image, and also has been allocated by the "restore" kernel, swap its + * current contents with the previous (ie. "before hibernation") ones.   * - *	If the resume eventually fails, we can call this function once - *	again and restore the "before resume" highmem state. + * If the restore eventually fails, we can call this function once again and + * restore the highmem state as seen by the restore kernel.   */ -  int restore_highmem(void)  {  	struct highmem_pbe *pbe = highmem_pblist; diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 5b70d64b871e..0acab9d7f96f 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -266,16 +266,18 @@ static int suspend_test(int level)   */  static int suspend_prepare(suspend_state_t state)  { -	int error; +	int error, nr_calls = 0;  	if (!sleep_state_supported(state))  		return -EPERM;  	pm_prepare_console(); -	error = pm_notifier_call_chain(PM_SUSPEND_PREPARE); -	if (error) +	error = __pm_notifier_call_chain(PM_SUSPEND_PREPARE, -1, &nr_calls); +	if (error) { +		nr_calls--;  		goto Finish; +	}  	trace_suspend_resume(TPS("freeze_processes"), 0, true);  	error = suspend_freeze_processes(); @@ -286,7 +288,7 @@ static int suspend_prepare(suspend_state_t state)  	suspend_stats.failed_freeze++;  	dpm_save_failed_step(SUSPEND_FREEZE);   Finish: -	pm_notifier_call_chain(PM_POST_SUSPEND); +	__pm_notifier_call_chain(PM_POST_SUSPEND, nr_calls, NULL);  	pm_restore_console();  	return error;  } diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 160e1006640d..a3b1e617bcdc 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -261,7 +261,7 @@ static void hib_end_io(struct bio *bio)  	bio_put(bio);  } -static int hib_submit_io(int rw, pgoff_t page_off, void *addr, +static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr,  		struct hib_bio_batch *hb)  {  	struct page *page = virt_to_page(addr); @@ -271,6 +271,7 @@ static int hib_submit_io(int rw, pgoff_t page_off, void *addr,  	bio = bio_alloc(__GFP_RECLAIM | __GFP_HIGH, 1);  	bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9);  	bio->bi_bdev = hib_resume_bdev; +	bio_set_op_attrs(bio, op, op_flags);  	if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {  		printk(KERN_ERR "PM: Adding page to bio failed at %llu\n", @@ -283,9 +284,9 @@ static int hib_submit_io(int rw, pgoff_t page_off, void *addr,  		bio->bi_end_io = hib_end_io;  		bio->bi_private = hb;  		atomic_inc(&hb->count); -		submit_bio(rw, bio); +		submit_bio(bio);  	} else { -		error = submit_bio_wait(rw, bio); +		error = submit_bio_wait(bio);  		bio_put(bio);  	} @@ -306,7 +307,8 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)  {  	int error; -	hib_submit_io(READ_SYNC, swsusp_resume_block, swsusp_header, NULL); +	hib_submit_io(REQ_OP_READ, READ_SYNC, swsusp_resume_block, +		      swsusp_header, NULL);  	if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) ||  	    !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) {  		memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); @@ -315,8 +317,8 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)  		swsusp_header->flags = flags;  		if (flags & SF_CRC32_MODE)  			swsusp_header->crc32 = handle->crc32; -		error = hib_submit_io(WRITE_SYNC, swsusp_resume_block, -					swsusp_header, NULL); +		error = hib_submit_io(REQ_OP_WRITE, WRITE_SYNC, +				      swsusp_resume_block, swsusp_header, NULL);  	} else {  		printk(KERN_ERR "PM: Swap header not found!\n");  		error = -ENODEV; @@ -348,6 +350,12 @@ static int swsusp_swap_check(void)  	if (res < 0)  		blkdev_put(hib_resume_bdev, FMODE_WRITE); +	/* +	 * Update the resume device to the one actually used, +	 * so the test_resume mode can use it in case it is +	 * invoked from hibernate() to test the snapshot. +	 */ +	swsusp_resume_device = hib_resume_bdev->bd_dev;  	return res;  } @@ -389,7 +397,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb)  	} else {  		src = buf;  	} -	return hib_submit_io(WRITE_SYNC, offset, src, hb); +	return hib_submit_io(REQ_OP_WRITE, WRITE_SYNC, offset, src, hb);  }  static void release_swap_writer(struct swap_map_handle *handle) @@ -992,7 +1000,8 @@ static int get_swap_reader(struct swap_map_handle *handle,  			return -ENOMEM;  		} -		error = hib_submit_io(READ_SYNC, offset, tmp->map, NULL); +		error = hib_submit_io(REQ_OP_READ, READ_SYNC, offset, +				      tmp->map, NULL);  		if (error) {  			release_swap_reader(handle);  			return error; @@ -1016,7 +1025,7 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf,  	offset = handle->cur->entries[handle->k];  	if (!offset)  		return -EFAULT; -	error = hib_submit_io(READ_SYNC, offset, buf, hb); +	error = hib_submit_io(REQ_OP_READ, READ_SYNC, offset, buf, hb);  	if (error)  		return error;  	if (++handle->k >= MAP_PAGE_ENTRIES) { @@ -1525,7 +1534,8 @@ int swsusp_check(void)  	if (!IS_ERR(hib_resume_bdev)) {  		set_blocksize(hib_resume_bdev, PAGE_SIZE);  		clear_page(swsusp_header); -		error = hib_submit_io(READ_SYNC, swsusp_resume_block, +		error = hib_submit_io(REQ_OP_READ, READ_SYNC, +					swsusp_resume_block,  					swsusp_header, NULL);  		if (error)  			goto put; @@ -1533,7 +1543,8 @@ int swsusp_check(void)  		if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) {  			memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10);  			/* Reset swap signature now */ -			error = hib_submit_io(WRITE_SYNC, swsusp_resume_block, +			error = hib_submit_io(REQ_OP_WRITE, WRITE_SYNC, +						swsusp_resume_block,  						swsusp_header, NULL);  		} else {  			error = -EINVAL; @@ -1577,10 +1588,12 @@ int swsusp_unmark(void)  {  	int error; -	hib_submit_io(READ_SYNC, swsusp_resume_block, swsusp_header, NULL); +	hib_submit_io(REQ_OP_READ, READ_SYNC, swsusp_resume_block, +		      swsusp_header, NULL);  	if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) {  		memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10); -		error = hib_submit_io(WRITE_SYNC, swsusp_resume_block, +		error = hib_submit_io(REQ_OP_WRITE, WRITE_SYNC, +					swsusp_resume_block,  					swsusp_header, NULL);  	} else {  		printk(KERN_ERR "PM: Cannot find swsusp signature!\n"); diff --git a/kernel/power/user.c b/kernel/power/user.c index 526e8911460a..35310b627388 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -47,7 +47,7 @@ atomic_t snapshot_device_available = ATOMIC_INIT(1);  static int snapshot_open(struct inode *inode, struct file *filp)  {  	struct snapshot_data *data; -	int error; +	int error, nr_calls = 0;  	if (!hibernation_available())  		return -EPERM; @@ -74,9 +74,9 @@ static int snapshot_open(struct inode *inode, struct file *filp)  			swap_type_of(swsusp_resume_device, 0, NULL) : -1;  		data->mode = O_RDONLY;  		data->free_bitmaps = false; -		error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); +		error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls);  		if (error) -			pm_notifier_call_chain(PM_POST_HIBERNATION); +			__pm_notifier_call_chain(PM_POST_HIBERNATION, --nr_calls, NULL);  	} else {  		/*  		 * Resuming.  We may need to wait for the image device to @@ -86,13 +86,15 @@ static int snapshot_open(struct inode *inode, struct file *filp)  		data->swap = -1;  		data->mode = O_WRONLY; -		error = pm_notifier_call_chain(PM_RESTORE_PREPARE); +		error = __pm_notifier_call_chain(PM_RESTORE_PREPARE, -1, &nr_calls);  		if (!error) {  			error = create_basic_memory_bitmaps();  			data->free_bitmaps = !error; -		} +		} else +			nr_calls--; +  		if (error) -			pm_notifier_call_chain(PM_POST_RESTORE); +			__pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL);  	}  	if (error)  		atomic_inc(&snapshot_device_available); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 60cdf6386763..d4de33934dac 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -3177,9 +3177,8 @@ void show_regs_print_info(const char *log_lvl)  {  	dump_stack_print_info(log_lvl); -	printk("%stask: %p ti: %p task.ti: %p\n", -	       log_lvl, current, current_thread_info(), -	       task_thread_info(current)); +	printk("%stask: %p task.stack: %p\n", +	       log_lvl, current, task_stack_page(current));  }  #endif diff --git a/kernel/profile.c b/kernel/profile.c index c2199e9901c9..2dbccf2d806c 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -328,68 +328,57 @@ out:  	put_cpu();  } -static int profile_cpu_callback(struct notifier_block *info, -					unsigned long action, void *__cpu) +static int profile_dead_cpu(unsigned int cpu)  { -	int node, cpu = (unsigned long)__cpu;  	struct page *page; +	int i; -	switch (action) { -	case CPU_UP_PREPARE: -	case CPU_UP_PREPARE_FROZEN: -		node = cpu_to_mem(cpu); -		per_cpu(cpu_profile_flip, cpu) = 0; -		if (!per_cpu(cpu_profile_hits, cpu)[1]) { -			page = __alloc_pages_node(node, -					GFP_KERNEL | __GFP_ZERO, -					0); -			if (!page) -				return notifier_from_errno(-ENOMEM); -			per_cpu(cpu_profile_hits, cpu)[1] = page_address(page); -		} -		if (!per_cpu(cpu_profile_hits, cpu)[0]) { -			page = __alloc_pages_node(node, -					GFP_KERNEL | __GFP_ZERO, -					0); -			if (!page) -				goto out_free; -			per_cpu(cpu_profile_hits, cpu)[0] = page_address(page); -		} -		break; -out_free: -		page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); -		per_cpu(cpu_profile_hits, cpu)[1] = NULL; -		__free_page(page); -		return notifier_from_errno(-ENOMEM); -	case CPU_ONLINE: -	case CPU_ONLINE_FROZEN: -		if (prof_cpu_mask != NULL) -			cpumask_set_cpu(cpu, prof_cpu_mask); -		break; -	case CPU_UP_CANCELED: -	case CPU_UP_CANCELED_FROZEN: -	case CPU_DEAD: -	case CPU_DEAD_FROZEN: -		if (prof_cpu_mask != NULL) -			cpumask_clear_cpu(cpu, prof_cpu_mask); -		if (per_cpu(cpu_profile_hits, cpu)[0]) { -			page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]); -			per_cpu(cpu_profile_hits, cpu)[0] = NULL; +	if (prof_cpu_mask != NULL) +		cpumask_clear_cpu(cpu, prof_cpu_mask); + +	for (i = 0; i < 2; i++) { +		if (per_cpu(cpu_profile_hits, cpu)[i]) { +			page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[i]); +			per_cpu(cpu_profile_hits, cpu)[i] = NULL;  			__free_page(page);  		} -		if (per_cpu(cpu_profile_hits, cpu)[1]) { -			page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); -			per_cpu(cpu_profile_hits, cpu)[1] = NULL; -			__free_page(page); +	} +	return 0; +} + +static int profile_prepare_cpu(unsigned int cpu) +{ +	int i, node = cpu_to_mem(cpu); +	struct page *page; + +	per_cpu(cpu_profile_flip, cpu) = 0; + +	for (i = 0; i < 2; i++) { +		if (per_cpu(cpu_profile_hits, cpu)[i]) +			continue; + +		page = __alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); +		if (!page) { +			profile_dead_cpu(cpu); +			return -ENOMEM;  		} -		break; +		per_cpu(cpu_profile_hits, cpu)[i] = page_address(page); +  	} -	return NOTIFY_OK; +	return 0; +} + +static int profile_online_cpu(unsigned int cpu) +{ +	if (prof_cpu_mask != NULL) +		cpumask_set_cpu(cpu, prof_cpu_mask); + +	return 0;  } +  #else /* !CONFIG_SMP */  #define profile_flip_buffers()		do { } while (0)  #define profile_discard_flip_buffers()	do { } while (0) -#define profile_cpu_callback		NULL  static void do_profile_hits(int type, void *__pc, unsigned int nr_hits)  { @@ -531,83 +520,43 @@ static const struct file_operations proc_profile_operations = {  	.llseek		= default_llseek,  }; -#ifdef CONFIG_SMP -static void profile_nop(void *unused) -{ -} - -static int create_hash_tables(void) +int __ref create_proc_profile(void)  { -	int cpu; - -	for_each_online_cpu(cpu) { -		int node = cpu_to_mem(cpu); -		struct page *page; - -		page = __alloc_pages_node(node, -				GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE, -				0); -		if (!page) -			goto out_cleanup; -		per_cpu(cpu_profile_hits, cpu)[1] -				= (struct profile_hit *)page_address(page); -		page = __alloc_pages_node(node, -				GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE, -				0); -		if (!page) -			goto out_cleanup; -		per_cpu(cpu_profile_hits, cpu)[0] -				= (struct profile_hit *)page_address(page); -	} -	return 0; -out_cleanup: -	prof_on = 0; -	smp_mb(); -	on_each_cpu(profile_nop, NULL, 1); -	for_each_online_cpu(cpu) { -		struct page *page; - -		if (per_cpu(cpu_profile_hits, cpu)[0]) { -			page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]); -			per_cpu(cpu_profile_hits, cpu)[0] = NULL; -			__free_page(page); -		} -		if (per_cpu(cpu_profile_hits, cpu)[1]) { -			page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); -			per_cpu(cpu_profile_hits, cpu)[1] = NULL; -			__free_page(page); -		} -	} -	return -1; -} -#else -#define create_hash_tables()			({ 0; }) +	struct proc_dir_entry *entry; +#ifdef CONFIG_SMP +	enum cpuhp_state online_state;  #endif -int __ref create_proc_profile(void) /* false positive from hotcpu_notifier */ -{ -	struct proc_dir_entry *entry;  	int err = 0;  	if (!prof_on)  		return 0; - -	cpu_notifier_register_begin(); - -	if (create_hash_tables()) { -		err = -ENOMEM; -		goto out; -	} - +#ifdef CONFIG_SMP +	err = cpuhp_setup_state(CPUHP_PROFILE_PREPARE, "PROFILE_PREPARE", +				profile_prepare_cpu, profile_dead_cpu); +	if (err) +		return err; + +	err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "AP_PROFILE_ONLINE", +				profile_online_cpu, NULL); +	if (err < 0) +		goto err_state_prep; +	online_state = err; +	err = 0; +#endif  	entry = proc_create("profile", S_IWUSR | S_IRUGO,  			    NULL, &proc_profile_operations);  	if (!entry) -		goto out; +		goto err_state_onl;  	proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t)); -	__hotcpu_notifier(profile_cpu_callback, 0); -out: -	cpu_notifier_register_done(); +	return err; +err_state_onl: +#ifdef CONFIG_SMP +	cpuhp_remove_state(online_state); +err_state_prep: +	cpuhp_remove_state(CPUHP_PROFILE_PREPARE); +#endif  	return err;  }  subsys_initcall(create_proc_profile); diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 3cee0d8393ed..d38ab08a3fe7 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -58,7 +58,7 @@ MODULE_AUTHOR("Paul E. McKenney <[email protected]>");  #define VERBOSE_PERFOUT_ERRSTRING(s) \  	do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0) -torture_param(bool, gp_exp, true, "Use expedited GP wait primitives"); +torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");  torture_param(int, holdoff, 10, "Holdoff time before test start (s)");  torture_param(int, nreaders, -1, "Number of RCU reader threads");  torture_param(int, nwriters, -1, "Number of RCU updater threads"); @@ -96,12 +96,7 @@ static int rcu_perf_writer_state;  #define MAX_MEAS 10000  #define MIN_MEAS 100 -#if defined(MODULE) || defined(CONFIG_RCU_PERF_TEST_RUNNABLE) -#define RCUPERF_RUNNABLE_INIT 1 -#else -#define RCUPERF_RUNNABLE_INIT 0 -#endif -static int perf_runnable = RCUPERF_RUNNABLE_INIT; +static int perf_runnable = IS_ENABLED(MODULE);  module_param(perf_runnable, int, 0444);  MODULE_PARM_DESC(perf_runnable, "Start rcuperf at boot"); @@ -363,8 +358,6 @@ rcu_perf_writer(void *arg)  	u64 *wdpp = writer_durations[me];  	VERBOSE_PERFOUT_STRING("rcu_perf_writer task started"); -	WARN_ON(rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp); -	WARN_ON(rcu_gp_is_normal() && gp_exp);  	WARN_ON(!wdpp);  	set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));  	sp.sched_priority = 1; @@ -631,12 +624,24 @@ rcu_perf_init(void)  		firsterr = -ENOMEM;  		goto unwind;  	} +	if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp) { +		VERBOSE_PERFOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!"); +		firsterr = -EINVAL; +		goto unwind; +	} +	if (rcu_gp_is_normal() && gp_exp) { +		VERBOSE_PERFOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!"); +		firsterr = -EINVAL; +		goto unwind; +	}  	for (i = 0; i < nrealwriters; i++) {  		writer_durations[i] =  			kcalloc(MAX_MEAS, sizeof(*writer_durations[i]),  				GFP_KERNEL); -		if (!writer_durations[i]) +		if (!writer_durations[i]) { +			firsterr = -ENOMEM;  			goto unwind; +		}  		firsterr = torture_create_kthread(rcu_perf_writer, (void *)i,  						  writer_tasks[i]);  		if (firsterr) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 084a28a732eb..971e2b138063 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -182,12 +182,7 @@ static const char *rcu_torture_writer_state_getname(void)  	return rcu_torture_writer_state_names[i];  } -#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) -#define RCUTORTURE_RUNNABLE_INIT 1 -#else -#define RCUTORTURE_RUNNABLE_INIT 0 -#endif -static int torture_runnable = RCUTORTURE_RUNNABLE_INIT; +static int torture_runnable = IS_ENABLED(MODULE);  module_param(torture_runnable, int, 0444);  MODULE_PARM_DESC(torture_runnable, "Start rcutorture at boot"); @@ -1476,7 +1471,7 @@ static int rcu_torture_barrier_cbs(void *arg)  			break;  		/*  		 * The above smp_load_acquire() ensures barrier_phase load -		 * is ordered before the folloiwng ->call(). +		 * is ordered before the following ->call().  		 */  		local_irq_disable(); /* Just to test no-irq call_rcu(). */  		cur_ops->call(&rcu, rcu_torture_barrier_cbf); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c7f1bc4f817c..5d80925e7fc8 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -125,12 +125,14 @@ int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;  /* Number of rcu_nodes at specified level. */  static int num_rcu_lvl[] = NUM_RCU_LVL_INIT;  int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ +/* panic() on RCU Stall sysctl. */ +int sysctl_panic_on_rcu_stall __read_mostly;  /*   * The rcu_scheduler_active variable transitions from zero to one just   * before the first task is spawned.  So when this variable is zero, RCU   * can assume that there is but one task, allowing RCU to (for example) - * optimize synchronize_sched() to a simple barrier().  When this variable + * optimize synchronize_rcu() to a simple barrier().  When this variable   * is one, RCU must actually do all the hard work required to detect real   * grace periods.  This variable is also used to suppress boot-time false   * positives from lockdep-RCU error checking. @@ -159,6 +161,7 @@ static void invoke_rcu_core(void);  static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);  static void rcu_report_exp_rdp(struct rcu_state *rsp,  			       struct rcu_data *rdp, bool wake); +static void sync_sched_exp_online_cleanup(int cpu);  /* rcuc/rcub kthread realtime priority */  #ifdef CONFIG_RCU_KTHREAD_PRIO @@ -1070,11 +1073,11 @@ EXPORT_SYMBOL_GPL(rcu_is_watching);   * offline to continue to use RCU for one jiffy after marking itself   * offline in the cpu_online_mask.  This leniency is necessary given the   * non-atomic nature of the online and offline processing, for example, - * the fact that a CPU enters the scheduler after completing the CPU_DYING - * notifiers. + * the fact that a CPU enters the scheduler after completing the teardown + * of the CPU.   * - * This is also why RCU internally marks CPUs online during the - * CPU_UP_PREPARE phase and offline during the CPU_DEAD phase. + * This is also why RCU internally marks CPUs online during in the + * preparation phase and offline after the CPU has been taken down.   *   * Disable checking if in an NMI handler because we cannot safely report   * errors from NMI handlers anyway. @@ -1284,9 +1287,9 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp)  	rcu_for_each_leaf_node(rsp, rnp) {  		raw_spin_lock_irqsave_rcu_node(rnp, flags);  		if (rnp->qsmask != 0) { -			for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) -				if (rnp->qsmask & (1UL << cpu)) -					dump_cpu_task(rnp->grplo + cpu); +			for_each_leaf_node_possible_cpu(rnp, cpu) +				if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) +					dump_cpu_task(cpu);  		}  		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);  	} @@ -1311,6 +1314,12 @@ static void rcu_stall_kick_kthreads(struct rcu_state *rsp)  	}  } +static inline void panic_on_rcu_stall(void) +{ +	if (sysctl_panic_on_rcu_stall) +		panic("RCU Stall\n"); +} +  static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)  {  	int cpu; @@ -1351,10 +1360,9 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)  		raw_spin_lock_irqsave_rcu_node(rnp, flags);  		ndetected += rcu_print_task_stall(rnp);  		if (rnp->qsmask != 0) { -			for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) -				if (rnp->qsmask & (1UL << cpu)) { -					print_cpu_stall_info(rsp, -							     rnp->grplo + cpu); +			for_each_leaf_node_possible_cpu(rnp, cpu) +				if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) { +					print_cpu_stall_info(rsp, cpu);  					ndetected++;  				}  		} @@ -1390,6 +1398,8 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)  	rcu_check_gp_kthread_starvation(rsp); +	panic_on_rcu_stall(); +  	force_quiescent_state(rsp);  /* Kick them all. */  } @@ -1430,6 +1440,8 @@ static void print_cpu_stall(struct rcu_state *rsp)  			   jiffies + 3 * rcu_jiffies_till_stall_check() + 3);  	raw_spin_unlock_irqrestore_rcu_node(rnp, flags); +	panic_on_rcu_stall(); +  	/*  	 * Attempt to revive the RCU machinery by forcing a context switch.  	 * @@ -1989,8 +2001,7 @@ static bool rcu_gp_init(struct rcu_state *rsp)  	 * of the tree within the rsp->node[] array.  Note that other CPUs  	 * will access only the leaves of the hierarchy, thus seeing that no  	 * grace period is in progress, at least until the corresponding -	 * leaf node has been initialized.  In addition, we have excluded -	 * CPU-hotplug operations. +	 * leaf node has been initialized.  	 *  	 * The grace period cannot complete until the initialization  	 * process finishes, because this kthread handles both. @@ -2872,7 +2883,6 @@ static void force_qs_rnp(struct rcu_state *rsp,  				  unsigned long *maxj),  			 bool *isidle, unsigned long *maxj)  { -	unsigned long bit;  	int cpu;  	unsigned long flags;  	unsigned long mask; @@ -2907,9 +2917,8 @@ static void force_qs_rnp(struct rcu_state *rsp,  				continue;  			}  		} -		cpu = rnp->grplo; -		bit = 1; -		for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { +		for_each_leaf_node_possible_cpu(rnp, cpu) { +			unsigned long bit = leaf_node_cpu_bit(rnp, cpu);  			if ((rnp->qsmask & bit) != 0) {  				if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj))  					mask |= bit; @@ -3448,549 +3457,6 @@ static bool rcu_seq_done(unsigned long *sp, unsigned long s)  	return ULONG_CMP_GE(READ_ONCE(*sp), s);  } -/* Wrapper functions for expedited grace periods.  */ -static void rcu_exp_gp_seq_start(struct rcu_state *rsp) -{ -	rcu_seq_start(&rsp->expedited_sequence); -} -static void rcu_exp_gp_seq_end(struct rcu_state *rsp) -{ -	rcu_seq_end(&rsp->expedited_sequence); -	smp_mb(); /* Ensure that consecutive grace periods serialize. */ -} -static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp) -{ -	unsigned long s; - -	smp_mb(); /* Caller's modifications seen first by other CPUs. */ -	s = rcu_seq_snap(&rsp->expedited_sequence); -	trace_rcu_exp_grace_period(rsp->name, s, TPS("snap")); -	return s; -} -static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) -{ -	return rcu_seq_done(&rsp->expedited_sequence, s); -} - -/* - * Reset the ->expmaskinit values in the rcu_node tree to reflect any - * recent CPU-online activity.  Note that these masks are not cleared - * when CPUs go offline, so they reflect the union of all CPUs that have - * ever been online.  This means that this function normally takes its - * no-work-to-do fastpath. - */ -static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) -{ -	bool done; -	unsigned long flags; -	unsigned long mask; -	unsigned long oldmask; -	int ncpus = READ_ONCE(rsp->ncpus); -	struct rcu_node *rnp; -	struct rcu_node *rnp_up; - -	/* If no new CPUs onlined since last time, nothing to do. */ -	if (likely(ncpus == rsp->ncpus_snap)) -		return; -	rsp->ncpus_snap = ncpus; - -	/* -	 * Each pass through the following loop propagates newly onlined -	 * CPUs for the current rcu_node structure up the rcu_node tree. -	 */ -	rcu_for_each_leaf_node(rsp, rnp) { -		raw_spin_lock_irqsave_rcu_node(rnp, flags); -		if (rnp->expmaskinit == rnp->expmaskinitnext) { -			raw_spin_unlock_irqrestore_rcu_node(rnp, flags); -			continue;  /* No new CPUs, nothing to do. */ -		} - -		/* Update this node's mask, track old value for propagation. */ -		oldmask = rnp->expmaskinit; -		rnp->expmaskinit = rnp->expmaskinitnext; -		raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - -		/* If was already nonzero, nothing to propagate. */ -		if (oldmask) -			continue; - -		/* Propagate the new CPU up the tree. */ -		mask = rnp->grpmask; -		rnp_up = rnp->parent; -		done = false; -		while (rnp_up) { -			raw_spin_lock_irqsave_rcu_node(rnp_up, flags); -			if (rnp_up->expmaskinit) -				done = true; -			rnp_up->expmaskinit |= mask; -			raw_spin_unlock_irqrestore_rcu_node(rnp_up, flags); -			if (done) -				break; -			mask = rnp_up->grpmask; -			rnp_up = rnp_up->parent; -		} -	} -} - -/* - * Reset the ->expmask values in the rcu_node tree in preparation for - * a new expedited grace period. - */ -static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) -{ -	unsigned long flags; -	struct rcu_node *rnp; - -	sync_exp_reset_tree_hotplug(rsp); -	rcu_for_each_node_breadth_first(rsp, rnp) { -		raw_spin_lock_irqsave_rcu_node(rnp, flags); -		WARN_ON_ONCE(rnp->expmask); -		rnp->expmask = rnp->expmaskinit; -		raw_spin_unlock_irqrestore_rcu_node(rnp, flags); -	} -} - -/* - * Return non-zero if there is no RCU expedited grace period in progress - * for the specified rcu_node structure, in other words, if all CPUs and - * tasks covered by the specified rcu_node structure have done their bit - * for the current expedited grace period.  Works only for preemptible - * RCU -- other RCU implementation use other means. - * - * Caller must hold the rcu_state's exp_mutex. - */ -static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) -{ -	return rnp->exp_tasks == NULL && -	       READ_ONCE(rnp->expmask) == 0; -} - -/* - * Report the exit from RCU read-side critical section for the last task - * that queued itself during or before the current expedited preemptible-RCU - * grace period.  This event is reported either to the rcu_node structure on - * which the task was queued or to one of that rcu_node structure's ancestors, - * recursively up the tree.  (Calm down, calm down, we do the recursion - * iteratively!) - * - * Caller must hold the rcu_state's exp_mutex and the specified rcu_node - * structure's ->lock. - */ -static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, -				 bool wake, unsigned long flags) -	__releases(rnp->lock) -{ -	unsigned long mask; - -	for (;;) { -		if (!sync_rcu_preempt_exp_done(rnp)) { -			if (!rnp->expmask) -				rcu_initiate_boost(rnp, flags); -			else -				raw_spin_unlock_irqrestore_rcu_node(rnp, flags); -			break; -		} -		if (rnp->parent == NULL) { -			raw_spin_unlock_irqrestore_rcu_node(rnp, flags); -			if (wake) { -				smp_mb(); /* EGP done before wake_up(). */ -				swake_up(&rsp->expedited_wq); -			} -			break; -		} -		mask = rnp->grpmask; -		raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled */ -		rnp = rnp->parent; -		raw_spin_lock_rcu_node(rnp); /* irqs already disabled */ -		WARN_ON_ONCE(!(rnp->expmask & mask)); -		rnp->expmask &= ~mask; -	} -} - -/* - * Report expedited quiescent state for specified node.  This is a - * lock-acquisition wrapper function for __rcu_report_exp_rnp(). - * - * Caller must hold the rcu_state's exp_mutex. - */ -static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, -					      struct rcu_node *rnp, bool wake) -{ -	unsigned long flags; - -	raw_spin_lock_irqsave_rcu_node(rnp, flags); -	__rcu_report_exp_rnp(rsp, rnp, wake, flags); -} - -/* - * Report expedited quiescent state for multiple CPUs, all covered by the - * specified leaf rcu_node structure.  Caller must hold the rcu_state's - * exp_mutex. - */ -static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, -				    unsigned long mask, bool wake) -{ -	unsigned long flags; - -	raw_spin_lock_irqsave_rcu_node(rnp, flags); -	if (!(rnp->expmask & mask)) { -		raw_spin_unlock_irqrestore_rcu_node(rnp, flags); -		return; -	} -	rnp->expmask &= ~mask; -	__rcu_report_exp_rnp(rsp, rnp, wake, flags); /* Releases rnp->lock. */ -} - -/* - * Report expedited quiescent state for specified rcu_data (CPU). - */ -static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp, -			       bool wake) -{ -	rcu_report_exp_cpu_mult(rsp, rdp->mynode, rdp->grpmask, wake); -} - -/* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */ -static bool sync_exp_work_done(struct rcu_state *rsp, atomic_long_t *stat, -			       unsigned long s) -{ -	if (rcu_exp_gp_seq_done(rsp, s)) { -		trace_rcu_exp_grace_period(rsp->name, s, TPS("done")); -		/* Ensure test happens before caller kfree(). */ -		smp_mb__before_atomic(); /* ^^^ */ -		atomic_long_inc(stat); -		return true; -	} -	return false; -} - -/* - * Funnel-lock acquisition for expedited grace periods.  Returns true - * if some other task completed an expedited grace period that this task - * can piggy-back on, and with no mutex held.  Otherwise, returns false - * with the mutex held, indicating that the caller must actually do the - * expedited grace period. - */ -static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s) -{ -	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); -	struct rcu_node *rnp = rdp->mynode; -	struct rcu_node *rnp_root = rcu_get_root(rsp); - -	/* Low-contention fastpath. */ -	if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s) && -	    (rnp == rnp_root || -	     ULONG_CMP_LT(READ_ONCE(rnp_root->exp_seq_rq), s)) && -	    !mutex_is_locked(&rsp->exp_mutex) && -	    mutex_trylock(&rsp->exp_mutex)) -		goto fastpath; - -	/* -	 * Each pass through the following loop works its way up -	 * the rcu_node tree, returning if others have done the work or -	 * otherwise falls through to acquire rsp->exp_mutex.  The mapping -	 * from CPU to rcu_node structure can be inexact, as it is just -	 * promoting locality and is not strictly needed for correctness. -	 */ -	for (; rnp != NULL; rnp = rnp->parent) { -		if (sync_exp_work_done(rsp, &rdp->exp_workdone1, s)) -			return true; - -		/* Work not done, either wait here or go up. */ -		spin_lock(&rnp->exp_lock); -		if (ULONG_CMP_GE(rnp->exp_seq_rq, s)) { - -			/* Someone else doing GP, so wait for them. */ -			spin_unlock(&rnp->exp_lock); -			trace_rcu_exp_funnel_lock(rsp->name, rnp->level, -						  rnp->grplo, rnp->grphi, -						  TPS("wait")); -			wait_event(rnp->exp_wq[(s >> 1) & 0x3], -				   sync_exp_work_done(rsp, -						      &rdp->exp_workdone2, s)); -			return true; -		} -		rnp->exp_seq_rq = s; /* Followers can wait on us. */ -		spin_unlock(&rnp->exp_lock); -		trace_rcu_exp_funnel_lock(rsp->name, rnp->level, rnp->grplo, -					  rnp->grphi, TPS("nxtlvl")); -	} -	mutex_lock(&rsp->exp_mutex); -fastpath: -	if (sync_exp_work_done(rsp, &rdp->exp_workdone3, s)) { -		mutex_unlock(&rsp->exp_mutex); -		return true; -	} -	rcu_exp_gp_seq_start(rsp); -	trace_rcu_exp_grace_period(rsp->name, s, TPS("start")); -	return false; -} - -/* Invoked on each online non-idle CPU for expedited quiescent state. */ -static void sync_sched_exp_handler(void *data) -{ -	struct rcu_data *rdp; -	struct rcu_node *rnp; -	struct rcu_state *rsp = data; - -	rdp = this_cpu_ptr(rsp->rda); -	rnp = rdp->mynode; -	if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || -	    __this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) -		return; -	if (rcu_is_cpu_rrupt_from_idle()) { -		rcu_report_exp_rdp(&rcu_sched_state, -				   this_cpu_ptr(&rcu_sched_data), true); -		return; -	} -	__this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true); -	resched_cpu(smp_processor_id()); -} - -/* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */ -static void sync_sched_exp_online_cleanup(int cpu) -{ -	struct rcu_data *rdp; -	int ret; -	struct rcu_node *rnp; -	struct rcu_state *rsp = &rcu_sched_state; - -	rdp = per_cpu_ptr(rsp->rda, cpu); -	rnp = rdp->mynode; -	if (!(READ_ONCE(rnp->expmask) & rdp->grpmask)) -		return; -	ret = smp_call_function_single(cpu, sync_sched_exp_handler, rsp, 0); -	WARN_ON_ONCE(ret); -} - -/* - * Select the nodes that the upcoming expedited grace period needs - * to wait for. - */ -static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, -				     smp_call_func_t func) -{ -	int cpu; -	unsigned long flags; -	unsigned long mask; -	unsigned long mask_ofl_test; -	unsigned long mask_ofl_ipi; -	int ret; -	struct rcu_node *rnp; - -	sync_exp_reset_tree(rsp); -	rcu_for_each_leaf_node(rsp, rnp) { -		raw_spin_lock_irqsave_rcu_node(rnp, flags); - -		/* Each pass checks a CPU for identity, offline, and idle. */ -		mask_ofl_test = 0; -		for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) { -			struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); -			struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); - -			if (raw_smp_processor_id() == cpu || -			    !(atomic_add_return(0, &rdtp->dynticks) & 0x1)) -				mask_ofl_test |= rdp->grpmask; -		} -		mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; - -		/* -		 * Need to wait for any blocked tasks as well.  Note that -		 * additional blocking tasks will also block the expedited -		 * GP until such time as the ->expmask bits are cleared. -		 */ -		if (rcu_preempt_has_tasks(rnp)) -			rnp->exp_tasks = rnp->blkd_tasks.next; -		raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - -		/* IPI the remaining CPUs for expedited quiescent state. */ -		mask = 1; -		for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { -			if (!(mask_ofl_ipi & mask)) -				continue; -retry_ipi: -			ret = smp_call_function_single(cpu, func, rsp, 0); -			if (!ret) { -				mask_ofl_ipi &= ~mask; -				continue; -			} -			/* Failed, raced with offline. */ -			raw_spin_lock_irqsave_rcu_node(rnp, flags); -			if (cpu_online(cpu) && -			    (rnp->expmask & mask)) { -				raw_spin_unlock_irqrestore_rcu_node(rnp, flags); -				schedule_timeout_uninterruptible(1); -				if (cpu_online(cpu) && -				    (rnp->expmask & mask)) -					goto retry_ipi; -				raw_spin_lock_irqsave_rcu_node(rnp, flags); -			} -			if (!(rnp->expmask & mask)) -				mask_ofl_ipi &= ~mask; -			raw_spin_unlock_irqrestore_rcu_node(rnp, flags); -		} -		/* Report quiescent states for those that went offline. */ -		mask_ofl_test |= mask_ofl_ipi; -		if (mask_ofl_test) -			rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false); -	} -} - -static void synchronize_sched_expedited_wait(struct rcu_state *rsp) -{ -	int cpu; -	unsigned long jiffies_stall; -	unsigned long jiffies_start; -	unsigned long mask; -	int ndetected; -	struct rcu_node *rnp; -	struct rcu_node *rnp_root = rcu_get_root(rsp); -	int ret; - -	jiffies_stall = rcu_jiffies_till_stall_check(); -	jiffies_start = jiffies; - -	for (;;) { -		ret = swait_event_timeout( -				rsp->expedited_wq, -				sync_rcu_preempt_exp_done(rnp_root), -				jiffies_stall); -		if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root)) -			return; -		if (ret < 0) { -			/* Hit a signal, disable CPU stall warnings. */ -			swait_event(rsp->expedited_wq, -				   sync_rcu_preempt_exp_done(rnp_root)); -			return; -		} -		pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", -		       rsp->name); -		ndetected = 0; -		rcu_for_each_leaf_node(rsp, rnp) { -			ndetected += rcu_print_task_exp_stall(rnp); -			mask = 1; -			for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { -				struct rcu_data *rdp; - -				if (!(rnp->expmask & mask)) -					continue; -				ndetected++; -				rdp = per_cpu_ptr(rsp->rda, cpu); -				pr_cont(" %d-%c%c%c", cpu, -					"O."[!!cpu_online(cpu)], -					"o."[!!(rdp->grpmask & rnp->expmaskinit)], -					"N."[!!(rdp->grpmask & rnp->expmaskinitnext)]); -			} -			mask <<= 1; -		} -		pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", -			jiffies - jiffies_start, rsp->expedited_sequence, -			rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]); -		if (ndetected) { -			pr_err("blocking rcu_node structures:"); -			rcu_for_each_node_breadth_first(rsp, rnp) { -				if (rnp == rnp_root) -					continue; /* printed unconditionally */ -				if (sync_rcu_preempt_exp_done(rnp)) -					continue; -				pr_cont(" l=%u:%d-%d:%#lx/%c", -					rnp->level, rnp->grplo, rnp->grphi, -					rnp->expmask, -					".T"[!!rnp->exp_tasks]); -			} -			pr_cont("\n"); -		} -		rcu_for_each_leaf_node(rsp, rnp) { -			mask = 1; -			for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { -				if (!(rnp->expmask & mask)) -					continue; -				dump_cpu_task(cpu); -			} -		} -		jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3; -	} -} - -/* - * Wait for the current expedited grace period to complete, and then - * wake up everyone who piggybacked on the just-completed expedited - * grace period.  Also update all the ->exp_seq_rq counters as needed - * in order to avoid counter-wrap problems. - */ -static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) -{ -	struct rcu_node *rnp; - -	synchronize_sched_expedited_wait(rsp); -	rcu_exp_gp_seq_end(rsp); -	trace_rcu_exp_grace_period(rsp->name, s, TPS("end")); - -	/* -	 * Switch over to wakeup mode, allowing the next GP, but -only- the -	 * next GP, to proceed. -	 */ -	mutex_lock(&rsp->exp_wake_mutex); -	mutex_unlock(&rsp->exp_mutex); - -	rcu_for_each_node_breadth_first(rsp, rnp) { -		if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) { -			spin_lock(&rnp->exp_lock); -			/* Recheck, avoid hang in case someone just arrived. */ -			if (ULONG_CMP_LT(rnp->exp_seq_rq, s)) -				rnp->exp_seq_rq = s; -			spin_unlock(&rnp->exp_lock); -		} -		wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x3]); -	} -	trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake")); -	mutex_unlock(&rsp->exp_wake_mutex); -} - -/** - * synchronize_sched_expedited - Brute-force RCU-sched grace period - * - * Wait for an RCU-sched grace period to elapse, but use a "big hammer" - * approach to force the grace period to end quickly.  This consumes - * significant time on all CPUs and is unfriendly to real-time workloads, - * so is thus not recommended for any sort of common-case code.  In fact, - * if you are using synchronize_sched_expedited() in a loop, please - * restructure your code to batch your updates, and then use a single - * synchronize_sched() instead. - * - * This implementation can be thought of as an application of sequence - * locking to expedited grace periods, but using the sequence counter to - * determine when someone else has already done the work instead of for - * retrying readers. - */ -void synchronize_sched_expedited(void) -{ -	unsigned long s; -	struct rcu_state *rsp = &rcu_sched_state; - -	/* If only one CPU, this is automatically a grace period. */ -	if (rcu_blocking_is_gp()) -		return; - -	/* If expedited grace periods are prohibited, fall back to normal. */ -	if (rcu_gp_is_normal()) { -		wait_rcu_gp(call_rcu_sched); -		return; -	} - -	/* Take a snapshot of the sequence number.  */ -	s = rcu_exp_gp_seq_snap(rsp); -	if (exp_funnel_lock(rsp, s)) -		return;  /* Someone else did our work for us. */ - -	/* Initialize the rcu_node tree in preparation for the wait. */ -	sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler); - -	/* Wait and clean up, including waking everyone. */ -	rcu_exp_wait_wake(rsp, s); -} -EXPORT_SYMBOL_GPL(synchronize_sched_expedited); -  /*   * Check to see if there is any immediate RCU-related work to be done   * by the current CPU, for the specified type of RCU, returning 1 if so. @@ -4281,7 +3747,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)  	/* Set up local state, ensuring consistent view of global state. */  	raw_spin_lock_irqsave_rcu_node(rnp, flags); -	rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); +	rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu);  	rdp->dynticks = &per_cpu(rcu_dynticks, cpu);  	WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);  	WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); @@ -4340,12 +3806,58 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)  	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);  } -static void rcu_prepare_cpu(int cpu) +int rcutree_prepare_cpu(unsigned int cpu)  {  	struct rcu_state *rsp;  	for_each_rcu_flavor(rsp)  		rcu_init_percpu_data(cpu, rsp); + +	rcu_prepare_kthreads(cpu); +	rcu_spawn_all_nocb_kthreads(cpu); + +	return 0; +} + +static void rcutree_affinity_setting(unsigned int cpu, int outgoing) +{ +	struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu); + +	rcu_boost_kthread_setaffinity(rdp->mynode, outgoing); +} + +int rcutree_online_cpu(unsigned int cpu) +{ +	sync_sched_exp_online_cleanup(cpu); +	rcutree_affinity_setting(cpu, -1); +	return 0; +} + +int rcutree_offline_cpu(unsigned int cpu) +{ +	rcutree_affinity_setting(cpu, cpu); +	return 0; +} + + +int rcutree_dying_cpu(unsigned int cpu) +{ +	struct rcu_state *rsp; + +	for_each_rcu_flavor(rsp) +		rcu_cleanup_dying_cpu(rsp); +	return 0; +} + +int rcutree_dead_cpu(unsigned int cpu) +{ +	struct rcu_state *rsp; + +	for_each_rcu_flavor(rsp) { +		rcu_cleanup_dead_cpu(cpu, rsp); +		do_nocb_deferred_wakeup(per_cpu_ptr(rsp->rda, cpu)); +	} +	return 0;  }  #ifdef CONFIG_HOTPLUG_CPU @@ -4364,9 +3876,6 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)  	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);  	struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rdp & rnp. */ -	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) -		return; -  	/* Remove outgoing CPU from mask in the leaf rcu_node structure. */  	mask = rdp->grpmask;  	raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ @@ -4388,52 +3897,6 @@ void rcu_report_dead(unsigned int cpu)  }  #endif -/* - * Handle CPU online/offline notification events. - */ -int rcu_cpu_notify(struct notifier_block *self, -		   unsigned long action, void *hcpu) -{ -	long cpu = (long)hcpu; -	struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu); -	struct rcu_node *rnp = rdp->mynode; -	struct rcu_state *rsp; - -	switch (action) { -	case CPU_UP_PREPARE: -	case CPU_UP_PREPARE_FROZEN: -		rcu_prepare_cpu(cpu); -		rcu_prepare_kthreads(cpu); -		rcu_spawn_all_nocb_kthreads(cpu); -		break; -	case CPU_ONLINE: -	case CPU_DOWN_FAILED: -		sync_sched_exp_online_cleanup(cpu); -		rcu_boost_kthread_setaffinity(rnp, -1); -		break; -	case CPU_DOWN_PREPARE: -		rcu_boost_kthread_setaffinity(rnp, cpu); -		break; -	case CPU_DYING: -	case CPU_DYING_FROZEN: -		for_each_rcu_flavor(rsp) -			rcu_cleanup_dying_cpu(rsp); -		break; -	case CPU_DEAD: -	case CPU_DEAD_FROZEN: -	case CPU_UP_CANCELED: -	case CPU_UP_CANCELED_FROZEN: -		for_each_rcu_flavor(rsp) { -			rcu_cleanup_dead_cpu(cpu, rsp); -			do_nocb_deferred_wakeup(per_cpu_ptr(rsp->rda, cpu)); -		} -		break; -	default: -		break; -	} -	return NOTIFY_OK; -} -  static int rcu_pm_notify(struct notifier_block *self,  			 unsigned long action, void *hcpu)  { @@ -4745,10 +4208,10 @@ void __init rcu_init(void)  	 * this is called early in boot, before either interrupts  	 * or the scheduler are operational.  	 */ -	cpu_notifier(rcu_cpu_notify, 0);  	pm_notifier(rcu_pm_notify, 0);  	for_each_online_cpu(cpu) -		rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); +		rcutree_prepare_cpu(cpu);  } +#include "tree_exp.h"  #include "tree_plugin.h" diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index e3959f5e6ddf..f714f873bf9d 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -254,6 +254,13 @@ struct rcu_node {  } ____cacheline_internodealigned_in_smp;  /* + * Bitmasks in an rcu_node cover the interval [grplo, grphi] of CPU IDs, and + * are indexed relative to this interval rather than the global CPU ID space. + * This generates the bit for a CPU in node-local masks. + */ +#define leaf_node_cpu_bit(rnp, cpu) (1UL << ((cpu) - (rnp)->grplo)) + +/*   * Do a full breadth-first scan of the rcu_node structures for the   * specified rcu_state structure.   */ @@ -281,6 +288,14 @@ struct rcu_node {  	     (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)  /* + * Iterate over all possible CPUs in a leaf RCU node. + */ +#define for_each_leaf_node_possible_cpu(rnp, cpu) \ +	for ((cpu) = cpumask_next(rnp->grplo - 1, cpu_possible_mask); \ +	     cpu <= rnp->grphi; \ +	     cpu = cpumask_next((cpu), cpu_possible_mask)) + +/*   * Union to allow "aggregate OR" operation on the need for a quiescent   * state by the normal and expedited grace periods.   */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h new file mode 100644 index 000000000000..6d86ab6ec2c9 --- /dev/null +++ b/kernel/rcu/tree_exp.h @@ -0,0 +1,655 @@ +/* + * RCU expedited grace periods + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright IBM Corporation, 2016 + * + * Authors: Paul E. McKenney <[email protected]> + */ + +/* Wrapper functions for expedited grace periods.  */ +static void rcu_exp_gp_seq_start(struct rcu_state *rsp) +{ +	rcu_seq_start(&rsp->expedited_sequence); +} +static void rcu_exp_gp_seq_end(struct rcu_state *rsp) +{ +	rcu_seq_end(&rsp->expedited_sequence); +	smp_mb(); /* Ensure that consecutive grace periods serialize. */ +} +static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp) +{ +	unsigned long s; + +	smp_mb(); /* Caller's modifications seen first by other CPUs. */ +	s = rcu_seq_snap(&rsp->expedited_sequence); +	trace_rcu_exp_grace_period(rsp->name, s, TPS("snap")); +	return s; +} +static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) +{ +	return rcu_seq_done(&rsp->expedited_sequence, s); +} + +/* + * Reset the ->expmaskinit values in the rcu_node tree to reflect any + * recent CPU-online activity.  Note that these masks are not cleared + * when CPUs go offline, so they reflect the union of all CPUs that have + * ever been online.  This means that this function normally takes its + * no-work-to-do fastpath. + */ +static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) +{ +	bool done; +	unsigned long flags; +	unsigned long mask; +	unsigned long oldmask; +	int ncpus = READ_ONCE(rsp->ncpus); +	struct rcu_node *rnp; +	struct rcu_node *rnp_up; + +	/* If no new CPUs onlined since last time, nothing to do. */ +	if (likely(ncpus == rsp->ncpus_snap)) +		return; +	rsp->ncpus_snap = ncpus; + +	/* +	 * Each pass through the following loop propagates newly onlined +	 * CPUs for the current rcu_node structure up the rcu_node tree. +	 */ +	rcu_for_each_leaf_node(rsp, rnp) { +		raw_spin_lock_irqsave_rcu_node(rnp, flags); +		if (rnp->expmaskinit == rnp->expmaskinitnext) { +			raw_spin_unlock_irqrestore_rcu_node(rnp, flags); +			continue;  /* No new CPUs, nothing to do. */ +		} + +		/* Update this node's mask, track old value for propagation. */ +		oldmask = rnp->expmaskinit; +		rnp->expmaskinit = rnp->expmaskinitnext; +		raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + +		/* If was already nonzero, nothing to propagate. */ +		if (oldmask) +			continue; + +		/* Propagate the new CPU up the tree. */ +		mask = rnp->grpmask; +		rnp_up = rnp->parent; +		done = false; +		while (rnp_up) { +			raw_spin_lock_irqsave_rcu_node(rnp_up, flags); +			if (rnp_up->expmaskinit) +				done = true; +			rnp_up->expmaskinit |= mask; +			raw_spin_unlock_irqrestore_rcu_node(rnp_up, flags); +			if (done) +				break; +			mask = rnp_up->grpmask; +			rnp_up = rnp_up->parent; +		} +	} +} + +/* + * Reset the ->expmask values in the rcu_node tree in preparation for + * a new expedited grace period. + */ +static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) +{ +	unsigned long flags; +	struct rcu_node *rnp; + +	sync_exp_reset_tree_hotplug(rsp); +	rcu_for_each_node_breadth_first(rsp, rnp) { +		raw_spin_lock_irqsave_rcu_node(rnp, flags); +		WARN_ON_ONCE(rnp->expmask); +		rnp->expmask = rnp->expmaskinit; +		raw_spin_unlock_irqrestore_rcu_node(rnp, flags); +	} +} + +/* + * Return non-zero if there is no RCU expedited grace period in progress + * for the specified rcu_node structure, in other words, if all CPUs and + * tasks covered by the specified rcu_node structure have done their bit + * for the current expedited grace period.  Works only for preemptible + * RCU -- other RCU implementation use other means. + * + * Caller must hold the rcu_state's exp_mutex. + */ +static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) +{ +	return rnp->exp_tasks == NULL && +	       READ_ONCE(rnp->expmask) == 0; +} + +/* + * Report the exit from RCU read-side critical section for the last task + * that queued itself during or before the current expedited preemptible-RCU + * grace period.  This event is reported either to the rcu_node structure on + * which the task was queued or to one of that rcu_node structure's ancestors, + * recursively up the tree.  (Calm down, calm down, we do the recursion + * iteratively!) + * + * Caller must hold the rcu_state's exp_mutex and the specified rcu_node + * structure's ->lock. + */ +static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, +				 bool wake, unsigned long flags) +	__releases(rnp->lock) +{ +	unsigned long mask; + +	for (;;) { +		if (!sync_rcu_preempt_exp_done(rnp)) { +			if (!rnp->expmask) +				rcu_initiate_boost(rnp, flags); +			else +				raw_spin_unlock_irqrestore_rcu_node(rnp, flags); +			break; +		} +		if (rnp->parent == NULL) { +			raw_spin_unlock_irqrestore_rcu_node(rnp, flags); +			if (wake) { +				smp_mb(); /* EGP done before wake_up(). */ +				swake_up(&rsp->expedited_wq); +			} +			break; +		} +		mask = rnp->grpmask; +		raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled */ +		rnp = rnp->parent; +		raw_spin_lock_rcu_node(rnp); /* irqs already disabled */ +		WARN_ON_ONCE(!(rnp->expmask & mask)); +		rnp->expmask &= ~mask; +	} +} + +/* + * Report expedited quiescent state for specified node.  This is a + * lock-acquisition wrapper function for __rcu_report_exp_rnp(). + * + * Caller must hold the rcu_state's exp_mutex. + */ +static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, +					      struct rcu_node *rnp, bool wake) +{ +	unsigned long flags; + +	raw_spin_lock_irqsave_rcu_node(rnp, flags); +	__rcu_report_exp_rnp(rsp, rnp, wake, flags); +} + +/* + * Report expedited quiescent state for multiple CPUs, all covered by the + * specified leaf rcu_node structure.  Caller must hold the rcu_state's + * exp_mutex. + */ +static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, +				    unsigned long mask, bool wake) +{ +	unsigned long flags; + +	raw_spin_lock_irqsave_rcu_node(rnp, flags); +	if (!(rnp->expmask & mask)) { +		raw_spin_unlock_irqrestore_rcu_node(rnp, flags); +		return; +	} +	rnp->expmask &= ~mask; +	__rcu_report_exp_rnp(rsp, rnp, wake, flags); /* Releases rnp->lock. */ +} + +/* + * Report expedited quiescent state for specified rcu_data (CPU). + */ +static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp, +			       bool wake) +{ +	rcu_report_exp_cpu_mult(rsp, rdp->mynode, rdp->grpmask, wake); +} + +/* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */ +static bool sync_exp_work_done(struct rcu_state *rsp, atomic_long_t *stat, +			       unsigned long s) +{ +	if (rcu_exp_gp_seq_done(rsp, s)) { +		trace_rcu_exp_grace_period(rsp->name, s, TPS("done")); +		/* Ensure test happens before caller kfree(). */ +		smp_mb__before_atomic(); /* ^^^ */ +		atomic_long_inc(stat); +		return true; +	} +	return false; +} + +/* + * Funnel-lock acquisition for expedited grace periods.  Returns true + * if some other task completed an expedited grace period that this task + * can piggy-back on, and with no mutex held.  Otherwise, returns false + * with the mutex held, indicating that the caller must actually do the + * expedited grace period. + */ +static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s) +{ +	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); +	struct rcu_node *rnp = rdp->mynode; +	struct rcu_node *rnp_root = rcu_get_root(rsp); + +	/* Low-contention fastpath. */ +	if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s) && +	    (rnp == rnp_root || +	     ULONG_CMP_LT(READ_ONCE(rnp_root->exp_seq_rq), s)) && +	    mutex_trylock(&rsp->exp_mutex)) +		goto fastpath; + +	/* +	 * Each pass through the following loop works its way up +	 * the rcu_node tree, returning if others have done the work or +	 * otherwise falls through to acquire rsp->exp_mutex.  The mapping +	 * from CPU to rcu_node structure can be inexact, as it is just +	 * promoting locality and is not strictly needed for correctness. +	 */ +	for (; rnp != NULL; rnp = rnp->parent) { +		if (sync_exp_work_done(rsp, &rdp->exp_workdone1, s)) +			return true; + +		/* Work not done, either wait here or go up. */ +		spin_lock(&rnp->exp_lock); +		if (ULONG_CMP_GE(rnp->exp_seq_rq, s)) { + +			/* Someone else doing GP, so wait for them. */ +			spin_unlock(&rnp->exp_lock); +			trace_rcu_exp_funnel_lock(rsp->name, rnp->level, +						  rnp->grplo, rnp->grphi, +						  TPS("wait")); +			wait_event(rnp->exp_wq[(s >> 1) & 0x3], +				   sync_exp_work_done(rsp, +						      &rdp->exp_workdone2, s)); +			return true; +		} +		rnp->exp_seq_rq = s; /* Followers can wait on us. */ +		spin_unlock(&rnp->exp_lock); +		trace_rcu_exp_funnel_lock(rsp->name, rnp->level, rnp->grplo, +					  rnp->grphi, TPS("nxtlvl")); +	} +	mutex_lock(&rsp->exp_mutex); +fastpath: +	if (sync_exp_work_done(rsp, &rdp->exp_workdone3, s)) { +		mutex_unlock(&rsp->exp_mutex); +		return true; +	} +	rcu_exp_gp_seq_start(rsp); +	trace_rcu_exp_grace_period(rsp->name, s, TPS("start")); +	return false; +} + +/* Invoked on each online non-idle CPU for expedited quiescent state. */ +static void sync_sched_exp_handler(void *data) +{ +	struct rcu_data *rdp; +	struct rcu_node *rnp; +	struct rcu_state *rsp = data; + +	rdp = this_cpu_ptr(rsp->rda); +	rnp = rdp->mynode; +	if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || +	    __this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) +		return; +	if (rcu_is_cpu_rrupt_from_idle()) { +		rcu_report_exp_rdp(&rcu_sched_state, +				   this_cpu_ptr(&rcu_sched_data), true); +		return; +	} +	__this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true); +	resched_cpu(smp_processor_id()); +} + +/* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */ +static void sync_sched_exp_online_cleanup(int cpu) +{ +	struct rcu_data *rdp; +	int ret; +	struct rcu_node *rnp; +	struct rcu_state *rsp = &rcu_sched_state; + +	rdp = per_cpu_ptr(rsp->rda, cpu); +	rnp = rdp->mynode; +	if (!(READ_ONCE(rnp->expmask) & rdp->grpmask)) +		return; +	ret = smp_call_function_single(cpu, sync_sched_exp_handler, rsp, 0); +	WARN_ON_ONCE(ret); +} + +/* + * Select the nodes that the upcoming expedited grace period needs + * to wait for. + */ +static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, +				     smp_call_func_t func) +{ +	int cpu; +	unsigned long flags; +	unsigned long mask_ofl_test; +	unsigned long mask_ofl_ipi; +	int ret; +	struct rcu_node *rnp; + +	sync_exp_reset_tree(rsp); +	rcu_for_each_leaf_node(rsp, rnp) { +		raw_spin_lock_irqsave_rcu_node(rnp, flags); + +		/* Each pass checks a CPU for identity, offline, and idle. */ +		mask_ofl_test = 0; +		for_each_leaf_node_possible_cpu(rnp, cpu) { +			struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); +			struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + +			if (raw_smp_processor_id() == cpu || +			    !(atomic_add_return(0, &rdtp->dynticks) & 0x1)) +				mask_ofl_test |= rdp->grpmask; +		} +		mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; + +		/* +		 * Need to wait for any blocked tasks as well.  Note that +		 * additional blocking tasks will also block the expedited +		 * GP until such time as the ->expmask bits are cleared. +		 */ +		if (rcu_preempt_has_tasks(rnp)) +			rnp->exp_tasks = rnp->blkd_tasks.next; +		raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + +		/* IPI the remaining CPUs for expedited quiescent state. */ +		for_each_leaf_node_possible_cpu(rnp, cpu) { +			unsigned long mask = leaf_node_cpu_bit(rnp, cpu); +			if (!(mask_ofl_ipi & mask)) +				continue; +retry_ipi: +			ret = smp_call_function_single(cpu, func, rsp, 0); +			if (!ret) { +				mask_ofl_ipi &= ~mask; +				continue; +			} +			/* Failed, raced with offline. */ +			raw_spin_lock_irqsave_rcu_node(rnp, flags); +			if (cpu_online(cpu) && +			    (rnp->expmask & mask)) { +				raw_spin_unlock_irqrestore_rcu_node(rnp, flags); +				schedule_timeout_uninterruptible(1); +				if (cpu_online(cpu) && +				    (rnp->expmask & mask)) +					goto retry_ipi; +				raw_spin_lock_irqsave_rcu_node(rnp, flags); +			} +			if (!(rnp->expmask & mask)) +				mask_ofl_ipi &= ~mask; +			raw_spin_unlock_irqrestore_rcu_node(rnp, flags); +		} +		/* Report quiescent states for those that went offline. */ +		mask_ofl_test |= mask_ofl_ipi; +		if (mask_ofl_test) +			rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false); +	} +} + +static void synchronize_sched_expedited_wait(struct rcu_state *rsp) +{ +	int cpu; +	unsigned long jiffies_stall; +	unsigned long jiffies_start; +	unsigned long mask; +	int ndetected; +	struct rcu_node *rnp; +	struct rcu_node *rnp_root = rcu_get_root(rsp); +	int ret; + +	jiffies_stall = rcu_jiffies_till_stall_check(); +	jiffies_start = jiffies; + +	for (;;) { +		ret = swait_event_timeout( +				rsp->expedited_wq, +				sync_rcu_preempt_exp_done(rnp_root), +				jiffies_stall); +		if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root)) +			return; +		if (ret < 0) { +			/* Hit a signal, disable CPU stall warnings. */ +			swait_event(rsp->expedited_wq, +				   sync_rcu_preempt_exp_done(rnp_root)); +			return; +		} +		pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", +		       rsp->name); +		ndetected = 0; +		rcu_for_each_leaf_node(rsp, rnp) { +			ndetected += rcu_print_task_exp_stall(rnp); +			for_each_leaf_node_possible_cpu(rnp, cpu) { +				struct rcu_data *rdp; + +				mask = leaf_node_cpu_bit(rnp, cpu); +				if (!(rnp->expmask & mask)) +					continue; +				ndetected++; +				rdp = per_cpu_ptr(rsp->rda, cpu); +				pr_cont(" %d-%c%c%c", cpu, +					"O."[!!cpu_online(cpu)], +					"o."[!!(rdp->grpmask & rnp->expmaskinit)], +					"N."[!!(rdp->grpmask & rnp->expmaskinitnext)]); +			} +		} +		pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", +			jiffies - jiffies_start, rsp->expedited_sequence, +			rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]); +		if (ndetected) { +			pr_err("blocking rcu_node structures:"); +			rcu_for_each_node_breadth_first(rsp, rnp) { +				if (rnp == rnp_root) +					continue; /* printed unconditionally */ +				if (sync_rcu_preempt_exp_done(rnp)) +					continue; +				pr_cont(" l=%u:%d-%d:%#lx/%c", +					rnp->level, rnp->grplo, rnp->grphi, +					rnp->expmask, +					".T"[!!rnp->exp_tasks]); +			} +			pr_cont("\n"); +		} +		rcu_for_each_leaf_node(rsp, rnp) { +			for_each_leaf_node_possible_cpu(rnp, cpu) { +				mask = leaf_node_cpu_bit(rnp, cpu); +				if (!(rnp->expmask & mask)) +					continue; +				dump_cpu_task(cpu); +			} +		} +		jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3; +	} +} + +/* + * Wait for the current expedited grace period to complete, and then + * wake up everyone who piggybacked on the just-completed expedited + * grace period.  Also update all the ->exp_seq_rq counters as needed + * in order to avoid counter-wrap problems. + */ +static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) +{ +	struct rcu_node *rnp; + +	synchronize_sched_expedited_wait(rsp); +	rcu_exp_gp_seq_end(rsp); +	trace_rcu_exp_grace_period(rsp->name, s, TPS("end")); + +	/* +	 * Switch over to wakeup mode, allowing the next GP, but -only- the +	 * next GP, to proceed. +	 */ +	mutex_lock(&rsp->exp_wake_mutex); +	mutex_unlock(&rsp->exp_mutex); + +	rcu_for_each_node_breadth_first(rsp, rnp) { +		if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) { +			spin_lock(&rnp->exp_lock); +			/* Recheck, avoid hang in case someone just arrived. */ +			if (ULONG_CMP_LT(rnp->exp_seq_rq, s)) +				rnp->exp_seq_rq = s; +			spin_unlock(&rnp->exp_lock); +		} +		wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x3]); +	} +	trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake")); +	mutex_unlock(&rsp->exp_wake_mutex); +} + +/** + * synchronize_sched_expedited - Brute-force RCU-sched grace period + * + * Wait for an RCU-sched grace period to elapse, but use a "big hammer" + * approach to force the grace period to end quickly.  This consumes + * significant time on all CPUs and is unfriendly to real-time workloads, + * so is thus not recommended for any sort of common-case code.  In fact, + * if you are using synchronize_sched_expedited() in a loop, please + * restructure your code to batch your updates, and then use a single + * synchronize_sched() instead. + * + * This implementation can be thought of as an application of sequence + * locking to expedited grace periods, but using the sequence counter to + * determine when someone else has already done the work instead of for + * retrying readers. + */ +void synchronize_sched_expedited(void) +{ +	unsigned long s; +	struct rcu_state *rsp = &rcu_sched_state; + +	/* If only one CPU, this is automatically a grace period. */ +	if (rcu_blocking_is_gp()) +		return; + +	/* If expedited grace periods are prohibited, fall back to normal. */ +	if (rcu_gp_is_normal()) { +		wait_rcu_gp(call_rcu_sched); +		return; +	} + +	/* Take a snapshot of the sequence number.  */ +	s = rcu_exp_gp_seq_snap(rsp); +	if (exp_funnel_lock(rsp, s)) +		return;  /* Someone else did our work for us. */ + +	/* Initialize the rcu_node tree in preparation for the wait. */ +	sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler); + +	/* Wait and clean up, including waking everyone. */ +	rcu_exp_wait_wake(rsp, s); +} +EXPORT_SYMBOL_GPL(synchronize_sched_expedited); + +#ifdef CONFIG_PREEMPT_RCU + +/* + * Remote handler for smp_call_function_single().  If there is an + * RCU read-side critical section in effect, request that the + * next rcu_read_unlock() record the quiescent state up the + * ->expmask fields in the rcu_node tree.  Otherwise, immediately + * report the quiescent state. + */ +static void sync_rcu_exp_handler(void *info) +{ +	struct rcu_data *rdp; +	struct rcu_state *rsp = info; +	struct task_struct *t = current; + +	/* +	 * Within an RCU read-side critical section, request that the next +	 * rcu_read_unlock() report.  Unless this RCU read-side critical +	 * section has already blocked, in which case it is already set +	 * up for the expedited grace period to wait on it. +	 */ +	if (t->rcu_read_lock_nesting > 0 && +	    !t->rcu_read_unlock_special.b.blocked) { +		t->rcu_read_unlock_special.b.exp_need_qs = true; +		return; +	} + +	/* +	 * We are either exiting an RCU read-side critical section (negative +	 * values of t->rcu_read_lock_nesting) or are not in one at all +	 * (zero value of t->rcu_read_lock_nesting).  Or we are in an RCU +	 * read-side critical section that blocked before this expedited +	 * grace period started.  Either way, we can immediately report +	 * the quiescent state. +	 */ +	rdp = this_cpu_ptr(rsp->rda); +	rcu_report_exp_rdp(rsp, rdp, true); +} + +/** + * synchronize_rcu_expedited - Brute-force RCU grace period + * + * Wait for an RCU-preempt grace period, but expedite it.  The basic + * idea is to IPI all non-idle non-nohz online CPUs.  The IPI handler + * checks whether the CPU is in an RCU-preempt critical section, and + * if so, it sets a flag that causes the outermost rcu_read_unlock() + * to report the quiescent state.  On the other hand, if the CPU is + * not in an RCU read-side critical section, the IPI handler reports + * the quiescent state immediately. + * + * Although this is a greate improvement over previous expedited + * implementations, it is still unfriendly to real-time workloads, so is + * thus not recommended for any sort of common-case code.  In fact, if + * you are using synchronize_rcu_expedited() in a loop, please restructure + * your code to batch your updates, and then Use a single synchronize_rcu() + * instead. + */ +void synchronize_rcu_expedited(void) +{ +	struct rcu_state *rsp = rcu_state_p; +	unsigned long s; + +	/* If expedited grace periods are prohibited, fall back to normal. */ +	if (rcu_gp_is_normal()) { +		wait_rcu_gp(call_rcu); +		return; +	} + +	s = rcu_exp_gp_seq_snap(rsp); +	if (exp_funnel_lock(rsp, s)) +		return;  /* Someone else did our work for us. */ + +	/* Initialize the rcu_node tree in preparation for the wait. */ +	sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler); + +	/* Wait for ->blkd_tasks lists to drain, then wake everyone up. */ +	rcu_exp_wait_wake(rsp, s); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); + +#else /* #ifdef CONFIG_PREEMPT_RCU */ + +/* + * Wait for an rcu-preempt grace period, but make it happen quickly. + * But because preemptible RCU does not exist, map to rcu-sched. + */ +void synchronize_rcu_expedited(void) +{ +	synchronize_sched_expedited(); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); + +#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index ff1cd4e1188d..0082fce402a0 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -79,8 +79,6 @@ static void __init rcu_bootup_announce_oddness(void)  		pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n");  	if (IS_ENABLED(CONFIG_PROVE_RCU))  		pr_info("\tRCU lockdep checking is enabled.\n"); -	if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_RUNNABLE)) -		pr_info("\tRCU torture testing starts during boot.\n");  	if (RCU_NUM_LVLS >= 4)  		pr_info("\tFour(or more)-level hierarchy is enabled.\n");  	if (RCU_FANOUT_LEAF != 16) @@ -681,84 +679,6 @@ void synchronize_rcu(void)  }  EXPORT_SYMBOL_GPL(synchronize_rcu); -/* - * Remote handler for smp_call_function_single().  If there is an - * RCU read-side critical section in effect, request that the - * next rcu_read_unlock() record the quiescent state up the - * ->expmask fields in the rcu_node tree.  Otherwise, immediately - * report the quiescent state. - */ -static void sync_rcu_exp_handler(void *info) -{ -	struct rcu_data *rdp; -	struct rcu_state *rsp = info; -	struct task_struct *t = current; - -	/* -	 * Within an RCU read-side critical section, request that the next -	 * rcu_read_unlock() report.  Unless this RCU read-side critical -	 * section has already blocked, in which case it is already set -	 * up for the expedited grace period to wait on it. -	 */ -	if (t->rcu_read_lock_nesting > 0 && -	    !t->rcu_read_unlock_special.b.blocked) { -		t->rcu_read_unlock_special.b.exp_need_qs = true; -		return; -	} - -	/* -	 * We are either exiting an RCU read-side critical section (negative -	 * values of t->rcu_read_lock_nesting) or are not in one at all -	 * (zero value of t->rcu_read_lock_nesting).  Or we are in an RCU -	 * read-side critical section that blocked before this expedited -	 * grace period started.  Either way, we can immediately report -	 * the quiescent state. -	 */ -	rdp = this_cpu_ptr(rsp->rda); -	rcu_report_exp_rdp(rsp, rdp, true); -} - -/** - * synchronize_rcu_expedited - Brute-force RCU grace period - * - * Wait for an RCU-preempt grace period, but expedite it.  The basic - * idea is to IPI all non-idle non-nohz online CPUs.  The IPI handler - * checks whether the CPU is in an RCU-preempt critical section, and - * if so, it sets a flag that causes the outermost rcu_read_unlock() - * to report the quiescent state.  On the other hand, if the CPU is - * not in an RCU read-side critical section, the IPI handler reports - * the quiescent state immediately. - * - * Although this is a greate improvement over previous expedited - * implementations, it is still unfriendly to real-time workloads, so is - * thus not recommended for any sort of common-case code.  In fact, if - * you are using synchronize_rcu_expedited() in a loop, please restructure - * your code to batch your updates, and then Use a single synchronize_rcu() - * instead. - */ -void synchronize_rcu_expedited(void) -{ -	struct rcu_state *rsp = rcu_state_p; -	unsigned long s; - -	/* If expedited grace periods are prohibited, fall back to normal. */ -	if (rcu_gp_is_normal()) { -		wait_rcu_gp(call_rcu); -		return; -	} - -	s = rcu_exp_gp_seq_snap(rsp); -	if (exp_funnel_lock(rsp, s)) -		return;  /* Someone else did our work for us. */ - -	/* Initialize the rcu_node tree in preparation for the wait. */ -	sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler); - -	/* Wait for ->blkd_tasks lists to drain, then wake everyone up. */ -	rcu_exp_wait_wake(rsp, s); -} -EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); -  /**   * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.   * @@ -883,16 +803,6 @@ static void rcu_preempt_check_callbacks(void)  }  /* - * Wait for an rcu-preempt grace period, but make it happen quickly. - * But because preemptible RCU does not exist, map to rcu-sched. - */ -void synchronize_rcu_expedited(void) -{ -	synchronize_sched_expedited(); -} -EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); - -/*   * Because preemptible RCU does not exist, rcu_barrier() is just   * another name for rcu_barrier_sched().   */ @@ -1254,8 +1164,9 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)  		return;  	if (!zalloc_cpumask_var(&cm, GFP_KERNEL))  		return; -	for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) -		if ((mask & 0x1) && cpu != outgoingcpu) +	for_each_leaf_node_possible_cpu(rnp, cpu) +		if ((mask & leaf_node_cpu_bit(rnp, cpu)) && +		    cpu != outgoingcpu)  			cpumask_set_cpu(cpu, cm);  	if (cpumask_weight(cm) == 0)  		cpumask_setall(cm); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 3e888cd5a594..f0d8322bc3ec 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -528,6 +528,7 @@ static int rcu_task_stall_timeout __read_mostly = HZ * 60 * 10;  module_param(rcu_task_stall_timeout, int, 0644);  static void rcu_spawn_tasks_kthread(void); +static struct task_struct *rcu_tasks_kthread_ptr;  /*   * Post an RCU-tasks callback.  First call must be from process context @@ -537,6 +538,7 @@ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func)  {  	unsigned long flags;  	bool needwake; +	bool havetask = READ_ONCE(rcu_tasks_kthread_ptr);  	rhp->next = NULL;  	rhp->func = func; @@ -545,7 +547,9 @@ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func)  	*rcu_tasks_cbs_tail = rhp;  	rcu_tasks_cbs_tail = &rhp->next;  	raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); -	if (needwake) { +	/* We can't create the thread unless interrupts are enabled. */ +	if ((needwake && havetask) || +	    (!havetask && !irqs_disabled_flags(flags))) {  		rcu_spawn_tasks_kthread();  		wake_up(&rcu_tasks_cbs_wq);  	} @@ -790,7 +794,6 @@ static int __noreturn rcu_tasks_kthread(void *arg)  static void rcu_spawn_tasks_kthread(void)  {  	static DEFINE_MUTEX(rcu_tasks_kthread_mutex); -	static struct task_struct *rcu_tasks_kthread_ptr;  	struct task_struct *t;  	if (READ_ONCE(rcu_tasks_kthread_ptr)) { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 017d5394f5dc..5c883fe8e440 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1536,7 +1536,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p)  	for (;;) {  		/* Any allowed, online CPU? */  		for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) { -			if (!cpu_active(dest_cpu)) +			if (!(p->flags & PF_KTHREAD) && !cpu_active(dest_cpu)) +				continue; +			if (!cpu_online(dest_cpu))  				continue;  			goto out;  		} @@ -1935,7 +1937,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)   * chain to provide order. Instead we do:   *   *   1) smp_store_release(X->on_cpu, 0) - *   2) smp_cond_acquire(!X->on_cpu) + *   2) smp_cond_load_acquire(!X->on_cpu)   *   * Example:   * @@ -1946,7 +1948,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)   *   sched-out X   *   smp_store_release(X->on_cpu, 0);   * - *                    smp_cond_acquire(!X->on_cpu); + *                    smp_cond_load_acquire(&X->on_cpu, !VAL);   *                    X->state = WAKING   *                    set_task_cpu(X,2)   * @@ -1972,7 +1974,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)   * This means that any means of doing remote wakeups must order the CPU doing   * the wakeup against the CPU the task is going to end up running on. This,   * however, is already required for the regular Program-Order guarantee above, - * since the waking CPU is the one issueing the ACQUIRE (smp_cond_acquire). + * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire).   *   */ @@ -2045,7 +2047,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)  	 * This ensures that tasks getting woken will be fully ordered against  	 * their previous state and preserve Program Order.  	 */ -	smp_cond_acquire(!p->on_cpu); +	smp_cond_load_acquire(&p->on_cpu, !VAL);  	p->sched_contributes_to_load = !!task_contributes_to_load(p);  	p->state = TASK_WAKING; @@ -2340,11 +2342,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)  	__sched_fork(clone_flags, p);  	/* -	 * We mark the process as running here. This guarantees that +	 * We mark the process as NEW here. This guarantees that  	 * nobody will actually run it, and a signal or other external  	 * event cannot wake it up and insert it on the runqueue either.  	 */ -	p->state = TASK_RUNNING; +	p->state = TASK_NEW;  	/*  	 * Make sure we do not leak PI boosting priority to the child. @@ -2381,8 +2383,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)  		p->sched_class = &fair_sched_class;  	} -	if (p->sched_class->task_fork) -		p->sched_class->task_fork(p); +	init_entity_runnable_average(&p->se);  	/*  	 * The child is not yet in the pid-hash so no cgroup attach races, @@ -2392,7 +2393,13 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)  	 * Silence PROVE_RCU.  	 */  	raw_spin_lock_irqsave(&p->pi_lock, flags); -	set_task_cpu(p, cpu); +	/* +	 * We're setting the cpu for the first time, we don't migrate, +	 * so use __set_task_cpu(). +	 */ +	__set_task_cpu(p, cpu); +	if (p->sched_class->task_fork) +		p->sched_class->task_fork(p);  	raw_spin_unlock_irqrestore(&p->pi_lock, flags);  #ifdef CONFIG_SCHED_INFO @@ -2524,21 +2531,22 @@ void wake_up_new_task(struct task_struct *p)  	struct rq_flags rf;  	struct rq *rq; -	/* Initialize new task's runnable average */ -	init_entity_runnable_average(&p->se);  	raw_spin_lock_irqsave(&p->pi_lock, rf.flags); +	p->state = TASK_RUNNING;  #ifdef CONFIG_SMP  	/*  	 * Fork balancing, do it here and not earlier because:  	 *  - cpus_allowed can change in the fork path  	 *  - any previously selected cpu might disappear through hotplug +	 * +	 * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, +	 * as we're not fully set-up yet.  	 */ -	set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); +	__set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));  #endif -	/* Post initialize new task's util average when its cfs_rq is set */ +	rq = __task_rq_lock(p, &rf);  	post_init_entity_util_avg(&p->se); -	rq = __task_rq_lock(p, &rf);  	activate_task(rq, p, 0);  	p->on_rq = TASK_ON_RQ_QUEUED;  	trace_sched_wakeup_new(p); @@ -3160,6 +3168,9 @@ static noinline void __schedule_bug(struct task_struct *prev)  		pr_cont("\n");  	}  #endif +	if (panic_on_warn) +		panic("scheduling while atomic\n"); +  	dump_stack();  	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);  } @@ -4751,7 +4762,8 @@ out_unlock:   * @len: length in bytes of the bitmask pointed to by user_mask_ptr   * @user_mask_ptr: user-space pointer to hold the current cpu mask   * - * Return: 0 on success. An error code otherwise. + * Return: size of CPU mask copied to user_mask_ptr on success. An + * error code otherwise.   */  SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,  		unsigned long __user *, user_mask_ptr) @@ -5148,14 +5160,16 @@ void show_state_filter(unsigned long state_filter)  		/*  		 * reset the NMI-timeout, listing all files on a slow  		 * console might take a lot of time: +		 * Also, reset softlockup watchdogs on all CPUs, because +		 * another CPU might be blocked waiting for us to process +		 * an IPI.  		 */  		touch_nmi_watchdog(); +		touch_all_softlockup_watchdogs();  		if (!state_filter || (p->state & state_filter))  			sched_show_task(p);  	} -	touch_all_softlockup_watchdogs(); -  #ifdef CONFIG_SCHED_DEBUG  	if (!state_filter)  		sysrq_sched_debug_show(); @@ -5391,13 +5405,15 @@ void idle_task_exit(void)  /*   * Since this CPU is going 'away' for a while, fold any nr_active delta   * we might have. Assumes we're called after migrate_tasks() so that the - * nr_active count is stable. + * nr_active count is stable. We need to take the teardown thread which + * is calling this into account, so we hand in adjust = 1 to the load + * calculation.   *   * Also see the comment "Global load-average calculations".   */  static void calc_load_migrate(struct rq *rq)  { -	long delta = calc_load_fold_active(rq); +	long delta = calc_load_fold_active(rq, 1);  	if (delta)  		atomic_long_add(delta, &calc_load_tasks);  } @@ -7228,7 +7244,6 @@ static void sched_rq_cpu_starting(unsigned int cpu)  	struct rq *rq = cpu_rq(cpu);  	rq->calc_load_update = calc_load_update; -	account_reset_rq(rq);  	update_max_interval();  } @@ -7708,6 +7723,8 @@ void sched_online_group(struct task_group *tg, struct task_group *parent)  	INIT_LIST_HEAD(&tg->children);  	list_add_rcu(&tg->siblings, &parent->children);  	spin_unlock_irqrestore(&task_group_lock, flags); + +	online_fair_sched_group(tg);  }  /* rcu callback to free various structures associated with a task group */ @@ -7736,27 +7753,9 @@ void sched_offline_group(struct task_group *tg)  	spin_unlock_irqrestore(&task_group_lock, flags);  } -/* change task's runqueue when it moves between groups. - *	The caller of this function should have put the task in its new group - *	by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to - *	reflect its new group. - */ -void sched_move_task(struct task_struct *tsk) +static void sched_change_group(struct task_struct *tsk, int type)  {  	struct task_group *tg; -	int queued, running; -	struct rq_flags rf; -	struct rq *rq; - -	rq = task_rq_lock(tsk, &rf); - -	running = task_current(rq, tsk); -	queued = task_on_rq_queued(tsk); - -	if (queued) -		dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE); -	if (unlikely(running)) -		put_prev_task(rq, tsk);  	/*  	 * All callers are synchronized by task_rq_lock(); we do not use RCU @@ -7769,11 +7768,37 @@ void sched_move_task(struct task_struct *tsk)  	tsk->sched_task_group = tg;  #ifdef CONFIG_FAIR_GROUP_SCHED -	if (tsk->sched_class->task_move_group) -		tsk->sched_class->task_move_group(tsk); +	if (tsk->sched_class->task_change_group) +		tsk->sched_class->task_change_group(tsk, type);  	else  #endif  		set_task_rq(tsk, task_cpu(tsk)); +} + +/* + * Change task's runqueue when it moves between groups. + * + * The caller of this function should have put the task in its new group by + * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect + * its new group. + */ +void sched_move_task(struct task_struct *tsk) +{ +	int queued, running; +	struct rq_flags rf; +	struct rq *rq; + +	rq = task_rq_lock(tsk, &rf); + +	running = task_current(rq, tsk); +	queued = task_on_rq_queued(tsk); + +	if (queued) +		dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE); +	if (unlikely(running)) +		put_prev_task(rq, tsk); + +	sched_change_group(tsk, TASK_MOVE_GROUP);  	if (unlikely(running))  		tsk->sched_class->set_curr_task(rq); @@ -8201,15 +8226,27 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)  	sched_free_group(tg);  } +/* + * This is called before wake_up_new_task(), therefore we really only + * have to set its group bits, all the other stuff does not apply. + */  static void cpu_cgroup_fork(struct task_struct *task)  { -	sched_move_task(task); +	struct rq_flags rf; +	struct rq *rq; + +	rq = task_rq_lock(task, &rf); + +	sched_change_group(task, TASK_SET_GROUP); + +	task_rq_unlock(rq, task, &rf);  }  static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)  {  	struct task_struct *task;  	struct cgroup_subsys_state *css; +	int ret = 0;  	cgroup_taskset_for_each(task, css, tset) {  #ifdef CONFIG_RT_GROUP_SCHED @@ -8220,8 +8257,24 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)  		if (task->sched_class != &fair_sched_class)  			return -EINVAL;  #endif +		/* +		 * Serialize against wake_up_new_task() such that if its +		 * running, we're sure to observe its full state. +		 */ +		raw_spin_lock_irq(&task->pi_lock); +		/* +		 * Avoid calling sched_move_task() before wake_up_new_task() +		 * has happened. This would lead to problems with PELT, due to +		 * move wanting to detach+attach while we're not attached yet. +		 */ +		if (task->state == TASK_NEW) +			ret = -EINVAL; +		raw_spin_unlock_irq(&task->pi_lock); + +		if (ret) +			break;  	} -	return 0; +	return ret;  }  static void cpu_cgroup_attach(struct cgroup_taskset *tset) diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 41f85c4d0938..bc0b309c3f19 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -25,15 +25,13 @@ enum cpuacct_stat_index {  	CPUACCT_STAT_NSTATS,  }; -enum cpuacct_usage_index { -	CPUACCT_USAGE_USER,	/* ... user mode */ -	CPUACCT_USAGE_SYSTEM,	/* ... kernel mode */ - -	CPUACCT_USAGE_NRUSAGE, +static const char * const cpuacct_stat_desc[] = { +	[CPUACCT_STAT_USER] = "user", +	[CPUACCT_STAT_SYSTEM] = "system",  };  struct cpuacct_usage { -	u64	usages[CPUACCT_USAGE_NRUSAGE]; +	u64	usages[CPUACCT_STAT_NSTATS];  };  /* track cpu usage of a group of tasks and its child groups */ @@ -108,16 +106,16 @@ static void cpuacct_css_free(struct cgroup_subsys_state *css)  }  static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, -				 enum cpuacct_usage_index index) +				 enum cpuacct_stat_index index)  {  	struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);  	u64 data;  	/* -	 * We allow index == CPUACCT_USAGE_NRUSAGE here to read +	 * We allow index == CPUACCT_STAT_NSTATS here to read  	 * the sum of suages.  	 */ -	BUG_ON(index > CPUACCT_USAGE_NRUSAGE); +	BUG_ON(index > CPUACCT_STAT_NSTATS);  #ifndef CONFIG_64BIT  	/* @@ -126,11 +124,11 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu,  	raw_spin_lock_irq(&cpu_rq(cpu)->lock);  #endif -	if (index == CPUACCT_USAGE_NRUSAGE) { +	if (index == CPUACCT_STAT_NSTATS) {  		int i = 0;  		data = 0; -		for (i = 0; i < CPUACCT_USAGE_NRUSAGE; i++) +		for (i = 0; i < CPUACCT_STAT_NSTATS; i++)  			data += cpuusage->usages[i];  	} else {  		data = cpuusage->usages[index]; @@ -155,7 +153,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)  	raw_spin_lock_irq(&cpu_rq(cpu)->lock);  #endif -	for (i = 0; i < CPUACCT_USAGE_NRUSAGE; i++) +	for (i = 0; i < CPUACCT_STAT_NSTATS; i++)  		cpuusage->usages[i] = val;  #ifndef CONFIG_64BIT @@ -165,7 +163,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)  /* return total cpu usage (in nanoseconds) of a group */  static u64 __cpuusage_read(struct cgroup_subsys_state *css, -			   enum cpuacct_usage_index index) +			   enum cpuacct_stat_index index)  {  	struct cpuacct *ca = css_ca(css);  	u64 totalcpuusage = 0; @@ -180,18 +178,18 @@ static u64 __cpuusage_read(struct cgroup_subsys_state *css,  static u64 cpuusage_user_read(struct cgroup_subsys_state *css,  			      struct cftype *cft)  { -	return __cpuusage_read(css, CPUACCT_USAGE_USER); +	return __cpuusage_read(css, CPUACCT_STAT_USER);  }  static u64 cpuusage_sys_read(struct cgroup_subsys_state *css,  			     struct cftype *cft)  { -	return __cpuusage_read(css, CPUACCT_USAGE_SYSTEM); +	return __cpuusage_read(css, CPUACCT_STAT_SYSTEM);  }  static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft)  { -	return __cpuusage_read(css, CPUACCT_USAGE_NRUSAGE); +	return __cpuusage_read(css, CPUACCT_STAT_NSTATS);  }  static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, @@ -213,7 +211,7 @@ static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,  }  static int __cpuacct_percpu_seq_show(struct seq_file *m, -				     enum cpuacct_usage_index index) +				     enum cpuacct_stat_index index)  {  	struct cpuacct *ca = css_ca(seq_css(m));  	u64 percpu; @@ -229,48 +227,78 @@ static int __cpuacct_percpu_seq_show(struct seq_file *m,  static int cpuacct_percpu_user_seq_show(struct seq_file *m, void *V)  { -	return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_USER); +	return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_USER);  }  static int cpuacct_percpu_sys_seq_show(struct seq_file *m, void *V)  { -	return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_SYSTEM); +	return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_SYSTEM);  }  static int cpuacct_percpu_seq_show(struct seq_file *m, void *V)  { -	return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_NRUSAGE); +	return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_NSTATS);  } -static const char * const cpuacct_stat_desc[] = { -	[CPUACCT_STAT_USER] = "user", -	[CPUACCT_STAT_SYSTEM] = "system", -}; +static int cpuacct_all_seq_show(struct seq_file *m, void *V) +{ +	struct cpuacct *ca = css_ca(seq_css(m)); +	int index; +	int cpu; + +	seq_puts(m, "cpu"); +	for (index = 0; index < CPUACCT_STAT_NSTATS; index++) +		seq_printf(m, " %s", cpuacct_stat_desc[index]); +	seq_puts(m, "\n"); + +	for_each_possible_cpu(cpu) { +		struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + +		seq_printf(m, "%d", cpu); + +		for (index = 0; index < CPUACCT_STAT_NSTATS; index++) { +#ifndef CONFIG_64BIT +			/* +			 * Take rq->lock to make 64-bit read safe on 32-bit +			 * platforms. +			 */ +			raw_spin_lock_irq(&cpu_rq(cpu)->lock); +#endif + +			seq_printf(m, " %llu", cpuusage->usages[index]); + +#ifndef CONFIG_64BIT +			raw_spin_unlock_irq(&cpu_rq(cpu)->lock); +#endif +		} +		seq_puts(m, "\n"); +	} +	return 0; +}  static int cpuacct_stats_show(struct seq_file *sf, void *v)  {  	struct cpuacct *ca = css_ca(seq_css(sf)); +	s64 val[CPUACCT_STAT_NSTATS];  	int cpu; -	s64 val = 0; +	int stat; +	memset(val, 0, sizeof(val));  	for_each_possible_cpu(cpu) { -		struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); -		val += kcpustat->cpustat[CPUTIME_USER]; -		val += kcpustat->cpustat[CPUTIME_NICE]; -	} -	val = cputime64_to_clock_t(val); -	seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val); +		u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat; -	val = 0; -	for_each_possible_cpu(cpu) { -		struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); -		val += kcpustat->cpustat[CPUTIME_SYSTEM]; -		val += kcpustat->cpustat[CPUTIME_IRQ]; -		val += kcpustat->cpustat[CPUTIME_SOFTIRQ]; +		val[CPUACCT_STAT_USER]   += cpustat[CPUTIME_USER]; +		val[CPUACCT_STAT_USER]   += cpustat[CPUTIME_NICE]; +		val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SYSTEM]; +		val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_IRQ]; +		val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SOFTIRQ];  	} -	val = cputime64_to_clock_t(val); -	seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); +	for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) { +		seq_printf(sf, "%s %lld\n", +			   cpuacct_stat_desc[stat], +			   cputime64_to_clock_t(val[stat])); +	}  	return 0;  } @@ -302,6 +330,10 @@ static struct cftype files[] = {  		.seq_show = cpuacct_percpu_sys_seq_show,  	},  	{ +		.name = "usage_all", +		.seq_show = cpuacct_all_seq_show, +	}, +	{  		.name = "stat",  		.seq_show = cpuacct_stats_show,  	}, @@ -316,11 +348,11 @@ static struct cftype files[] = {  void cpuacct_charge(struct task_struct *tsk, u64 cputime)  {  	struct cpuacct *ca; -	int index = CPUACCT_USAGE_SYSTEM; +	int index = CPUACCT_STAT_SYSTEM;  	struct pt_regs *regs = task_pt_regs(tsk);  	if (regs && user_mode(regs)) -		index = CPUACCT_USAGE_USER; +		index = CPUACCT_STAT_USER;  	rcu_read_lock(); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 14c4aa25cc45..a84641b222c1 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -47,6 +47,8 @@ struct sugov_cpu {  	struct update_util_data update_util;  	struct sugov_policy *sg_policy; +	unsigned int cached_raw_freq; +  	/* The fields below are only needed when sharing a policy. */  	unsigned long util;  	unsigned long max; @@ -106,7 +108,7 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,  /**   * get_next_freq - Compute a new frequency for a given cpufreq policy. - * @policy: cpufreq policy object to compute the new frequency for. + * @sg_cpu: schedutil cpu object to compute the new frequency for.   * @util: Current CPU utilization.   * @max: CPU capacity.   * @@ -121,14 +123,25 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,   * next_freq = C * curr_freq * util_raw / max   *   * Take C = 1.25 for the frequency tipping point at (util / max) = 0.8. + * + * The lowest driver-supported frequency which is equal or greater than the raw + * next_freq (as calculated above) is returned, subject to policy min/max and + * cpufreq driver limitations.   */ -static unsigned int get_next_freq(struct cpufreq_policy *policy, -				  unsigned long util, unsigned long max) +static unsigned int get_next_freq(struct sugov_cpu *sg_cpu, unsigned long util, +				  unsigned long max)  { +	struct sugov_policy *sg_policy = sg_cpu->sg_policy; +	struct cpufreq_policy *policy = sg_policy->policy;  	unsigned int freq = arch_scale_freq_invariant() ?  				policy->cpuinfo.max_freq : policy->cur; -	return (freq + (freq >> 2)) * util / max; +	freq = (freq + (freq >> 2)) * util / max; + +	if (freq == sg_cpu->cached_raw_freq && sg_policy->next_freq != UINT_MAX) +		return sg_policy->next_freq; +	sg_cpu->cached_raw_freq = freq; +	return cpufreq_driver_resolve_freq(policy, freq);  }  static void sugov_update_single(struct update_util_data *hook, u64 time, @@ -143,13 +156,14 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,  		return;  	next_f = util == ULONG_MAX ? policy->cpuinfo.max_freq : -			get_next_freq(policy, util, max); +			get_next_freq(sg_cpu, util, max);  	sugov_update_commit(sg_policy, time, next_f);  } -static unsigned int sugov_next_freq_shared(struct sugov_policy *sg_policy, +static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu,  					   unsigned long util, unsigned long max)  { +	struct sugov_policy *sg_policy = sg_cpu->sg_policy;  	struct cpufreq_policy *policy = sg_policy->policy;  	unsigned int max_f = policy->cpuinfo.max_freq;  	u64 last_freq_update_time = sg_policy->last_freq_update_time; @@ -189,7 +203,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_policy *sg_policy,  		}  	} -	return get_next_freq(policy, util, max); +	return get_next_freq(sg_cpu, util, max);  }  static void sugov_update_shared(struct update_util_data *hook, u64 time, @@ -206,7 +220,7 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time,  	sg_cpu->last_update = time;  	if (sugov_should_update_freq(sg_policy, time)) { -		next_f = sugov_next_freq_shared(sg_policy, util, max); +		next_f = sugov_next_freq_shared(sg_cpu, util, max);  		sugov_update_commit(sg_policy, time, next_f);  	} @@ -394,7 +408,7 @@ static int sugov_init(struct cpufreq_policy *policy)  	return ret;  } -static int sugov_exit(struct cpufreq_policy *policy) +static void sugov_exit(struct cpufreq_policy *policy)  {  	struct sugov_policy *sg_policy = policy->governor_data;  	struct sugov_tunables *tunables = sg_policy->tunables; @@ -412,7 +426,6 @@ static int sugov_exit(struct cpufreq_policy *policy)  	mutex_unlock(&global_tunables_lock);  	sugov_policy_free(sg_policy); -	return 0;  }  static int sugov_start(struct cpufreq_policy *policy) @@ -434,6 +447,7 @@ static int sugov_start(struct cpufreq_policy *policy)  			sg_cpu->util = ULONG_MAX;  			sg_cpu->max = 0;  			sg_cpu->last_update = 0; +			sg_cpu->cached_raw_freq = 0;  			cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,  						     sugov_update_shared);  		} else { @@ -444,7 +458,7 @@ static int sugov_start(struct cpufreq_policy *policy)  	return 0;  } -static int sugov_stop(struct cpufreq_policy *policy) +static void sugov_stop(struct cpufreq_policy *policy)  {  	struct sugov_policy *sg_policy = policy->governor_data;  	unsigned int cpu; @@ -456,53 +470,29 @@ static int sugov_stop(struct cpufreq_policy *policy)  	irq_work_sync(&sg_policy->irq_work);  	cancel_work_sync(&sg_policy->work); -	return 0;  } -static int sugov_limits(struct cpufreq_policy *policy) +static void sugov_limits(struct cpufreq_policy *policy)  {  	struct sugov_policy *sg_policy = policy->governor_data;  	if (!policy->fast_switch_enabled) {  		mutex_lock(&sg_policy->work_lock); - -		if (policy->max < policy->cur) -			__cpufreq_driver_target(policy, policy->max, -						CPUFREQ_RELATION_H); -		else if (policy->min > policy->cur) -			__cpufreq_driver_target(policy, policy->min, -						CPUFREQ_RELATION_L); - +		cpufreq_policy_apply_limits(policy);  		mutex_unlock(&sg_policy->work_lock);  	}  	sg_policy->need_freq_update = true; -	return 0; -} - -int sugov_governor(struct cpufreq_policy *policy, unsigned int event) -{ -	if (event == CPUFREQ_GOV_POLICY_INIT) { -		return sugov_init(policy); -	} else if (policy->governor_data) { -		switch (event) { -		case CPUFREQ_GOV_POLICY_EXIT: -			return sugov_exit(policy); -		case CPUFREQ_GOV_START: -			return sugov_start(policy); -		case CPUFREQ_GOV_STOP: -			return sugov_stop(policy); -		case CPUFREQ_GOV_LIMITS: -			return sugov_limits(policy); -		} -	} -	return -EINVAL;  }  static struct cpufreq_governor schedutil_gov = {  	.name = "schedutil", -	.governor = sugov_governor,  	.owner = THIS_MODULE, +	.init = sugov_init, +	.exit = sugov_exit, +	.start = sugov_start, +	.stop = sugov_stop, +	.limits = sugov_limits,  };  static int __init sugov_module_init(void) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 75f98c5498d5..1934f658c036 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -49,15 +49,12 @@ DEFINE_PER_CPU(seqcount_t, irq_time_seq);   */  void irqtime_account_irq(struct task_struct *curr)  { -	unsigned long flags;  	s64 delta;  	int cpu;  	if (!sched_clock_irqtime)  		return; -	local_irq_save(flags); -  	cpu = smp_processor_id();  	delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);  	__this_cpu_add(irq_start_time, delta); @@ -75,44 +72,53 @@ void irqtime_account_irq(struct task_struct *curr)  		__this_cpu_add(cpu_softirq_time, delta);  	irq_time_write_end(); -	local_irq_restore(flags);  }  EXPORT_SYMBOL_GPL(irqtime_account_irq); -static int irqtime_account_hi_update(void) +static cputime_t irqtime_account_hi_update(cputime_t maxtime)  {  	u64 *cpustat = kcpustat_this_cpu->cpustat;  	unsigned long flags; -	u64 latest_ns; -	int ret = 0; +	cputime_t irq_cputime;  	local_irq_save(flags); -	latest_ns = this_cpu_read(cpu_hardirq_time); -	if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ]) -		ret = 1; +	irq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_hardirq_time)) - +		      cpustat[CPUTIME_IRQ]; +	irq_cputime = min(irq_cputime, maxtime); +	cpustat[CPUTIME_IRQ] += irq_cputime;  	local_irq_restore(flags); -	return ret; +	return irq_cputime;  } -static int irqtime_account_si_update(void) +static cputime_t irqtime_account_si_update(cputime_t maxtime)  {  	u64 *cpustat = kcpustat_this_cpu->cpustat;  	unsigned long flags; -	u64 latest_ns; -	int ret = 0; +	cputime_t softirq_cputime;  	local_irq_save(flags); -	latest_ns = this_cpu_read(cpu_softirq_time); -	if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ]) -		ret = 1; +	softirq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_softirq_time)) - +			  cpustat[CPUTIME_SOFTIRQ]; +	softirq_cputime = min(softirq_cputime, maxtime); +	cpustat[CPUTIME_SOFTIRQ] += softirq_cputime;  	local_irq_restore(flags); -	return ret; +	return softirq_cputime;  }  #else /* CONFIG_IRQ_TIME_ACCOUNTING */  #define sched_clock_irqtime	(0) +static cputime_t irqtime_account_hi_update(cputime_t dummy) +{ +	return 0; +} + +static cputime_t irqtime_account_si_update(cputime_t dummy) +{ +	return 0; +} +  #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */  static inline void task_group_account_field(struct task_struct *p, int index, @@ -257,29 +263,42 @@ void account_idle_time(cputime_t cputime)  		cpustat[CPUTIME_IDLE] += (__force u64) cputime;  } -static __always_inline bool steal_account_process_tick(void) +static __always_inline cputime_t steal_account_process_time(cputime_t maxtime)  {  #ifdef CONFIG_PARAVIRT  	if (static_key_false(¶virt_steal_enabled)) { +		cputime_t steal_cputime;  		u64 steal; -		unsigned long steal_jiffies;  		steal = paravirt_steal_clock(smp_processor_id());  		steal -= this_rq()->prev_steal_time; -		/* -		 * steal is in nsecs but our caller is expecting steal -		 * time in jiffies. Lets cast the result to jiffies -		 * granularity and account the rest on the next rounds. -		 */ -		steal_jiffies = nsecs_to_jiffies(steal); -		this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies); +		steal_cputime = min(nsecs_to_cputime(steal), maxtime); +		account_steal_time(steal_cputime); +		this_rq()->prev_steal_time += cputime_to_nsecs(steal_cputime); -		account_steal_time(jiffies_to_cputime(steal_jiffies)); -		return steal_jiffies; +		return steal_cputime;  	}  #endif -	return false; +	return 0; +} + +/* + * Account how much elapsed time was spent in steal, irq, or softirq time. + */ +static inline cputime_t account_other_time(cputime_t max) +{ +	cputime_t accounted; + +	accounted = steal_account_process_time(max); + +	if (accounted < max) +		accounted += irqtime_account_hi_update(max - accounted); + +	if (accounted < max) +		accounted += irqtime_account_si_update(max - accounted); + +	return accounted;  }  /* @@ -342,21 +361,23 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)  static void irqtime_account_process_tick(struct task_struct *p, int user_tick,  					 struct rq *rq, int ticks)  { -	cputime_t scaled = cputime_to_scaled(cputime_one_jiffy); -	u64 cputime = (__force u64) cputime_one_jiffy; -	u64 *cpustat = kcpustat_this_cpu->cpustat; +	u64 cputime = (__force u64) cputime_one_jiffy * ticks; +	cputime_t scaled, other; -	if (steal_account_process_tick()) +	/* +	 * When returning from idle, many ticks can get accounted at +	 * once, including some ticks of steal, irq, and softirq time. +	 * Subtract those ticks from the amount of time accounted to +	 * idle, or potentially user or system time. Due to rounding, +	 * other time can exceed ticks occasionally. +	 */ +	other = account_other_time(cputime); +	if (other >= cputime)  		return; +	cputime -= other; +	scaled = cputime_to_scaled(cputime); -	cputime *= ticks; -	scaled *= ticks; - -	if (irqtime_account_hi_update()) { -		cpustat[CPUTIME_IRQ] += cputime; -	} else if (irqtime_account_si_update()) { -		cpustat[CPUTIME_SOFTIRQ] += cputime; -	} else if (this_cpu_ksoftirqd() == p) { +	if (this_cpu_ksoftirqd() == p) {  		/*  		 * ksoftirqd time do not get accounted in cpu_softirq_time.  		 * So, we have to handle it separately here. @@ -406,6 +427,10 @@ void vtime_common_task_switch(struct task_struct *prev)  }  #endif +#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ + + +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE  /*   * Archs that account the whole time spent in the idle task   * (outside irq) as idle time can rely on this and just implement @@ -415,33 +440,16 @@ void vtime_common_task_switch(struct task_struct *prev)   * vtime_account().   */  #ifndef __ARCH_HAS_VTIME_ACCOUNT -void vtime_common_account_irq_enter(struct task_struct *tsk) +void vtime_account_irq_enter(struct task_struct *tsk)  { -	if (!in_interrupt()) { -		/* -		 * If we interrupted user, context_tracking_in_user() -		 * is 1 because the context tracking don't hook -		 * on irq entry/exit. This way we know if -		 * we need to flush user time on kernel entry. -		 */ -		if (context_tracking_in_user()) { -			vtime_account_user(tsk); -			return; -		} - -		if (is_idle_task(tsk)) { -			vtime_account_idle(tsk); -			return; -		} -	} -	vtime_account_system(tsk); +	if (!in_interrupt() && is_idle_task(tsk)) +		vtime_account_idle(tsk); +	else +		vtime_account_system(tsk);  } -EXPORT_SYMBOL_GPL(vtime_common_account_irq_enter); +EXPORT_SYMBOL_GPL(vtime_account_irq_enter);  #endif /* __ARCH_HAS_VTIME_ACCOUNT */ -#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ - -#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE  void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)  {  	*ut = p->utime; @@ -466,7 +474,7 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime   */  void account_process_tick(struct task_struct *p, int user_tick)  { -	cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); +	cputime_t cputime, scaled, steal;  	struct rq *rq = this_rq();  	if (vtime_accounting_cpu_enabled()) @@ -477,26 +485,21 @@ void account_process_tick(struct task_struct *p, int user_tick)  		return;  	} -	if (steal_account_process_tick()) +	cputime = cputime_one_jiffy; +	steal = steal_account_process_time(cputime); + +	if (steal >= cputime)  		return; +	cputime -= steal; +	scaled = cputime_to_scaled(cputime); +  	if (user_tick) -		account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); +		account_user_time(p, cputime, scaled);  	else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) -		account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, -				    one_jiffy_scaled); +		account_system_time(p, HARDIRQ_OFFSET, cputime, scaled);  	else -		account_idle_time(cputime_one_jiffy); -} - -/* - * Account multiple ticks of steal time. - * @p: the process from which the cpu time has been stolen - * @ticks: number of stolen ticks - */ -void account_steal_ticks(unsigned long ticks) -{ -	account_steal_time(jiffies_to_cputime(ticks)); +		account_idle_time(cputime);  }  /* @@ -681,12 +684,14 @@ static cputime_t vtime_delta(struct task_struct *tsk)  static cputime_t get_vtime_delta(struct task_struct *tsk)  {  	unsigned long now = READ_ONCE(jiffies); -	unsigned long delta = now - tsk->vtime_snap; +	cputime_t delta, other; +	delta = jiffies_to_cputime(now - tsk->vtime_snap); +	other = account_other_time(delta);  	WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);  	tsk->vtime_snap = now; -	return jiffies_to_cputime(delta); +	return delta - other;  }  static void __vtime_account_system(struct task_struct *tsk) @@ -706,16 +711,6 @@ void vtime_account_system(struct task_struct *tsk)  	write_seqcount_end(&tsk->vtime_seqcount);  } -void vtime_gen_account_irq_exit(struct task_struct *tsk) -{ -	write_seqcount_begin(&tsk->vtime_seqcount); -	if (vtime_delta(tsk)) -		__vtime_account_system(tsk); -	if (context_tracking_in_user()) -		tsk->vtime_snap_whence = VTIME_USER; -	write_seqcount_end(&tsk->vtime_seqcount); -} -  void vtime_account_user(struct task_struct *tsk)  {  	cputime_t delta_cpu; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 0368c393a336..2a0a9995256d 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -879,9 +879,9 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)  	nr_switches = p->nvcsw + p->nivcsw; -#ifdef CONFIG_SCHEDSTATS  	P(se.nr_migrations); +#ifdef CONFIG_SCHEDSTATS  	if (schedstat_enabled()) {  		u64 avg_atom, avg_per_cpu; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 218f8e83db73..4088eedea763 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -690,6 +690,11 @@ void init_entity_runnable_average(struct sched_entity *se)  	/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */  } +static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); +static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq); +static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force); +static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se); +  /*   * With new tasks being created, their initial util_avgs are extrapolated   * based on the cfs_rq's current util_avg: @@ -720,6 +725,8 @@ void post_init_entity_util_avg(struct sched_entity *se)  	struct cfs_rq *cfs_rq = cfs_rq_of(se);  	struct sched_avg *sa = &se->avg;  	long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; +	u64 now = cfs_rq_clock_task(cfs_rq); +	int tg_update;  	if (cap > 0) {  		if (cfs_rq->avg.util_avg != 0) { @@ -733,18 +740,42 @@ void post_init_entity_util_avg(struct sched_entity *se)  		}  		sa->util_sum = sa->util_avg * LOAD_AVG_MAX;  	} + +	if (entity_is_task(se)) { +		struct task_struct *p = task_of(se); +		if (p->sched_class != &fair_sched_class) { +			/* +			 * For !fair tasks do: +			 * +			update_cfs_rq_load_avg(now, cfs_rq, false); +			attach_entity_load_avg(cfs_rq, se); +			switched_from_fair(rq, p); +			 * +			 * such that the next switched_to_fair() has the +			 * expected state. +			 */ +			se->avg.last_update_time = now; +			return; +		} +	} + +	tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); +	attach_entity_load_avg(cfs_rq, se); +	if (tg_update) +		update_tg_load_avg(cfs_rq, false);  } -static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq); -static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq); -#else +#else /* !CONFIG_SMP */  void init_entity_runnable_average(struct sched_entity *se)  {  }  void post_init_entity_util_avg(struct sched_entity *se)  {  } -#endif +static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) +{ +} +#endif /* CONFIG_SMP */  /*   * Update the current task's runtime statistics. @@ -1305,6 +1336,8 @@ static void task_numa_assign(struct task_numa_env *env,  {  	if (env->best_task)  		put_task_struct(env->best_task); +	if (p) +		get_task_struct(p);  	env->best_task = p;  	env->best_imp = imp; @@ -1372,31 +1405,11 @@ static void task_numa_compare(struct task_numa_env *env,  	long imp = env->p->numa_group ? groupimp : taskimp;  	long moveimp = imp;  	int dist = env->dist; -	bool assigned = false;  	rcu_read_lock(); - -	raw_spin_lock_irq(&dst_rq->lock); -	cur = dst_rq->curr; -	/* -	 * No need to move the exiting task or idle task. -	 */ -	if ((cur->flags & PF_EXITING) || is_idle_task(cur)) +	cur = task_rcu_dereference(&dst_rq->curr); +	if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))  		cur = NULL; -	else { -		/* -		 * The task_struct must be protected here to protect the -		 * p->numa_faults access in the task_weight since the -		 * numa_faults could already be freed in the following path: -		 * finish_task_switch() -		 *     --> put_task_struct() -		 *         --> __put_task_struct() -		 *             --> task_numa_free() -		 */ -		get_task_struct(cur); -	} - -	raw_spin_unlock_irq(&dst_rq->lock);  	/*  	 * Because we have preemption enabled we can get migrated around and @@ -1479,7 +1492,6 @@ balance:  		 */  		if (!load_too_imbalanced(src_load, dst_load, env)) {  			imp = moveimp - 1; -			put_task_struct(cur);  			cur = NULL;  			goto assign;  		} @@ -1505,16 +1517,9 @@ balance:  		env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);  assign: -	assigned = true;  	task_numa_assign(env, cur, imp);  unlock:  	rcu_read_unlock(); -	/* -	 * The dst_rq->curr isn't assigned. The protection for task_struct is -	 * finished. -	 */ -	if (cur && !assigned) -		put_task_struct(cur);  }  static void task_numa_find_cpu(struct task_numa_env *env, @@ -2499,28 +2504,22 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)  #ifdef CONFIG_FAIR_GROUP_SCHED  # ifdef CONFIG_SMP -static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) +static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)  { -	long tg_weight; +	long tg_weight, load, shares;  	/* -	 * Use this CPU's real-time load instead of the last load contribution -	 * as the updating of the contribution is delayed, and we will use the -	 * the real-time load to calc the share. See update_tg_load_avg(). +	 * This really should be: cfs_rq->avg.load_avg, but instead we use +	 * cfs_rq->load.weight, which is its upper bound. This helps ramp up +	 * the shares for small weight interactive tasks.  	 */ -	tg_weight = atomic_long_read(&tg->load_avg); -	tg_weight -= cfs_rq->tg_load_avg_contrib; -	tg_weight += cfs_rq->load.weight; +	load = scale_load_down(cfs_rq->load.weight); -	return tg_weight; -} - -static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) -{ -	long tg_weight, load, shares; +	tg_weight = atomic_long_read(&tg->load_avg); -	tg_weight = calc_tg_weight(tg, cfs_rq); -	load = cfs_rq->load.weight; +	/* Ensure tg_weight >= load */ +	tg_weight -= cfs_rq->tg_load_avg_contrib; +	tg_weight += load;  	shares = (tg->shares * load);  	if (tg_weight) @@ -2539,6 +2538,7 @@ static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)  	return tg->shares;  }  # endif /* CONFIG_SMP */ +  static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,  			    unsigned long weight)  { @@ -2873,8 +2873,6 @@ void set_task_rq_fair(struct sched_entity *se,  static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}  #endif /* CONFIG_FAIR_GROUP_SCHED */ -static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); -  static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)  {  	struct rq *rq = rq_of(cfs_rq); @@ -2904,7 +2902,40 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)  	}  } -/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */ +/* + * Unsigned subtract and clamp on underflow. + * + * Explicitly do a load-store to ensure the intermediate value never hits + * memory. This allows lockless observations without ever seeing the negative + * values. + */ +#define sub_positive(_ptr, _val) do {				\ +	typeof(_ptr) ptr = (_ptr);				\ +	typeof(*ptr) val = (_val);				\ +	typeof(*ptr) res, var = READ_ONCE(*ptr);		\ +	res = var - val;					\ +	if (res > var)						\ +		res = 0;					\ +	WRITE_ONCE(*ptr, res);					\ +} while (0) + +/** + * update_cfs_rq_load_avg - update the cfs_rq's load/util averages + * @now: current time, as per cfs_rq_clock_task() + * @cfs_rq: cfs_rq to update + * @update_freq: should we call cfs_rq_util_change() or will the call do so + * + * The cfs_rq avg is the direct sum of all its entities (blocked and runnable) + * avg. The immediate corollary is that all (fair) tasks must be attached, see + * post_init_entity_util_avg(). + * + * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example. + * + * Returns true if the load decayed or we removed utilization. It is expected + * that one calls update_tg_load_avg() on this condition, but after you've + * modified the cfs_rq avg (attach/detach), such that we propagate the new + * avg up. + */  static inline int  update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)  { @@ -2913,15 +2944,15 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)  	if (atomic_long_read(&cfs_rq->removed_load_avg)) {  		s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); -		sa->load_avg = max_t(long, sa->load_avg - r, 0); -		sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0); +		sub_positive(&sa->load_avg, r); +		sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);  		removed_load = 1;  	}  	if (atomic_long_read(&cfs_rq->removed_util_avg)) {  		long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0); -		sa->util_avg = max_t(long, sa->util_avg - r, 0); -		sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0); +		sub_positive(&sa->util_avg, r); +		sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);  		removed_util = 1;  	} @@ -2959,6 +2990,14 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)  		update_tg_load_avg(cfs_rq, 0);  } +/** + * attach_entity_load_avg - attach this entity to its cfs_rq load avg + * @cfs_rq: cfs_rq to attach to + * @se: sched_entity to attach + * + * Must call update_cfs_rq_load_avg() before this, since we rely on + * cfs_rq->avg.last_update_time being current. + */  static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)  {  	if (!sched_feat(ATTACH_AGE_LOAD)) @@ -2967,6 +3006,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s  	/*  	 * If we got migrated (either between CPUs or between cgroups) we'll  	 * have aged the average right before clearing @last_update_time. +	 * +	 * Or we're fresh through post_init_entity_util_avg().  	 */  	if (se->avg.last_update_time) {  		__update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), @@ -2988,16 +3029,24 @@ skip_aging:  	cfs_rq_util_change(cfs_rq);  } +/** + * detach_entity_load_avg - detach this entity from its cfs_rq load avg + * @cfs_rq: cfs_rq to detach from + * @se: sched_entity to detach + * + * Must call update_cfs_rq_load_avg() before this, since we rely on + * cfs_rq->avg.last_update_time being current. + */  static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)  {  	__update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),  			  &se->avg, se->on_rq * scale_load_down(se->load.weight),  			  cfs_rq->curr == se, NULL); -	cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0); -	cfs_rq->avg.load_sum = max_t(s64,  cfs_rq->avg.load_sum - se->avg.load_sum, 0); -	cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0); -	cfs_rq->avg.util_sum = max_t(s32,  cfs_rq->avg.util_sum - se->avg.util_sum, 0); +	sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); +	sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum); +	sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); +	sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);  	cfs_rq_util_change(cfs_rq);  } @@ -3072,11 +3121,14 @@ void remove_entity_load_avg(struct sched_entity *se)  	u64 last_update_time;  	/* -	 * Newly created task or never used group entity should not be removed -	 * from its (source) cfs_rq +	 * tasks cannot exit without having gone through wake_up_new_task() -> +	 * post_init_entity_util_avg() which will have added things to the +	 * cfs_rq, so we can remove unconditionally. +	 * +	 * Similarly for groups, they will have passed through +	 * post_init_entity_util_avg() before unregister_sched_fair_group() +	 * calls this.  	 */ -	if (se->avg.last_update_time == 0) -		return;  	last_update_time = cfs_rq_last_update_time(cfs_rq); @@ -3099,6 +3151,12 @@ static int idle_balance(struct rq *this_rq);  #else /* CONFIG_SMP */ +static inline int +update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) +{ +	return 0; +} +  static inline void update_load_avg(struct sched_entity *se, int not_used)  {  	struct cfs_rq *cfs_rq = cfs_rq_of(se); @@ -3246,7 +3304,7 @@ static inline void check_schedstat_required(void)  			trace_sched_stat_iowait_enabled()  ||  			trace_sched_stat_blocked_enabled() ||  			trace_sched_stat_runtime_enabled())  { -		pr_warn_once("Scheduler tracepoints stat_sleep, stat_iowait, " +		printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, "  			     "stat_blocked and stat_runtime require the "  			     "kernel parameter schedstats=enabled or "  			     "kernel.sched_schedstats=1\n"); @@ -3688,7 +3746,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)  static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)  {  	if (unlikely(cfs_rq->throttle_count)) -		return cfs_rq->throttled_clock_task; +		return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;  	return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;  } @@ -3826,13 +3884,11 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)  	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];  	cfs_rq->throttle_count--; -#ifdef CONFIG_SMP  	if (!cfs_rq->throttle_count) {  		/* adjust cfs_rq_clock_task() */  		cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -  					     cfs_rq->throttled_clock_task;  	} -#endif  	return 0;  } @@ -4199,6 +4255,23 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)  		throttle_cfs_rq(cfs_rq);  } +static void sync_throttle(struct task_group *tg, int cpu) +{ +	struct cfs_rq *pcfs_rq, *cfs_rq; + +	if (!cfs_bandwidth_used()) +		return; + +	if (!tg->parent) +		return; + +	cfs_rq = tg->cfs_rq[cpu]; +	pcfs_rq = tg->parent->cfs_rq[cpu]; + +	cfs_rq->throttle_count = pcfs_rq->throttle_count; +	pcfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu)); +} +  /* conditionally throttle active cfs_rq's from put_prev_entity() */  static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)  { @@ -4338,6 +4411,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)  static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}  static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }  static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} +static inline void sync_throttle(struct task_group *tg, int cpu) {}  static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}  static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) @@ -4446,7 +4520,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)  		 *  		 * note: in the case of encountering a throttled cfs_rq we will  		 * post the final h_nr_running increment below. -		*/ +		 */  		if (cfs_rq_throttled(cfs_rq))  			break;  		cfs_rq->h_nr_running++; @@ -4500,15 +4574,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)  		/* Don't dequeue parent if it has other entities besides us */  		if (cfs_rq->load.weight) { +			/* Avoid re-evaluating load for this entity: */ +			se = parent_entity(se);  			/*  			 * Bias pick_next to pick a task from this cfs_rq, as  			 * p is sleeping when it is within its sched_slice.  			 */ -			if (task_sleep && parent_entity(se)) -				set_next_buddy(parent_entity(se)); - -			/* avoid re-evaluating load for this entity */ -			se = parent_entity(se); +			if (task_sleep && se && !throttled_hierarchy(cfs_rq)) +				set_next_buddy(se);  			break;  		}  		flags |= DEQUEUE_SLEEP; @@ -4910,19 +4983,24 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)  		return wl;  	for_each_sched_entity(se) { -		long w, W; +		struct cfs_rq *cfs_rq = se->my_q; +		long W, w = cfs_rq_load_avg(cfs_rq); -		tg = se->my_q->tg; +		tg = cfs_rq->tg;  		/*  		 * W = @wg + \Sum rw_j  		 */ -		W = wg + calc_tg_weight(tg, se->my_q); +		W = wg + atomic_long_read(&tg->load_avg); + +		/* Ensure \Sum rw_j >= rw_i */ +		W -= cfs_rq->tg_load_avg_contrib; +		W += w;  		/*  		 * w = rw_i + @wl  		 */ -		w = cfs_rq_load_avg(se->my_q) + wl; +		w += wl;  		/*  		 * wl = S * s'_i; see (2) @@ -8283,31 +8361,17 @@ static void task_fork_fair(struct task_struct *p)  {  	struct cfs_rq *cfs_rq;  	struct sched_entity *se = &p->se, *curr; -	int this_cpu = smp_processor_id();  	struct rq *rq = this_rq(); -	unsigned long flags; - -	raw_spin_lock_irqsave(&rq->lock, flags); +	raw_spin_lock(&rq->lock);  	update_rq_clock(rq);  	cfs_rq = task_cfs_rq(current);  	curr = cfs_rq->curr; - -	/* -	 * Not only the cpu but also the task_group of the parent might have -	 * been changed after parent->se.parent,cfs_rq were copied to -	 * child->se.parent,cfs_rq. So call __set_task_cpu() to make those -	 * of child point to valid ones. -	 */ -	rcu_read_lock(); -	__set_task_cpu(p, this_cpu); -	rcu_read_unlock(); - -	update_curr(cfs_rq); - -	if (curr) +	if (curr) { +		update_curr(cfs_rq);  		se->vruntime = curr->vruntime; +	}  	place_entity(cfs_rq, se, 1);  	if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) { @@ -8320,8 +8384,7 @@ static void task_fork_fair(struct task_struct *p)  	}  	se->vruntime -= cfs_rq->min_vruntime; - -	raw_spin_unlock_irqrestore(&rq->lock, flags); +	raw_spin_unlock(&rq->lock);  }  /* @@ -8377,6 +8440,8 @@ static void detach_task_cfs_rq(struct task_struct *p)  {  	struct sched_entity *se = &p->se;  	struct cfs_rq *cfs_rq = cfs_rq_of(se); +	u64 now = cfs_rq_clock_task(cfs_rq); +	int tg_update;  	if (!vruntime_normalized(p)) {  		/* @@ -8388,13 +8453,18 @@ static void detach_task_cfs_rq(struct task_struct *p)  	}  	/* Catch up with the cfs_rq and remove our load when we leave */ +	tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);  	detach_entity_load_avg(cfs_rq, se); +	if (tg_update) +		update_tg_load_avg(cfs_rq, false);  }  static void attach_task_cfs_rq(struct task_struct *p)  {  	struct sched_entity *se = &p->se;  	struct cfs_rq *cfs_rq = cfs_rq_of(se); +	u64 now = cfs_rq_clock_task(cfs_rq); +	int tg_update;  #ifdef CONFIG_FAIR_GROUP_SCHED  	/* @@ -8405,7 +8475,10 @@ static void attach_task_cfs_rq(struct task_struct *p)  #endif  	/* Synchronize task with its cfs_rq */ +	tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);  	attach_entity_load_avg(cfs_rq, se); +	if (tg_update) +		update_tg_load_avg(cfs_rq, false);  	if (!vruntime_normalized(p))  		se->vruntime += cfs_rq->min_vruntime; @@ -8465,6 +8538,14 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)  }  #ifdef CONFIG_FAIR_GROUP_SCHED +static void task_set_group_fair(struct task_struct *p) +{ +	struct sched_entity *se = &p->se; + +	set_task_rq(p, task_cpu(p)); +	se->depth = se->parent ? se->parent->depth + 1 : 0; +} +  static void task_move_group_fair(struct task_struct *p)  {  	detach_task_cfs_rq(p); @@ -8477,6 +8558,19 @@ static void task_move_group_fair(struct task_struct *p)  	attach_task_cfs_rq(p);  } +static void task_change_group_fair(struct task_struct *p, int type) +{ +	switch (type) { +	case TASK_SET_GROUP: +		task_set_group_fair(p); +		break; + +	case TASK_MOVE_GROUP: +		task_move_group_fair(p); +		break; +	} +} +  void free_fair_sched_group(struct task_group *tg)  {  	int i; @@ -8496,8 +8590,9 @@ void free_fair_sched_group(struct task_group *tg)  int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)  { -	struct cfs_rq *cfs_rq;  	struct sched_entity *se; +	struct cfs_rq *cfs_rq; +	struct rq *rq;  	int i;  	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); @@ -8512,6 +8607,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)  	init_cfs_bandwidth(tg_cfs_bandwidth(tg));  	for_each_possible_cpu(i) { +		rq = cpu_rq(i); +  		cfs_rq = kzalloc_node(sizeof(struct cfs_rq),  				      GFP_KERNEL, cpu_to_node(i));  		if (!cfs_rq) @@ -8525,7 +8622,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)  		init_cfs_rq(cfs_rq);  		init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);  		init_entity_runnable_average(se); -		post_init_entity_util_avg(se);  	}  	return 1; @@ -8536,6 +8632,23 @@ err:  	return 0;  } +void online_fair_sched_group(struct task_group *tg) +{ +	struct sched_entity *se; +	struct rq *rq; +	int i; + +	for_each_possible_cpu(i) { +		rq = cpu_rq(i); +		se = tg->se[i]; + +		raw_spin_lock_irq(&rq->lock); +		post_init_entity_util_avg(se); +		sync_throttle(tg, i); +		raw_spin_unlock_irq(&rq->lock); +	} +} +  void unregister_fair_sched_group(struct task_group *tg)  {  	unsigned long flags; @@ -8640,6 +8753,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)  	return 1;  } +void online_fair_sched_group(struct task_group *tg) { } +  void unregister_fair_sched_group(struct task_group *tg) { }  #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -8699,7 +8814,7 @@ const struct sched_class fair_sched_class = {  	.update_curr		= update_curr_fair,  #ifdef CONFIG_FAIR_GROUP_SCHED -	.task_move_group	= task_move_group_fair, +	.task_change_group	= task_change_group_fair,  #endif  }; diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index c5aeedf4e93a..9fb873cfc75c 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -201,6 +201,8 @@ exit_idle:   */  static void cpu_idle_loop(void)  { +	int cpu = smp_processor_id(); +  	while (1) {  		/*  		 * If the arch has a polling bit, we maintain an invariant: @@ -219,7 +221,7 @@ static void cpu_idle_loop(void)  			check_pgt_cache();  			rmb(); -			if (cpu_is_offline(smp_processor_id())) { +			if (cpu_is_offline(cpu)) {  				cpuhp_report_idle_dead();  				arch_cpu_idle_dead();  			} diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index b0b93fd33af9..a2d6eb71f06b 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -78,11 +78,11 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)  	loads[2] = (avenrun[2] + offset) << shift;  } -long calc_load_fold_active(struct rq *this_rq) +long calc_load_fold_active(struct rq *this_rq, long adjust)  {  	long nr_active, delta = 0; -	nr_active = this_rq->nr_running; +	nr_active = this_rq->nr_running - adjust;  	nr_active += (long)this_rq->nr_uninterruptible;  	if (nr_active != this_rq->calc_load_active) { @@ -188,7 +188,7 @@ void calc_load_enter_idle(void)  	 * We're going into NOHZ mode, if there's any pending delta, fold it  	 * into the pending idle delta.  	 */ -	delta = calc_load_fold_active(this_rq); +	delta = calc_load_fold_active(this_rq, 0);  	if (delta) {  		int idx = calc_load_write_idx(); @@ -389,7 +389,7 @@ void calc_global_load_tick(struct rq *this_rq)  	if (time_before(jiffies, this_rq->calc_load_update))  		return; -	delta  = calc_load_fold_active(this_rq); +	delta  = calc_load_fold_active(this_rq, 0);  	if (delta)  		atomic_long_add(delta, &calc_load_tasks); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 72f1f3087b04..c64fc5114004 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -28,7 +28,7 @@ extern unsigned long calc_load_update;  extern atomic_long_t calc_load_tasks;  extern void calc_global_load_tick(struct rq *this_rq); -extern long calc_load_fold_active(struct rq *this_rq); +extern long calc_load_fold_active(struct rq *this_rq, long adjust);  #ifdef CONFIG_SMP  extern void cpu_load_update_active(struct rq *this_rq); @@ -321,6 +321,7 @@ extern int tg_nop(struct task_group *tg, void *data);  extern void free_fair_sched_group(struct task_group *tg);  extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); +extern void online_fair_sched_group(struct task_group *tg);  extern void unregister_fair_sched_group(struct task_group *tg);  extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,  			struct sched_entity *se, int cpu, @@ -1113,7 +1114,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)  	 * In particular, the load of prev->state in finish_task_switch() must  	 * happen before this.  	 * -	 * Pairs with the smp_cond_acquire() in try_to_wake_up(). +	 * Pairs with the smp_cond_load_acquire() in try_to_wake_up().  	 */  	smp_store_release(&prev->on_cpu, 0);  #endif @@ -1246,8 +1247,11 @@ struct sched_class {  	void (*update_curr) (struct rq *rq); +#define TASK_SET_GROUP  0 +#define TASK_MOVE_GROUP	1 +  #ifdef CONFIG_FAIR_GROUP_SCHED -	void (*task_move_group) (struct task_struct *p); +	void (*task_change_group) (struct task_struct *p, int type);  #endif  }; @@ -1809,16 +1813,3 @@ static inline void cpufreq_trigger_update(u64 time) {}  #else /* arch_scale_freq_capacity */  #define arch_scale_freq_invariant()	(false)  #endif - -static inline void account_reset_rq(struct rq *rq) -{ -#ifdef CONFIG_IRQ_TIME_ACCOUNTING -	rq->prev_irq_time = 0; -#endif -#ifdef CONFIG_PARAVIRT -	rq->prev_steal_time = 0; -#endif -#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING -	rq->prev_steal_time_rq = 0; -#endif -} diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 7002796f14a4..54d15eb2b701 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -173,7 +173,7 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)   *   * Returns valid seccomp BPF response codes.   */ -static u32 seccomp_run_filters(struct seccomp_data *sd) +static u32 seccomp_run_filters(const struct seccomp_data *sd)  {  	struct seccomp_data sd_local;  	u32 ret = SECCOMP_RET_ALLOW; @@ -554,20 +554,10 @@ void secure_computing_strict(int this_syscall)  		BUG();  }  #else -int __secure_computing(void) -{ -	u32 phase1_result = seccomp_phase1(NULL); - -	if (likely(phase1_result == SECCOMP_PHASE1_OK)) -		return 0; -	else if (likely(phase1_result == SECCOMP_PHASE1_SKIP)) -		return -1; -	else -		return seccomp_phase2(phase1_result); -}  #ifdef CONFIG_SECCOMP_FILTER -static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd) +static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, +			    const bool recheck_after_trace)  {  	u32 filter_ret, action;  	int data; @@ -599,10 +589,46 @@ static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd)  		goto skip;  	case SECCOMP_RET_TRACE: -		return filter_ret;  /* Save the rest for phase 2. */ +		/* We've been put in this state by the ptracer already. */ +		if (recheck_after_trace) +			return 0; + +		/* ENOSYS these calls if there is no tracer attached. */ +		if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) { +			syscall_set_return_value(current, +						 task_pt_regs(current), +						 -ENOSYS, 0); +			goto skip; +		} + +		/* Allow the BPF to provide the event message */ +		ptrace_event(PTRACE_EVENT_SECCOMP, data); +		/* +		 * The delivery of a fatal signal during event +		 * notification may silently skip tracer notification. +		 * Terminating the task now avoids executing a system +		 * call that may not be intended. +		 */ +		if (fatal_signal_pending(current)) +			do_exit(SIGSYS); +		/* Check if the tracer forced the syscall to be skipped. */ +		this_syscall = syscall_get_nr(current, task_pt_regs(current)); +		if (this_syscall < 0) +			goto skip; + +		/* +		 * Recheck the syscall, since it may have changed. This +		 * intentionally uses a NULL struct seccomp_data to force +		 * a reload of all registers. This does not goto skip since +		 * a skip would have already been reported. +		 */ +		if (__seccomp_filter(this_syscall, NULL, true)) +			return -1; + +		return 0;  	case SECCOMP_RET_ALLOW: -		return SECCOMP_PHASE1_OK; +		return 0;  	case SECCOMP_RET_KILL:  	default: @@ -614,96 +640,38 @@ static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd)  skip:  	audit_seccomp(this_syscall, 0, action); -	return SECCOMP_PHASE1_SKIP; +	return -1; +} +#else +static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, +			    const bool recheck_after_trace) +{ +	BUG();  }  #endif -/** - * seccomp_phase1() - run fast path seccomp checks on the current syscall - * @arg sd: The seccomp_data or NULL - * - * This only reads pt_regs via the syscall_xyz helpers.  The only change - * it will make to pt_regs is via syscall_set_return_value, and it will - * only do that if it returns SECCOMP_PHASE1_SKIP. - * - * If sd is provided, it will not read pt_regs at all. - * - * It may also call do_exit or force a signal; these actions must be - * safe. - * - * If it returns SECCOMP_PHASE1_OK, the syscall passes checks and should - * be processed normally. - * - * If it returns SECCOMP_PHASE1_SKIP, then the syscall should not be - * invoked.  In this case, seccomp_phase1 will have set the return value - * using syscall_set_return_value. - * - * If it returns anything else, then the return value should be passed - * to seccomp_phase2 from a context in which ptrace hooks are safe. - */ -u32 seccomp_phase1(struct seccomp_data *sd) +int __secure_computing(const struct seccomp_data *sd)  {  	int mode = current->seccomp.mode; -	int this_syscall = sd ? sd->nr : -		syscall_get_nr(current, task_pt_regs(current)); +	int this_syscall;  	if (config_enabled(CONFIG_CHECKPOINT_RESTORE) &&  	    unlikely(current->ptrace & PT_SUSPEND_SECCOMP)) -		return SECCOMP_PHASE1_OK; +		return 0; + +	this_syscall = sd ? sd->nr : +		syscall_get_nr(current, task_pt_regs(current));  	switch (mode) {  	case SECCOMP_MODE_STRICT:  		__secure_computing_strict(this_syscall);  /* may call do_exit */ -		return SECCOMP_PHASE1_OK; -#ifdef CONFIG_SECCOMP_FILTER +		return 0;  	case SECCOMP_MODE_FILTER: -		return __seccomp_phase1_filter(this_syscall, sd); -#endif +		return __seccomp_filter(this_syscall, sd, false);  	default:  		BUG();  	}  } - -/** - * seccomp_phase2() - finish slow path seccomp work for the current syscall - * @phase1_result: The return value from seccomp_phase1() - * - * This must be called from a context in which ptrace hooks can be used. - * - * Returns 0 if the syscall should be processed or -1 to skip the syscall. - */ -int seccomp_phase2(u32 phase1_result) -{ -	struct pt_regs *regs = task_pt_regs(current); -	u32 action = phase1_result & SECCOMP_RET_ACTION; -	int data = phase1_result & SECCOMP_RET_DATA; - -	BUG_ON(action != SECCOMP_RET_TRACE); - -	audit_seccomp(syscall_get_nr(current, regs), 0, action); - -	/* Skip these calls if there is no tracer. */ -	if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) { -		syscall_set_return_value(current, regs, -					 -ENOSYS, 0); -		return -1; -	} - -	/* Allow the BPF to provide the event message */ -	ptrace_event(PTRACE_EVENT_SECCOMP, data); -	/* -	 * The delivery of a fatal signal during event -	 * notification may silently skip tracer notification. -	 * Terminating the task now avoids executing a system -	 * call that may not be intended. -	 */ -	if (fatal_signal_pending(current)) -		do_exit(SIGSYS); -	if (syscall_get_nr(current, regs) < 0) -		return -1;  /* Explicit request to skip. */ - -	return 0; -}  #endif /* CONFIG_HAVE_ARCH_SECCOMP_FILTER */  long prctl_get_seccomp(void) diff --git a/kernel/signal.c b/kernel/signal.c index 96e9bc40667f..af21afc00d08 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2751,23 +2751,18 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)   *  @ts: upper bound on process time suspension   */  int do_sigtimedwait(const sigset_t *which, siginfo_t *info, -			const struct timespec *ts) +		    const struct timespec *ts)  { +	ktime_t *to = NULL, timeout = { .tv64 = KTIME_MAX };  	struct task_struct *tsk = current; -	long timeout = MAX_SCHEDULE_TIMEOUT;  	sigset_t mask = *which; -	int sig; +	int sig, ret = 0;  	if (ts) {  		if (!timespec_valid(ts))  			return -EINVAL; -		timeout = timespec_to_jiffies(ts); -		/* -		 * We can be close to the next tick, add another one -		 * to ensure we will wait at least the time asked for. -		 */ -		if (ts->tv_sec || ts->tv_nsec) -			timeout++; +		timeout = timespec_to_ktime(*ts); +		to = &timeout;  	}  	/* @@ -2778,7 +2773,7 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info,  	spin_lock_irq(&tsk->sighand->siglock);  	sig = dequeue_signal(tsk, &mask, info); -	if (!sig && timeout) { +	if (!sig && timeout.tv64) {  		/*  		 * None ready, temporarily unblock those we're interested  		 * while we are sleeping in so that we'll be awakened when @@ -2790,8 +2785,9 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info,  		recalc_sigpending();  		spin_unlock_irq(&tsk->sighand->siglock); -		timeout = freezable_schedule_timeout_interruptible(timeout); - +		__set_current_state(TASK_INTERRUPTIBLE); +		ret = freezable_schedule_hrtimeout_range(to, tsk->timer_slack_ns, +							 HRTIMER_MODE_REL);  		spin_lock_irq(&tsk->sighand->siglock);  		__set_task_blocked(tsk, &tsk->real_blocked);  		sigemptyset(&tsk->real_blocked); @@ -2801,7 +2797,7 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info,  	if (sig)  		return sig; -	return timeout ? -EINTR : -EAGAIN; +	return ret ? -EINTR : -EAGAIN;  }  /** diff --git a/kernel/smp.c b/kernel/smp.c index 74165443c240..3aa642d39c03 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -33,69 +33,54 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue);  static void flush_smp_call_function_queue(bool warn_cpu_offline); -static int -hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) +int smpcfd_prepare_cpu(unsigned int cpu)  { -	long cpu = (long)hcpu;  	struct call_function_data *cfd = &per_cpu(cfd_data, cpu); -	switch (action) { -	case CPU_UP_PREPARE: -	case CPU_UP_PREPARE_FROZEN: -		if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, -				cpu_to_node(cpu))) -			return notifier_from_errno(-ENOMEM); -		cfd->csd = alloc_percpu(struct call_single_data); -		if (!cfd->csd) { -			free_cpumask_var(cfd->cpumask); -			return notifier_from_errno(-ENOMEM); -		} -		break; - -#ifdef CONFIG_HOTPLUG_CPU -	case CPU_UP_CANCELED: -	case CPU_UP_CANCELED_FROZEN: -		/* Fall-through to the CPU_DEAD[_FROZEN] case. */ - -	case CPU_DEAD: -	case CPU_DEAD_FROZEN: +	if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, +				     cpu_to_node(cpu))) +		return -ENOMEM; +	cfd->csd = alloc_percpu(struct call_single_data); +	if (!cfd->csd) {  		free_cpumask_var(cfd->cpumask); -		free_percpu(cfd->csd); -		break; +		return -ENOMEM; +	} -	case CPU_DYING: -	case CPU_DYING_FROZEN: -		/* -		 * The IPIs for the smp-call-function callbacks queued by other -		 * CPUs might arrive late, either due to hardware latencies or -		 * because this CPU disabled interrupts (inside stop-machine) -		 * before the IPIs were sent. So flush out any pending callbacks -		 * explicitly (without waiting for the IPIs to arrive), to -		 * ensure that the outgoing CPU doesn't go offline with work -		 * still pending. -		 */ -		flush_smp_call_function_queue(false); -		break; -#endif -	}; +	return 0; +} + +int smpcfd_dead_cpu(unsigned int cpu) +{ +	struct call_function_data *cfd = &per_cpu(cfd_data, cpu); -	return NOTIFY_OK; +	free_cpumask_var(cfd->cpumask); +	free_percpu(cfd->csd); +	return 0;  } -static struct notifier_block hotplug_cfd_notifier = { -	.notifier_call		= hotplug_cfd, -}; +int smpcfd_dying_cpu(unsigned int cpu) +{ +	/* +	 * The IPIs for the smp-call-function callbacks queued by other +	 * CPUs might arrive late, either due to hardware latencies or +	 * because this CPU disabled interrupts (inside stop-machine) +	 * before the IPIs were sent. So flush out any pending callbacks +	 * explicitly (without waiting for the IPIs to arrive), to +	 * ensure that the outgoing CPU doesn't go offline with work +	 * still pending. +	 */ +	flush_smp_call_function_queue(false); +	return 0; +}  void __init call_function_init(void)  { -	void *cpu = (void *)(long)smp_processor_id();  	int i;  	for_each_possible_cpu(i)  		init_llist_head(&per_cpu(call_single_queue, i)); -	hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu); -	register_cpu_notifier(&hotplug_cfd_notifier); +	smpcfd_prepare_cpu(smp_processor_id());  }  /* @@ -107,7 +92,7 @@ void __init call_function_init(void)   */  static __always_inline void csd_lock_wait(struct call_single_data *csd)  { -	smp_cond_acquire(!(csd->flags & CSD_FLAG_LOCK)); +	smp_cond_load_acquire(&csd->flags, !(VAL & CSD_FLAG_LOCK));  }  static __always_inline void csd_lock(struct call_single_data *csd) diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index a467e6c28a3b..4a1ca5f6da7e 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -21,6 +21,7 @@  #include <linux/smpboot.h>  #include <linux/atomic.h>  #include <linux/lglock.h> +#include <linux/nmi.h>  /*   * Structure to determine completion condition and record errors.  May @@ -209,6 +210,13 @@ static int multi_cpu_stop(void *data)  				break;  			}  			ack_state(msdata); +		} else if (curstate > MULTI_STOP_PREPARE) { +			/* +			 * At this stage all other CPUs we depend on must spin +			 * in the same loop. Any reason for hard-lockup should +			 * be detected and reported on their side. +			 */ +			touch_nmi_watchdog();  		}  	} while (curstate != MULTI_STOP_EXIT); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 87b2fc38398b..53954631a4e1 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1205,6 +1205,17 @@ static struct ctl_table kern_table[] = {  		.extra2		= &one,  	},  #endif +#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) +	{ +		.procname	= "panic_on_rcu_stall", +		.data		= &sysctl_panic_on_rcu_stall, +		.maxlen		= sizeof(sysctl_panic_on_rcu_stall), +		.mode		= 0644, +		.proc_handler	= proc_dointvec_minmax, +		.extra1		= &zero, +		.extra2		= &one, +	}, +#endif  	{ }  }; @@ -1497,8 +1508,8 @@ static struct ctl_table vm_table[] = {  #ifdef CONFIG_NUMA  	{  		.procname	= "zone_reclaim_mode", -		.data		= &zone_reclaim_mode, -		.maxlen		= sizeof(zone_reclaim_mode), +		.data		= &node_reclaim_mode, +		.maxlen		= sizeof(node_reclaim_mode),  		.mode		= 0644,  		.proc_handler	= proc_dointvec,  		.extra1		= &zero, diff --git a/kernel/task_work.c b/kernel/task_work.c index 53fa971d000d..6ab4842b00e8 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -108,7 +108,6 @@ void task_work_run(void)  		 * fail, but it can play with *work and other entries.  		 */  		raw_spin_unlock_wait(&task->pi_lock); -		smp_mb();  		do {  			next = work->next; diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index e840ed867a5d..c3aad685bbc0 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -30,7 +30,6 @@   * struct alarm_base - Alarm timer bases   * @lock:		Lock for syncrhonized access to the base   * @timerqueue:		Timerqueue head managing the list of events - * @timer: 		hrtimer used to schedule events while running   * @gettime:		Function to read the time correlating to the base   * @base_clockid:	clockid for the base   */ diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index a9b76a40319e..2c5bc77c0bb0 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -645,7 +645,7 @@ void tick_cleanup_dead_cpu(int cpu)  #endif  #ifdef CONFIG_SYSFS -struct bus_type clockevents_subsys = { +static struct bus_type clockevents_subsys = {  	.name		= "clockevents",  	.dev_name       = "clockevent",  }; diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 56ece145a814..6a5a310a1a53 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -669,10 +669,12 @@ static void clocksource_enqueue(struct clocksource *cs)  	struct list_head *entry = &clocksource_list;  	struct clocksource *tmp; -	list_for_each_entry(tmp, &clocksource_list, list) +	list_for_each_entry(tmp, &clocksource_list, list) {  		/* Keep track of the place, where to insert */ -		if (tmp->rating >= cs->rating) -			entry = &tmp->list; +		if (tmp->rating < cs->rating) +			break; +		entry = &tmp->list; +	}  	list_add(&cs->list, entry);  } diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index e99df0ff1d42..9ba7c820fc23 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -177,7 +177,7 @@ hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)  #endif  } -#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) +#ifdef CONFIG_NO_HZ_COMMON  static inline  struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,  					 int pinned) @@ -1590,7 +1590,7 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,  /*   * Functions related to boot-time initialization:   */ -static void init_hrtimers_cpu(int cpu) +int hrtimers_prepare_cpu(unsigned int cpu)  {  	struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);  	int i; @@ -1602,6 +1602,7 @@ static void init_hrtimers_cpu(int cpu)  	cpu_base->cpu = cpu;  	hrtimer_init_hres(cpu_base); +	return 0;  }  #ifdef CONFIG_HOTPLUG_CPU @@ -1636,7 +1637,7 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,  	}  } -static void migrate_hrtimers(int scpu) +int hrtimers_dead_cpu(unsigned int scpu)  {  	struct hrtimer_cpu_base *old_base, *new_base;  	int i; @@ -1665,45 +1666,14 @@ static void migrate_hrtimers(int scpu)  	/* Check, if we got expired work to do */  	__hrtimer_peek_ahead_timers();  	local_irq_enable(); +	return 0;  }  #endif /* CONFIG_HOTPLUG_CPU */ -static int hrtimer_cpu_notify(struct notifier_block *self, -					unsigned long action, void *hcpu) -{ -	int scpu = (long)hcpu; - -	switch (action) { - -	case CPU_UP_PREPARE: -	case CPU_UP_PREPARE_FROZEN: -		init_hrtimers_cpu(scpu); -		break; - -#ifdef CONFIG_HOTPLUG_CPU -	case CPU_DEAD: -	case CPU_DEAD_FROZEN: -		migrate_hrtimers(scpu); -		break; -#endif - -	default: -		break; -	} - -	return NOTIFY_OK; -} - -static struct notifier_block hrtimers_nb = { -	.notifier_call = hrtimer_cpu_notify, -}; -  void __init hrtimers_init(void)  { -	hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, -			  (void *)(long)smp_processor_id()); -	register_cpu_notifier(&hrtimers_nb); +	hrtimers_prepare_cpu(smp_processor_id());  }  /** diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 1cafba860b08..39008d78927a 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -777,6 +777,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)  			timer->it.cpu.expires = 0;  			sample_to_timespec(timer->it_clock, timer->it.cpu.expires,  					   &itp->it_value); +			return;  		} else {  			cpu_timer_sample_group(timer->it_clock, p, &now);  			unlock_task_sighand(p, &flags); diff --git a/kernel/time/test_udelay.c b/kernel/time/test_udelay.c index e622ba365a13..b0928ab3270f 100644 --- a/kernel/time/test_udelay.c +++ b/kernel/time/test_udelay.c @@ -43,13 +43,13 @@ static int udelay_test_single(struct seq_file *s, int usecs, uint32_t iters)  	int allowed_error_ns = usecs * 5;  	for (i = 0; i < iters; ++i) { -		struct timespec ts1, ts2; +		s64 kt1, kt2;  		int time_passed; -		ktime_get_ts(&ts1); +		kt1 = ktime_get_ns();  		udelay(usecs); -		ktime_get_ts(&ts2); -		time_passed = timespec_to_ns(&ts2) - timespec_to_ns(&ts1); +		kt2 = ktime_get_ns(); +		time_passed = kt2 - kt1;  		if (i == 0 || time_passed < min)  			min = time_passed; @@ -87,11 +87,11 @@ static int udelay_test_show(struct seq_file *s, void *v)  	if (usecs > 0 && iters > 0) {  		return udelay_test_single(s, usecs, iters);  	} else if (usecs == 0) { -		struct timespec ts; +		struct timespec64 ts; -		ktime_get_ts(&ts); -		seq_printf(s, "udelay() test (lpj=%ld kt=%ld.%09ld)\n", -				loops_per_jiffy, ts.tv_sec, ts.tv_nsec); +		ktime_get_ts64(&ts); +		seq_printf(s, "udelay() test (lpj=%ld kt=%lld.%09ld)\n", +				loops_per_jiffy, (s64)ts.tv_sec, ts.tv_nsec);  		seq_puts(s, "usage:\n");  		seq_puts(s, "echo USECS [ITERS] > " DEBUGFS_FILENAME "\n");  		seq_puts(s, "cat " DEBUGFS_FILENAME "\n"); diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c index 53d7184da0be..690b797f522e 100644 --- a/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -75,6 +75,7 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc)  }  static struct clock_event_device ce_broadcast_hrtimer = { +	.name			= "bc_hrtimer",  	.set_state_shutdown	= bc_shutdown,  	.set_next_ktime		= bc_set_next,  	.features		= CLOCK_EVT_FEAT_ONESHOT | diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 966a5a6fdd0a..f738251000fe 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -164,3 +164,4 @@ static inline void timers_update_migration(bool update_nohz) { }  DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);  extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem); +void timer_clear_idle(void); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 536ada80f6dd..204fdc86863d 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -31,7 +31,7 @@  #include <trace/events/timer.h>  /* - * Per cpu nohz control structure + * Per-CPU nohz control structure   */  static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); @@ -61,7 +61,7 @@ static void tick_do_update_jiffies64(ktime_t now)  	if (delta.tv64 < tick_period.tv64)  		return; -	/* Reevalute with jiffies_lock held */ +	/* Reevaluate with jiffies_lock held */  	write_seqlock(&jiffies_lock);  	delta = ktime_sub(now, last_jiffies_update); @@ -116,8 +116,8 @@ static void tick_sched_do_timer(ktime_t now)  #ifdef CONFIG_NO_HZ_COMMON  	/*  	 * Check if the do_timer duty was dropped. We don't care about -	 * concurrency: This happens only when the cpu in charge went -	 * into a long sleep. If two cpus happen to assign themself to +	 * concurrency: This happens only when the CPU in charge went +	 * into a long sleep. If two CPUs happen to assign themselves to  	 * this duty, then the jiffies update is still serialized by  	 * jiffies_lock.  	 */ @@ -349,7 +349,7 @@ void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bi  /*   * Re-evaluate the need for the tick as we switch the current task.   * It might need the tick due to per task/process properties: - * perf events, posix cpu timers, ... + * perf events, posix CPU timers, ...   */  void __tick_nohz_task_switch(void)  { @@ -509,8 +509,8 @@ int tick_nohz_tick_stopped(void)   *   * In case the sched_tick was stopped on this CPU, we have to check if jiffies   * must be updated. Otherwise an interrupt handler could use a stale jiffy - * value. We do this unconditionally on any cpu, as we don't know whether the - * cpu, which has the update task assigned is in a long sleep. + * value. We do this unconditionally on any CPU, as we don't know whether the + * CPU, which has the update task assigned is in a long sleep.   */  static void tick_nohz_update_jiffies(ktime_t now)  { @@ -526,7 +526,7 @@ static void tick_nohz_update_jiffies(ktime_t now)  }  /* - * Updates the per cpu time idle statistics counters + * Updates the per-CPU time idle statistics counters   */  static void  update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time) @@ -566,12 +566,12 @@ static ktime_t tick_nohz_start_idle(struct tick_sched *ts)  }  /** - * get_cpu_idle_time_us - get the total idle time of a cpu + * get_cpu_idle_time_us - get the total idle time of a CPU   * @cpu: CPU number to query   * @last_update_time: variable to store update time in. Do not update   * counters if NULL.   * - * Return the cummulative idle time (since boot) for a given + * Return the cumulative idle time (since boot) for a given   * CPU, in microseconds.   *   * This time is measured via accounting rather than sampling, @@ -607,12 +607,12 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)  EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);  /** - * get_cpu_iowait_time_us - get the total iowait time of a cpu + * get_cpu_iowait_time_us - get the total iowait time of a CPU   * @cpu: CPU number to query   * @last_update_time: variable to store update time in. Do not update   * counters if NULL.   * - * Return the cummulative iowait time (since boot) for a given + * Return the cumulative iowait time (since boot) for a given   * CPU, in microseconds.   *   * This time is measured via accounting rather than sampling, @@ -700,6 +700,12 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,  	delta = next_tick - basemono;  	if (delta <= (u64)TICK_NSEC) {  		tick.tv64 = 0; + +		/* +		 * Tell the timer code that the base is not idle, i.e. undo +		 * the effect of get_next_timer_interrupt(): +		 */ +		timer_clear_idle();  		/*  		 * We've not stopped the tick yet, and there's a timer in the  		 * next period, so no point in stopping it either, bail. @@ -726,14 +732,14 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,  	}  	/* -	 * If this cpu is the one which updates jiffies, then give up -	 * the assignment and let it be taken by the cpu which runs -	 * the tick timer next, which might be this cpu as well. If we +	 * If this CPU is the one which updates jiffies, then give up +	 * the assignment and let it be taken by the CPU which runs +	 * the tick timer next, which might be this CPU as well. If we  	 * don't drop this here the jiffies might be stale and  	 * do_timer() never invoked. Keep track of the fact that it -	 * was the one which had the do_timer() duty last. If this cpu +	 * was the one which had the do_timer() duty last. If this CPU  	 * is the one which had the do_timer() duty last, we limit the -	 * sleep time to the timekeeping max_deferement value. +	 * sleep time to the timekeeping max_deferment value.  	 * Otherwise we can sleep as long as we want.  	 */  	delta = timekeeping_max_deferment(); @@ -809,6 +815,12 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)  	tick_do_update_jiffies64(now);  	cpu_load_update_nohz_stop(); +	/* +	 * Clear the timer idle flag, so we avoid IPIs on remote queueing and +	 * the clock forward checks in the enqueue path: +	 */ +	timer_clear_idle(); +  	calc_load_exit_idle();  	touch_softlockup_watchdog_sched();  	/* @@ -841,9 +853,9 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts)  static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)  {  	/* -	 * If this cpu is offline and it is the one which updates +	 * If this CPU is offline and it is the one which updates  	 * jiffies, then give up the assignment and let it be taken by -	 * the cpu which runs the tick timer next. If we don't drop +	 * the CPU which runs the tick timer next. If we don't drop  	 * this here the jiffies might be stale and do_timer() never  	 * invoked.  	 */ @@ -896,11 +908,10 @@ static void __tick_nohz_idle_enter(struct tick_sched *ts)  	ktime_t now, expires;  	int cpu = smp_processor_id(); -	now = tick_nohz_start_idle(ts); -  	if (can_stop_idle_tick(cpu, ts)) {  		int was_stopped = ts->tick_stopped; +		now = tick_nohz_start_idle(ts);  		ts->idle_calls++;  		expires = tick_nohz_stop_sched_tick(ts, now, cpu); @@ -933,11 +944,11 @@ void tick_nohz_idle_enter(void)  	WARN_ON_ONCE(irqs_disabled());  	/* - 	 * Update the idle state in the scheduler domain hierarchy - 	 * when tick_nohz_stop_sched_tick() is called from the idle loop. - 	 * State will be updated to busy during the first busy tick after - 	 * exiting idle. - 	 */ +	 * Update the idle state in the scheduler domain hierarchy +	 * when tick_nohz_stop_sched_tick() is called from the idle loop. +	 * State will be updated to busy during the first busy tick after +	 * exiting idle. +	 */  	set_cpu_sd_state_idle();  	local_irq_disable(); @@ -1092,35 +1103,6 @@ static void tick_nohz_switch_to_nohz(void)  	tick_nohz_activate(ts, NOHZ_MODE_LOWRES);  } -/* - * When NOHZ is enabled and the tick is stopped, we need to kick the - * tick timer from irq_enter() so that the jiffies update is kept - * alive during long running softirqs. That's ugly as hell, but - * correctness is key even if we need to fix the offending softirq in - * the first place. - * - * Note, this is different to tick_nohz_restart. We just kick the - * timer and do not touch the other magic bits which need to be done - * when idle is left. - */ -static void tick_nohz_kick_tick(struct tick_sched *ts, ktime_t now) -{ -#if 0 -	/* Switch back to 2.6.27 behaviour */ -	ktime_t delta; - -	/* -	 * Do not touch the tick device, when the next expiry is either -	 * already reached or less/equal than the tick period. -	 */ -	delta =	ktime_sub(hrtimer_get_expires(&ts->sched_timer), now); -	if (delta.tv64 <= tick_period.tv64) -		return; - -	tick_nohz_restart(ts, now); -#endif -} -  static inline void tick_nohz_irq_enter(void)  {  	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); @@ -1131,10 +1113,8 @@ static inline void tick_nohz_irq_enter(void)  	now = ktime_get();  	if (ts->idle_active)  		tick_nohz_stop_idle(ts, now); -	if (ts->tick_stopped) { +	if (ts->tick_stopped)  		tick_nohz_update_jiffies(now); -		tick_nohz_kick_tick(ts, now); -	}  }  #else @@ -1211,7 +1191,7 @@ void tick_setup_sched_timer(void)  	hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);  	ts->sched_timer.function = tick_sched_timer; -	/* Get the next period (per cpu) */ +	/* Get the next period (per-CPU) */  	hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());  	/* Offset the tick to avert jiffies_lock contention. */ diff --git a/kernel/time/timeconv.c b/kernel/time/timeconv.c index 86628e755f38..7142580ad94f 100644 --- a/kernel/time/timeconv.c +++ b/kernel/time/timeconv.c @@ -67,20 +67,21 @@ static const unsigned short __mon_yday[2][13] = {  #define SECS_PER_DAY	(SECS_PER_HOUR * 24)  /** - * time_to_tm - converts the calendar time to local broken-down time + * time64_to_tm - converts the calendar time to local broken-down time   *   * @totalsecs	the number of seconds elapsed since 00:00:00 on January 1, 1970,   *		Coordinated Universal Time (UTC).   * @offset	offset seconds adding to totalsecs.   * @result	pointer to struct tm variable to receive broken-down time   */ -void time_to_tm(time_t totalsecs, int offset, struct tm *result) +void time64_to_tm(time64_t totalsecs, int offset, struct tm *result)  {  	long days, rem, y; +	int remainder;  	const unsigned short *ip; -	days = totalsecs / SECS_PER_DAY; -	rem = totalsecs % SECS_PER_DAY; +	days = div_s64_rem(totalsecs, SECS_PER_DAY, &remainder); +	rem = remainder;  	rem += offset;  	while (rem < 0) {  		rem += SECS_PER_DAY; @@ -124,4 +125,4 @@ void time_to_tm(time_t totalsecs, int offset, struct tm *result)  	result->tm_mon = y;  	result->tm_mday = days + 1;  } -EXPORT_SYMBOL(time_to_tm); +EXPORT_SYMBOL(time64_to_tm); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 479d25cd3d4f..3b65746c7f15 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -480,10 +480,12 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk)  	* users are removed, this can be killed.  	*/  	remainder = tk->tkr_mono.xtime_nsec & ((1ULL << tk->tkr_mono.shift) - 1); -	tk->tkr_mono.xtime_nsec -= remainder; -	tk->tkr_mono.xtime_nsec += 1ULL << tk->tkr_mono.shift; -	tk->ntp_error += remainder << tk->ntp_error_shift; -	tk->ntp_error -= (1ULL << tk->tkr_mono.shift) << tk->ntp_error_shift; +	if (remainder != 0) { +		tk->tkr_mono.xtime_nsec -= remainder; +		tk->tkr_mono.xtime_nsec += 1ULL << tk->tkr_mono.shift; +		tk->ntp_error += remainder << tk->ntp_error_shift; +		tk->ntp_error -= (1ULL << tk->tkr_mono.shift) << tk->ntp_error_shift; +	}  }  #else  #define old_vsyscall_fixup(tk) @@ -2186,6 +2188,7 @@ struct timespec64 get_monotonic_coarse64(void)  	return now;  } +EXPORT_SYMBOL(get_monotonic_coarse64);  /*   * Must hold jiffies_lock diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 3a95f9728778..555670a5143c 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -59,43 +59,153 @@ __visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;  EXPORT_SYMBOL(jiffies_64);  /* - * per-CPU timer vector definitions: + * The timer wheel has LVL_DEPTH array levels. Each level provides an array of + * LVL_SIZE buckets. Each level is driven by its own clock and therefor each + * level has a different granularity. + * + * The level granularity is:		LVL_CLK_DIV ^ lvl + * The level clock frequency is:	HZ / (LVL_CLK_DIV ^ level) + * + * The array level of a newly armed timer depends on the relative expiry + * time. The farther the expiry time is away the higher the array level and + * therefor the granularity becomes. + * + * Contrary to the original timer wheel implementation, which aims for 'exact' + * expiry of the timers, this implementation removes the need for recascading + * the timers into the lower array levels. The previous 'classic' timer wheel + * implementation of the kernel already violated the 'exact' expiry by adding + * slack to the expiry time to provide batched expiration. The granularity + * levels provide implicit batching. + * + * This is an optimization of the original timer wheel implementation for the + * majority of the timer wheel use cases: timeouts. The vast majority of + * timeout timers (networking, disk I/O ...) are canceled before expiry. If + * the timeout expires it indicates that normal operation is disturbed, so it + * does not matter much whether the timeout comes with a slight delay. + * + * The only exception to this are networking timers with a small expiry + * time. They rely on the granularity. Those fit into the first wheel level, + * which has HZ granularity. + * + * We don't have cascading anymore. timers with a expiry time above the + * capacity of the last wheel level are force expired at the maximum timeout + * value of the last wheel level. From data sampling we know that the maximum + * value observed is 5 days (network connection tracking), so this should not + * be an issue. + * + * The currently chosen array constants values are a good compromise between + * array size and granularity. + * + * This results in the following granularity and range levels: + * + * HZ 1000 steps + * Level Offset  Granularity            Range + *  0      0         1 ms                0 ms -         63 ms + *  1     64         8 ms               64 ms -        511 ms + *  2    128        64 ms              512 ms -       4095 ms (512ms - ~4s) + *  3    192       512 ms             4096 ms -      32767 ms (~4s - ~32s) + *  4    256      4096 ms (~4s)      32768 ms -     262143 ms (~32s - ~4m) + *  5    320     32768 ms (~32s)    262144 ms -    2097151 ms (~4m - ~34m) + *  6    384    262144 ms (~4m)    2097152 ms -   16777215 ms (~34m - ~4h) + *  7    448   2097152 ms (~34m)  16777216 ms -  134217727 ms (~4h - ~1d) + *  8    512  16777216 ms (~4h)  134217728 ms - 1073741822 ms (~1d - ~12d) + * + * HZ  300 + * Level Offset  Granularity            Range + *  0	   0         3 ms                0 ms -        210 ms + *  1	  64        26 ms              213 ms -       1703 ms (213ms - ~1s) + *  2	 128       213 ms             1706 ms -      13650 ms (~1s - ~13s) + *  3	 192      1706 ms (~1s)      13653 ms -     109223 ms (~13s - ~1m) + *  4	 256     13653 ms (~13s)    109226 ms -     873810 ms (~1m - ~14m) + *  5	 320    109226 ms (~1m)     873813 ms -    6990503 ms (~14m - ~1h) + *  6	 384    873813 ms (~14m)   6990506 ms -   55924050 ms (~1h - ~15h) + *  7	 448   6990506 ms (~1h)   55924053 ms -  447392423 ms (~15h - ~5d) + *  8    512  55924053 ms (~15h) 447392426 ms - 3579139406 ms (~5d - ~41d) + * + * HZ  250 + * Level Offset  Granularity            Range + *  0	   0         4 ms                0 ms -        255 ms + *  1	  64        32 ms              256 ms -       2047 ms (256ms - ~2s) + *  2	 128       256 ms             2048 ms -      16383 ms (~2s - ~16s) + *  3	 192      2048 ms (~2s)      16384 ms -     131071 ms (~16s - ~2m) + *  4	 256     16384 ms (~16s)    131072 ms -    1048575 ms (~2m - ~17m) + *  5	 320    131072 ms (~2m)    1048576 ms -    8388607 ms (~17m - ~2h) + *  6	 384   1048576 ms (~17m)   8388608 ms -   67108863 ms (~2h - ~18h) + *  7	 448   8388608 ms (~2h)   67108864 ms -  536870911 ms (~18h - ~6d) + *  8    512  67108864 ms (~18h) 536870912 ms - 4294967288 ms (~6d - ~49d) + * + * HZ  100 + * Level Offset  Granularity            Range + *  0	   0         10 ms               0 ms -        630 ms + *  1	  64         80 ms             640 ms -       5110 ms (640ms - ~5s) + *  2	 128        640 ms            5120 ms -      40950 ms (~5s - ~40s) + *  3	 192       5120 ms (~5s)     40960 ms -     327670 ms (~40s - ~5m) + *  4	 256      40960 ms (~40s)   327680 ms -    2621430 ms (~5m - ~43m) + *  5	 320     327680 ms (~5m)   2621440 ms -   20971510 ms (~43m - ~5h) + *  6	 384    2621440 ms (~43m) 20971520 ms -  167772150 ms (~5h - ~1d) + *  7	 448   20971520 ms (~5h) 167772160 ms - 1342177270 ms (~1d - ~15d)   */ -#define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6) -#define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8) -#define TVN_SIZE (1 << TVN_BITS) -#define TVR_SIZE (1 << TVR_BITS) -#define TVN_MASK (TVN_SIZE - 1) -#define TVR_MASK (TVR_SIZE - 1) -#define MAX_TVAL ((unsigned long)((1ULL << (TVR_BITS + 4*TVN_BITS)) - 1)) - -struct tvec { -	struct hlist_head vec[TVN_SIZE]; -}; -struct tvec_root { -	struct hlist_head vec[TVR_SIZE]; -}; +/* Clock divisor for the next level */ +#define LVL_CLK_SHIFT	3 +#define LVL_CLK_DIV	(1UL << LVL_CLK_SHIFT) +#define LVL_CLK_MASK	(LVL_CLK_DIV - 1) +#define LVL_SHIFT(n)	((n) * LVL_CLK_SHIFT) +#define LVL_GRAN(n)	(1UL << LVL_SHIFT(n)) -struct tvec_base { -	spinlock_t lock; -	struct timer_list *running_timer; -	unsigned long timer_jiffies; -	unsigned long next_timer; -	unsigned long active_timers; -	unsigned long all_timers; -	int cpu; -	bool migration_enabled; -	bool nohz_active; -	struct tvec_root tv1; -	struct tvec tv2; -	struct tvec tv3; -	struct tvec tv4; -	struct tvec tv5; -} ____cacheline_aligned; +/* + * The time start value for each level to select the bucket at enqueue + * time. + */ +#define LVL_START(n)	((LVL_SIZE - 1) << (((n) - 1) * LVL_CLK_SHIFT)) + +/* Size of each clock level */ +#define LVL_BITS	6 +#define LVL_SIZE	(1UL << LVL_BITS) +#define LVL_MASK	(LVL_SIZE - 1) +#define LVL_OFFS(n)	((n) * LVL_SIZE) + +/* Level depth */ +#if HZ > 100 +# define LVL_DEPTH	9 +# else +# define LVL_DEPTH	8 +#endif + +/* The cutoff (max. capacity of the wheel) */ +#define WHEEL_TIMEOUT_CUTOFF	(LVL_START(LVL_DEPTH)) +#define WHEEL_TIMEOUT_MAX	(WHEEL_TIMEOUT_CUTOFF - LVL_GRAN(LVL_DEPTH - 1)) + +/* + * The resulting wheel size. If NOHZ is configured we allocate two + * wheels so we have a separate storage for the deferrable timers. + */ +#define WHEEL_SIZE	(LVL_SIZE * LVL_DEPTH) + +#ifdef CONFIG_NO_HZ_COMMON +# define NR_BASES	2 +# define BASE_STD	0 +# define BASE_DEF	1 +#else +# define NR_BASES	1 +# define BASE_STD	0 +# define BASE_DEF	0 +#endif +struct timer_base { +	spinlock_t		lock; +	struct timer_list	*running_timer; +	unsigned long		clk; +	unsigned long		next_expiry; +	unsigned int		cpu; +	bool			migration_enabled; +	bool			nohz_active; +	bool			is_idle; +	DECLARE_BITMAP(pending_map, WHEEL_SIZE); +	struct hlist_head	vectors[WHEEL_SIZE]; +} ____cacheline_aligned; -static DEFINE_PER_CPU(struct tvec_base, tvec_bases); +static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]);  #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)  unsigned int sysctl_timer_migration = 1; @@ -106,15 +216,17 @@ void timers_update_migration(bool update_nohz)  	unsigned int cpu;  	/* Avoid the loop, if nothing to update */ -	if (this_cpu_read(tvec_bases.migration_enabled) == on) +	if (this_cpu_read(timer_bases[BASE_STD].migration_enabled) == on)  		return;  	for_each_possible_cpu(cpu) { -		per_cpu(tvec_bases.migration_enabled, cpu) = on; +		per_cpu(timer_bases[BASE_STD].migration_enabled, cpu) = on; +		per_cpu(timer_bases[BASE_DEF].migration_enabled, cpu) = on;  		per_cpu(hrtimer_bases.migration_enabled, cpu) = on;  		if (!update_nohz)  			continue; -		per_cpu(tvec_bases.nohz_active, cpu) = true; +		per_cpu(timer_bases[BASE_STD].nohz_active, cpu) = true; +		per_cpu(timer_bases[BASE_DEF].nohz_active, cpu) = true;  		per_cpu(hrtimer_bases.nohz_active, cpu) = true;  	}  } @@ -133,20 +245,6 @@ int timer_migration_handler(struct ctl_table *table, int write,  	mutex_unlock(&mutex);  	return ret;  } - -static inline struct tvec_base *get_target_base(struct tvec_base *base, -						int pinned) -{ -	if (pinned || !base->migration_enabled) -		return this_cpu_ptr(&tvec_bases); -	return per_cpu_ptr(&tvec_bases, get_nohz_timer_target()); -} -#else -static inline struct tvec_base *get_target_base(struct tvec_base *base, -						int pinned) -{ -	return this_cpu_ptr(&tvec_bases); -}  #endif  static unsigned long round_jiffies_common(unsigned long j, int cpu, @@ -351,101 +449,126 @@ unsigned long round_jiffies_up_relative(unsigned long j)  }  EXPORT_SYMBOL_GPL(round_jiffies_up_relative); -/** - * set_timer_slack - set the allowed slack for a timer - * @timer: the timer to be modified - * @slack_hz: the amount of time (in jiffies) allowed for rounding - * - * Set the amount of time, in jiffies, that a certain timer has - * in terms of slack. By setting this value, the timer subsystem - * will schedule the actual timer somewhere between - * the time mod_timer() asks for, and that time plus the slack. - * - * By setting the slack to -1, a percentage of the delay is used - * instead. - */ -void set_timer_slack(struct timer_list *timer, int slack_hz) + +static inline unsigned int timer_get_idx(struct timer_list *timer)  { -	timer->slack = slack_hz; +	return (timer->flags & TIMER_ARRAYMASK) >> TIMER_ARRAYSHIFT;  } -EXPORT_SYMBOL_GPL(set_timer_slack); -static void -__internal_add_timer(struct tvec_base *base, struct timer_list *timer) +static inline void timer_set_idx(struct timer_list *timer, unsigned int idx)  { -	unsigned long expires = timer->expires; -	unsigned long idx = expires - base->timer_jiffies; -	struct hlist_head *vec; +	timer->flags = (timer->flags & ~TIMER_ARRAYMASK) | +			idx << TIMER_ARRAYSHIFT; +} -	if (idx < TVR_SIZE) { -		int i = expires & TVR_MASK; -		vec = base->tv1.vec + i; -	} else if (idx < 1 << (TVR_BITS + TVN_BITS)) { -		int i = (expires >> TVR_BITS) & TVN_MASK; -		vec = base->tv2.vec + i; -	} else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) { -		int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK; -		vec = base->tv3.vec + i; -	} else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) { -		int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK; -		vec = base->tv4.vec + i; -	} else if ((signed long) idx < 0) { -		/* -		 * Can happen if you add a timer with expires == jiffies, -		 * or you set a timer to go off in the past -		 */ -		vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK); +/* + * Helper function to calculate the array index for a given expiry + * time. + */ +static inline unsigned calc_index(unsigned expires, unsigned lvl) +{ +	expires = (expires + LVL_GRAN(lvl)) >> LVL_SHIFT(lvl); +	return LVL_OFFS(lvl) + (expires & LVL_MASK); +} + +static int calc_wheel_index(unsigned long expires, unsigned long clk) +{ +	unsigned long delta = expires - clk; +	unsigned int idx; + +	if (delta < LVL_START(1)) { +		idx = calc_index(expires, 0); +	} else if (delta < LVL_START(2)) { +		idx = calc_index(expires, 1); +	} else if (delta < LVL_START(3)) { +		idx = calc_index(expires, 2); +	} else if (delta < LVL_START(4)) { +		idx = calc_index(expires, 3); +	} else if (delta < LVL_START(5)) { +		idx = calc_index(expires, 4); +	} else if (delta < LVL_START(6)) { +		idx = calc_index(expires, 5); +	} else if (delta < LVL_START(7)) { +		idx = calc_index(expires, 6); +	} else if (LVL_DEPTH > 8 && delta < LVL_START(8)) { +		idx = calc_index(expires, 7); +	} else if ((long) delta < 0) { +		idx = clk & LVL_MASK;  	} else { -		int i; -		/* If the timeout is larger than MAX_TVAL (on 64-bit -		 * architectures or with CONFIG_BASE_SMALL=1) then we -		 * use the maximum timeout. +		/* +		 * Force expire obscene large timeouts to expire at the +		 * capacity limit of the wheel.  		 */ -		if (idx > MAX_TVAL) { -			idx = MAX_TVAL; -			expires = idx + base->timer_jiffies; -		} -		i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK; -		vec = base->tv5.vec + i; +		if (expires >= WHEEL_TIMEOUT_CUTOFF) +			expires = WHEEL_TIMEOUT_MAX; + +		idx = calc_index(expires, LVL_DEPTH - 1);  	} +	return idx; +} + +/* + * Enqueue the timer into the hash bucket, mark it pending in + * the bitmap and store the index in the timer flags. + */ +static void enqueue_timer(struct timer_base *base, struct timer_list *timer, +			  unsigned int idx) +{ +	hlist_add_head(&timer->entry, base->vectors + idx); +	__set_bit(idx, base->pending_map); +	timer_set_idx(timer, idx); +} + +static void +__internal_add_timer(struct timer_base *base, struct timer_list *timer) +{ +	unsigned int idx; -	hlist_add_head(&timer->entry, vec); +	idx = calc_wheel_index(timer->expires, base->clk); +	enqueue_timer(base, timer, idx);  } -static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) +static void +trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)  { -	/* Advance base->jiffies, if the base is empty */ -	if (!base->all_timers++) -		base->timer_jiffies = jiffies; +	if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active) +		return; -	__internal_add_timer(base, timer);  	/* -	 * Update base->active_timers and base->next_timer +	 * TODO: This wants some optimizing similar to the code below, but we +	 * will do that when we switch from push to pull for deferrable timers.  	 */ -	if (!(timer->flags & TIMER_DEFERRABLE)) { -		if (!base->active_timers++ || -		    time_before(timer->expires, base->next_timer)) -			base->next_timer = timer->expires; +	if (timer->flags & TIMER_DEFERRABLE) { +		if (tick_nohz_full_cpu(base->cpu)) +			wake_up_nohz_cpu(base->cpu); +		return;  	}  	/* -	 * Check whether the other CPU is in dynticks mode and needs -	 * to be triggered to reevaluate the timer wheel. -	 * We are protected against the other CPU fiddling -	 * with the timer by holding the timer base lock. This also -	 * makes sure that a CPU on the way to stop its tick can not -	 * evaluate the timer wheel. -	 * -	 * Spare the IPI for deferrable timers on idle targets though. -	 * The next busy ticks will take care of it. Except full dynticks -	 * require special care against races with idle_cpu(), lets deal -	 * with that later. +	 * We might have to IPI the remote CPU if the base is idle and the +	 * timer is not deferrable. If the other CPU is on the way to idle +	 * then it can't set base->is_idle as we hold the base lock:  	 */ -	if (base->nohz_active) { -		if (!(timer->flags & TIMER_DEFERRABLE) || -		    tick_nohz_full_cpu(base->cpu)) -			wake_up_nohz_cpu(base->cpu); -	} +	if (!base->is_idle) +		return; + +	/* Check whether this is the new first expiring timer: */ +	if (time_after_eq(timer->expires, base->next_expiry)) +		return; + +	/* +	 * Set the next expiry time and kick the CPU so it can reevaluate the +	 * wheel: +	 */ +	base->next_expiry = timer->expires; +		wake_up_nohz_cpu(base->cpu); +} + +static void +internal_add_timer(struct timer_base *base, struct timer_list *timer) +{ +	__internal_add_timer(base, timer); +	trigger_dyntick_cpu(base, timer);  }  #ifdef CONFIG_TIMER_STATS @@ -666,7 +789,6 @@ static void do_init_timer(struct timer_list *timer, unsigned int flags,  {  	timer->entry.pprev = NULL;  	timer->flags = flags | raw_smp_processor_id(); -	timer->slack = -1;  #ifdef CONFIG_TIMER_STATS  	timer->start_site = NULL;  	timer->start_pid = -1; @@ -706,54 +828,125 @@ static inline void detach_timer(struct timer_list *timer, bool clear_pending)  	entry->next = LIST_POISON2;  } -static inline void -detach_expired_timer(struct timer_list *timer, struct tvec_base *base) -{ -	detach_timer(timer, true); -	if (!(timer->flags & TIMER_DEFERRABLE)) -		base->active_timers--; -	base->all_timers--; -} - -static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, +static int detach_if_pending(struct timer_list *timer, struct timer_base *base,  			     bool clear_pending)  { +	unsigned idx = timer_get_idx(timer); +  	if (!timer_pending(timer))  		return 0; +	if (hlist_is_singular_node(&timer->entry, base->vectors + idx)) +		__clear_bit(idx, base->pending_map); +  	detach_timer(timer, clear_pending); -	if (!(timer->flags & TIMER_DEFERRABLE)) { -		base->active_timers--; -		if (timer->expires == base->next_timer) -			base->next_timer = base->timer_jiffies; -	} -	/* If this was the last timer, advance base->jiffies */ -	if (!--base->all_timers) -		base->timer_jiffies = jiffies;  	return 1;  } +static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu) +{ +	struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_STD], cpu); + +	/* +	 * If the timer is deferrable and nohz is active then we need to use +	 * the deferrable base. +	 */ +	if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active && +	    (tflags & TIMER_DEFERRABLE)) +		base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu); +	return base; +} + +static inline struct timer_base *get_timer_this_cpu_base(u32 tflags) +{ +	struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); + +	/* +	 * If the timer is deferrable and nohz is active then we need to use +	 * the deferrable base. +	 */ +	if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active && +	    (tflags & TIMER_DEFERRABLE)) +		base = this_cpu_ptr(&timer_bases[BASE_DEF]); +	return base; +} + +static inline struct timer_base *get_timer_base(u32 tflags) +{ +	return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK); +} + +#ifdef CONFIG_NO_HZ_COMMON +static inline struct timer_base * +__get_target_base(struct timer_base *base, unsigned tflags) +{ +#ifdef CONFIG_SMP +	if ((tflags & TIMER_PINNED) || !base->migration_enabled) +		return get_timer_this_cpu_base(tflags); +	return get_timer_cpu_base(tflags, get_nohz_timer_target()); +#else +	return get_timer_this_cpu_base(tflags); +#endif +} + +static inline void forward_timer_base(struct timer_base *base) +{ +	/* +	 * We only forward the base when it's idle and we have a delta between +	 * base clock and jiffies. +	 */ +	if (!base->is_idle || (long) (jiffies - base->clk) < 2) +		return; + +	/* +	 * If the next expiry value is > jiffies, then we fast forward to +	 * jiffies otherwise we forward to the next expiry value. +	 */ +	if (time_after(base->next_expiry, jiffies)) +		base->clk = jiffies; +	else +		base->clk = base->next_expiry; +} +#else +static inline struct timer_base * +__get_target_base(struct timer_base *base, unsigned tflags) +{ +	return get_timer_this_cpu_base(tflags); +} + +static inline void forward_timer_base(struct timer_base *base) { } +#endif + +static inline struct timer_base * +get_target_base(struct timer_base *base, unsigned tflags) +{ +	struct timer_base *target = __get_target_base(base, tflags); + +	forward_timer_base(target); +	return target; +} +  /* - * We are using hashed locking: holding per_cpu(tvec_bases).lock - * means that all timers which are tied to this base via timer->base are - * locked, and the base itself is locked too. + * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means + * that all timers which are tied to this base are locked, and the base itself + * is locked too.   *   * So __run_timers/migrate_timers can safely modify all timers which could - * be found on ->tvX lists. + * be found in the base->vectors array.   * - * When the timer's base is locked and removed from the list, the - * TIMER_MIGRATING flag is set, FIXME + * When a timer is migrating then the TIMER_MIGRATING flag is set and we need + * to wait until the migration is done.   */ -static struct tvec_base *lock_timer_base(struct timer_list *timer, -					unsigned long *flags) +static struct timer_base *lock_timer_base(struct timer_list *timer, +					  unsigned long *flags)  	__acquires(timer->base->lock)  {  	for (;;) { +		struct timer_base *base;  		u32 tf = timer->flags; -		struct tvec_base *base;  		if (!(tf & TIMER_MIGRATING)) { -			base = per_cpu_ptr(&tvec_bases, tf & TIMER_CPUMASK); +			base = get_timer_base(tf);  			spin_lock_irqsave(&base->lock, *flags);  			if (timer->flags == tf)  				return base; @@ -764,13 +957,41 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer,  }  static inline int -__mod_timer(struct timer_list *timer, unsigned long expires, -	    bool pending_only, int pinned) +__mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)  { -	struct tvec_base *base, *new_base; -	unsigned long flags; +	struct timer_base *base, *new_base; +	unsigned int idx = UINT_MAX; +	unsigned long clk = 0, flags;  	int ret = 0; +	/* +	 * This is a common optimization triggered by the networking code - if +	 * the timer is re-modified to have the same timeout or ends up in the +	 * same array bucket then just return: +	 */ +	if (timer_pending(timer)) { +		if (timer->expires == expires) +			return 1; +		/* +		 * Take the current timer_jiffies of base, but without holding +		 * the lock! +		 */ +		base = get_timer_base(timer->flags); +		clk = base->clk; + +		idx = calc_wheel_index(expires, clk); + +		/* +		 * Retrieve and compare the array index of the pending +		 * timer. If it matches set the expiry to the new value so a +		 * subsequent call will exit in the expires check above. +		 */ +		if (idx == timer_get_idx(timer)) { +			timer->expires = expires; +			return 1; +		} +	} +  	timer_stats_timer_set_start_info(timer);  	BUG_ON(!timer->function); @@ -782,15 +1003,15 @@ __mod_timer(struct timer_list *timer, unsigned long expires,  	debug_activate(timer, expires); -	new_base = get_target_base(base, pinned); +	new_base = get_target_base(base, timer->flags);  	if (base != new_base) {  		/* -		 * We are trying to schedule the timer on the local CPU. +		 * We are trying to schedule the timer on the new base.  		 * However we can't change timer's base while it is running,  		 * otherwise del_timer_sync() can't detect that the timer's -		 * handler yet has not finished. This also guarantees that -		 * the timer is serialized wrt itself. +		 * handler yet has not finished. This also guarantees that the +		 * timer is serialized wrt itself.  		 */  		if (likely(base->running_timer != timer)) {  			/* See the comment in lock_timer_base() */ @@ -805,7 +1026,18 @@ __mod_timer(struct timer_list *timer, unsigned long expires,  	}  	timer->expires = expires; -	internal_add_timer(base, timer); +	/* +	 * If 'idx' was calculated above and the base time did not advance +	 * between calculating 'idx' and taking the lock, only enqueue_timer() +	 * and trigger_dyntick_cpu() is required. Otherwise we need to +	 * (re)calculate the wheel index via internal_add_timer(). +	 */ +	if (idx != UINT_MAX && clk == base->clk) { +		enqueue_timer(base, timer, idx); +		trigger_dyntick_cpu(base, timer); +	} else { +		internal_add_timer(base, timer); +	}  out_unlock:  	spin_unlock_irqrestore(&base->lock, flags); @@ -825,49 +1057,10 @@ out_unlock:   */  int mod_timer_pending(struct timer_list *timer, unsigned long expires)  { -	return __mod_timer(timer, expires, true, TIMER_NOT_PINNED); +	return __mod_timer(timer, expires, true);  }  EXPORT_SYMBOL(mod_timer_pending); -/* - * Decide where to put the timer while taking the slack into account - * - * Algorithm: - *   1) calculate the maximum (absolute) time - *   2) calculate the highest bit where the expires and new max are different - *   3) use this bit to make a mask - *   4) use the bitmask to round down the maximum time, so that all last - *      bits are zeros - */ -static inline -unsigned long apply_slack(struct timer_list *timer, unsigned long expires) -{ -	unsigned long expires_limit, mask; -	int bit; - -	if (timer->slack >= 0) { -		expires_limit = expires + timer->slack; -	} else { -		long delta = expires - jiffies; - -		if (delta < 256) -			return expires; - -		expires_limit = expires + delta / 256; -	} -	mask = expires ^ expires_limit; -	if (mask == 0) -		return expires; - -	bit = __fls(mask); - -	mask = (1UL << bit) - 1; - -	expires_limit = expires_limit & ~(mask); - -	return expires_limit; -} -  /**   * mod_timer - modify a timer's timeout   * @timer: the timer to be modified @@ -890,49 +1083,11 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires)   */  int mod_timer(struct timer_list *timer, unsigned long expires)  { -	expires = apply_slack(timer, expires); - -	/* -	 * This is a common optimization triggered by the -	 * networking code - if the timer is re-modified -	 * to be the same thing then just return: -	 */ -	if (timer_pending(timer) && timer->expires == expires) -		return 1; - -	return __mod_timer(timer, expires, false, TIMER_NOT_PINNED); +	return __mod_timer(timer, expires, false);  }  EXPORT_SYMBOL(mod_timer);  /** - * mod_timer_pinned - modify a timer's timeout - * @timer: the timer to be modified - * @expires: new timeout in jiffies - * - * mod_timer_pinned() is a way to update the expire field of an - * active timer (if the timer is inactive it will be activated) - * and to ensure that the timer is scheduled on the current CPU. - * - * Note that this does not prevent the timer from being migrated - * when the current CPU goes offline.  If this is a problem for - * you, use CPU-hotplug notifiers to handle it correctly, for - * example, cancelling the timer when the corresponding CPU goes - * offline. - * - * mod_timer_pinned(timer, expires) is equivalent to: - * - *     del_timer(timer); timer->expires = expires; add_timer(timer); - */ -int mod_timer_pinned(struct timer_list *timer, unsigned long expires) -{ -	if (timer->expires == expires && timer_pending(timer)) -		return 1; - -	return __mod_timer(timer, expires, false, TIMER_PINNED); -} -EXPORT_SYMBOL(mod_timer_pinned); - -/**   * add_timer - start a timer   * @timer: the timer to be added   * @@ -962,13 +1117,14 @@ EXPORT_SYMBOL(add_timer);   */  void add_timer_on(struct timer_list *timer, int cpu)  { -	struct tvec_base *new_base = per_cpu_ptr(&tvec_bases, cpu); -	struct tvec_base *base; +	struct timer_base *new_base, *base;  	unsigned long flags;  	timer_stats_timer_set_start_info(timer);  	BUG_ON(timer_pending(timer) || !timer->function); +	new_base = get_timer_cpu_base(timer->flags, cpu); +  	/*  	 * If @timer was on a different CPU, it should be migrated with the  	 * old base locked to prevent other operations proceeding with the @@ -1004,7 +1160,7 @@ EXPORT_SYMBOL_GPL(add_timer_on);   */  int del_timer(struct timer_list *timer)  { -	struct tvec_base *base; +	struct timer_base *base;  	unsigned long flags;  	int ret = 0; @@ -1030,7 +1186,7 @@ EXPORT_SYMBOL(del_timer);   */  int try_to_del_timer_sync(struct timer_list *timer)  { -	struct tvec_base *base; +	struct timer_base *base;  	unsigned long flags;  	int ret = -1; @@ -1114,27 +1270,6 @@ int del_timer_sync(struct timer_list *timer)  EXPORT_SYMBOL(del_timer_sync);  #endif -static int cascade(struct tvec_base *base, struct tvec *tv, int index) -{ -	/* cascade all the timers from tv up one level */ -	struct timer_list *timer; -	struct hlist_node *tmp; -	struct hlist_head tv_list; - -	hlist_move_list(tv->vec + index, &tv_list); - -	/* -	 * We are removing _all_ timers from the list, so we -	 * don't have to detach them individually. -	 */ -	hlist_for_each_entry_safe(timer, tmp, &tv_list, entry) { -		/* No accounting, while moving them */ -		__internal_add_timer(base, timer); -	} - -	return index; -} -  static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),  			  unsigned long data)  { @@ -1178,147 +1313,141 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),  	}  } -#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK) - -/** - * __run_timers - run all expired timers (if any) on this CPU. - * @base: the timer vector to be processed. - * - * This function cascades all vectors and executes all expired timer - * vectors. - */ -static inline void __run_timers(struct tvec_base *base) +static void expire_timers(struct timer_base *base, struct hlist_head *head)  { -	struct timer_list *timer; +	while (!hlist_empty(head)) { +		struct timer_list *timer; +		void (*fn)(unsigned long); +		unsigned long data; -	spin_lock_irq(&base->lock); +		timer = hlist_entry(head->first, struct timer_list, entry); +		timer_stats_account_timer(timer); -	while (time_after_eq(jiffies, base->timer_jiffies)) { -		struct hlist_head work_list; -		struct hlist_head *head = &work_list; -		int index; +		base->running_timer = timer; +		detach_timer(timer, true); -		if (!base->all_timers) { -			base->timer_jiffies = jiffies; -			break; +		fn = timer->function; +		data = timer->data; + +		if (timer->flags & TIMER_IRQSAFE) { +			spin_unlock(&base->lock); +			call_timer_fn(timer, fn, data); +			spin_lock(&base->lock); +		} else { +			spin_unlock_irq(&base->lock); +			call_timer_fn(timer, fn, data); +			spin_lock_irq(&base->lock);  		} +	} +} -		index = base->timer_jiffies & TVR_MASK; +static int __collect_expired_timers(struct timer_base *base, +				    struct hlist_head *heads) +{ +	unsigned long clk = base->clk; +	struct hlist_head *vec; +	int i, levels = 0; +	unsigned int idx; -		/* -		 * Cascade timers: -		 */ -		if (!index && -			(!cascade(base, &base->tv2, INDEX(0))) && -				(!cascade(base, &base->tv3, INDEX(1))) && -					!cascade(base, &base->tv4, INDEX(2))) -			cascade(base, &base->tv5, INDEX(3)); -		++base->timer_jiffies; -		hlist_move_list(base->tv1.vec + index, head); -		while (!hlist_empty(head)) { -			void (*fn)(unsigned long); -			unsigned long data; -			bool irqsafe; - -			timer = hlist_entry(head->first, struct timer_list, entry); -			fn = timer->function; -			data = timer->data; -			irqsafe = timer->flags & TIMER_IRQSAFE; - -			timer_stats_account_timer(timer); - -			base->running_timer = timer; -			detach_expired_timer(timer, base); - -			if (irqsafe) { -				spin_unlock(&base->lock); -				call_timer_fn(timer, fn, data); -				spin_lock(&base->lock); -			} else { -				spin_unlock_irq(&base->lock); -				call_timer_fn(timer, fn, data); -				spin_lock_irq(&base->lock); -			} +	for (i = 0; i < LVL_DEPTH; i++) { +		idx = (clk & LVL_MASK) + i * LVL_SIZE; + +		if (__test_and_clear_bit(idx, base->pending_map)) { +			vec = base->vectors + idx; +			hlist_move_list(vec, heads++); +			levels++;  		} +		/* Is it time to look at the next level? */ +		if (clk & LVL_CLK_MASK) +			break; +		/* Shift clock for the next level granularity */ +		clk >>= LVL_CLK_SHIFT;  	} -	base->running_timer = NULL; -	spin_unlock_irq(&base->lock); +	return levels;  }  #ifdef CONFIG_NO_HZ_COMMON  /* - * Find out when the next timer event is due to happen. This - * is used on S/390 to stop all activity when a CPU is idle. - * This function needs to be called with interrupts disabled. + * Find the next pending bucket of a level. Search from level start (@offset) + * + @clk upwards and if nothing there, search from start of the level + * (@offset) up to @offset + clk. + */ +static int next_pending_bucket(struct timer_base *base, unsigned offset, +			       unsigned clk) +{ +	unsigned pos, start = offset + clk; +	unsigned end = offset + LVL_SIZE; + +	pos = find_next_bit(base->pending_map, end, start); +	if (pos < end) +		return pos - start; + +	pos = find_next_bit(base->pending_map, start, offset); +	return pos < start ? pos + LVL_SIZE - start : -1; +} + +/* + * Search the first expiring timer in the various clock levels. Caller must + * hold base->lock.   */ -static unsigned long __next_timer_interrupt(struct tvec_base *base) -{ -	unsigned long timer_jiffies = base->timer_jiffies; -	unsigned long expires = timer_jiffies + NEXT_TIMER_MAX_DELTA; -	int index, slot, array, found = 0; -	struct timer_list *nte; -	struct tvec *varray[4]; - -	/* Look for timer events in tv1. */ -	index = slot = timer_jiffies & TVR_MASK; -	do { -		hlist_for_each_entry(nte, base->tv1.vec + slot, entry) { -			if (nte->flags & TIMER_DEFERRABLE) -				continue; - -			found = 1; -			expires = nte->expires; -			/* Look at the cascade bucket(s)? */ -			if (!index || slot < index) -				goto cascade; -			return expires; +static unsigned long __next_timer_interrupt(struct timer_base *base) +{ +	unsigned long clk, next, adj; +	unsigned lvl, offset = 0; + +	next = base->clk + NEXT_TIMER_MAX_DELTA; +	clk = base->clk; +	for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) { +		int pos = next_pending_bucket(base, offset, clk & LVL_MASK); + +		if (pos >= 0) { +			unsigned long tmp = clk + (unsigned long) pos; + +			tmp <<= LVL_SHIFT(lvl); +			if (time_before(tmp, next)) +				next = tmp;  		} -		slot = (slot + 1) & TVR_MASK; -	} while (slot != index); - -cascade: -	/* Calculate the next cascade event */ -	if (index) -		timer_jiffies += TVR_SIZE - index; -	timer_jiffies >>= TVR_BITS; - -	/* Check tv2-tv5. */ -	varray[0] = &base->tv2; -	varray[1] = &base->tv3; -	varray[2] = &base->tv4; -	varray[3] = &base->tv5; - -	for (array = 0; array < 4; array++) { -		struct tvec *varp = varray[array]; - -		index = slot = timer_jiffies & TVN_MASK; -		do { -			hlist_for_each_entry(nte, varp->vec + slot, entry) { -				if (nte->flags & TIMER_DEFERRABLE) -					continue; - -				found = 1; -				if (time_before(nte->expires, expires)) -					expires = nte->expires; -			} -			/* -			 * Do we still search for the first timer or are -			 * we looking up the cascade buckets ? -			 */ -			if (found) { -				/* Look at the cascade bucket(s)? */ -				if (!index || slot < index) -					break; -				return expires; -			} -			slot = (slot + 1) & TVN_MASK; -		} while (slot != index); - -		if (index) -			timer_jiffies += TVN_SIZE - index; -		timer_jiffies >>= TVN_BITS; +		/* +		 * Clock for the next level. If the current level clock lower +		 * bits are zero, we look at the next level as is. If not we +		 * need to advance it by one because that's going to be the +		 * next expiring bucket in that level. base->clk is the next +		 * expiring jiffie. So in case of: +		 * +		 * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0 +		 *  0    0    0    0    0    0 +		 * +		 * we have to look at all levels @index 0. With +		 * +		 * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0 +		 *  0    0    0    0    0    2 +		 * +		 * LVL0 has the next expiring bucket @index 2. The upper +		 * levels have the next expiring bucket @index 1. +		 * +		 * In case that the propagation wraps the next level the same +		 * rules apply: +		 * +		 * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0 +		 *  0    0    0    0    F    2 +		 * +		 * So after looking at LVL0 we get: +		 * +		 * LVL5 LVL4 LVL3 LVL2 LVL1 +		 *  0    0    0    1    0 +		 * +		 * So no propagation from LVL1 to LVL2 because that happened +		 * with the add already, but then we need to propagate further +		 * from LVL2 to LVL3. +		 * +		 * So the simple check whether the lower bits of the current +		 * level are 0 or not is sufficient for all cases. +		 */ +		adj = clk & LVL_CLK_MASK ? 1 : 0; +		clk >>= LVL_CLK_SHIFT; +		clk += adj;  	} -	return expires; +	return next;  }  /* @@ -1364,7 +1493,7 @@ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)   */  u64 get_next_timer_interrupt(unsigned long basej, u64 basem)  { -	struct tvec_base *base = this_cpu_ptr(&tvec_bases); +	struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);  	u64 expires = KTIME_MAX;  	unsigned long nextevt; @@ -1376,19 +1505,80 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)  		return expires;  	spin_lock(&base->lock); -	if (base->active_timers) { -		if (time_before_eq(base->next_timer, base->timer_jiffies)) -			base->next_timer = __next_timer_interrupt(base); -		nextevt = base->next_timer; -		if (time_before_eq(nextevt, basej)) -			expires = basem; -		else -			expires = basem + (nextevt - basej) * TICK_NSEC; +	nextevt = __next_timer_interrupt(base); +	base->next_expiry = nextevt; +	/* +	 * We have a fresh next event. Check whether we can forward the base: +	 */ +	if (time_after(nextevt, jiffies)) +		base->clk = jiffies; +	else if (time_after(nextevt, base->clk)) +		base->clk = nextevt; + +	if (time_before_eq(nextevt, basej)) { +		expires = basem; +		base->is_idle = false; +	} else { +		expires = basem + (nextevt - basej) * TICK_NSEC; +		/* +		 * If we expect to sleep more than a tick, mark the base idle: +		 */ +		if ((expires - basem) > TICK_NSEC) +			base->is_idle = true;  	}  	spin_unlock(&base->lock);  	return cmp_next_hrtimer_event(basem, expires);  } + +/** + * timer_clear_idle - Clear the idle state of the timer base + * + * Called with interrupts disabled + */ +void timer_clear_idle(void) +{ +	struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); + +	/* +	 * We do this unlocked. The worst outcome is a remote enqueue sending +	 * a pointless IPI, but taking the lock would just make the window for +	 * sending the IPI a few instructions smaller for the cost of taking +	 * the lock in the exit from idle path. +	 */ +	base->is_idle = false; +} + +static int collect_expired_timers(struct timer_base *base, +				  struct hlist_head *heads) +{ +	/* +	 * NOHZ optimization. After a long idle sleep we need to forward the +	 * base to current jiffies. Avoid a loop by searching the bitfield for +	 * the next expiring timer. +	 */ +	if ((long)(jiffies - base->clk) > 2) { +		unsigned long next = __next_timer_interrupt(base); + +		/* +		 * If the next timer is ahead of time forward to current +		 * jiffies, otherwise forward to the next expiry time: +		 */ +		if (time_after(next, jiffies)) { +			/* The call site will increment clock! */ +			base->clk = jiffies - 1; +			return 0; +		} +		base->clk = next; +	} +	return __collect_expired_timers(base, heads); +} +#else +static inline int collect_expired_timers(struct timer_base *base, +					 struct hlist_head *heads) +{ +	return __collect_expired_timers(base, heads); +}  #endif  /* @@ -1411,15 +1601,42 @@ void update_process_times(int user_tick)  	run_posix_cpu_timers(p);  } +/** + * __run_timers - run all expired timers (if any) on this CPU. + * @base: the timer vector to be processed. + */ +static inline void __run_timers(struct timer_base *base) +{ +	struct hlist_head heads[LVL_DEPTH]; +	int levels; + +	if (!time_after_eq(jiffies, base->clk)) +		return; + +	spin_lock_irq(&base->lock); + +	while (time_after_eq(jiffies, base->clk)) { + +		levels = collect_expired_timers(base, heads); +		base->clk++; + +		while (levels--) +			expire_timers(base, heads + levels); +	} +	base->running_timer = NULL; +	spin_unlock_irq(&base->lock); +} +  /*   * This function runs timers and the timer-tq in bottom half context.   */  static void run_timer_softirq(struct softirq_action *h)  { -	struct tvec_base *base = this_cpu_ptr(&tvec_bases); +	struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); -	if (time_after_eq(jiffies, base->timer_jiffies)) -		__run_timers(base); +	__run_timers(base); +	if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active) +		__run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));  }  /* @@ -1427,7 +1644,18 @@ static void run_timer_softirq(struct softirq_action *h)   */  void run_local_timers(void)  { +	struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); +  	hrtimer_run_queues(); +	/* Raise the softirq only if required. */ +	if (time_before(jiffies, base->clk)) { +		if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active) +			return; +		/* CPU is awake, so check the deferrable base. */ +		base++; +		if (time_before(jiffies, base->clk)) +			return; +	}  	raise_softirq(TIMER_SOFTIRQ);  } @@ -1512,7 +1740,7 @@ signed long __sched schedule_timeout(signed long timeout)  	expire = timeout + jiffies;  	setup_timer_on_stack(&timer, process_timeout, (unsigned long)current); -	__mod_timer(&timer, expire, false, TIMER_NOT_PINNED); +	__mod_timer(&timer, expire, false);  	schedule();  	del_singleshot_timer_sync(&timer); @@ -1563,87 +1791,62 @@ signed long __sched schedule_timeout_idle(signed long timeout)  EXPORT_SYMBOL(schedule_timeout_idle);  #ifdef CONFIG_HOTPLUG_CPU -static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *head) +static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *head)  {  	struct timer_list *timer;  	int cpu = new_base->cpu;  	while (!hlist_empty(head)) {  		timer = hlist_entry(head->first, struct timer_list, entry); -		/* We ignore the accounting on the dying cpu */  		detach_timer(timer, false);  		timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu;  		internal_add_timer(new_base, timer);  	}  } -static void migrate_timers(int cpu) +int timers_dead_cpu(unsigned int cpu)  { -	struct tvec_base *old_base; -	struct tvec_base *new_base; -	int i; +	struct timer_base *old_base; +	struct timer_base *new_base; +	int b, i;  	BUG_ON(cpu_online(cpu)); -	old_base = per_cpu_ptr(&tvec_bases, cpu); -	new_base = get_cpu_ptr(&tvec_bases); -	/* -	 * The caller is globally serialized and nobody else -	 * takes two locks at once, deadlock is not possible. -	 */ -	spin_lock_irq(&new_base->lock); -	spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); - -	BUG_ON(old_base->running_timer); - -	for (i = 0; i < TVR_SIZE; i++) -		migrate_timer_list(new_base, old_base->tv1.vec + i); -	for (i = 0; i < TVN_SIZE; i++) { -		migrate_timer_list(new_base, old_base->tv2.vec + i); -		migrate_timer_list(new_base, old_base->tv3.vec + i); -		migrate_timer_list(new_base, old_base->tv4.vec + i); -		migrate_timer_list(new_base, old_base->tv5.vec + i); -	} -	old_base->active_timers = 0; -	old_base->all_timers = 0; +	for (b = 0; b < NR_BASES; b++) { +		old_base = per_cpu_ptr(&timer_bases[b], cpu); +		new_base = get_cpu_ptr(&timer_bases[b]); +		/* +		 * The caller is globally serialized and nobody else +		 * takes two locks at once, deadlock is not possible. +		 */ +		spin_lock_irq(&new_base->lock); +		spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); -	spin_unlock(&old_base->lock); -	spin_unlock_irq(&new_base->lock); -	put_cpu_ptr(&tvec_bases); -} +		BUG_ON(old_base->running_timer); -static int timer_cpu_notify(struct notifier_block *self, -				unsigned long action, void *hcpu) -{ -	switch (action) { -	case CPU_DEAD: -	case CPU_DEAD_FROZEN: -		migrate_timers((long)hcpu); -		break; -	default: -		break; -	} +		for (i = 0; i < WHEEL_SIZE; i++) +			migrate_timer_list(new_base, old_base->vectors + i); -	return NOTIFY_OK; +		spin_unlock(&old_base->lock); +		spin_unlock_irq(&new_base->lock); +		put_cpu_ptr(&timer_bases); +	} +	return 0;  } -static inline void timer_register_cpu_notifier(void) -{ -	cpu_notifier(timer_cpu_notify, 0); -} -#else -static inline void timer_register_cpu_notifier(void) { }  #endif /* CONFIG_HOTPLUG_CPU */  static void __init init_timer_cpu(int cpu)  { -	struct tvec_base *base = per_cpu_ptr(&tvec_bases, cpu); - -	base->cpu = cpu; -	spin_lock_init(&base->lock); +	struct timer_base *base; +	int i; -	base->timer_jiffies = jiffies; -	base->next_timer = base->timer_jiffies; +	for (i = 0; i < NR_BASES; i++) { +		base = per_cpu_ptr(&timer_bases[i], cpu); +		base->cpu = cpu; +		spin_lock_init(&base->lock); +		base->clk = jiffies; +	}  }  static void __init init_timer_cpus(void) @@ -1658,7 +1861,6 @@ void __init init_timers(void)  {  	init_timer_cpus();  	init_timer_stats(); -	timer_register_cpu_notifier();  	open_softirq(TIMER_SOFTIRQ, run_timer_softirq);  } @@ -1702,9 +1904,15 @@ static void __sched do_usleep_range(unsigned long min, unsigned long max)  }  /** - * usleep_range - Drop in replacement for udelay where wakeup is flexible + * usleep_range - Sleep for an approximate time   * @min: Minimum time in usecs to sleep   * @max: Maximum time in usecs to sleep + * + * In non-atomic context where the exact wakeup time is flexible, use + * usleep_range() instead of udelay().  The sleep improves responsiveness + * by avoiding the CPU-hogging busy-wait of udelay(), and the range reduces + * power usage by allowing hrtimers to take advantage of an already- + * scheduled interrupt instead of scheduling a new one just for this sleep.   */  void __sched usleep_range(unsigned long min, unsigned long max)  { diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index 1adecb4b87c8..087204c733eb 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c @@ -279,7 +279,7 @@ static void print_name_offset(struct seq_file *m, unsigned long addr)  static int tstats_show(struct seq_file *m, void *v)  { -	struct timespec period; +	struct timespec64 period;  	struct entry *entry;  	unsigned long ms;  	long events = 0; @@ -295,11 +295,11 @@ static int tstats_show(struct seq_file *m, void *v)  	time = ktime_sub(time_stop, time_start); -	period = ktime_to_timespec(time); +	period = ktime_to_timespec64(time);  	ms = period.tv_nsec / 1000000;  	seq_puts(m, "Timer Stats Version: v0.3\n"); -	seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms); +	seq_printf(m, "Sample period: %ld.%03ld s\n", (long)period.tv_sec, ms);  	if (atomic_read(&overflow_count))  		seq_printf(m, "Overflow: %d entries\n", atomic_read(&overflow_count));  	seq_printf(m, "Collection: %s\n", timer_stats_active ? "active" : "inactive"); diff --git a/kernel/torture.c b/kernel/torture.c index fa0bdeee17ac..75961b3decfe 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -82,6 +82,104 @@ static int min_online = -1;  static int max_online;  /* + * Attempt to take a CPU offline.  Return false if the CPU is already + * offline or if it is not subject to CPU-hotplug operations.  The + * caller can detect other failures by looking at the statistics. + */ +bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes, +		     unsigned long *sum_offl, int *min_offl, int *max_offl) +{ +	unsigned long delta; +	int ret; +	unsigned long starttime; + +	if (!cpu_online(cpu) || !cpu_is_hotpluggable(cpu)) +		return false; + +	if (verbose) +		pr_alert("%s" TORTURE_FLAG +			 "torture_onoff task: offlining %d\n", +			 torture_type, cpu); +	starttime = jiffies; +	(*n_offl_attempts)++; +	ret = cpu_down(cpu); +	if (ret) { +		if (verbose) +			pr_alert("%s" TORTURE_FLAG +				 "torture_onoff task: offline %d failed: errno %d\n", +				 torture_type, cpu, ret); +	} else { +		if (verbose) +			pr_alert("%s" TORTURE_FLAG +				 "torture_onoff task: offlined %d\n", +				 torture_type, cpu); +		(*n_offl_successes)++; +		delta = jiffies - starttime; +		sum_offl += delta; +		if (*min_offl < 0) { +			*min_offl = delta; +			*max_offl = delta; +		} +		if (*min_offl > delta) +			*min_offl = delta; +		if (*max_offl < delta) +			*max_offl = delta; +	} + +	return true; +} +EXPORT_SYMBOL_GPL(torture_offline); + +/* + * Attempt to bring a CPU online.  Return false if the CPU is already + * online or if it is not subject to CPU-hotplug operations.  The + * caller can detect other failures by looking at the statistics. + */ +bool torture_online(int cpu, long *n_onl_attempts, long *n_onl_successes, +		    unsigned long *sum_onl, int *min_onl, int *max_onl) +{ +	unsigned long delta; +	int ret; +	unsigned long starttime; + +	if (cpu_online(cpu) || !cpu_is_hotpluggable(cpu)) +		return false; + +	if (verbose) +		pr_alert("%s" TORTURE_FLAG +			 "torture_onoff task: onlining %d\n", +			 torture_type, cpu); +	starttime = jiffies; +	(*n_onl_attempts)++; +	ret = cpu_up(cpu); +	if (ret) { +		if (verbose) +			pr_alert("%s" TORTURE_FLAG +				 "torture_onoff task: online %d failed: errno %d\n", +				 torture_type, cpu, ret); +	} else { +		if (verbose) +			pr_alert("%s" TORTURE_FLAG +				 "torture_onoff task: onlined %d\n", +				 torture_type, cpu); +		(*n_onl_successes)++; +		delta = jiffies - starttime; +		*sum_onl += delta; +		if (*min_onl < 0) { +			*min_onl = delta; +			*max_onl = delta; +		} +		if (*min_onl > delta) +			*min_onl = delta; +		if (*max_onl < delta) +			*max_onl = delta; +	} + +	return true; +} +EXPORT_SYMBOL_GPL(torture_online); + +/*   * Execute random CPU-hotplug operations at the interval specified   * by the onoff_interval.   */ @@ -89,16 +187,19 @@ static int  torture_onoff(void *arg)  {  	int cpu; -	unsigned long delta;  	int maxcpu = -1;  	DEFINE_TORTURE_RANDOM(rand); -	int ret; -	unsigned long starttime;  	VERBOSE_TOROUT_STRING("torture_onoff task started");  	for_each_online_cpu(cpu)  		maxcpu = cpu;  	WARN_ON(maxcpu < 0); + +	if (maxcpu == 0) { +		VERBOSE_TOROUT_STRING("Only one CPU, so CPU-hotplug testing is disabled"); +		goto stop; +	} +  	if (onoff_holdoff > 0) {  		VERBOSE_TOROUT_STRING("torture_onoff begin holdoff");  		schedule_timeout_interruptible(onoff_holdoff); @@ -106,69 +207,16 @@ torture_onoff(void *arg)  	}  	while (!torture_must_stop()) {  		cpu = (torture_random(&rand) >> 4) % (maxcpu + 1); -		if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) { -			if (verbose) -				pr_alert("%s" TORTURE_FLAG -					 "torture_onoff task: offlining %d\n", -					 torture_type, cpu); -			starttime = jiffies; -			n_offline_attempts++; -			ret = cpu_down(cpu); -			if (ret) { -				if (verbose) -					pr_alert("%s" TORTURE_FLAG -						 "torture_onoff task: offline %d failed: errno %d\n", -						 torture_type, cpu, ret); -			} else { -				if (verbose) -					pr_alert("%s" TORTURE_FLAG -						 "torture_onoff task: offlined %d\n", -						 torture_type, cpu); -				n_offline_successes++; -				delta = jiffies - starttime; -				sum_offline += delta; -				if (min_offline < 0) { -					min_offline = delta; -					max_offline = delta; -				} -				if (min_offline > delta) -					min_offline = delta; -				if (max_offline < delta) -					max_offline = delta; -			} -		} else if (cpu_is_hotpluggable(cpu)) { -			if (verbose) -				pr_alert("%s" TORTURE_FLAG -					 "torture_onoff task: onlining %d\n", -					 torture_type, cpu); -			starttime = jiffies; -			n_online_attempts++; -			ret = cpu_up(cpu); -			if (ret) { -				if (verbose) -					pr_alert("%s" TORTURE_FLAG -						 "torture_onoff task: online %d failed: errno %d\n", -						 torture_type, cpu, ret); -			} else { -				if (verbose) -					pr_alert("%s" TORTURE_FLAG -						 "torture_onoff task: onlined %d\n", -						 torture_type, cpu); -				n_online_successes++; -				delta = jiffies - starttime; -				sum_online += delta; -				if (min_online < 0) { -					min_online = delta; -					max_online = delta; -				} -				if (min_online > delta) -					min_online = delta; -				if (max_online < delta) -					max_online = delta; -			} -		} +		if (!torture_offline(cpu, +				     &n_offline_attempts, &n_offline_successes, +				     &sum_offline, &min_offline, &max_offline)) +			torture_online(cpu, +				       &n_online_attempts, &n_online_successes, +				       &sum_online, &min_online, &max_online);  		schedule_timeout_interruptible(onoff_interval);  	} + +stop:  	torture_kthread_stopping("torture_onoff");  	return 0;  } diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index fafeaf803bd0..f4b86e8ca1e7 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -542,6 +542,7 @@ config HIST_TRIGGERS  	bool "Histogram triggers"  	depends on ARCH_HAVE_NMI_SAFE_CMPXCHG  	select TRACING_MAP +	select TRACING  	default n  	help  	  Hist triggers allow one or more arbitrary trace event fields diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 9aef8654e90d..fb345cd11883 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -127,12 +127,13 @@ static void trace_note_tsk(struct task_struct *tsk)  static void trace_note_time(struct blk_trace *bt)  { -	struct timespec now; +	struct timespec64 now;  	unsigned long flags;  	u32 words[2]; -	getnstimeofday(&now); -	words[0] = now.tv_sec; +	/* need to check user space to see if this breaks in y2038 or y2106 */ +	ktime_get_real_ts64(&now); +	words[0] = (u32)now.tv_sec;  	words[1] = now.tv_nsec;  	local_irq_save(flags); @@ -189,6 +190,7 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),  				 BLK_TC_ACT(BLK_TC_WRITE) };  #define BLK_TC_RAHEAD		BLK_TC_AHEAD +#define BLK_TC_PREFLUSH		BLK_TC_FLUSH  /* The ilog2() calls fall out because they're constant */  #define MASK_TC_BIT(rw, __name) ((rw & REQ_ ## __name) << \ @@ -199,7 +201,8 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),   * blk_io_trace structure and places it in a per-cpu subbuffer.   */  static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, -		     int rw, u32 what, int error, int pdu_len, void *pdu_data) +		     int op, int op_flags, u32 what, int error, int pdu_len, +		     void *pdu_data)  {  	struct task_struct *tsk = current;  	struct ring_buffer_event *event = NULL; @@ -214,13 +217,16 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,  	if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer))  		return; -	what |= ddir_act[rw & WRITE]; -	what |= MASK_TC_BIT(rw, SYNC); -	what |= MASK_TC_BIT(rw, RAHEAD); -	what |= MASK_TC_BIT(rw, META); -	what |= MASK_TC_BIT(rw, DISCARD); -	what |= MASK_TC_BIT(rw, FLUSH); -	what |= MASK_TC_BIT(rw, FUA); +	what |= ddir_act[op_is_write(op) ? WRITE : READ]; +	what |= MASK_TC_BIT(op_flags, SYNC); +	what |= MASK_TC_BIT(op_flags, RAHEAD); +	what |= MASK_TC_BIT(op_flags, META); +	what |= MASK_TC_BIT(op_flags, PREFLUSH); +	what |= MASK_TC_BIT(op_flags, FUA); +	if (op == REQ_OP_DISCARD) +		what |= BLK_TC_ACT(BLK_TC_DISCARD); +	if (op == REQ_OP_FLUSH) +		what |= BLK_TC_ACT(BLK_TC_FLUSH);  	pid = tsk->pid;  	if (act_log_check(bt, what, sector, pid)) @@ -708,11 +714,11 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,  	if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {  		what |= BLK_TC_ACT(BLK_TC_PC); -		__blk_add_trace(bt, 0, nr_bytes, rq->cmd_flags, +		__blk_add_trace(bt, 0, nr_bytes, req_op(rq), rq->cmd_flags,  				what, rq->errors, rq->cmd_len, rq->cmd);  	} else  {  		what |= BLK_TC_ACT(BLK_TC_FS); -		__blk_add_trace(bt, blk_rq_pos(rq), nr_bytes, +		__blk_add_trace(bt, blk_rq_pos(rq), nr_bytes, req_op(rq),  				rq->cmd_flags, what, rq->errors, 0, NULL);  	}  } @@ -770,7 +776,7 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,  		return;  	__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, -			bio->bi_rw, what, error, 0, NULL); +			bio_op(bio), bio->bi_rw, what, error, 0, NULL);  }  static void blk_add_trace_bio_bounce(void *ignore, @@ -818,7 +824,8 @@ static void blk_add_trace_getrq(void *ignore,  		struct blk_trace *bt = q->blk_trace;  		if (bt) -			__blk_add_trace(bt, 0, 0, rw, BLK_TA_GETRQ, 0, 0, NULL); +			__blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0, +					NULL);  	}  } @@ -833,7 +840,7 @@ static void blk_add_trace_sleeprq(void *ignore,  		struct blk_trace *bt = q->blk_trace;  		if (bt) -			__blk_add_trace(bt, 0, 0, rw, BLK_TA_SLEEPRQ, +			__blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ,  					0, 0, NULL);  	}  } @@ -843,7 +850,7 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q)  	struct blk_trace *bt = q->blk_trace;  	if (bt) -		__blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL); +		__blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);  }  static void blk_add_trace_unplug(void *ignore, struct request_queue *q, @@ -860,7 +867,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q,  		else  			what = BLK_TA_UNPLUG_TIMER; -		__blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu); +		__blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);  	}  } @@ -874,8 +881,9 @@ static void blk_add_trace_split(void *ignore,  		__be64 rpdu = cpu_to_be64(pdu);  		__blk_add_trace(bt, bio->bi_iter.bi_sector, -				bio->bi_iter.bi_size, bio->bi_rw, BLK_TA_SPLIT, -				bio->bi_error, sizeof(rpdu), &rpdu); +				bio->bi_iter.bi_size, bio_op(bio), bio->bi_rw, +				BLK_TA_SPLIT, bio->bi_error, sizeof(rpdu), +				&rpdu);  	}  } @@ -907,7 +915,7 @@ static void blk_add_trace_bio_remap(void *ignore,  	r.sector_from = cpu_to_be64(from);  	__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, -			bio->bi_rw, BLK_TA_REMAP, bio->bi_error, +			bio_op(bio), bio->bi_rw, BLK_TA_REMAP, bio->bi_error,  			sizeof(r), &r);  } @@ -940,7 +948,7 @@ static void blk_add_trace_rq_remap(void *ignore,  	r.sector_from = cpu_to_be64(from);  	__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), -			rq_data_dir(rq), BLK_TA_REMAP, !!rq->errors, +			rq_data_dir(rq), 0, BLK_TA_REMAP, !!rq->errors,  			sizeof(r), &r);  } @@ -965,10 +973,10 @@ void blk_add_driver_data(struct request_queue *q,  		return;  	if (rq->cmd_type == REQ_TYPE_BLOCK_PC) -		__blk_add_trace(bt, 0, blk_rq_bytes(rq), 0, +		__blk_add_trace(bt, 0, blk_rq_bytes(rq), 0, 0,  				BLK_TA_DRV_DATA, rq->errors, len, data);  	else -		__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0, +		__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0, 0,  				BLK_TA_DRV_DATA, rq->errors, len, data);  }  EXPORT_SYMBOL_GPL(blk_add_driver_data); @@ -1769,21 +1777,34 @@ void blk_dump_cmd(char *buf, struct request *rq)  	}  } -void blk_fill_rwbs(char *rwbs, u32 rw, int bytes) +void blk_fill_rwbs(char *rwbs, int op, u32 rw, int bytes)  {  	int i = 0; -	if (rw & REQ_FLUSH) +	if (rw & REQ_PREFLUSH)  		rwbs[i++] = 'F'; -	if (rw & WRITE) +	switch (op) { +	case REQ_OP_WRITE: +	case REQ_OP_WRITE_SAME:  		rwbs[i++] = 'W'; -	else if (rw & REQ_DISCARD) +		break; +	case REQ_OP_DISCARD: +		rwbs[i++] = 'D'; +		break; +	case REQ_OP_SECURE_ERASE:  		rwbs[i++] = 'D'; -	else if (bytes) +		rwbs[i++] = 'E'; +		break; +	case REQ_OP_FLUSH: +		rwbs[i++] = 'F'; +		break; +	case REQ_OP_READ:  		rwbs[i++] = 'R'; -	else +		break; +	default:  		rwbs[i++] = 'N'; +	}  	if (rw & REQ_FUA)  		rwbs[i++] = 'F'; @@ -1793,8 +1814,6 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)  		rwbs[i++] = 'S';  	if (rw & REQ_META)  		rwbs[i++] = 'M'; -	if (rw & REQ_SECURE) -		rwbs[i++] = 'E';  	rwbs[i] = '\0';  } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 720b7bb01d43..b20438fdb029 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -81,6 +81,49 @@ static const struct bpf_func_proto bpf_probe_read_proto = {  	.arg3_type	= ARG_ANYTHING,  }; +static u64 bpf_probe_write_user(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ +	void *unsafe_ptr = (void *) (long) r1; +	void *src = (void *) (long) r2; +	int size = (int) r3; + +	/* +	 * Ensure we're in user context which is safe for the helper to +	 * run. This helper has no business in a kthread. +	 * +	 * access_ok() should prevent writing to non-user memory, but in +	 * some situations (nommu, temporary switch, etc) access_ok() does +	 * not provide enough validation, hence the check on KERNEL_DS. +	 */ + +	if (unlikely(in_interrupt() || +		     current->flags & (PF_KTHREAD | PF_EXITING))) +		return -EPERM; +	if (unlikely(segment_eq(get_fs(), KERNEL_DS))) +		return -EPERM; +	if (!access_ok(VERIFY_WRITE, unsafe_ptr, size)) +		return -EPERM; + +	return probe_kernel_write(unsafe_ptr, src, size); +} + +static const struct bpf_func_proto bpf_probe_write_user_proto = { +	.func		= bpf_probe_write_user, +	.gpl_only	= true, +	.ret_type	= RET_INTEGER, +	.arg1_type	= ARG_ANYTHING, +	.arg2_type	= ARG_PTR_TO_STACK, +	.arg3_type	= ARG_CONST_STACK_SIZE, +}; + +static const struct bpf_func_proto *bpf_get_probe_write_proto(void) +{ +	pr_warn_ratelimited("%s[%d] is installing a program with bpf_probe_write_user helper that may corrupt user memory!", +			    current->comm, task_pid_nr(current)); + +	return &bpf_probe_write_user_proto; +} +  /*   * limited trace_printk()   * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed @@ -188,25 +231,33 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void)  	return &bpf_trace_printk_proto;  } -static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5) +static u64 bpf_perf_event_read(u64 r1, u64 flags, u64 r3, u64 r4, u64 r5)  {  	struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;  	struct bpf_array *array = container_of(map, struct bpf_array, map); +	unsigned int cpu = smp_processor_id(); +	u64 index = flags & BPF_F_INDEX_MASK; +	struct bpf_event_entry *ee;  	struct perf_event *event; -	struct file *file; +	if (unlikely(flags & ~(BPF_F_INDEX_MASK))) +		return -EINVAL; +	if (index == BPF_F_CURRENT_CPU) +		index = cpu;  	if (unlikely(index >= array->map.max_entries))  		return -E2BIG; -	file = READ_ONCE(array->ptrs[index]); -	if (unlikely(!file)) +	ee = READ_ONCE(array->ptrs[index]); +	if (!ee)  		return -ENOENT; -	event = file->private_data; +	event = ee->event; +	if (unlikely(event->attr.type != PERF_TYPE_HARDWARE && +		     event->attr.type != PERF_TYPE_RAW)) +		return -EINVAL;  	/* make sure event is local and doesn't have pmu::count */ -	if (event->oncpu != smp_processor_id() || -	    event->pmu->count) +	if (unlikely(event->oncpu != cpu || event->pmu->count))  		return -EINVAL;  	/* @@ -225,47 +276,58 @@ static const struct bpf_func_proto bpf_perf_event_read_proto = {  	.arg2_type	= ARG_ANYTHING,  }; -static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) +static __always_inline u64 +__bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, +			u64 flags, struct perf_raw_record *raw)  { -	struct pt_regs *regs = (struct pt_regs *) (long) r1; -	struct bpf_map *map = (struct bpf_map *) (long) r2;  	struct bpf_array *array = container_of(map, struct bpf_array, map); +	unsigned int cpu = smp_processor_id();  	u64 index = flags & BPF_F_INDEX_MASK; -	void *data = (void *) (long) r4;  	struct perf_sample_data sample_data; +	struct bpf_event_entry *ee;  	struct perf_event *event; -	struct file *file; -	struct perf_raw_record raw = { -		.size = size, -		.data = data, -	}; -	if (unlikely(flags & ~(BPF_F_INDEX_MASK))) -		return -EINVAL;  	if (index == BPF_F_CURRENT_CPU) -		index = raw_smp_processor_id(); +		index = cpu;  	if (unlikely(index >= array->map.max_entries))  		return -E2BIG; -	file = READ_ONCE(array->ptrs[index]); -	if (unlikely(!file)) +	ee = READ_ONCE(array->ptrs[index]); +	if (!ee)  		return -ENOENT; -	event = file->private_data; - +	event = ee->event;  	if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE ||  		     event->attr.config != PERF_COUNT_SW_BPF_OUTPUT))  		return -EINVAL; -	if (unlikely(event->oncpu != smp_processor_id())) +	if (unlikely(event->oncpu != cpu))  		return -EOPNOTSUPP;  	perf_sample_data_init(&sample_data, 0, 0); -	sample_data.raw = &raw; +	sample_data.raw = raw;  	perf_event_output(event, &sample_data, regs);  	return 0;  } +static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) +{ +	struct pt_regs *regs = (struct pt_regs *)(long) r1; +	struct bpf_map *map  = (struct bpf_map *)(long) r2; +	void *data = (void *)(long) r4; +	struct perf_raw_record raw = { +		.frag = { +			.size = size, +			.data = data, +		}, +	}; + +	if (unlikely(flags & ~(BPF_F_INDEX_MASK))) +		return -EINVAL; + +	return __bpf_perf_event_output(regs, map, flags, &raw); +} +  static const struct bpf_func_proto bpf_perf_event_output_proto = {  	.func		= bpf_perf_event_output,  	.gpl_only	= true, @@ -279,31 +341,41 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = {  static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs); -static u64 bpf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) +u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, +		     void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)  {  	struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs); +	struct perf_raw_frag frag = { +		.copy		= ctx_copy, +		.size		= ctx_size, +		.data		= ctx, +	}; +	struct perf_raw_record raw = { +		.frag = { +			{ +				.next	= ctx_size ? &frag : NULL, +			}, +			.size	= meta_size, +			.data	= meta, +		}, +	};  	perf_fetch_caller_regs(regs); -	return bpf_perf_event_output((long)regs, r2, flags, r4, size); +	return __bpf_perf_event_output(regs, map, flags, &raw);  } -static const struct bpf_func_proto bpf_event_output_proto = { -	.func		= bpf_event_output, +static u64 bpf_get_current_task(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ +	return (long) current; +} + +static const struct bpf_func_proto bpf_get_current_task_proto = { +	.func		= bpf_get_current_task,  	.gpl_only	= true,  	.ret_type	= RET_INTEGER, -	.arg1_type	= ARG_PTR_TO_CTX, -	.arg2_type	= ARG_CONST_MAP_PTR, -	.arg3_type	= ARG_ANYTHING, -	.arg4_type	= ARG_PTR_TO_STACK, -	.arg5_type	= ARG_CONST_STACK_SIZE,  }; -const struct bpf_func_proto *bpf_get_event_output_proto(void) -{ -	return &bpf_event_output_proto; -} -  static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)  {  	switch (func_id) { @@ -321,6 +393,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)  		return &bpf_tail_call_proto;  	case BPF_FUNC_get_current_pid_tgid:  		return &bpf_get_current_pid_tgid_proto; +	case BPF_FUNC_get_current_task: +		return &bpf_get_current_task_proto;  	case BPF_FUNC_get_current_uid_gid:  		return &bpf_get_current_uid_gid_proto;  	case BPF_FUNC_get_current_comm: @@ -331,6 +405,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)  		return &bpf_get_smp_processor_id_proto;  	case BPF_FUNC_perf_event_read:  		return &bpf_perf_event_read_proto; +	case BPF_FUNC_probe_write_user: +		return bpf_get_probe_write_proto();  	default:  		return NULL;  	} @@ -349,20 +425,15 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func  }  /* bpf+kprobe programs can access fields of 'struct pt_regs' */ -static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type) +static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type, +					enum bpf_reg_type *reg_type)  { -	/* check bounds */  	if (off < 0 || off >= sizeof(struct pt_regs))  		return false; - -	/* only read is allowed */  	if (type != BPF_READ)  		return false; - -	/* disallow misaligned access */  	if (off % size != 0)  		return false; -  	return true;  } @@ -427,7 +498,8 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)  	}  } -static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type) +static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type, +				    enum bpf_reg_type *reg_type)  {  	if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE)  		return false; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 900dbb1efff2..84752c8e28b5 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -89,16 +89,16 @@ struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end;  /* What to set function_trace_op to */  static struct ftrace_ops *set_function_trace_op; -/* List for set_ftrace_pid's pids. */ -LIST_HEAD(ftrace_pids); -struct ftrace_pid { -	struct list_head list; -	struct pid *pid; -}; - -static bool ftrace_pids_enabled(void) +static bool ftrace_pids_enabled(struct ftrace_ops *ops)  { -	return !list_empty(&ftrace_pids); +	struct trace_array *tr; + +	if (!(ops->flags & FTRACE_OPS_FL_PID) || !ops->private) +		return false; + +	tr = ops->private; + +	return tr->function_pids != NULL;  }  static void ftrace_update_trampoline(struct ftrace_ops *ops); @@ -179,7 +179,9 @@ int ftrace_nr_registered_ops(void)  static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,  			    struct ftrace_ops *op, struct pt_regs *regs)  { -	if (!test_tsk_trace_trace(current)) +	struct trace_array *tr = op->private; + +	if (tr && this_cpu_read(tr->trace_buffer.data->ftrace_ignore_pid))  		return;  	op->saved_func(ip, parent_ip, op, regs); @@ -417,7 +419,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops)  	/* Always save the function, and reset at unregistering */  	ops->saved_func = ops->func; -	if (ops->flags & FTRACE_OPS_FL_PID && ftrace_pids_enabled()) +	if (ftrace_pids_enabled(ops))  		ops->func = ftrace_pid_func;  	ftrace_update_trampoline(ops); @@ -450,7 +452,6 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)  static void ftrace_update_pid_func(void)  { -	bool enabled = ftrace_pids_enabled();  	struct ftrace_ops *op;  	/* Only do something if we are tracing something */ @@ -459,8 +460,8 @@ static void ftrace_update_pid_func(void)  	do_for_each_ftrace_op(op, ftrace_ops_list) {  		if (op->flags & FTRACE_OPS_FL_PID) { -			op->func = enabled ? ftrace_pid_func : -				op->saved_func; +			op->func = ftrace_pids_enabled(op) ? +				ftrace_pid_func : op->saved_func;  			ftrace_update_trampoline(op);  		}  	} while_for_each_ftrace_op(op); @@ -5324,179 +5325,99 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)  	return ops->func;  } -static void clear_ftrace_swapper(void) +static void +ftrace_filter_pid_sched_switch_probe(void *data, bool preempt, +		    struct task_struct *prev, struct task_struct *next)  { -	struct task_struct *p; -	int cpu; +	struct trace_array *tr = data; +	struct trace_pid_list *pid_list; -	get_online_cpus(); -	for_each_online_cpu(cpu) { -		p = idle_task(cpu); -		clear_tsk_trace_trace(p); -	} -	put_online_cpus(); -} - -static void set_ftrace_swapper(void) -{ -	struct task_struct *p; -	int cpu; +	pid_list = rcu_dereference_sched(tr->function_pids); -	get_online_cpus(); -	for_each_online_cpu(cpu) { -		p = idle_task(cpu); -		set_tsk_trace_trace(p); -	} -	put_online_cpus(); +	this_cpu_write(tr->trace_buffer.data->ftrace_ignore_pid, +		       trace_ignore_this_task(pid_list, next));  } -static void clear_ftrace_pid(struct pid *pid) +static void clear_ftrace_pids(struct trace_array *tr)  { -	struct task_struct *p; +	struct trace_pid_list *pid_list; +	int cpu; -	rcu_read_lock(); -	do_each_pid_task(pid, PIDTYPE_PID, p) { -		clear_tsk_trace_trace(p); -	} while_each_pid_task(pid, PIDTYPE_PID, p); -	rcu_read_unlock(); +	pid_list = rcu_dereference_protected(tr->function_pids, +					     lockdep_is_held(&ftrace_lock)); +	if (!pid_list) +		return; -	put_pid(pid); -} +	unregister_trace_sched_switch(ftrace_filter_pid_sched_switch_probe, tr); -static void set_ftrace_pid(struct pid *pid) -{ -	struct task_struct *p; +	for_each_possible_cpu(cpu) +		per_cpu_ptr(tr->trace_buffer.data, cpu)->ftrace_ignore_pid = false; -	rcu_read_lock(); -	do_each_pid_task(pid, PIDTYPE_PID, p) { -		set_tsk_trace_trace(p); -	} while_each_pid_task(pid, PIDTYPE_PID, p); -	rcu_read_unlock(); -} +	rcu_assign_pointer(tr->function_pids, NULL); -static void clear_ftrace_pid_task(struct pid *pid) -{ -	if (pid == ftrace_swapper_pid) -		clear_ftrace_swapper(); -	else -		clear_ftrace_pid(pid); -} +	/* Wait till all users are no longer using pid filtering */ +	synchronize_sched(); -static void set_ftrace_pid_task(struct pid *pid) -{ -	if (pid == ftrace_swapper_pid) -		set_ftrace_swapper(); -	else -		set_ftrace_pid(pid); +	trace_free_pid_list(pid_list);  } -static int ftrace_pid_add(int p) +static void ftrace_pid_reset(struct trace_array *tr)  { -	struct pid *pid; -	struct ftrace_pid *fpid; -	int ret = -EINVAL; -  	mutex_lock(&ftrace_lock); - -	if (!p) -		pid = ftrace_swapper_pid; -	else -		pid = find_get_pid(p); - -	if (!pid) -		goto out; - -	ret = 0; - -	list_for_each_entry(fpid, &ftrace_pids, list) -		if (fpid->pid == pid) -			goto out_put; - -	ret = -ENOMEM; - -	fpid = kmalloc(sizeof(*fpid), GFP_KERNEL); -	if (!fpid) -		goto out_put; - -	list_add(&fpid->list, &ftrace_pids); -	fpid->pid = pid; - -	set_ftrace_pid_task(pid); +	clear_ftrace_pids(tr);  	ftrace_update_pid_func(); -  	ftrace_startup_all(0);  	mutex_unlock(&ftrace_lock); -	return 0; - -out_put: -	if (pid != ftrace_swapper_pid) -		put_pid(pid); - -out: -	mutex_unlock(&ftrace_lock); -	return ret;  } -static void ftrace_pid_reset(void) -{ -	struct ftrace_pid *fpid, *safe; - -	mutex_lock(&ftrace_lock); -	list_for_each_entry_safe(fpid, safe, &ftrace_pids, list) { -		struct pid *pid = fpid->pid; - -		clear_ftrace_pid_task(pid); - -		list_del(&fpid->list); -		kfree(fpid); -	} - -	ftrace_update_pid_func(); -	ftrace_startup_all(0); - -	mutex_unlock(&ftrace_lock); -} +/* Greater than any max PID */ +#define FTRACE_NO_PIDS		(void *)(PID_MAX_LIMIT + 1)  static void *fpid_start(struct seq_file *m, loff_t *pos) +	__acquires(RCU)  { +	struct trace_pid_list *pid_list; +	struct trace_array *tr = m->private; +  	mutex_lock(&ftrace_lock); +	rcu_read_lock_sched(); -	if (!ftrace_pids_enabled() && (!*pos)) -		return (void *) 1; +	pid_list = rcu_dereference_sched(tr->function_pids); -	return seq_list_start(&ftrace_pids, *pos); +	if (!pid_list) +		return !(*pos) ? FTRACE_NO_PIDS : NULL; + +	return trace_pid_start(pid_list, pos);  }  static void *fpid_next(struct seq_file *m, void *v, loff_t *pos)  { -	if (v == (void *)1) +	struct trace_array *tr = m->private; +	struct trace_pid_list *pid_list = rcu_dereference_sched(tr->function_pids); + +	if (v == FTRACE_NO_PIDS)  		return NULL; -	return seq_list_next(v, &ftrace_pids, pos); +	return trace_pid_next(pid_list, v, pos);  }  static void fpid_stop(struct seq_file *m, void *p) +	__releases(RCU)  { +	rcu_read_unlock_sched();  	mutex_unlock(&ftrace_lock);  }  static int fpid_show(struct seq_file *m, void *v)  { -	const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list); - -	if (v == (void *)1) { +	if (v == FTRACE_NO_PIDS) {  		seq_puts(m, "no pid\n");  		return 0;  	} -	if (fpid->pid == ftrace_swapper_pid) -		seq_puts(m, "swapper tasks\n"); -	else -		seq_printf(m, "%u\n", pid_vnr(fpid->pid)); - -	return 0; +	return trace_pid_show(m, v);  }  static const struct seq_operations ftrace_pid_sops = { @@ -5509,58 +5430,103 @@ static const struct seq_operations ftrace_pid_sops = {  static int  ftrace_pid_open(struct inode *inode, struct file *file)  { +	struct trace_array *tr = inode->i_private; +	struct seq_file *m;  	int ret = 0; +	if (trace_array_get(tr) < 0) +		return -ENODEV; +  	if ((file->f_mode & FMODE_WRITE) &&  	    (file->f_flags & O_TRUNC)) -		ftrace_pid_reset(); +		ftrace_pid_reset(tr); -	if (file->f_mode & FMODE_READ) -		ret = seq_open(file, &ftrace_pid_sops); +	ret = seq_open(file, &ftrace_pid_sops); +	if (ret < 0) { +		trace_array_put(tr); +	} else { +		m = file->private_data; +		/* copy tr over to seq ops */ +		m->private = tr; +	}  	return ret;  } +static void ignore_task_cpu(void *data) +{ +	struct trace_array *tr = data; +	struct trace_pid_list *pid_list; + +	/* +	 * This function is called by on_each_cpu() while the +	 * event_mutex is held. +	 */ +	pid_list = rcu_dereference_protected(tr->function_pids, +					     mutex_is_locked(&ftrace_lock)); + +	this_cpu_write(tr->trace_buffer.data->ftrace_ignore_pid, +		       trace_ignore_this_task(pid_list, current)); +} +  static ssize_t  ftrace_pid_write(struct file *filp, const char __user *ubuf,  		   size_t cnt, loff_t *ppos)  { -	char buf[64], *tmp; -	long val; -	int ret; +	struct seq_file *m = filp->private_data; +	struct trace_array *tr = m->private; +	struct trace_pid_list *filtered_pids = NULL; +	struct trace_pid_list *pid_list; +	ssize_t ret; -	if (cnt >= sizeof(buf)) -		return -EINVAL; +	if (!cnt) +		return 0; + +	mutex_lock(&ftrace_lock); + +	filtered_pids = rcu_dereference_protected(tr->function_pids, +					     lockdep_is_held(&ftrace_lock)); + +	ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt); +	if (ret < 0) +		goto out; -	if (copy_from_user(&buf, ubuf, cnt)) -		return -EFAULT; +	rcu_assign_pointer(tr->function_pids, pid_list); -	buf[cnt] = 0; +	if (filtered_pids) { +		synchronize_sched(); +		trace_free_pid_list(filtered_pids); +	} else if (pid_list) { +		/* Register a probe to set whether to ignore the tracing of a task */ +		register_trace_sched_switch(ftrace_filter_pid_sched_switch_probe, tr); +	}  	/* -	 * Allow "echo > set_ftrace_pid" or "echo -n '' > set_ftrace_pid" -	 * to clean the filter quietly. +	 * Ignoring of pids is done at task switch. But we have to +	 * check for those tasks that are currently running. +	 * Always do this in case a pid was appended or removed.  	 */ -	tmp = strstrip(buf); -	if (strlen(tmp) == 0) -		return 1; +	on_each_cpu(ignore_task_cpu, tr, 1); -	ret = kstrtol(tmp, 10, &val); -	if (ret < 0) -		return ret; +	ftrace_update_pid_func(); +	ftrace_startup_all(0); + out: +	mutex_unlock(&ftrace_lock); -	ret = ftrace_pid_add(val); +	if (ret > 0) +		*ppos += ret; -	return ret ? ret : cnt; +	return ret;  }  static int  ftrace_pid_release(struct inode *inode, struct file *file)  { -	if (file->f_mode & FMODE_READ) -		seq_release(inode, file); +	struct trace_array *tr = inode->i_private; -	return 0; +	trace_array_put(tr); + +	return seq_release(inode, file);  }  static const struct file_operations ftrace_pid_fops = { @@ -5571,24 +5537,21 @@ static const struct file_operations ftrace_pid_fops = {  	.release	= ftrace_pid_release,  }; -static __init int ftrace_init_tracefs(void) +void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer)  { -	struct dentry *d_tracer; +	trace_create_file("set_ftrace_pid", 0644, d_tracer, +			    tr, &ftrace_pid_fops); +} -	d_tracer = tracing_init_dentry(); -	if (IS_ERR(d_tracer)) -		return 0; +void __init ftrace_init_tracefs_toplevel(struct trace_array *tr, +					 struct dentry *d_tracer) +{ +	/* Only the top level directory has the dyn_tracefs and profile */ +	WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL));  	ftrace_init_dyn_tracefs(d_tracer); - -	trace_create_file("set_ftrace_pid", 0644, d_tracer, -			    NULL, &ftrace_pid_fops); -  	ftrace_profile_tracefs(d_tracer); - -	return 0;  } -fs_initcall(ftrace_init_tracefs);  /**   * ftrace_kill - kill ftrace diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8a4bd6b68a0b..dade4c9559cc 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -25,7 +25,7 @@  #include <linux/hardirq.h>  #include <linux/linkage.h>  #include <linux/uaccess.h> -#include <linux/kprobes.h> +#include <linux/vmalloc.h>  #include <linux/ftrace.h>  #include <linux/module.h>  #include <linux/percpu.h> @@ -319,6 +319,258 @@ int call_filter_check_discard(struct trace_event_call *call, void *rec,  	return 0;  } +void trace_free_pid_list(struct trace_pid_list *pid_list) +{ +	vfree(pid_list->pids); +	kfree(pid_list); +} + +/** + * trace_find_filtered_pid - check if a pid exists in a filtered_pid list + * @filtered_pids: The list of pids to check + * @search_pid: The PID to find in @filtered_pids + * + * Returns true if @search_pid is fonud in @filtered_pids, and false otherwis. + */ +bool +trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid) +{ +	/* +	 * If pid_max changed after filtered_pids was created, we +	 * by default ignore all pids greater than the previous pid_max. +	 */ +	if (search_pid >= filtered_pids->pid_max) +		return false; + +	return test_bit(search_pid, filtered_pids->pids); +} + +/** + * trace_ignore_this_task - should a task be ignored for tracing + * @filtered_pids: The list of pids to check + * @task: The task that should be ignored if not filtered + * + * Checks if @task should be traced or not from @filtered_pids. + * Returns true if @task should *NOT* be traced. + * Returns false if @task should be traced. + */ +bool +trace_ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task) +{ +	/* +	 * Return false, because if filtered_pids does not exist, +	 * all pids are good to trace. +	 */ +	if (!filtered_pids) +		return false; + +	return !trace_find_filtered_pid(filtered_pids, task->pid); +} + +/** + * trace_pid_filter_add_remove - Add or remove a task from a pid_list + * @pid_list: The list to modify + * @self: The current task for fork or NULL for exit + * @task: The task to add or remove + * + * If adding a task, if @self is defined, the task is only added if @self + * is also included in @pid_list. This happens on fork and tasks should + * only be added when the parent is listed. If @self is NULL, then the + * @task pid will be removed from the list, which would happen on exit + * of a task. + */ +void trace_filter_add_remove_task(struct trace_pid_list *pid_list, +				  struct task_struct *self, +				  struct task_struct *task) +{ +	if (!pid_list) +		return; + +	/* For forks, we only add if the forking task is listed */ +	if (self) { +		if (!trace_find_filtered_pid(pid_list, self->pid)) +			return; +	} + +	/* Sorry, but we don't support pid_max changing after setting */ +	if (task->pid >= pid_list->pid_max) +		return; + +	/* "self" is set for forks, and NULL for exits */ +	if (self) +		set_bit(task->pid, pid_list->pids); +	else +		clear_bit(task->pid, pid_list->pids); +} + +/** + * trace_pid_next - Used for seq_file to get to the next pid of a pid_list + * @pid_list: The pid list to show + * @v: The last pid that was shown (+1 the actual pid to let zero be displayed) + * @pos: The position of the file + * + * This is used by the seq_file "next" operation to iterate the pids + * listed in a trace_pid_list structure. + * + * Returns the pid+1 as we want to display pid of zero, but NULL would + * stop the iteration. + */ +void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos) +{ +	unsigned long pid = (unsigned long)v; + +	(*pos)++; + +	/* pid already is +1 of the actual prevous bit */ +	pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid); + +	/* Return pid + 1 to allow zero to be represented */ +	if (pid < pid_list->pid_max) +		return (void *)(pid + 1); + +	return NULL; +} + +/** + * trace_pid_start - Used for seq_file to start reading pid lists + * @pid_list: The pid list to show + * @pos: The position of the file + * + * This is used by seq_file "start" operation to start the iteration + * of listing pids. + * + * Returns the pid+1 as we want to display pid of zero, but NULL would + * stop the iteration. + */ +void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos) +{ +	unsigned long pid; +	loff_t l = 0; + +	pid = find_first_bit(pid_list->pids, pid_list->pid_max); +	if (pid >= pid_list->pid_max) +		return NULL; + +	/* Return pid + 1 so that zero can be the exit value */ +	for (pid++; pid && l < *pos; +	     pid = (unsigned long)trace_pid_next(pid_list, (void *)pid, &l)) +		; +	return (void *)pid; +} + +/** + * trace_pid_show - show the current pid in seq_file processing + * @m: The seq_file structure to write into + * @v: A void pointer of the pid (+1) value to display + * + * Can be directly used by seq_file operations to display the current + * pid value. + */ +int trace_pid_show(struct seq_file *m, void *v) +{ +	unsigned long pid = (unsigned long)v - 1; + +	seq_printf(m, "%lu\n", pid); +	return 0; +} + +/* 128 should be much more than enough */ +#define PID_BUF_SIZE		127 + +int trace_pid_write(struct trace_pid_list *filtered_pids, +		    struct trace_pid_list **new_pid_list, +		    const char __user *ubuf, size_t cnt) +{ +	struct trace_pid_list *pid_list; +	struct trace_parser parser; +	unsigned long val; +	int nr_pids = 0; +	ssize_t read = 0; +	ssize_t ret = 0; +	loff_t pos; +	pid_t pid; + +	if (trace_parser_get_init(&parser, PID_BUF_SIZE + 1)) +		return -ENOMEM; + +	/* +	 * Always recreate a new array. The write is an all or nothing +	 * operation. Always create a new array when adding new pids by +	 * the user. If the operation fails, then the current list is +	 * not modified. +	 */ +	pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL); +	if (!pid_list) +		return -ENOMEM; + +	pid_list->pid_max = READ_ONCE(pid_max); + +	/* Only truncating will shrink pid_max */ +	if (filtered_pids && filtered_pids->pid_max > pid_list->pid_max) +		pid_list->pid_max = filtered_pids->pid_max; + +	pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3); +	if (!pid_list->pids) { +		kfree(pid_list); +		return -ENOMEM; +	} + +	if (filtered_pids) { +		/* copy the current bits to the new max */ +		for_each_set_bit(pid, filtered_pids->pids, +				 filtered_pids->pid_max) { +			set_bit(pid, pid_list->pids); +			nr_pids++; +		} +	} + +	while (cnt > 0) { + +		pos = 0; + +		ret = trace_get_user(&parser, ubuf, cnt, &pos); +		if (ret < 0 || !trace_parser_loaded(&parser)) +			break; + +		read += ret; +		ubuf += ret; +		cnt -= ret; + +		parser.buffer[parser.idx] = 0; + +		ret = -EINVAL; +		if (kstrtoul(parser.buffer, 0, &val)) +			break; +		if (val >= pid_list->pid_max) +			break; + +		pid = (pid_t)val; + +		set_bit(pid, pid_list->pids); +		nr_pids++; + +		trace_parser_clear(&parser); +		ret = 0; +	} +	trace_parser_put(&parser); + +	if (ret < 0) { +		trace_free_pid_list(pid_list); +		return ret; +	} + +	if (!nr_pids) { +		/* Cleared the list of pids */ +		trace_free_pid_list(pid_list); +		read = ret; +		pid_list = NULL; +	} + +	*new_pid_list = pid_list; + +	return read; +} +  static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu)  {  	u64 ts; @@ -1862,7 +2114,17 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr,  {  	__buffer_unlock_commit(buffer, event); -	ftrace_trace_stack(tr, buffer, flags, 0, pc, regs); +	/* +	 * If regs is not set, then skip the following callers: +	 *   trace_buffer_unlock_commit_regs +	 *   event_trigger_unlock_commit +	 *   trace_event_buffer_commit +	 *   trace_event_raw_event_sched_switch +	 * Note, we can still get here via blktrace, wakeup tracer +	 * and mmiotrace, but that's ok if they lose a function or +	 * two. They are that meaningful. +	 */ +	ftrace_trace_stack(tr, buffer, flags, regs ? 0 : 4, pc, regs);  	ftrace_trace_userstack(buffer, flags, pc);  } @@ -1913,6 +2175,13 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,  	trace.skip		= skip;  	/* +	 * Add two, for this function and the call to save_stack_trace() +	 * If regs is set, then these functions will not be in the way. +	 */ +	if (!regs) +		trace.skip += 2; + +	/*  	 * Since events can happen in NMIs there's no safe way to  	 * use the per cpu ftrace_stacks. We reserve it and if an interrupt  	 * or NMI comes in, it will just have to use the default @@ -2083,83 +2352,41 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags)  /* created for use with alloc_percpu */  struct trace_buffer_struct { -	char buffer[TRACE_BUF_SIZE]; +	int nesting; +	char buffer[4][TRACE_BUF_SIZE];  };  static struct trace_buffer_struct *trace_percpu_buffer; -static struct trace_buffer_struct *trace_percpu_sirq_buffer; -static struct trace_buffer_struct *trace_percpu_irq_buffer; -static struct trace_buffer_struct *trace_percpu_nmi_buffer;  /* - * The buffer used is dependent on the context. There is a per cpu - * buffer for normal context, softirq contex, hard irq context and - * for NMI context. Thise allows for lockless recording. - * - * Note, if the buffers failed to be allocated, then this returns NULL + * Thise allows for lockless recording.  If we're nested too deeply, then + * this returns NULL.   */  static char *get_trace_buf(void)  { -	struct trace_buffer_struct *percpu_buffer; - -	/* -	 * If we have allocated per cpu buffers, then we do not -	 * need to do any locking. -	 */ -	if (in_nmi()) -		percpu_buffer = trace_percpu_nmi_buffer; -	else if (in_irq()) -		percpu_buffer = trace_percpu_irq_buffer; -	else if (in_softirq()) -		percpu_buffer = trace_percpu_sirq_buffer; -	else -		percpu_buffer = trace_percpu_buffer; +	struct trace_buffer_struct *buffer = this_cpu_ptr(trace_percpu_buffer); -	if (!percpu_buffer) +	if (!buffer || buffer->nesting >= 4)  		return NULL; -	return this_cpu_ptr(&percpu_buffer->buffer[0]); +	return &buffer->buffer[buffer->nesting++][0]; +} + +static void put_trace_buf(void) +{ +	this_cpu_dec(trace_percpu_buffer->nesting);  }  static int alloc_percpu_trace_buffer(void)  {  	struct trace_buffer_struct *buffers; -	struct trace_buffer_struct *sirq_buffers; -	struct trace_buffer_struct *irq_buffers; -	struct trace_buffer_struct *nmi_buffers;  	buffers = alloc_percpu(struct trace_buffer_struct); -	if (!buffers) -		goto err_warn; - -	sirq_buffers = alloc_percpu(struct trace_buffer_struct); -	if (!sirq_buffers) -		goto err_sirq; - -	irq_buffers = alloc_percpu(struct trace_buffer_struct); -	if (!irq_buffers) -		goto err_irq; - -	nmi_buffers = alloc_percpu(struct trace_buffer_struct); -	if (!nmi_buffers) -		goto err_nmi; +	if (WARN(!buffers, "Could not allocate percpu trace_printk buffer")) +		return -ENOMEM;  	trace_percpu_buffer = buffers; -	trace_percpu_sirq_buffer = sirq_buffers; -	trace_percpu_irq_buffer = irq_buffers; -	trace_percpu_nmi_buffer = nmi_buffers; -  	return 0; - - err_nmi: -	free_percpu(irq_buffers); - err_irq: -	free_percpu(sirq_buffers); - err_sirq: -	free_percpu(buffers); - err_warn: -	WARN(1, "Could not allocate percpu trace_printk buffer"); -	return -ENOMEM;  }  static int buffers_allocated; @@ -2250,7 +2477,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)  	tbuffer = get_trace_buf();  	if (!tbuffer) {  		len = 0; -		goto out; +		goto out_nobuffer;  	}  	len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args); @@ -2276,6 +2503,9 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)  	}  out: +	put_trace_buf(); + +out_nobuffer:  	preempt_enable_notrace();  	unpause_graph_tracing(); @@ -2307,7 +2537,7 @@ __trace_array_vprintk(struct ring_buffer *buffer,  	tbuffer = get_trace_buf();  	if (!tbuffer) {  		len = 0; -		goto out; +		goto out_nobuffer;  	}  	len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args); @@ -2326,7 +2556,11 @@ __trace_array_vprintk(struct ring_buffer *buffer,  		__buffer_unlock_commit(buffer, event);  		ftrace_trace_stack(&global_trace, buffer, flags, 6, pc, NULL);  	} - out: + +out: +	put_trace_buf(); + +out_nobuffer:  	preempt_enable_notrace();  	unpause_graph_tracing(); @@ -6977,6 +7211,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)  	for_each_tracing_cpu(cpu)  		tracing_init_tracefs_percpu(tr, cpu); +	ftrace_init_tracefs(tr, d_tracer);  }  static struct vfsmount *trace_automount(void *ingore) @@ -7130,6 +7365,7 @@ static __init int tracer_init_tracefs(void)  		return 0;  	init_tracer_tracefs(&global_trace, d_tracer); +	ftrace_init_tracefs_toplevel(&global_trace, d_tracer);  	trace_create_file("tracing_thresh", 0644, d_tracer,  			&global_trace, &tracing_thresh_fops); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 5167c366d6b7..f783df416726 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -80,6 +80,12 @@ enum trace_type {  	FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \  		     filter) +#undef FTRACE_ENTRY_PACKED +#define FTRACE_ENTRY_PACKED(name, struct_name, id, tstruct, print,	\ +			    filter)					\ +	FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ +		     filter) __packed +  #include "trace_entries.h"  /* @@ -156,6 +162,9 @@ struct trace_array_cpu {  	char			comm[TASK_COMM_LEN];  	bool			ignore_pid; +#ifdef CONFIG_FUNCTION_TRACER +	bool			ftrace_ignore_pid; +#endif  };  struct tracer; @@ -247,6 +256,7 @@ struct trace_array {  	int			ref;  #ifdef CONFIG_FUNCTION_TRACER  	struct ftrace_ops	*ops; +	struct trace_pid_list	__rcu *function_pids;  	/* function tracing enabled */  	int			function_enabled;  #endif @@ -628,6 +638,25 @@ extern unsigned long nsecs_to_usecs(unsigned long nsecs);  extern unsigned long tracing_thresh; +/* PID filtering */ + +extern int pid_max; + +bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids, +			     pid_t search_pid); +bool trace_ignore_this_task(struct trace_pid_list *filtered_pids, +			    struct task_struct *task); +void trace_filter_add_remove_task(struct trace_pid_list *pid_list, +				  struct task_struct *self, +				  struct task_struct *task); +void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos); +void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos); +int trace_pid_show(struct seq_file *m, void *v); +void trace_free_pid_list(struct trace_pid_list *pid_list); +int trace_pid_write(struct trace_pid_list *filtered_pids, +		    struct trace_pid_list **new_pid_list, +		    const char __user *ubuf, size_t cnt); +  #ifdef CONFIG_TRACER_MAX_TRACE  void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);  void update_max_tr_single(struct trace_array *tr, @@ -821,12 +850,9 @@ extern struct list_head ftrace_pids;  #ifdef CONFIG_FUNCTION_TRACER  extern bool ftrace_filter_param __initdata; -static inline int ftrace_trace_task(struct task_struct *task) +static inline int ftrace_trace_task(struct trace_array *tr)  { -	if (list_empty(&ftrace_pids)) -		return 1; - -	return test_tsk_trace_trace(task); +	return !this_cpu_read(tr->trace_buffer.data->ftrace_ignore_pid);  }  extern int ftrace_is_dead(void);  int ftrace_create_function_files(struct trace_array *tr, @@ -836,8 +862,11 @@ void ftrace_init_global_array_ops(struct trace_array *tr);  void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func);  void ftrace_reset_array_ops(struct trace_array *tr);  int using_ftrace_ops_list_func(void); +void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer); +void ftrace_init_tracefs_toplevel(struct trace_array *tr, +				  struct dentry *d_tracer);  #else -static inline int ftrace_trace_task(struct task_struct *task) +static inline int ftrace_trace_task(struct trace_array *tr)  {  	return 1;  } @@ -852,6 +881,8 @@ static inline void ftrace_destroy_function_files(struct trace_array *tr) { }  static inline __init void  ftrace_init_global_array_ops(struct trace_array *tr) { }  static inline void ftrace_reset_array_ops(struct trace_array *tr) { } +static inline void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d) { } +static inline void ftrace_init_tracefs_toplevel(struct trace_array *tr, struct dentry *d) { }  /* ftace_func_t type is not defined, use macro instead of static inline */  #define ftrace_init_array_ops(tr, func) do { } while (0)  #endif /* CONFIG_FUNCTION_TRACER */ @@ -1600,6 +1631,11 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled);  #define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter)	\  	FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \  		     filter) +#undef FTRACE_ENTRY_PACKED +#define FTRACE_ENTRY_PACKED(call, struct_name, id, tstruct, print, filter) \ +	FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \ +		     filter) +  #include "trace_entries.h"  #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_FUNCTION_TRACER) diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index ee7b94a4810a..5c30efcda5e6 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -72,7 +72,7 @@ FTRACE_ENTRY_REG(function, ftrace_entry,  );  /* Function call entry */ -FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry, +FTRACE_ENTRY_PACKED(funcgraph_entry, ftrace_graph_ent_entry,  	TRACE_GRAPH_ENT, @@ -88,7 +88,7 @@ FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry,  );  /* Function return entry */ -FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry, +FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry,  	TRACE_GRAPH_RET, diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 3d4155892a1e..03c0a48c3ac4 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -15,7 +15,6 @@  #include <linux/kthread.h>  #include <linux/tracefs.h>  #include <linux/uaccess.h> -#include <linux/vmalloc.h>  #include <linux/module.h>  #include <linux/ctype.h>  #include <linux/sort.h> @@ -262,6 +261,14 @@ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer,  	local_save_flags(fbuffer->flags);  	fbuffer->pc = preempt_count(); +	/* +	 * If CONFIG_PREEMPT is enabled, then the tracepoint itself disables +	 * preemption (adding one to the preempt_count). Since we are +	 * interested in the preempt_count at the time the tracepoint was +	 * hit, we need to subtract one to offset the increment. +	 */ +	if (IS_ENABLED(CONFIG_PREEMPT)) +		fbuffer->pc--;  	fbuffer->trace_file = trace_file;  	fbuffer->event = @@ -499,60 +506,6 @@ static void ftrace_clear_events(struct trace_array *tr)  	mutex_unlock(&event_mutex);  } -/* Shouldn't this be in a header? */ -extern int pid_max; - -/* Returns true if found in filter */ -static bool -find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid) -{ -	/* -	 * If pid_max changed after filtered_pids was created, we -	 * by default ignore all pids greater than the previous pid_max. -	 */ -	if (search_pid >= filtered_pids->pid_max) -		return false; - -	return test_bit(search_pid, filtered_pids->pids); -} - -static bool -ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task) -{ -	/* -	 * Return false, because if filtered_pids does not exist, -	 * all pids are good to trace. -	 */ -	if (!filtered_pids) -		return false; - -	return !find_filtered_pid(filtered_pids, task->pid); -} - -static void filter_add_remove_task(struct trace_pid_list *pid_list, -				   struct task_struct *self, -				   struct task_struct *task) -{ -	if (!pid_list) -		return; - -	/* For forks, we only add if the forking task is listed */ -	if (self) { -		if (!find_filtered_pid(pid_list, self->pid)) -			return; -	} - -	/* Sorry, but we don't support pid_max changing after setting */ -	if (task->pid >= pid_list->pid_max) -		return; - -	/* "self" is set for forks, and NULL for exits */ -	if (self) -		set_bit(task->pid, pid_list->pids); -	else -		clear_bit(task->pid, pid_list->pids); -} -  static void  event_filter_pid_sched_process_exit(void *data, struct task_struct *task)  { @@ -560,7 +513,7 @@ event_filter_pid_sched_process_exit(void *data, struct task_struct *task)  	struct trace_array *tr = data;  	pid_list = rcu_dereference_sched(tr->filtered_pids); -	filter_add_remove_task(pid_list, NULL, task); +	trace_filter_add_remove_task(pid_list, NULL, task);  }  static void @@ -572,7 +525,7 @@ event_filter_pid_sched_process_fork(void *data,  	struct trace_array *tr = data;  	pid_list = rcu_dereference_sched(tr->filtered_pids); -	filter_add_remove_task(pid_list, self, task); +	trace_filter_add_remove_task(pid_list, self, task);  }  void trace_event_follow_fork(struct trace_array *tr, bool enable) @@ -600,8 +553,8 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,  	pid_list = rcu_dereference_sched(tr->filtered_pids);  	this_cpu_write(tr->trace_buffer.data->ignore_pid, -		       ignore_this_task(pid_list, prev) && -		       ignore_this_task(pid_list, next)); +		       trace_ignore_this_task(pid_list, prev) && +		       trace_ignore_this_task(pid_list, next));  }  static void @@ -614,7 +567,7 @@ event_filter_pid_sched_switch_probe_post(void *data, bool preempt,  	pid_list = rcu_dereference_sched(tr->filtered_pids);  	this_cpu_write(tr->trace_buffer.data->ignore_pid, -		       ignore_this_task(pid_list, next)); +		       trace_ignore_this_task(pid_list, next));  }  static void @@ -630,7 +583,7 @@ event_filter_pid_sched_wakeup_probe_pre(void *data, struct task_struct *task)  	pid_list = rcu_dereference_sched(tr->filtered_pids);  	this_cpu_write(tr->trace_buffer.data->ignore_pid, -		       ignore_this_task(pid_list, task)); +		       trace_ignore_this_task(pid_list, task));  }  static void @@ -647,7 +600,7 @@ event_filter_pid_sched_wakeup_probe_post(void *data, struct task_struct *task)  	/* Set tracing if current is enabled */  	this_cpu_write(tr->trace_buffer.data->ignore_pid, -		       ignore_this_task(pid_list, current)); +		       trace_ignore_this_task(pid_list, current));  }  static void __ftrace_clear_event_pids(struct trace_array *tr) @@ -685,8 +638,7 @@ static void __ftrace_clear_event_pids(struct trace_array *tr)  	/* Wait till all users are no longer using pid filtering */  	synchronize_sched(); -	vfree(pid_list->pids); -	kfree(pid_list); +	trace_free_pid_list(pid_list);  }  static void ftrace_clear_event_pids(struct trace_array *tr) @@ -1034,18 +986,8 @@ p_next(struct seq_file *m, void *v, loff_t *pos)  {  	struct trace_array *tr = m->private;  	struct trace_pid_list *pid_list = rcu_dereference_sched(tr->filtered_pids); -	unsigned long pid = (unsigned long)v; - -	(*pos)++; - -	/* pid already is +1 of the actual prevous bit */ -	pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid); -	/* Return pid + 1 to allow zero to be represented */ -	if (pid < pid_list->pid_max) -		return (void *)(pid + 1); - -	return NULL; +	return trace_pid_next(pid_list, v, pos);  }  static void *p_start(struct seq_file *m, loff_t *pos) @@ -1053,8 +995,6 @@ static void *p_start(struct seq_file *m, loff_t *pos)  {  	struct trace_pid_list *pid_list;  	struct trace_array *tr = m->private; -	unsigned long pid; -	loff_t l = 0;  	/*  	 * Grab the mutex, to keep calls to p_next() having the same @@ -1070,15 +1010,7 @@ static void *p_start(struct seq_file *m, loff_t *pos)  	if (!pid_list)  		return NULL; -	pid = find_first_bit(pid_list->pids, pid_list->pid_max); -	if (pid >= pid_list->pid_max) -		return NULL; - -	/* Return pid + 1 so that zero can be the exit value */ -	for (pid++; pid && l < *pos; -	     pid = (unsigned long)p_next(m, (void *)pid, &l)) -		; -	return (void *)pid; +	return trace_pid_start(pid_list, pos);  }  static void p_stop(struct seq_file *m, void *p) @@ -1088,14 +1020,6 @@ static void p_stop(struct seq_file *m, void *p)  	mutex_unlock(&event_mutex);  } -static int p_show(struct seq_file *m, void *v) -{ -	unsigned long pid = (unsigned long)v - 1; - -	seq_printf(m, "%lu\n", pid); -	return 0; -} -  static ssize_t  event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,  		  loff_t *ppos) @@ -1654,7 +1578,7 @@ static void ignore_task_cpu(void *data)  					     mutex_is_locked(&event_mutex));  	this_cpu_write(tr->trace_buffer.data->ignore_pid, -		       ignore_this_task(pid_list, current)); +		       trace_ignore_this_task(pid_list, current));  }  static ssize_t @@ -1666,13 +1590,7 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,  	struct trace_pid_list *filtered_pids = NULL;  	struct trace_pid_list *pid_list;  	struct trace_event_file *file; -	struct trace_parser parser; -	unsigned long val; -	loff_t this_pos; -	ssize_t read = 0; -	ssize_t ret = 0; -	pid_t pid; -	int nr_pids = 0; +	ssize_t ret;  	if (!cnt)  		return 0; @@ -1681,93 +1599,15 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,  	if (ret < 0)  		return ret; -	if (trace_parser_get_init(&parser, EVENT_BUF_SIZE + 1)) -		return -ENOMEM; -  	mutex_lock(&event_mutex); +  	filtered_pids = rcu_dereference_protected(tr->filtered_pids,  					     lockdep_is_held(&event_mutex)); -	/* -	 * Always recreate a new array. The write is an all or nothing -	 * operation. Always create a new array when adding new pids by -	 * the user. If the operation fails, then the current list is -	 * not modified. -	 */ -	pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL); -	if (!pid_list) { -		read = -ENOMEM; -		goto out; -	} -	pid_list->pid_max = READ_ONCE(pid_max); -	/* Only truncating will shrink pid_max */ -	if (filtered_pids && filtered_pids->pid_max > pid_list->pid_max) -		pid_list->pid_max = filtered_pids->pid_max; -	pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3); -	if (!pid_list->pids) { -		kfree(pid_list); -		read = -ENOMEM; -		goto out; -	} -	if (filtered_pids) { -		/* copy the current bits to the new max */ -		pid = find_first_bit(filtered_pids->pids, -				     filtered_pids->pid_max); -		while (pid < filtered_pids->pid_max) { -			set_bit(pid, pid_list->pids); -			pid = find_next_bit(filtered_pids->pids, -					    filtered_pids->pid_max, -					    pid + 1); -			nr_pids++; -		} -	} - -	while (cnt > 0) { - -		this_pos = 0; - -		ret = trace_get_user(&parser, ubuf, cnt, &this_pos); -		if (ret < 0 || !trace_parser_loaded(&parser)) -			break; - -		read += ret; -		ubuf += ret; -		cnt -= ret; - -		parser.buffer[parser.idx] = 0; - -		ret = -EINVAL; -		if (kstrtoul(parser.buffer, 0, &val)) -			break; -		if (val >= pid_list->pid_max) -			break; - -		pid = (pid_t)val; - -		set_bit(pid, pid_list->pids); -		nr_pids++; - -		trace_parser_clear(&parser); -		ret = 0; -	} -	trace_parser_put(&parser); - -	if (ret < 0) { -		vfree(pid_list->pids); -		kfree(pid_list); -		read = ret; +	ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt); +	if (ret < 0)  		goto out; -	} -	if (!nr_pids) { -		/* Cleared the list of pids */ -		vfree(pid_list->pids); -		kfree(pid_list); -		read = ret; -		if (!filtered_pids) -			goto out; -		pid_list = NULL; -	}  	rcu_assign_pointer(tr->filtered_pids, pid_list);  	list_for_each_entry(file, &tr->events, list) { @@ -1776,10 +1616,8 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,  	if (filtered_pids) {  		synchronize_sched(); - -		vfree(filtered_pids->pids); -		kfree(filtered_pids); -	} else { +		trace_free_pid_list(filtered_pids); +	} else if (pid_list) {  		/*  		 * Register a probe that is called before all other probes  		 * to set ignore_pid if next or prev do not match. @@ -1817,9 +1655,8 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,   out:  	mutex_unlock(&event_mutex); -	ret = read; -	if (read > 0) -		*ppos += read; +	if (ret > 0) +		*ppos += ret;  	return ret;  } @@ -1846,7 +1683,7 @@ static const struct seq_operations show_set_event_seq_ops = {  static const struct seq_operations show_set_pid_seq_ops = {  	.start = p_start,  	.next = p_next, -	.show = p_show, +	.show = trace_pid_show,  	.stop = p_stop,  }; diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 5a095c2e4b69..0efa00d80623 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -43,7 +43,7 @@ static int allocate_ftrace_ops(struct trace_array *tr)  	/* Currently only the non stack verision is supported */  	ops->func = function_trace_call; -	ops->flags = FTRACE_OPS_FL_RECURSION_SAFE; +	ops->flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_PID;  	tr->ops = ops;  	ops->private = tr; diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 3a0244ff7ea8..7363ccf79512 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -319,7 +319,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)  	int cpu;  	int pc; -	if (!ftrace_trace_task(current)) +	if (!ftrace_trace_task(tr))  		return 0;  	/* trace it when it is-nested-in or is a function enabled. */ @@ -338,6 +338,13 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)  	if (ftrace_graph_notrace_addr(trace->func))  		return 1; +	/* +	 * Stop here if tracing_threshold is set. We only write function return +	 * events to the ring buffer. +	 */ +	if (tracing_thresh) +		return 1; +  	local_irq_save(flags);  	cpu = raw_smp_processor_id();  	data = per_cpu_ptr(tr->trace_buffer.data, cpu); @@ -355,14 +362,6 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)  	return ret;  } -static int trace_graph_thresh_entry(struct ftrace_graph_ent *trace) -{ -	if (tracing_thresh) -		return 1; -	else -		return trace_graph_entry(trace); -} -  static void  __trace_graph_function(struct trace_array *tr,  		unsigned long ip, unsigned long flags, int pc) @@ -457,7 +456,7 @@ static int graph_trace_init(struct trace_array *tr)  	set_graph_array(tr);  	if (tracing_thresh)  		ret = register_ftrace_graph(&trace_graph_thresh_return, -					    &trace_graph_thresh_entry); +					    &trace_graph_entry);  	else  		ret = register_ftrace_graph(&trace_graph_return,  					    &trace_graph_entry); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 5546eec0505f..9aedb0b06683 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -587,6 +587,7 @@ static int create_trace_kprobe(int argc, char **argv)  	 *  $retval	: fetch return value  	 *  $stack	: fetch stack address  	 *  $stackN	: fetch Nth of stack (N:0-) +	 *  $comm       : fetch current task comm  	 *  @ADDR	: fetch memory at ADDR (ADDR should be in kernel)  	 *  @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol)  	 *  %REG	: fetch register REG diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 68f376ca6d3f..cd7480d0a201 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -68,19 +68,15 @@ static void mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)  	trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x",  			 dev->bus->number, dev->devfn,  			 dev->vendor, dev->device, dev->irq); -	/* -	 * XXX: is pci_resource_to_user() appropriate, since we are -	 * supposed to interpret the __ioremap() phys_addr argument based on -	 * these printed values? -	 */  	for (i = 0; i < 7; i++) { -		pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); +		start = dev->resource[i].start;  		trace_seq_printf(s, " %llx",  			(unsigned long long)(start |  			(dev->resource[i].flags & PCI_REGION_FLAG_MASK)));  	}  	for (i = 0; i < 7; i++) { -		pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); +		start = dev->resource[i].start; +		end = dev->resource[i].end;  		trace_seq_printf(s, " %llx",  			dev->resource[i].start < dev->resource[i].end ?  			(unsigned long long)(end - start) + 1 : 0); diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index f96f0383f6c6..ad1d6164e946 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -36,6 +36,10 @@ struct trace_bprintk_fmt {  static inline struct trace_bprintk_fmt *lookup_format(const char *fmt)  {  	struct trace_bprintk_fmt *pos; + +	if (!fmt) +		return ERR_PTR(-EINVAL); +  	list_for_each_entry(pos, &trace_bprintk_fmt_list, list) {  		if (!strcmp(pos->fmt, fmt))  			return pos; @@ -57,7 +61,8 @@ void hold_module_trace_bprintk_format(const char **start, const char **end)  	for (iter = start; iter < end; iter++) {  		struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter);  		if (tb_fmt) { -			*iter = tb_fmt->fmt; +			if (!IS_ERR(tb_fmt)) +				*iter = tb_fmt->fmt;  			continue;  		} diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 1d372fa6fefb..74e80a582c28 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -218,6 +218,28 @@ free_bitfield_fetch_param(struct bitfield_fetch_param *data)  	kfree(data);  } +void FETCH_FUNC_NAME(comm, string)(struct pt_regs *regs, +					  void *data, void *dest) +{ +	int maxlen = get_rloc_len(*(u32 *)dest); +	u8 *dst = get_rloc_data(dest); +	long ret; + +	if (!maxlen) +		return; + +	ret = strlcpy(dst, current->comm, maxlen); +	*(u32 *)dest = make_data_rloc(ret, get_rloc_offs(*(u32 *)dest)); +} +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(comm, string)); + +void FETCH_FUNC_NAME(comm, string_size)(struct pt_regs *regs, +					       void *data, void *dest) +{ +	*(u32 *)dest = strlen(current->comm) + 1; +} +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(comm, string_size)); +  static const struct fetch_type *find_fetch_type(const char *type,  						const struct fetch_type *ftbl)  { @@ -348,6 +370,11 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t,  			}  		} else  			ret = -EINVAL; +	} else if (strcmp(arg, "comm") == 0) { +		if (strcmp(t->name, "string") != 0 && +		    strcmp(t->name, "string_size") != 0) +			return -EINVAL; +		f->fn = t->fetch[FETCH_MTD_comm];  	} else  		ret = -EINVAL; @@ -522,6 +549,12 @@ int traceprobe_parse_probe_arg(char *arg, ssize_t *size,  		arg[t - parg->comm] = '\0';  		t++;  	} +	/* +	 * The default type of $comm should be "string", and it can't be +	 * dereferenced. +	 */ +	if (!t && strcmp(arg, "$comm") == 0) +		t = "string";  	parg->type = find_fetch_type(t, ftbl);  	if (!parg->type) {  		pr_info("Unsupported type: %s\n", t); diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index f6398db09114..45400ca5ded1 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -102,6 +102,7 @@ enum {  	FETCH_MTD_reg = 0,  	FETCH_MTD_stack,  	FETCH_MTD_retval, +	FETCH_MTD_comm,  	FETCH_MTD_memory,  	FETCH_MTD_symbol,  	FETCH_MTD_deref, @@ -183,6 +184,14 @@ DECLARE_BASIC_FETCH_FUNCS(bitfield);  #define fetch_bitfield_string			NULL  #define fetch_bitfield_string_size		NULL +/* comm only makes sense as a string */ +#define fetch_comm_u8		NULL +#define fetch_comm_u16		NULL +#define fetch_comm_u32		NULL +#define fetch_comm_u64		NULL +DECLARE_FETCH_FUNC(comm, string); +DECLARE_FETCH_FUNC(comm, string_size); +  /*   * Define macro for basic types - we don't need to define s* types, because   * we have to care only about bitwidth at recording time. @@ -213,6 +222,7 @@ DEFINE_FETCH_##method(u64)  ASSIGN_FETCH_FUNC(reg, ftype),				\  ASSIGN_FETCH_FUNC(stack, ftype),			\  ASSIGN_FETCH_FUNC(retval, ftype),			\ +ASSIGN_FETCH_FUNC(comm, ftype),				\  ASSIGN_FETCH_FUNC(memory, ftype),			\  ASSIGN_FETCH_FUNC(symbol, ftype),			\  ASSIGN_FETCH_FUNC(deref, ftype),			\ diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 9bafc211930c..68f594212759 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -938,6 +938,20 @@ bool userns_may_setgroups(const struct user_namespace *ns)  	return allowed;  } +/* + * Returns true if @ns is the same namespace as or a descendant of + * @target_ns. + */ +bool current_in_userns(const struct user_namespace *target_ns) +{ +	struct user_namespace *ns; +	for (ns = current_user_ns(); ns; ns = ns->parent) { +		if (ns == target_ns) +			return true; +	} +	return false; +} +  static inline struct user_namespace *to_user_ns(struct ns_common *ns)  {  	return container_of(ns, struct user_namespace, ns); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index e1c0e996b5ae..ef071ca73fc3 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4369,8 +4369,8 @@ static void show_pwq(struct pool_workqueue *pwq)  /**   * show_workqueue_state - dump workqueue state   * - * Called from a sysrq handler and prints out all busy workqueues and - * pools. + * Called from a sysrq handler or try_to_freeze_tasks() and prints out + * all busy workqueues and pools.   */  void show_workqueue_state(void)  { @@ -4600,95 +4600,72 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)  	if (!cpumask_test_cpu(cpu, pool->attrs->cpumask))  		return; -	/* is @cpu the only online CPU? */  	cpumask_and(&cpumask, pool->attrs->cpumask, cpu_online_mask); -	if (cpumask_weight(&cpumask) != 1) -		return;  	/* as we're called from CPU_ONLINE, the following shouldn't fail */  	for_each_pool_worker(worker, pool) -		WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, -						  pool->attrs->cpumask) < 0); +		WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, &cpumask) < 0);  } -/* - * Workqueues should be brought up before normal priority CPU notifiers. - * This will be registered high priority CPU notifier. - */ -static int workqueue_cpu_up_callback(struct notifier_block *nfb, -					       unsigned long action, -					       void *hcpu) +int workqueue_prepare_cpu(unsigned int cpu) +{ +	struct worker_pool *pool; + +	for_each_cpu_worker_pool(pool, cpu) { +		if (pool->nr_workers) +			continue; +		if (!create_worker(pool)) +			return -ENOMEM; +	} +	return 0; +} + +int workqueue_online_cpu(unsigned int cpu)  { -	int cpu = (unsigned long)hcpu;  	struct worker_pool *pool;  	struct workqueue_struct *wq;  	int pi; -	switch (action & ~CPU_TASKS_FROZEN) { -	case CPU_UP_PREPARE: -		for_each_cpu_worker_pool(pool, cpu) { -			if (pool->nr_workers) -				continue; -			if (!create_worker(pool)) -				return NOTIFY_BAD; -		} -		break; - -	case CPU_DOWN_FAILED: -	case CPU_ONLINE: -		mutex_lock(&wq_pool_mutex); +	mutex_lock(&wq_pool_mutex); -		for_each_pool(pool, pi) { -			mutex_lock(&pool->attach_mutex); +	for_each_pool(pool, pi) { +		mutex_lock(&pool->attach_mutex); -			if (pool->cpu == cpu) -				rebind_workers(pool); -			else if (pool->cpu < 0) -				restore_unbound_workers_cpumask(pool, cpu); +		if (pool->cpu == cpu) +			rebind_workers(pool); +		else if (pool->cpu < 0) +			restore_unbound_workers_cpumask(pool, cpu); -			mutex_unlock(&pool->attach_mutex); -		} +		mutex_unlock(&pool->attach_mutex); +	} -		/* update NUMA affinity of unbound workqueues */ -		list_for_each_entry(wq, &workqueues, list) -			wq_update_unbound_numa(wq, cpu, true); +	/* update NUMA affinity of unbound workqueues */ +	list_for_each_entry(wq, &workqueues, list) +		wq_update_unbound_numa(wq, cpu, true); -		mutex_unlock(&wq_pool_mutex); -		break; -	} -	return NOTIFY_OK; +	mutex_unlock(&wq_pool_mutex); +	return 0;  } -/* - * Workqueues should be brought down after normal priority CPU notifiers. - * This will be registered as low priority CPU notifier. - */ -static int workqueue_cpu_down_callback(struct notifier_block *nfb, -						 unsigned long action, -						 void *hcpu) +int workqueue_offline_cpu(unsigned int cpu)  { -	int cpu = (unsigned long)hcpu;  	struct work_struct unbind_work;  	struct workqueue_struct *wq; -	switch (action & ~CPU_TASKS_FROZEN) { -	case CPU_DOWN_PREPARE: -		/* unbinding per-cpu workers should happen on the local CPU */ -		INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn); -		queue_work_on(cpu, system_highpri_wq, &unbind_work); - -		/* update NUMA affinity of unbound workqueues */ -		mutex_lock(&wq_pool_mutex); -		list_for_each_entry(wq, &workqueues, list) -			wq_update_unbound_numa(wq, cpu, false); -		mutex_unlock(&wq_pool_mutex); - -		/* wait for per-cpu unbinding to finish */ -		flush_work(&unbind_work); -		destroy_work_on_stack(&unbind_work); -		break; -	} -	return NOTIFY_OK; +	/* unbinding per-cpu workers should happen on the local CPU */ +	INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn); +	queue_work_on(cpu, system_highpri_wq, &unbind_work); + +	/* update NUMA affinity of unbound workqueues */ +	mutex_lock(&wq_pool_mutex); +	list_for_each_entry(wq, &workqueues, list) +		wq_update_unbound_numa(wq, cpu, false); +	mutex_unlock(&wq_pool_mutex); + +	/* wait for per-cpu unbinding to finish */ +	flush_work(&unbind_work); +	destroy_work_on_stack(&unbind_work); +	return 0;  }  #ifdef CONFIG_SMP @@ -5490,9 +5467,6 @@ static int __init init_workqueues(void)  	pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); -	cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP); -	hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN); -  	wq_numa_init();  	/* initialize CPU pools */  |