15 files changed, 142 insertions, 58 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 7dc8788cfd52..940aced4ed00 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1035,6 +1035,11 @@ static void cgroup_get(struct cgroup *cgrp)
 	css_get(&cgrp->self);
 }
 
+static bool cgroup_tryget(struct cgroup *cgrp)
+{
+	return css_tryget(&cgrp->self);
+}
+
 static void cgroup_put(struct cgroup *cgrp)
 {
 	css_put(&cgrp->self);
@@ -1147,7 +1152,8 @@ static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn)
 	 * protection against removal.  Ensure @cgrp stays accessible and
 	 * break the active_ref protection.
 	 */
-	cgroup_get(cgrp);
+	if (!cgroup_tryget(cgrp))
+		return NULL;
 	kernfs_break_active_protection(kn);
 
 	mutex_lock(&cgroup_mutex);
@@ -3271,8 +3277,17 @@ int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
 {
 	struct cftype *cft;
 
-	for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
-		cft->flags |= __CFTYPE_NOT_ON_DFL;
+	/*
+	 * If legacy_flies_on_dfl, we want to show the legacy files on the
+	 * dfl hierarchy but iff the target subsystem hasn't been updated
+	 * for the dfl hierarchy yet.
+	 */
+	if (!cgroup_legacy_files_on_dfl ||
+	    ss->dfl_cftypes != ss->legacy_cftypes) {
+		for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
+			cft->flags |= __CFTYPE_NOT_ON_DFL;
+	}
+
 	return cgroup_add_cftypes(ss, cfts);
 }
 
@@ -4387,6 +4402,15 @@ static void css_release_work_fn(struct work_struct *work)
 		/* cgroup release path */
 		cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
 		cgrp->id = -1;
+
+		/*
+		 * There are two control paths which try to determine
+		 * cgroup from dentry without going through kernfs -
+		 * cgroupstats_build() and css_tryget_online_from_dir().
+		 * Those are supported by RCU protecting clearing of
+		 * cgrp->kn->priv backpointer.
+		 */
+		RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL);
 	}
 
 	mutex_unlock(&cgroup_mutex);
@@ -4543,6 +4567,11 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
 	struct cftype *base_files;
 	int ssid, ret;
 
+	/* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable.
+	 */
+	if (strchr(name, '\n'))
+		return -EINVAL;
+
 	parent = cgroup_kn_lock_live(parent_kn);
 	if (!parent)
 		return -ENODEV;
@@ -4820,16 +4849,6 @@ static int cgroup_rmdir(struct kernfs_node *kn)
 
 	cgroup_kn_unlock(kn);
 
-	/*
-	 * There are two control paths which try to determine cgroup from
-	 * dentry without going through kernfs - cgroupstats_build() and
-	 * css_tryget_online_from_dir().  Those are supported by RCU
-	 * protecting clearing of cgrp->kn->priv backpointer, which should
-	 * happen after all files under it have been removed.
-	 */
-	if (!ret)
-		RCU_INIT_POINTER(*(void __rcu __force **)&kn->priv, NULL);
-
 	cgroup_put(cgrp);
 	return ret;
 }
@@ -5416,7 +5435,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
 	/*
 	 * This path doesn't originate from kernfs and @kn could already
 	 * have been or be removed at any point.  @kn->priv is RCU
-	 * protected for this access.  See cgroup_rmdir() for details.
+	 * protected for this access.  See css_release_work_fn() for details.
 	 */
 	cgrp = rcu_dereference(kn->priv);
 	if (cgrp)
diff --git a/kernel/compat.c b/kernel/compat.c
index 633394f442f8..ebb3c369d03d 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -226,7 +226,7 @@ static long compat_nanosleep_restart(struct restart_block *restart)
 	ret = hrtimer_nanosleep_restart(restart);
 	set_fs(oldfs);
 
-	if (ret) {
+	if (ret == -ERESTART_RESTARTBLOCK) {
 		rmtp = restart->nanosleep.compat_rmtp;
 
 		if (rmtp && compat_put_timespec(&rmt, rmtp))
@@ -256,7 +256,26 @@ COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp,
 				HRTIMER_MODE_REL, CLOCK_MONOTONIC);
 	set_fs(oldfs);
 
-	if (ret) {
+	/*
+	 * hrtimer_nanosleep() can only return 0 or
+	 * -ERESTART_RESTARTBLOCK here because:
+	 *
+	 * - we call it with HRTIMER_MODE_REL and therefor exclude the
+	 *   -ERESTARTNOHAND return path.
+	 *
+	 * - we supply the rmtp argument from the task stack (due to
+	 *   the necessary compat conversion. So the update cannot
+	 *   fail, which excludes the -EFAULT return path as well. If
+	 *   it fails nevertheless we have a bigger problem and wont
+	 *   reach this place anymore.
+	 *
+	 * - if the return value is 0, we do not have to update rmtp
+	 *    because there is no remaining time.
+	 *
+	 * We check for -ERESTART_RESTARTBLOCK nevertheless if the
+	 * core implementation decides to return random nonsense.
+	 */
+	if (ret == -ERESTART_RESTARTBLOCK) {
 		struct restart_block *restart
 			= &current_thread_info()->restart_block;
 
@@ -266,7 +285,6 @@ COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp,
 		if (rmtp && compat_put_timespec(&rmt, rmtp))
 			return -EFAULT;
 	}
-
 	return ret;
 }
 
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index a2b28a2fd7b1..6223fab9a9d2 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -517,6 +517,7 @@ out:
 		chip->irq_eoi(&desc->irq_data);
 	raw_spin_unlock(&desc->lock);
 }
+EXPORT_SYMBOL_GPL(handle_fasteoi_irq);
 
 /**
  *	handle_edge_irq - edge type IRQ handler
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
index e30ac0fe61c3..0aa69ea1d8fd 100644
--- a/kernel/kcmp.c
+++ b/kernel/kcmp.c
@@ -44,11 +44,12 @@ static long kptr_obfuscate(long v, int type)
  */
 static int kcmp_ptr(void *v1, void *v2, enum kcmp_type type)
 {
-	long ret;
+	long t1, t2;
 
-	ret = kptr_obfuscate((long)v1, type) - kptr_obfuscate((long)v2, type);
+	t1 = kptr_obfuscate((long)v1, type);
+	t2 = kptr_obfuscate((long)v2, type);
 
-	return (ret < 0) | ((ret > 0) << 1);
+	return (t1 < t2) | ((t1 > t2) << 1);
 }
 
 /* The caller must have pinned the task */
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 0b49a0a58102..2bee072268d9 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -64,7 +64,9 @@ bool kexec_in_progress = false;
 char __weak kexec_purgatory[0];
 size_t __weak kexec_purgatory_size = 0;
 
+#ifdef CONFIG_KEXEC_FILE
 static int kexec_calculate_store_digests(struct kimage *image);
+#endif
 
 /* Location of the reserved area for the crash kernel */
 struct resource crashk_res = {
@@ -341,6 +343,7 @@ out_free_image:
 	return ret;
 }
 
+#ifdef CONFIG_KEXEC_FILE
 static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len)
 {
 	struct fd f = fdget(fd);
@@ -612,6 +615,9 @@ out_free_image:
 	kfree(image);
 	return ret;
 }
+#else /* CONFIG_KEXEC_FILE */
+static inline void kimage_file_post_load_cleanup(struct kimage *image) { }
+#endif /* CONFIG_KEXEC_FILE */
 
 static int kimage_is_destination_range(struct kimage *image,
 					unsigned long start,
@@ -1375,6 +1381,7 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
 }
 #endif
 
+#ifdef CONFIG_KEXEC_FILE
 SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
 		unsigned long, cmdline_len, const char __user *, cmdline_ptr,
 		unsigned long, flags)
@@ -1451,6 +1458,8 @@ out:
 	return ret;
 }
 
+#endif /* CONFIG_KEXEC_FILE */
+
 void crash_kexec(struct pt_regs *regs)
 {
 	/* Take the kexec_mutex here to prevent sys_kexec_load
@@ -2006,6 +2015,7 @@ static int __init crash_save_vmcoreinfo_init(void)
 
 subsys_initcall(crash_save_vmcoreinfo_init);
 
+#ifdef CONFIG_KEXEC_FILE
 static int __kexec_add_segment(struct kimage *image, char *buf,
 			       unsigned long bufsz, unsigned long mem,
 			       unsigned long memsz)
@@ -2682,6 +2692,7 @@ int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
 
 	return 0;
 }
+#endif /* CONFIG_KEXEC_FILE */
 
 /*
  * Move into place and start executing a preloaded standalone
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 5d49dcac2537..2df883a9d3cb 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -179,6 +179,7 @@ extern void swsusp_show_speed(struct timeval *, struct timeval *,
 
 #ifdef CONFIG_SUSPEND
 /* kernel/power/suspend.c */
+extern const char *pm_labels[];
 extern const char *pm_states[];
 
 extern int suspend_devices_and_enter(suspend_state_t state);
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 6dadb25cb0d8..18c62195660f 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -31,7 +31,7 @@
 
 #include "power.h"
 
-static const char *pm_labels[] = { "mem", "standby", "freeze", };
+const char *pm_labels[] = { "mem", "standby", "freeze", NULL };
 const char *pm_states[PM_SUSPEND_MAX];
 
 static const struct platform_suspend_ops *suspend_ops;
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 2f524928b6aa..bd91bc177c93 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -129,20 +129,20 @@ static int __init has_wakealarm(struct device *dev, const void *data)
  * at startup time.  They're normally disabled, for faster boot and because
  * we can't know which states really work on this particular system.
  */
-static suspend_state_t test_state __initdata = PM_SUSPEND_ON;
+static const char *test_state_label __initdata;
 
 static char warn_bad_state[] __initdata =
 	KERN_WARNING "PM: can't test '%s' suspend state\n";
 
 static int __init setup_test_suspend(char *value)
 {
-	suspend_state_t i;
+	int i;
 
 	/* "=mem" ==> "mem" */
 	value++;
-	for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
-		if (!strcmp(pm_states[i], value)) {
-			test_state = i;
+	for (i = 0; pm_labels[i]; i++)
+		if (!strcmp(pm_labels[i], value)) {
+			test_state_label = pm_labels[i];
 			return 0;
 		}
 
@@ -158,13 +158,21 @@ static int __init test_suspend(void)
 
 	struct rtc_device	*rtc = NULL;
 	struct device		*dev;
+	suspend_state_t test_state;
 
 	/* PM is initialized by now; is that state testable? */
-	if (test_state == PM_SUSPEND_ON)
-		goto done;
-	if (!pm_states[test_state]) {
-		printk(warn_bad_state, pm_states[test_state]);
-		goto done;
+	if (!test_state_label)
+		return 0;
+
+	for (test_state = PM_SUSPEND_MIN; test_state < PM_SUSPEND_MAX; test_state++) {
+		const char *state_label = pm_states[test_state];
+
+		if (state_label && !strcmp(test_state_label, state_label))
+			break;
+	}
+	if (test_state == PM_SUSPEND_MAX) {
+		printk(warn_bad_state, test_state_label);
+		return 0;
 	}
 
 	/* RTCs have initialized by now too ... can we use one? */
@@ -173,13 +181,12 @@ static int __init test_suspend(void)
 		rtc = rtc_class_open(dev_name(dev));
 	if (!rtc) {
 		printk(warn_no_rtc);
-		goto done;
+		return 0;
 	}
 
 	/* go for it */
 	test_wakealarm(rtc, test_state);
 	rtc_class_close(rtc);
-done:
 	return 0;
 }
 late_initcall(test_suspend);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index e04c455a0e38..1ce770687ea8 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1665,15 +1665,15 @@ asmlinkage int vprintk_emit(int facility, int level,
 	raw_spin_lock(&logbuf_lock);
 	logbuf_cpu = this_cpu;
 
-	if (recursion_bug) {
+	if (unlikely(recursion_bug)) {
 		static const char recursion_msg[] =
 			"BUG: recent printk recursion!";
 
 		recursion_bug = 0;
-		text_len = strlen(recursion_msg);
 		/* emit KERN_CRIT message */
 		printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0,
-					 NULL, 0, recursion_msg, text_len);
+					 NULL, 0, recursion_msg,
+					 strlen(recursion_msg));
 	}
 
 	/*
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 71e64c718f75..6a86eb7bac45 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -358,7 +358,7 @@ struct rcu_data {
 	struct rcu_head **nocb_gp_tail;
 	long nocb_gp_count;
 	long nocb_gp_count_lazy;
-	bool nocb_leader_wake;		/* Is the nocb leader thread awake? */
+	bool nocb_leader_sleep;		/* Is the nocb leader thread asleep? */
 	struct rcu_data *nocb_next_follower;
 					/* Next follower in wakeup chain. */
 
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 00dc411e9676..a7997e272564 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -2074,9 +2074,9 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force)
 
 	if (!ACCESS_ONCE(rdp_leader->nocb_kthread))
 		return;
-	if (!ACCESS_ONCE(rdp_leader->nocb_leader_wake) || force) {
+	if (ACCESS_ONCE(rdp_leader->nocb_leader_sleep) || force) {
 		/* Prior xchg orders against prior callback enqueue. */
-		ACCESS_ONCE(rdp_leader->nocb_leader_wake) = true;
+		ACCESS_ONCE(rdp_leader->nocb_leader_sleep) = false;
 		wake_up(&rdp_leader->nocb_wq);
 	}
 }
@@ -2253,7 +2253,7 @@ wait_again:
 	if (!rcu_nocb_poll) {
 		trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep");
 		wait_event_interruptible(my_rdp->nocb_wq,
-					 ACCESS_ONCE(my_rdp->nocb_leader_wake));
+				!ACCESS_ONCE(my_rdp->nocb_leader_sleep));
 		/* Memory barrier handled by smp_mb() calls below and repoll. */
 	} else if (firsttime) {
 		firsttime = false; /* Don't drown trace log with "Poll"! */
@@ -2292,12 +2292,12 @@ wait_again:
 		schedule_timeout_interruptible(1);
 
 		/* Rescan in case we were a victim of memory ordering. */
-		my_rdp->nocb_leader_wake = false;
-		smp_mb();  /* Ensure _wake false before scan. */
+		my_rdp->nocb_leader_sleep = true;
+		smp_mb();  /* Ensure _sleep true before scan. */
 		for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower)
 			if (ACCESS_ONCE(rdp->nocb_head)) {
 				/* Found CB, so short-circuit next wait. */
-				my_rdp->nocb_leader_wake = true;
+				my_rdp->nocb_leader_sleep = false;
 				break;
 			}
 		goto wait_again;
@@ -2307,17 +2307,17 @@ wait_again:
 	rcu_nocb_wait_gp(my_rdp);
 
 	/*
-	 * We left ->nocb_leader_wake set to reduce cache thrashing.
-	 * We clear it now, but recheck for new callbacks while
+	 * We left ->nocb_leader_sleep unset to reduce cache thrashing.
+	 * We set it now, but recheck for new callbacks while
 	 * traversing our follower list.
 	 */
-	my_rdp->nocb_leader_wake = false;
-	smp_mb(); /* Ensure _wake false before scan of ->nocb_head. */
+	my_rdp->nocb_leader_sleep = true;
+	smp_mb(); /* Ensure _sleep true before scan of ->nocb_head. */
 
 	/* Each pass through the following loop wakes a follower, if needed. */
 	for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
 		if (ACCESS_ONCE(rdp->nocb_head))
-			my_rdp->nocb_leader_wake = true; /* No need to wait. */
+			my_rdp->nocb_leader_sleep = false;/* No need to sleep.*/
 		if (!rdp->nocb_gp_head)
 			continue; /* No CBs, so no need to wake follower. */
 
diff --git a/kernel/resource.c b/kernel/resource.c
index da14b8d09296..60c5a3856ab7 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -351,15 +351,12 @@ static int find_next_iomem_res(struct resource *res, char *name,
 	end = res->end;
 	BUG_ON(start >= end);
 
-	read_lock(&resource_lock);
-
-	if (first_level_children_only) {
-		p = iomem_resource.child;
+	if (first_level_children_only)
 		sibling_only = true;
-	} else
-		p = &iomem_resource;
 
-	while ((p = next_resource(p, sibling_only))) {
+	read_lock(&resource_lock);
+
+	for (p = iomem_resource.child; p; p = next_resource(p, sibling_only)) {
 		if (p->flags != res->flags)
 			continue;
 		if (name && strcmp(p->name, name))
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 99aa6ee3908f..f654a8a298fa 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -225,6 +225,20 @@ static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
 };
 
 /*
+ * Kick this CPU if it's full dynticks in order to force it to
+ * re-evaluate its dependency on the tick and restart it if necessary.
+ * This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(),
+ * is NMI safe.
+ */
+void tick_nohz_full_kick(void)
+{
+	if (!tick_nohz_full_cpu(smp_processor_id()))
+		return;
+
+	irq_work_queue(&__get_cpu_var(nohz_full_kick_work));
+}
+
+/*
  * Kick the CPU if it's full dynticks in order to force it to
  * re-evaluate its dependency on the tick and restart it if necessary.
  */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index fb4a9c2cf8d9..ec1791fae965 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -442,11 +442,12 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
 		tk->ntp_error = 0;
 		ntp_clear();
 	}
-	update_vsyscall(tk);
-	update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);
 
 	tk_update_ktime_data(tk);
 
+	update_vsyscall(tk);
+	update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);
+
 	if (action & TK_MIRROR)
 		memcpy(&shadow_timekeeper, &tk_core.timekeeper,
 		       sizeof(tk_core.timekeeper));
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index afb04b9b818a..b38fb2b9e237 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -626,8 +626,22 @@ int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
 		work = &cpu_buffer->irq_work;
 	}
 
-	work->waiters_pending = true;
 	poll_wait(filp, &work->waiters, poll_table);
+	work->waiters_pending = true;
+	/*
+	 * There's a tight race between setting the waiters_pending and
+	 * checking if the ring buffer is empty.  Once the waiters_pending bit
+	 * is set, the next event will wake the task up, but we can get stuck
+	 * if there's only a single event in.
+	 *
+	 * FIXME: Ideally, we need a memory barrier on the writer side as well,
+	 * but adding a memory barrier to all events will cause too much of a
+	 * performance hit in the fast path.  We only need a memory barrier when
+	 * the buffer goes from empty to having content.  But as this race is
+	 * extremely small, and it's not a problem if another event comes in, we
+	 * will fix it later.
+	 */
+	smp_mb();
 
 	if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) ||
 	    (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu)))