From 47b7aee14fd7e453370a5d15dfb11c958ca360f2 Mon Sep 17 00:00:00 2001
From: Valentin Schneider <valentin.schneider@arm.com>
Date: Wed, 26 Sep 2018 16:12:06 +0100
Subject: sched/fair: Clean up load_balance() condition

The alignment of the condition is off, clean that up.

Also, logical operators have lower precedence than bitwise/relational
operators, so remove one layer of parentheses to make the condition a
bit simpler to follow.

Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Dietmar.Eggemann@arm.com
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: patrick.bellasi@arm.com
Cc: vincent.guittot@linaro.org
Link: http://lkml.kernel.org/r/1537974727-30788-1-git-send-email-valentin.schneider@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ee271bb661cc..4e298931a715 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8877,9 +8877,9 @@ out_all_pinned:
 
 out_one_pinned:
 	/* tune up the balancing interval */
-	if (((env.flags & LBF_ALL_PINNED) &&
-			sd->balance_interval < MAX_PINNED_INTERVAL) ||
-			(sd->balance_interval < sd->max_interval))
+	if ((env.flags & LBF_ALL_PINNED &&
+	     sd->balance_interval < MAX_PINNED_INTERVAL) ||
+	    sd->balance_interval < sd->max_interval)
 		sd->balance_interval *= 2;
 
 	ld_moved = 0;
-- 
cgit 


From 3f130a37c442d5c4d66531b240ebe9abfef426b5 Mon Sep 17 00:00:00 2001
From: Valentin Schneider <valentin.schneider@arm.com>
Date: Wed, 26 Sep 2018 16:12:07 +0100
Subject: sched/fair: Don't increase sd->balance_interval on newidle balance

When load_balance() fails to move some load because of task affinity,
we end up increasing sd->balance_interval to delay the next periodic
balance in the hopes that next time we look, that annoying pinned
task(s) will be gone.

However, idle_balance() pays no attention to sd->balance_interval, yet
it will still lead to an increase in balance_interval in case of
pinned tasks.

If we're going through several newidle balances (e.g. we have a
periodic task), this can lead to a huge increase of the
balance_interval in a very small amount of time.

To prevent that, don't increase the balance interval when going
through a newidle balance.

This is a similar approach to what is done in commit 58b26c4c0257
("sched: Increment cache_nice_tries only on periodic lb"), where we
disregard newidle balance and rely on periodic balance for more stable
results.

Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Dietmar.Eggemann@arm.com
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: patrick.bellasi@arm.com
Cc: vincent.guittot@linaro.org
Link: http://lkml.kernel.org/r/1537974727-30788-2-git-send-email-valentin.schneider@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4e298931a715..a17ca4254427 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8876,13 +8876,22 @@ out_all_pinned:
 	sd->nr_balance_failed = 0;
 
 out_one_pinned:
+	ld_moved = 0;
+
+	/*
+	 * idle_balance() disregards balance intervals, so we could repeatedly
+	 * reach this code, which would lead to balance_interval skyrocketting
+	 * in a short amount of time. Skip the balance_interval increase logic
+	 * to avoid that.
+	 */
+	if (env.idle == CPU_NEWLY_IDLE)
+		goto out;
+
 	/* tune up the balancing interval */
 	if ((env.flags & LBF_ALL_PINNED &&
 	     sd->balance_interval < MAX_PINNED_INTERVAL) ||
 	    sd->balance_interval < sd->max_interval)
 		sd->balance_interval *= 2;
-
-	ld_moved = 0;
 out:
 	return ld_moved;
 }
-- 
cgit 


From ff1cdc94de4d336be45336d70709dfcf3d682514 Mon Sep 17 00:00:00 2001
From: Muchun Song <smuchun@gmail.com>
Date: Fri, 26 Oct 2018 21:17:43 +0800
Subject: sched/core: Introduce set_next_task() helper for better code
 readability

When we pick the next task, we will do the following for the task:

  1) p->se.exec_start = rq_clock_task(rq);
  2) dequeue_pushable(_dl)_task(rq, p);

When we call set_curr_task(), we also need to do the same thing
above. In rt.c, the code at 1) is in the _pick_next_task_rt()
and the code at 2) is in the pick_next_task_rt(). If we put two
operations in one function, maybe better. So, we introduce a new
function set_next_task(), which is responsible for doing the above.

By introducing the function we can get rid of calling the
dequeue_pushable(_dl)_task() directly(We can call set_next_task())
in pick_next_task() and have better code readability and reuse.
In set_curr_task_rt(), we also can call set_next_task().

Do this things such that we end up with:

  static struct task_struct *pick_next_task(struct rq *rq,
  					    struct task_struct *prev,
  					    struct rq_flags *rf)
  {
  	/* do something else ... */

  	put_prev_task(rq, prev);

  	/* pick next task p */

  	set_next_task(rq, p);

  	/* do something else ... */
  }

put_prev_task() can match set_next_task(), which can make the
code more readable.

Signed-off-by: Muchun Song <smuchun@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20181026131743.21786-1-smuchun@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/deadline.c | 19 ++++++++++---------
 kernel/sched/rt.c       | 24 +++++++++++-------------
 2 files changed, 21 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 91e4202b0634..470ba6b464fe 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1695,6 +1695,14 @@ static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
 }
 #endif
 
+static inline void set_next_task(struct rq *rq, struct task_struct *p)
+{
+	p->se.exec_start = rq_clock_task(rq);
+
+	/* You can't push away the running task */
+	dequeue_pushable_dl_task(rq, p);
+}
+
 static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
 						   struct dl_rq *dl_rq)
 {
@@ -1750,10 +1758,8 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 	BUG_ON(!dl_se);
 
 	p = dl_task_of(dl_se);
-	p->se.exec_start = rq_clock_task(rq);
 
-	/* Running task will never be pushed. */
-       dequeue_pushable_dl_task(rq, p);
+	set_next_task(rq, p);
 
 	if (hrtick_enabled(rq))
 		start_hrtick_dl(rq, p);
@@ -1808,12 +1814,7 @@ static void task_fork_dl(struct task_struct *p)
 
 static void set_curr_task_dl(struct rq *rq)
 {
-	struct task_struct *p = rq->curr;
-
-	p->se.exec_start = rq_clock_task(rq);
-
-	/* You can't push away the running task */
-	dequeue_pushable_dl_task(rq, p);
+	set_next_task(rq, rq->curr);
 }
 
 #ifdef CONFIG_SMP
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index a21ea6021929..9aa3287ce301 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1498,6 +1498,14 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag
 #endif
 }
 
+static inline void set_next_task(struct rq *rq, struct task_struct *p)
+{
+	p->se.exec_start = rq_clock_task(rq);
+
+	/* The running task is never eligible for pushing */
+	dequeue_pushable_task(rq, p);
+}
+
 static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
 						   struct rt_rq *rt_rq)
 {
@@ -1518,7 +1526,6 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
 static struct task_struct *_pick_next_task_rt(struct rq *rq)
 {
 	struct sched_rt_entity *rt_se;
-	struct task_struct *p;
 	struct rt_rq *rt_rq  = &rq->rt;
 
 	do {
@@ -1527,10 +1534,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
 		rt_rq = group_rt_rq(rt_se);
 	} while (rt_rq);
 
-	p = rt_task_of(rt_se);
-	p->se.exec_start = rq_clock_task(rq);
-
-	return p;
+	return rt_task_of(rt_se);
 }
 
 static struct task_struct *
@@ -1573,8 +1577,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 
 	p = _pick_next_task_rt(rq);
 
-	/* The running task is never eligible for pushing */
-	dequeue_pushable_task(rq, p);
+	set_next_task(rq, p);
 
 	rt_queue_push_tasks(rq);
 
@@ -2355,12 +2358,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
 
 static void set_curr_task_rt(struct rq *rq)
 {
-	struct task_struct *p = rq->curr;
-
-	p->se.exec_start = rq_clock_task(rq);
-
-	/* The running task is never eligible for pushing */
-	dequeue_pushable_task(rq, p);
+	set_next_task(rq, rq->curr);
 }
 
 static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
-- 
cgit 


From b82592199032bf7c778f861b936287e37ebc9f62 Mon Sep 17 00:00:00 2001
From: Long Li <longli@microsoft.com>
Date: Fri, 2 Nov 2018 18:02:48 +0000
Subject: genirq/affinity: Spread IRQs to all available NUMA nodes

If the number of NUMA nodes exceeds the number of MSI/MSI-X interrupts
which are allocated for a device, the interrupt affinity spreading code
fails to spread them across all nodes.

The reason is, that the spreading code starts from node 0 and continues up
to the number of interrupts requested for allocation. This leaves the nodes
past the last interrupt unused.

This results in interrupt concentration on the first nodes which violates
the assumption of the block layer that all nodes are covered evenly. As a
consequence the NUMA nodes above the number of interrupts are all assigned
to hardware queue 0 and therefore NUMA node 0, which results in bad
performance and has CPU hotplug implications, because queue 0 gets shut
down when the last CPU of node 0 is offlined.

Go over all NUMA nodes and assign them round-robin to all requested
interrupts to solve this.

[ tglx: Massaged changelog ]

Signed-off-by: Long Li <longli@microsoft.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Cc: Michael Kelley <mikelley@microsoft.com>
Link: https://lkml.kernel.org/r/20181102180248.13583-1-longli@linuxonhyperv.com
---
 kernel/irq/affinity.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index f4f29b9d90ee..e12cdf637c71 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -117,12 +117,11 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd,
 	 */
 	if (numvecs <= nodes) {
 		for_each_node_mask(n, nodemsk) {
-			cpumask_copy(masks + curvec, node_to_cpumask[n]);
-			if (++done == numvecs)
-				break;
+			cpumask_or(masks + curvec, masks + curvec, node_to_cpumask[n]);
 			if (++curvec == last_affv)
 				curvec = affd->pre_vectors;
 		}
+		done = numvecs;
 		goto out;
 	}
 
-- 
cgit 


From 5c903e108d0b005cf59904ca3520934fca4b9439 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 2 Nov 2018 22:59:49 +0800
Subject: genirq/affinity: Move two stage affinity spreading into a helper
 function

No functional change. Prepares for supporting allocating and affinitizing
interrupt sets.

[ tglx: Minor changelog tweaks ]

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: linux-block@vger.kernel.org
Cc: Hannes Reinecke <hare@suse.com>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Sagi Grimberg <sagi@grimberg.me>
Link: https://lkml.kernel.org/r/20181102145951.31979-3-ming.lei@redhat.com
---
 kernel/irq/affinity.c | 92 +++++++++++++++++++++++++++++++--------------------
 1 file changed, 56 insertions(+), 36 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index e12cdf637c71..2f9812b6035e 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -94,7 +94,7 @@ static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask,
 	return nodes;
 }
 
-static int irq_build_affinity_masks(const struct irq_affinity *affd,
+static int __irq_build_affinity_masks(const struct irq_affinity *affd,
 				    int startvec, int numvecs,
 				    cpumask_var_t *node_to_cpumask,
 				    const struct cpumask *cpu_mask,
@@ -165,6 +165,58 @@ out:
 	return done;
 }
 
+/*
+ * build affinity in two stages:
+ *	1) spread present CPU on these vectors
+ *	2) spread other possible CPUs on these vectors
+ */
+static int irq_build_affinity_masks(const struct irq_affinity *affd,
+				    int startvec, int numvecs,
+				    cpumask_var_t *node_to_cpumask,
+				    struct cpumask *masks)
+{
+	int curvec = startvec, usedvecs = -1;
+	cpumask_var_t nmsk, npresmsk;
+
+	if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
+			return usedvecs;
+
+	if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL))
+			goto fail;
+
+	/* Stabilize the cpumasks */
+	get_online_cpus();
+	build_node_to_cpumask(node_to_cpumask);
+
+	/* Spread on present CPUs starting from affd->pre_vectors */
+	usedvecs = __irq_build_affinity_masks(affd, curvec, numvecs,
+					    node_to_cpumask, cpu_present_mask,
+					    nmsk, masks);
+
+	/*
+	 * Spread on non present CPUs starting from the next vector to be
+	 * handled. If the spreading of present CPUs already exhausted the
+	 * vector space, assign the non present CPUs to the already spread
+	 * out vectors.
+	 */
+	if (usedvecs >= numvecs)
+		curvec = affd->pre_vectors;
+	else
+		curvec = affd->pre_vectors + usedvecs;
+	cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask);
+	usedvecs += __irq_build_affinity_masks(affd, curvec, numvecs,
+					     node_to_cpumask, npresmsk,
+					     nmsk, masks);
+	put_online_cpus();
+
+	free_cpumask_var(npresmsk);
+
+ fail:
+	free_cpumask_var(nmsk);
+
+	return usedvecs;
+}
+
 /**
  * irq_create_affinity_masks - Create affinity masks for multiqueue spreading
  * @nvecs:	The total number of vectors
@@ -177,7 +229,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 {
 	int affvecs = nvecs - affd->pre_vectors - affd->post_vectors;
 	int curvec, usedvecs;
-	cpumask_var_t nmsk, npresmsk, *node_to_cpumask;
+	cpumask_var_t *node_to_cpumask;
 	struct cpumask *masks = NULL;
 
 	/*
@@ -187,15 +239,9 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 	if (nvecs == affd->pre_vectors + affd->post_vectors)
 		return NULL;
 
-	if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
-		return NULL;
-
-	if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL))
-		goto outcpumsk;
-
 	node_to_cpumask = alloc_node_to_cpumask();
 	if (!node_to_cpumask)
-		goto outnpresmsk;
+		return NULL;
 
 	masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL);
 	if (!masks)
@@ -205,30 +251,8 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 	for (curvec = 0; curvec < affd->pre_vectors; curvec++)
 		cpumask_copy(masks + curvec, irq_default_affinity);
 
-	/* Stabilize the cpumasks */
-	get_online_cpus();
-	build_node_to_cpumask(node_to_cpumask);
-
-	/* Spread on present CPUs starting from affd->pre_vectors */
 	usedvecs = irq_build_affinity_masks(affd, curvec, affvecs,
-					    node_to_cpumask, cpu_present_mask,
-					    nmsk, masks);
-
-	/*
-	 * Spread on non present CPUs starting from the next vector to be
-	 * handled. If the spreading of present CPUs already exhausted the
-	 * vector space, assign the non present CPUs to the already spread
-	 * out vectors.
-	 */
-	if (usedvecs >= affvecs)
-		curvec = affd->pre_vectors;
-	else
-		curvec = affd->pre_vectors + usedvecs;
-	cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask);
-	usedvecs += irq_build_affinity_masks(affd, curvec, affvecs,
-					     node_to_cpumask, npresmsk,
-					     nmsk, masks);
-	put_online_cpus();
+					    node_to_cpumask, masks);
 
 	/* Fill out vectors at the end that don't need affinity */
 	if (usedvecs >= affvecs)
@@ -240,10 +264,6 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 
 outnodemsk:
 	free_node_to_cpumask(node_to_cpumask);
-outnpresmsk:
-	free_cpumask_var(npresmsk);
-outcpumsk:
-	free_cpumask_var(nmsk);
 	return masks;
 }
 
-- 
cgit 


From 060746d9e394084b7401e7532f2de528ecbfb521 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 2 Nov 2018 22:59:50 +0800
Subject: genirq/affinity: Pass first vector to __irq_build_affinity_masks()

No functional change.

Prepares for support of allocating and affinitizing sets of interrupts, in
which each set of interrupts needs a full two stage spreading. The first
vector argument is necessary for this so the affinitizing starts from the
first vector of each set.

[ tglx: Minor changelog tweaks ]

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: linux-block@vger.kernel.org
Cc: Hannes Reinecke <hare@suse.com>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Sagi Grimberg <sagi@grimberg.me>
Link: https://lkml.kernel.org/r/20181102145951.31979-4-ming.lei@redhat.com
---
 kernel/irq/affinity.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 2f9812b6035e..e028b773e38a 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -95,14 +95,14 @@ static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask,
 }
 
 static int __irq_build_affinity_masks(const struct irq_affinity *affd,
-				    int startvec, int numvecs,
+				    int startvec, int numvecs, int firstvec,
 				    cpumask_var_t *node_to_cpumask,
 				    const struct cpumask *cpu_mask,
 				    struct cpumask *nmsk,
 				    struct cpumask *masks)
 {
 	int n, nodes, cpus_per_vec, extra_vecs, done = 0;
-	int last_affv = affd->pre_vectors + numvecs;
+	int last_affv = firstvec + numvecs;
 	int curvec = startvec;
 	nodemask_t nodemsk = NODE_MASK_NONE;
 
@@ -119,7 +119,7 @@ static int __irq_build_affinity_masks(const struct irq_affinity *affd,
 		for_each_node_mask(n, nodemsk) {
 			cpumask_or(masks + curvec, masks + curvec, node_to_cpumask[n]);
 			if (++curvec == last_affv)
-				curvec = affd->pre_vectors;
+				curvec = firstvec;
 		}
 		done = numvecs;
 		goto out;
@@ -129,7 +129,7 @@ static int __irq_build_affinity_masks(const struct irq_affinity *affd,
 		int ncpus, v, vecs_to_assign, vecs_per_node;
 
 		/* Spread the vectors per node */
-		vecs_per_node = (numvecs - (curvec - affd->pre_vectors)) / nodes;
+		vecs_per_node = (numvecs - (curvec - firstvec)) / nodes;
 
 		/* Get the cpus on this node which are in the mask */
 		cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
@@ -157,7 +157,7 @@ static int __irq_build_affinity_masks(const struct irq_affinity *affd,
 		if (done >= numvecs)
 			break;
 		if (curvec >= last_affv)
-			curvec = affd->pre_vectors;
+			curvec = firstvec;
 		--nodes;
 	}
 
@@ -190,8 +190,9 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd,
 
 	/* Spread on present CPUs starting from affd->pre_vectors */
 	usedvecs = __irq_build_affinity_masks(affd, curvec, numvecs,
-					    node_to_cpumask, cpu_present_mask,
-					    nmsk, masks);
+					      affd->pre_vectors,
+					      node_to_cpumask,
+					      cpu_present_mask, nmsk, masks);
 
 	/*
 	 * Spread on non present CPUs starting from the next vector to be
@@ -205,8 +206,9 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd,
 		curvec = affd->pre_vectors + usedvecs;
 	cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask);
 	usedvecs += __irq_build_affinity_masks(affd, curvec, numvecs,
-					     node_to_cpumask, npresmsk,
-					     nmsk, masks);
+					       affd->pre_vectors,
+					       node_to_cpumask, npresmsk,
+					       nmsk, masks);
 	put_online_cpus();
 
 	free_cpumask_var(npresmsk);
-- 
cgit 


From 6da4b3ab9a6e9b1b5f90322ab3fa3a7dd18edb19 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 2 Nov 2018 22:59:51 +0800
Subject: genirq/affinity: Add support for allocating interrupt sets

A driver may have a need to allocate multiple sets of MSI/MSI-X interrupts,
and have them appropriately affinitized.

Add support for defining a number of sets in the irq_affinity structure, of
varying sizes, and get each set affinitized correctly across the machine.

[ tglx: Minor changelog tweaks ]

Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Cc: linux-block@vger.kernel.org
Link: https://lkml.kernel.org/r/20181102145951.31979-5-ming.lei@redhat.com
---
 drivers/pci/msi.c         | 14 +++++++++
 include/linux/interrupt.h |  4 +++
 kernel/irq/affinity.c     | 77 +++++++++++++++++++++++++++++++++--------------
 3 files changed, 72 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index af24ed50a245..265ed3e4c920 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -1036,6 +1036,13 @@ static int __pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec,
 	if (maxvec < minvec)
 		return -ERANGE;
 
+	/*
+	 * If the caller is passing in sets, we can't support a range of
+	 * vectors. The caller needs to handle that.
+	 */
+	if (affd && affd->nr_sets && minvec != maxvec)
+		return -EINVAL;
+
 	if (WARN_ON_ONCE(dev->msi_enabled))
 		return -EINVAL;
 
@@ -1087,6 +1094,13 @@ static int __pci_enable_msix_range(struct pci_dev *dev,
 	if (maxvec < minvec)
 		return -ERANGE;
 
+	/*
+	 * If the caller is passing in sets, we can't support a range of
+	 * supported vectors. The caller needs to handle that.
+	 */
+	if (affd && affd->nr_sets && minvec != maxvec)
+		return -EINVAL;
+
 	if (WARN_ON_ONCE(dev->msix_enabled))
 		return -EINVAL;
 
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 1d6711c28271..ca397ff40836 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -247,10 +247,14 @@ struct irq_affinity_notify {
  *			the MSI(-X) vector space
  * @post_vectors:	Don't apply affinity to @post_vectors at end of
  *			the MSI(-X) vector space
+ * @nr_sets:		Length of passed in *sets array
+ * @sets:		Number of affinitized sets
  */
 struct irq_affinity {
 	int	pre_vectors;
 	int	post_vectors;
+	int	nr_sets;
+	int	*sets;
 };
 
 #if defined(CONFIG_SMP)
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index e028b773e38a..08c904eb7279 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -171,28 +171,29 @@ out:
  *	2) spread other possible CPUs on these vectors
  */
 static int irq_build_affinity_masks(const struct irq_affinity *affd,
-				    int startvec, int numvecs,
+				    int startvec, int numvecs, int firstvec,
 				    cpumask_var_t *node_to_cpumask,
 				    struct cpumask *masks)
 {
-	int curvec = startvec, usedvecs = -1;
+	int curvec = startvec, nr_present, nr_others;
+	int ret = -ENOMEM;
 	cpumask_var_t nmsk, npresmsk;
 
 	if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
-			return usedvecs;
+			return ret;
 
 	if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL))
 			goto fail;
 
+	ret = 0;
 	/* Stabilize the cpumasks */
 	get_online_cpus();
 	build_node_to_cpumask(node_to_cpumask);
 
 	/* Spread on present CPUs starting from affd->pre_vectors */
-	usedvecs = __irq_build_affinity_masks(affd, curvec, numvecs,
-					      affd->pre_vectors,
-					      node_to_cpumask,
-					      cpu_present_mask, nmsk, masks);
+	nr_present = __irq_build_affinity_masks(affd, curvec, numvecs,
+						firstvec, node_to_cpumask,
+						cpu_present_mask, nmsk, masks);
 
 	/*
 	 * Spread on non present CPUs starting from the next vector to be
@@ -200,23 +201,24 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd,
 	 * vector space, assign the non present CPUs to the already spread
 	 * out vectors.
 	 */
-	if (usedvecs >= numvecs)
-		curvec = affd->pre_vectors;
+	if (nr_present >= numvecs)
+		curvec = firstvec;
 	else
-		curvec = affd->pre_vectors + usedvecs;
+		curvec = firstvec + nr_present;
 	cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask);
-	usedvecs += __irq_build_affinity_masks(affd, curvec, numvecs,
-					       affd->pre_vectors,
-					       node_to_cpumask, npresmsk,
-					       nmsk, masks);
+	nr_others = __irq_build_affinity_masks(affd, curvec, numvecs,
+					       firstvec, node_to_cpumask,
+					       npresmsk, nmsk, masks);
 	put_online_cpus();
 
+	if (nr_present < numvecs)
+			WARN_ON(nr_present + nr_others < numvecs);
+
 	free_cpumask_var(npresmsk);
 
  fail:
 	free_cpumask_var(nmsk);
-
-	return usedvecs;
+	return ret;
 }
 
 /**
@@ -233,6 +235,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 	int curvec, usedvecs;
 	cpumask_var_t *node_to_cpumask;
 	struct cpumask *masks = NULL;
+	int i, nr_sets;
 
 	/*
 	 * If there aren't any vectors left after applying the pre/post
@@ -253,8 +256,28 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 	for (curvec = 0; curvec < affd->pre_vectors; curvec++)
 		cpumask_copy(masks + curvec, irq_default_affinity);
 
-	usedvecs = irq_build_affinity_masks(affd, curvec, affvecs,
-					    node_to_cpumask, masks);
+	/*
+	 * Spread on present CPUs starting from affd->pre_vectors. If we
+	 * have multiple sets, build each sets affinity mask separately.
+	 */
+	nr_sets = affd->nr_sets;
+	if (!nr_sets)
+		nr_sets = 1;
+
+	for (i = 0, usedvecs = 0; i < nr_sets; i++) {
+		int this_vecs = affd->sets ? affd->sets[i] : affvecs;
+		int ret;
+
+		ret = irq_build_affinity_masks(affd, curvec, this_vecs,
+						curvec, node_to_cpumask, masks);
+		if (ret) {
+				kfree(masks);
+				masks = NULL;
+				goto outnodemsk;
+		}
+		curvec += this_vecs;
+		usedvecs += this_vecs;
+	}
 
 	/* Fill out vectors at the end that don't need affinity */
 	if (usedvecs >= affvecs)
@@ -279,13 +302,21 @@ int irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity
 {
 	int resv = affd->pre_vectors + affd->post_vectors;
 	int vecs = maxvec - resv;
-	int ret;
+	int set_vecs;
 
 	if (resv > minvec)
 		return 0;
 
-	get_online_cpus();
-	ret = min_t(int, cpumask_weight(cpu_possible_mask), vecs) + resv;
-	put_online_cpus();
-	return ret;
+	if (affd->nr_sets) {
+		int i;
+
+		for (i = 0, set_vecs = 0;  i < affd->nr_sets; i++)
+			set_vecs += affd->sets[i];
+	} else {
+		get_online_cpus();
+		set_vecs = cpumask_weight(cpu_possible_mask);
+		put_online_cpus();
+	}
+
+	return resv + min(set_vecs, vecs);
 }
-- 
cgit 


From 7d9df98be66fec64349f9f1c9d3e896293fe7b45 Mon Sep 17 00:00:00 2001
From: Yangtao Li <tiny.windzz@gmail.com>
Date: Sat, 3 Nov 2018 22:31:04 -0400
Subject: clockevents: Remove unnecessary unlikely()

WARN_ON() and WARN_ON_ONCE() already contains an unlikely(), so it's not
necessary to use unlikely.

Signed-off-by: Yangtao Li <tiny.windzz@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20181104023104.2572-1-tiny.windzz@gmail.com
---
 kernel/time/clockevents.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 8c0e4092f661..af58898d9ebf 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -39,10 +39,8 @@ static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt,
 	u64 clc = (u64) latch << evt->shift;
 	u64 rnd;
 
-	if (unlikely(!evt->mult)) {
+	if (WARN_ON(!evt->mult))
 		evt->mult = 1;
-		WARN_ON(1);
-	}
 	rnd = (u64) evt->mult - 1;
 
 	/*
@@ -164,10 +162,8 @@ void clockevents_switch_state(struct clock_event_device *dev,
 		 * on it, so fix it up and emit a warning:
 		 */
 		if (clockevent_state_oneshot(dev)) {
-			if (unlikely(!dev->mult)) {
+			if (WARN_ON(!dev->mult))
 				dev->mult = 1;
-				WARN_ON(1);
-			}
 		}
 	}
 }
@@ -315,10 +311,8 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
 	int64_t delta;
 	int rc;
 
-	if (unlikely(expires < 0)) {
-		WARN_ON_ONCE(1);
+	if (WARN_ON_ONCE(expires < 0))
 		return -ETIME;
-	}
 
 	dev->next_event = expires;
 
-- 
cgit 


From 4d9ebbe2b061a9c25e12ba8539ba172533132eb6 Mon Sep 17 00:00:00 2001
From: Yangtao Li <tiny.windzz@gmail.com>
Date: Sat, 3 Nov 2018 22:27:41 -0400
Subject: cgroup: remove unnecessary unlikely()

WARN_ON() already contains an unlikely(), so it's not necessary to use
unlikely.

Signed-off-by: Yangtao Li <tiny.windzz@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cgroup.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 6aaf5dd5383b..2e5d90dfcb49 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -5978,10 +5978,8 @@ static ssize_t show_delegatable_files(struct cftype *files, char *buf,
 
 		ret += snprintf(buf + ret, size - ret, "%s\n", cft->name);
 
-		if (unlikely(ret >= size)) {
-			WARN_ON(1);
+		if (WARN_ON(ret >= size))
 			break;
-		}
 	}
 
 	return ret;
-- 
cgit 


From ea956d8be91edc702a98b7fe1f9463e7ca8c42ab Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Wed, 10 Oct 2018 16:22:57 -0400
Subject: audit: print empty EXECVE args

Empty executable arguments were being skipped when printing out the list
of arguments in an EXECVE record, making it appear they were somehow
lost.  Include empty arguments as an itemized empty string.

Reproducer:
	autrace /bin/ls "" "/etc"
	ausearch --start recent -m execve -i | grep EXECVE
	type=EXECVE msg=audit(10/03/2018 13:04:03.208:1391) : argc=3 a0=/bin/ls a2=/etc

With fix:
	type=EXECVE msg=audit(10/03/2018 21:51:38.290:194) : argc=3 a0=/bin/ls a1= a2=/etc
	type=EXECVE msg=audit(1538617898.290:194): argc=3 a0="/bin/ls" a1="" a2="/etc"

Passes audit-testsuite.  GH issue tracker at
https://github.com/linux-audit/audit-kernel/issues/99

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
[PM: cleaned up the commit metadata]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/auditsc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index b2d1f043f17f..1513873e23bd 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1107,7 +1107,7 @@ static void audit_log_execve_info(struct audit_context *context,
 		}
 
 		/* write as much as we can to the audit log */
-		if (len_buf > 0) {
+		if (len_buf >= 0) {
 			/* NOTE: some magic numbers here - basically if we
 			 *       can't fit a reasonable amount of data into the
 			 *       existing audit buffer, flush it and start with
-- 
cgit 


From e8da8794a7fd9eef1ec9a07f0d4897c68581c72b Mon Sep 17 00:00:00 2001
From: Long Li <longli@microsoft.com>
Date: Tue, 6 Nov 2018 04:00:00 +0000
Subject: genirq/matrix: Improve target CPU selection for managed interrupts.

On large systems with multiple devices of the same class (e.g. NVMe disks,
using managed interrupts), the kernel can affinitize these interrupts to a
small subset of CPUs instead of spreading them out evenly.

irq_matrix_alloc_managed() tries to select the CPU in the supplied cpumask
of possible target CPUs which has the lowest number of interrupt vectors
allocated.

This is done by searching the CPU with the highest number of available
vectors. While this is correct for non-managed CPUs it can select the wrong
CPU for managed interrupts. Under certain constellations this results in
affinitizing the managed interrupts of several devices to a single CPU in
a set.

The book keeping of available vectors works the following way:

 1) Non-managed interrupts:

    available is decremented when the interrupt is actually requested by
    the device driver and a vector is assigned. It's incremented when the
    interrupt and the vector are freed.

 2) Managed interrupts:

    Managed interrupts guarantee vector reservation when the MSI/MSI-X
    functionality of a device is enabled, which is achieved by reserving
    vectors in the bitmaps of the possible target CPUs. This reservation
    decrements the available count on each possible target CPU.

    When the interrupt is requested by the device driver then a vector is
    allocated from the reserved region. The operation is reversed when the
    interrupt is freed by the device driver. Neither of these operations
    affect the available count.

    The reservation persist up to the point where the MSI/MSI-X
    functionality is disabled and only this operation increments the
    available count again.

For non-managed interrupts the available count is the correct selection
criterion because the guaranteed reservations need to be taken into
account. Using the allocated counter could lead to a failing allocation in
the following situation (total vector space of 10 assumed):

		 CPU0	CPU1
 available:	    2	   0
 allocated:	    5	   3   <--- CPU1 is selected, but available space = 0
 managed reserved:  3	   7

 while available yields the correct result.

For managed interrupts the available count is not the appropriate
selection criterion because as explained above the available count is not
affected by the actual vector allocation.

The following example illustrates that. Total vector space of 10
assumed. The starting point is:

		 CPU0	CPU1
 available:	    5	   4
 allocated:	    2	   3
 managed reserved:  3	   3

 Allocating vectors for three non-managed interrupts will result in
 affinitizing the first two to CPU0 and the third one to CPU1 because the
 available count is adjusted with each allocation:

		  CPU0	CPU1
 available:	     5	   4	<- Select CPU0 for 1st allocation
 --> allocated:	     3	   3

 available:	     4	   4	<- Select CPU0 for 2nd allocation
 --> allocated:	     4	   3

 available:	     3	   4	<- Select CPU1 for 3rd allocation
 --> allocated:	     4	   4

 But the allocation of three managed interrupts starting from the same
 point will affinitize all of them to CPU0 because the available count is
 not affected by the allocation (see above). So the end result is:

		  CPU0	CPU1
 available:	     5	   4
 allocated:	     5	   3

Introduce a "managed_allocated" field in struct cpumap to track the vector
allocation for managed interrupts separately. Use this information to
select the target CPU when a vector is allocated for a managed interrupt,
which results in more evenly distributed vector assignments. The above
example results in the following allocations:

		 CPU0	CPU1
 managed_allocated: 0	   0	<- Select CPU0 for 1st allocation
 --> allocated:	    3	   3

 managed_allocated: 1	   0	<- Select CPU1 for 2nd allocation
 --> allocated:	    3	   4

 managed_allocated: 1	   1	<- Select CPU0 for 3rd allocation
 --> allocated:	    4	   4

The allocation of non-managed interrupts is not affected by this change and
is still evaluating the available count.

The overall distribution of interrupt vectors for both types of interrupts
might still not be perfectly even depending on the number of non-managed
and managed interrupts in a system, but due to the reservation guarantee
for managed interrupts this cannot be avoided.

Expose the new field in debugfs as well.

[ tglx: Clarified the background of the problem in the changelog and
  	described it independent of NVME ]

Signed-off-by: Long Li <longli@microsoft.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Michael Kelley <mikelley@microsoft.com>
Link: https://lkml.kernel.org/r/20181106040000.27316-1-longli@linuxonhyperv.com
---
 kernel/irq/matrix.c | 34 ++++++++++++++++++++++++++++++----
 1 file changed, 30 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c
index 1f0985adf193..30cc217b8631 100644
--- a/kernel/irq/matrix.c
+++ b/kernel/irq/matrix.c
@@ -14,6 +14,7 @@ struct cpumap {
 	unsigned int		available;
 	unsigned int		allocated;
 	unsigned int		managed;
+	unsigned int		managed_allocated;
 	bool			initialized;
 	bool			online;
 	unsigned long		alloc_map[IRQ_MATRIX_SIZE];
@@ -145,6 +146,27 @@ static unsigned int matrix_find_best_cpu(struct irq_matrix *m,
 	return best_cpu;
 }
 
+/* Find the best CPU which has the lowest number of managed IRQs allocated */
+static unsigned int matrix_find_best_cpu_managed(struct irq_matrix *m,
+						const struct cpumask *msk)
+{
+	unsigned int cpu, best_cpu, allocated = UINT_MAX;
+	struct cpumap *cm;
+
+	best_cpu = UINT_MAX;
+
+	for_each_cpu(cpu, msk) {
+		cm = per_cpu_ptr(m->maps, cpu);
+
+		if (!cm->online || cm->managed_allocated > allocated)
+			continue;
+
+		best_cpu = cpu;
+		allocated = cm->managed_allocated;
+	}
+	return best_cpu;
+}
+
 /**
  * irq_matrix_assign_system - Assign system wide entry in the matrix
  * @m:		Matrix pointer
@@ -269,7 +291,7 @@ int irq_matrix_alloc_managed(struct irq_matrix *m, const struct cpumask *msk,
 	if (cpumask_empty(msk))
 		return -EINVAL;
 
-	cpu = matrix_find_best_cpu(m, msk);
+	cpu = matrix_find_best_cpu_managed(m, msk);
 	if (cpu == UINT_MAX)
 		return -ENOSPC;
 
@@ -282,6 +304,7 @@ int irq_matrix_alloc_managed(struct irq_matrix *m, const struct cpumask *msk,
 		return -ENOSPC;
 	set_bit(bit, cm->alloc_map);
 	cm->allocated++;
+	cm->managed_allocated++;
 	m->total_allocated++;
 	*mapped_cpu = cpu;
 	trace_irq_matrix_alloc_managed(bit, cpu, m, cm);
@@ -395,6 +418,8 @@ void irq_matrix_free(struct irq_matrix *m, unsigned int cpu,
 
 	clear_bit(bit, cm->alloc_map);
 	cm->allocated--;
+	if(managed)
+		cm->managed_allocated--;
 
 	if (cm->online)
 		m->total_allocated--;
@@ -464,13 +489,14 @@ void irq_matrix_debug_show(struct seq_file *sf, struct irq_matrix *m, int ind)
 	seq_printf(sf, "Total allocated:  %6u\n", m->total_allocated);
 	seq_printf(sf, "System: %u: %*pbl\n", nsys, m->matrix_bits,
 		   m->system_map);
-	seq_printf(sf, "%*s| CPU | avl | man | act | vectors\n", ind, " ");
+	seq_printf(sf, "%*s| CPU | avl | man | mac | act | vectors\n", ind, " ");
 	cpus_read_lock();
 	for_each_online_cpu(cpu) {
 		struct cpumap *cm = per_cpu_ptr(m->maps, cpu);
 
-		seq_printf(sf, "%*s %4d  %4u  %4u  %4u  %*pbl\n", ind, " ",
-			   cpu, cm->available, cm->managed, cm->allocated,
+		seq_printf(sf, "%*s %4d  %4u  %4u  %4u %4u  %*pbl\n", ind, " ",
+			   cpu, cm->available, cm->managed,
+			   cm->managed_allocated, cm->allocated,
 			   m->matrix_bits, cm->alloc_map);
 	}
 	cpus_read_unlock();
-- 
cgit 


From e84cd7ee630e44a2cc8ae49e85920a271b214cb3 Mon Sep 17 00:00:00 2001
From: Ke Wu <mikewu@google.com>
Date: Tue, 6 Nov 2018 15:21:30 -0800
Subject: modsign: use all trusted keys to verify module signature

Make mod_verify_sig to use all trusted keys. This allows keys in
secondary_trusted_keys to be used to verify PKCS#7 signature on a
kernel module.

Signed-off-by: Ke Wu <mikewu@google.com>
Signed-off-by: Jessica Yu <jeyu@kernel.org>
---
 kernel/module_signing.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/module_signing.c b/kernel/module_signing.c
index f2075ce8e4b3..6b9a926fd86b 100644
--- a/kernel/module_signing.c
+++ b/kernel/module_signing.c
@@ -83,6 +83,7 @@ int mod_verify_sig(const void *mod, struct load_info *info)
 	}
 
 	return verify_pkcs7_signature(mod, modlen, mod + modlen, sig_len,
-				      NULL, VERIFYING_MODULE_SIGNATURE,
+				      VERIFY_USE_SECONDARY_KEYRING,
+				      VERIFYING_MODULE_SIGNATURE,
 				      NULL, NULL);
 }
-- 
cgit 


From 4ec22e9c5a90e3809dd52014d5d239af8831a520 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Thu, 8 Nov 2018 10:08:35 -0500
Subject: cpuset: Enable cpuset controller in default hierarchy

Given the fact that thread mode had been merged into 4.14, it is now
time to enable cpuset to be used in the default hierarchy (cgroup v2)
as it is clearly threaded.

The cpuset controller had experienced feature creep since its
introduction more than a decade ago. Besides the core cpus and mems
control files to limit cpus and memory nodes, there are a bunch of
additional features that can be controlled from the userspace. Some of
the features are of doubtful usefulness and may not be actively used.

This patch enables cpuset controller in the default hierarchy with
a minimal set of features, namely just the cpus and mems and their
effective_* counterparts.  We can certainly add more features to the
default hierarchy in the future if there is a real user need for them
later on.

Alternatively, with the unified hiearachy, it may make more sense
to move some of those additional cpuset features, if desired, to
memory controller or may be to the cpu controller instead of staying
with cpuset.

Signed-off-by: Waiman Long <longman@redhat.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 Documentation/admin-guide/cgroup-v2.rst | 109 ++++++++++++++++++++++++++++++--
 kernel/cgroup/cpuset.c                  |  48 +++++++++++++-
 2 files changed, 149 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 476722b7b636..01b70f69304e 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -56,11 +56,13 @@ v1 is available under Documentation/cgroup-v1/.
          5-3-3-2. IO Latency Interface Files
      5-4. PID
        5-4-1. PID Interface Files
-     5-5. Device
-     5-6. RDMA
-       5-6-1. RDMA Interface Files
-     5-7. Misc
-       5-7-1. perf_event
+     5-5. Cpuset
+       5.5-1. Cpuset Interface Files
+     5-6. Device
+     5-7. RDMA
+       5-7-1. RDMA Interface Files
+     5-8. Misc
+       5-8-1. perf_event
      5-N. Non-normative information
        5-N-1. CPU controller root cgroup process behaviour
        5-N-2. IO controller root cgroup process behaviour
@@ -1610,6 +1612,103 @@ through fork() or clone(). These will return -EAGAIN if the creation
 of a new process would cause a cgroup policy to be violated.
 
 
+Cpuset
+------
+
+The "cpuset" controller provides a mechanism for constraining
+the CPU and memory node placement of tasks to only the resources
+specified in the cpuset interface files in a task's current cgroup.
+This is especially valuable on large NUMA systems where placing jobs
+on properly sized subsets of the systems with careful processor and
+memory placement to reduce cross-node memory access and contention
+can improve overall system performance.
+
+The "cpuset" controller is hierarchical.  That means the controller
+cannot use CPUs or memory nodes not allowed in its parent.
+
+
+Cpuset Interface Files
+~~~~~~~~~~~~~~~~~~~~~~
+
+  cpuset.cpus
+	A read-write multiple values file which exists on non-root
+	cpuset-enabled cgroups.
+
+	It lists the requested CPUs to be used by tasks within this
+	cgroup.  The actual list of CPUs to be granted, however, is
+	subjected to constraints imposed by its parent and can differ
+	from the requested CPUs.
+
+	The CPU numbers are comma-separated numbers or ranges.
+	For example:
+
+	  # cat cpuset.cpus
+	  0-4,6,8-10
+
+	An empty value indicates that the cgroup is using the same
+	setting as the nearest cgroup ancestor with a non-empty
+	"cpuset.cpus" or all the available CPUs if none is found.
+
+	The value of "cpuset.cpus" stays constant until the next update
+	and won't be affected by any CPU hotplug events.
+
+  cpuset.cpus.effective
+	A read-only multiple values file which exists on non-root
+	cpuset-enabled cgroups.
+
+	It lists the onlined CPUs that are actually granted to this
+	cgroup by its parent.  These CPUs are allowed to be used by
+	tasks within the current cgroup.
+
+	If "cpuset.cpus" is empty, the "cpuset.cpus.effective" file shows
+	all the CPUs from the parent cgroup that can be available to
+	be used by this cgroup.  Otherwise, it should be a subset of
+	"cpuset.cpus" unless none of the CPUs listed in "cpuset.cpus"
+	can be granted.  In this case, it will be treated just like an
+	empty "cpuset.cpus".
+
+	Its value will be affected by CPU hotplug events.
+
+  cpuset.mems
+	A read-write multiple values file which exists on non-root
+	cpuset-enabled cgroups.
+
+	It lists the requested memory nodes to be used by tasks within
+	this cgroup.  The actual list of memory nodes granted, however,
+	is subjected to constraints imposed by its parent and can differ
+	from the requested memory nodes.
+
+	The memory node numbers are comma-separated numbers or ranges.
+	For example:
+
+	  # cat cpuset.mems
+	  0-1,3
+
+	An empty value indicates that the cgroup is using the same
+	setting as the nearest cgroup ancestor with a non-empty
+	"cpuset.mems" or all the available memory nodes if none
+	is found.
+
+	The value of "cpuset.mems" stays constant until the next update
+	and won't be affected by any memory nodes hotplug events.
+
+  cpuset.mems.effective
+	A read-only multiple values file which exists on non-root
+	cpuset-enabled cgroups.
+
+	It lists the onlined memory nodes that are actually granted to
+	this cgroup by its parent. These memory nodes are allowed to
+	be used by tasks within the current cgroup.
+
+	If "cpuset.mems" is empty, it shows all the memory nodes from the
+	parent cgroup that will be available to be used by this cgroup.
+	Otherwise, it should be a subset of "cpuset.mems" unless none of
+	the memory nodes listed in "cpuset.mems" can be granted.  In this
+	case, it will be treated just like an empty "cpuset.mems".
+
+	Its value will be affected by memory nodes hotplug events.
+
+
 Device controller
 -----------------
 
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 266f10cb7222..2b5c4477d969 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -1824,12 +1824,11 @@ static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
 	return 0;
 }
 
-
 /*
  * for the common functions, 'private' gives the type of file
  */
 
-static struct cftype files[] = {
+static struct cftype legacy_files[] = {
 	{
 		.name = "cpus",
 		.seq_show = cpuset_common_seq_show,
@@ -1931,6 +1930,47 @@ static struct cftype files[] = {
 	{ }	/* terminate */
 };
 
+/*
+ * This is currently a minimal set for the default hierarchy. It can be
+ * expanded later on by migrating more features and control files from v1.
+ */
+static struct cftype dfl_files[] = {
+	{
+		.name = "cpus",
+		.seq_show = cpuset_common_seq_show,
+		.write = cpuset_write_resmask,
+		.max_write_len = (100U + 6 * NR_CPUS),
+		.private = FILE_CPULIST,
+		.flags = CFTYPE_NOT_ON_ROOT,
+	},
+
+	{
+		.name = "mems",
+		.seq_show = cpuset_common_seq_show,
+		.write = cpuset_write_resmask,
+		.max_write_len = (100U + 6 * MAX_NUMNODES),
+		.private = FILE_MEMLIST,
+		.flags = CFTYPE_NOT_ON_ROOT,
+	},
+
+	{
+		.name = "cpus.effective",
+		.seq_show = cpuset_common_seq_show,
+		.private = FILE_EFFECTIVE_CPULIST,
+		.flags = CFTYPE_NOT_ON_ROOT,
+	},
+
+	{
+		.name = "mems.effective",
+		.seq_show = cpuset_common_seq_show,
+		.private = FILE_EFFECTIVE_MEMLIST,
+		.flags = CFTYPE_NOT_ON_ROOT,
+	},
+
+	{ }	/* terminate */
+};
+
+
 /*
  *	cpuset_css_alloc - allocate a cpuset css
  *	cgrp:	control group that the new cpuset will be part of
@@ -2105,8 +2145,10 @@ struct cgroup_subsys cpuset_cgrp_subsys = {
 	.post_attach	= cpuset_post_attach,
 	.bind		= cpuset_bind,
 	.fork		= cpuset_fork,
-	.legacy_cftypes	= files,
+	.legacy_cftypes	= legacy_files,
+	.dfl_cftypes	= dfl_files,
 	.early_init	= true,
+	.threaded	= true,
 };
 
 /**
-- 
cgit 


From 58b7484250db8da0d49bdd289c877ccd3319575c Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Thu, 8 Nov 2018 10:08:36 -0500
Subject: cpuset: Define data structures to support scheduling partition

>From a cpuset point of view, a scheduling partition is a group of
cpusets with their own set of exclusive CPUs that are not shared by
other tasks outside the scheduling partition.

In the legacy hierarchy, scheduling partitions are supported indirectly
via the right use of the load balancing and the exclusive CPUs flag
which is not intuitive and can be hard to use.

To fully support the concept of scheduling partitions in the default
hierarchy, we need to add some new field into the cpuset structure as
well as a new tmpmasks structure that is used to pre-allocate cpumasks
at the top level cpuset functions to avoid memory allocation in inner
functions as memory allocation failure in those inner functions may
cause a cpuset to have inconsistent states.

Signed-off-by: Waiman Long <longman@redhat.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cpuset.c | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 2b5c4477d969..29a2bdc671fd 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -109,6 +109,13 @@ struct cpuset {
 	cpumask_var_t effective_cpus;
 	nodemask_t effective_mems;
 
+	/*
+	 * CPUs allocated to child sub-partitions (default hierarchy only)
+	 * - CPUs granted by the parent = effective_cpus U subparts_cpus
+	 * - effective_cpus and subparts_cpus are mutually exclusive.
+	 */
+	cpumask_var_t subparts_cpus;
+
 	/*
 	 * This is old Memory Nodes tasks took on.
 	 *
@@ -134,6 +141,30 @@ struct cpuset {
 
 	/* for custom sched domain */
 	int relax_domain_level;
+
+	/* number of CPUs in subparts_cpus */
+	int nr_subparts_cpus;
+
+	/* partition root state */
+	int partition_root_state;
+};
+
+/*
+ * Partition root states:
+ *
+ *   0 - not a partition root
+ *   1 - partition root
+ */
+#define PRS_DISABLED		0
+#define PRS_ENABLED		1
+
+/*
+ * Temporary cpumasks for working with partitions that are passed among
+ * functions to avoid memory allocation in inner functions.
+ */
+struct tmpmasks {
+	cpumask_var_t addmask, delmask;	/* For partition root */
+	cpumask_var_t new_cpus;		/* For update_cpumasks_hier() */
 };
 
 static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
@@ -218,9 +249,15 @@ static inline int is_spread_slab(const struct cpuset *cs)
 	return test_bit(CS_SPREAD_SLAB, &cs->flags);
 }
 
+static inline int is_partition_root(const struct cpuset *cs)
+{
+	return cs->partition_root_state;
+}
+
 static struct cpuset top_cpuset = {
 	.flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
 		  (1 << CS_MEM_EXCLUSIVE)),
+	.partition_root_state = PRS_ENABLED,
 };
 
 /**
-- 
cgit 


From bf92370c035d5ed73a90927450c20a07adf08cfd Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Thu, 8 Nov 2018 10:08:37 -0500
Subject: cpuset: Simply allocation and freeing of cpumasks

The previous commit introduces a new subparts_cpus mask into the cpuset
data structure and a new tmpmasks structure.  Managing the allocation
and freeing of those cpumasks is becoming more complex.

So a number of helper functions are added to simplify and streamline
the management of those cpumasks. To make it simple, all the cpumasks
are now pre-cleared on allocation.

Signed-off-by: Waiman Long <longman@redhat.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cpuset.c | 110 ++++++++++++++++++++++++++++++++++---------------
 1 file changed, 77 insertions(+), 33 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 29a2bdc671fd..2ac9437ce8f1 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -455,6 +455,65 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
 		is_mem_exclusive(p) <= is_mem_exclusive(q);
 }
 
+/**
+ * alloc_cpumasks - allocate three cpumasks for cpuset
+ * @cs:  the cpuset that have cpumasks to be allocated.
+ * @tmp: the tmpmasks structure pointer
+ * Return: 0 if successful, -ENOMEM otherwise.
+ *
+ * Only one of the two input arguments should be non-NULL.
+ */
+static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
+{
+	cpumask_var_t *pmask1, *pmask2, *pmask3;
+
+	if (cs) {
+		pmask1 = &cs->cpus_allowed;
+		pmask2 = &cs->effective_cpus;
+		pmask3 = &cs->subparts_cpus;
+	} else {
+		pmask1 = &tmp->new_cpus;
+		pmask2 = &tmp->addmask;
+		pmask3 = &tmp->delmask;
+	}
+
+	if (!zalloc_cpumask_var(pmask1, GFP_KERNEL))
+		return -ENOMEM;
+
+	if (!zalloc_cpumask_var(pmask2, GFP_KERNEL))
+		goto free_one;
+
+	if (!zalloc_cpumask_var(pmask3, GFP_KERNEL))
+		goto free_two;
+
+	return 0;
+
+free_two:
+	free_cpumask_var(*pmask2);
+free_one:
+	free_cpumask_var(*pmask1);
+	return -ENOMEM;
+}
+
+/**
+ * free_cpumasks - free cpumasks in a tmpmasks structure
+ * @cs:  the cpuset that have cpumasks to be free.
+ * @tmp: the tmpmasks structure pointer
+ */
+static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
+{
+	if (cs) {
+		free_cpumask_var(cs->cpus_allowed);
+		free_cpumask_var(cs->effective_cpus);
+		free_cpumask_var(cs->subparts_cpus);
+	}
+	if (tmp) {
+		free_cpumask_var(tmp->new_cpus);
+		free_cpumask_var(tmp->addmask);
+		free_cpumask_var(tmp->delmask);
+	}
+}
+
 /**
  * alloc_trial_cpuset - allocate a trial cpuset
  * @cs: the cpuset that the trial cpuset duplicates
@@ -467,31 +526,24 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
 	if (!trial)
 		return NULL;
 
-	if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL))
-		goto free_cs;
-	if (!alloc_cpumask_var(&trial->effective_cpus, GFP_KERNEL))
-		goto free_cpus;
+	if (alloc_cpumasks(trial, NULL)) {
+		kfree(trial);
+		return NULL;
+	}
 
 	cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
 	cpumask_copy(trial->effective_cpus, cs->effective_cpus);
 	return trial;
-
-free_cpus:
-	free_cpumask_var(trial->cpus_allowed);
-free_cs:
-	kfree(trial);
-	return NULL;
 }
 
 /**
- * free_trial_cpuset - free the trial cpuset
- * @trial: the trial cpuset to be freed
+ * free_cpuset - free the cpuset
+ * @cs: the cpuset to be freed
  */
-static void free_trial_cpuset(struct cpuset *trial)
+static inline void free_cpuset(struct cpuset *cs)
 {
-	free_cpumask_var(trial->effective_cpus);
-	free_cpumask_var(trial->cpus_allowed);
-	kfree(trial);
+	free_cpumasks(cs, NULL);
+	kfree(cs);
 }
 
 /*
@@ -1385,7 +1437,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
 	if (spread_flag_changed)
 		update_tasks_flags(cs);
 out:
-	free_trial_cpuset(trialcs);
+	free_cpuset(trialcs);
 	return err;
 }
 
@@ -1769,7 +1821,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
 		break;
 	}
 
-	free_trial_cpuset(trialcs);
+	free_cpuset(trialcs);
 out_unlock:
 	mutex_unlock(&cpuset_mutex);
 	kernfs_unbreak_active_protection(of->kn);
@@ -2024,26 +2076,19 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
 	cs = kzalloc(sizeof(*cs), GFP_KERNEL);
 	if (!cs)
 		return ERR_PTR(-ENOMEM);
-	if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL))
-		goto free_cs;
-	if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL))
-		goto free_cpus;
+
+	if (alloc_cpumasks(cs, NULL)) {
+		kfree(cs);
+		return ERR_PTR(-ENOMEM);
+	}
 
 	set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
-	cpumask_clear(cs->cpus_allowed);
 	nodes_clear(cs->mems_allowed);
-	cpumask_clear(cs->effective_cpus);
 	nodes_clear(cs->effective_mems);
 	fmeter_init(&cs->fmeter);
 	cs->relax_domain_level = -1;
 
 	return &cs->css;
-
-free_cpus:
-	free_cpumask_var(cs->cpus_allowed);
-free_cs:
-	kfree(cs);
-	return ERR_PTR(-ENOMEM);
 }
 
 static int cpuset_css_online(struct cgroup_subsys_state *css)
@@ -2134,9 +2179,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
 {
 	struct cpuset *cs = css_cs(css);
 
-	free_cpumask_var(cs->effective_cpus);
-	free_cpumask_var(cs->cpus_allowed);
-	kfree(cs);
+	free_cpuset(cs);
 }
 
 static void cpuset_bind(struct cgroup_subsys_state *root_css)
@@ -2200,6 +2243,7 @@ int __init cpuset_init(void)
 
 	BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
 	BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
+	BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL));
 
 	cpumask_setall(top_cpuset.cpus_allowed);
 	nodes_setall(top_cpuset.mems_allowed);
-- 
cgit 


From ee8dde0cd2ce78b62d16aec1c29960b64380e634 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Thu, 8 Nov 2018 10:08:38 -0500
Subject: cpuset: Add new v2 cpuset.sched.partition flag

A new cpuset.sched.partition boolean flag is added to cpuset v2.
This new flag, if set, indicates that the cgroup is the root of a
new scheduling domain or partition that includes itself and all its
descendants except those that are scheduling domain roots themselves
and their descendants.

With this new flag, one can directly create as many partitions as
necessary without ever using the v1 trick of turning off load balancing
in specific cpusets to create partitions as a side effect.

This new flag is owned by the parent and will cause the CPUs in the
cpuset to be removed from the effective CPUs of its parent.

This is implemented internally by adding a new subparts_cpus mask that
holds the CPUs belonging to child partitions so that:

        subparts_cpus | effective_cpus = cpus_allowed
        subparts_cpus & effective_cpus = 0

This new flag can only be turned on in a cpuset if its parent is a
partition root itself. The state of this flag cannot be changed if the
cpuset has children.

Once turned on, further changes to "cpuset.cpus" is allowed as long
as there is at least one CPU left that can be granted from the parent
and a child partition root cannot use up all the CPUs in the parent's
effective_cpus.

Signed-off-by: Waiman Long <longman@redhat.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cpuset.c | 365 +++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 352 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 2ac9437ce8f1..85e04162adcb 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -970,10 +970,191 @@ static void update_tasks_cpumask(struct cpuset *cs)
 	css_task_iter_end(&it);
 }
 
+/**
+ * compute_effective_cpumask - Compute the effective cpumask of the cpuset
+ * @new_cpus: the temp variable for the new effective_cpus mask
+ * @cs: the cpuset the need to recompute the new effective_cpus mask
+ * @parent: the parent cpuset
+ *
+ * If the parent has subpartition CPUs, include them in the list of
+ * allowable CPUs in computing the new effective_cpus mask.
+ */
+static void compute_effective_cpumask(struct cpumask *new_cpus,
+				      struct cpuset *cs, struct cpuset *parent)
+{
+	if (parent->nr_subparts_cpus) {
+		cpumask_or(new_cpus, parent->effective_cpus,
+			   parent->subparts_cpus);
+		cpumask_and(new_cpus, new_cpus, cs->cpus_allowed);
+	} else {
+		cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus);
+	}
+}
+
+/*
+ * Commands for update_parent_subparts_cpumask
+ */
+enum subparts_cmd {
+	partcmd_enable,		/* Enable partition root	 */
+	partcmd_disable,	/* Disable partition root	 */
+	partcmd_update,		/* Update parent's subparts_cpus */
+};
+
+/**
+ * update_parent_subparts_cpumask - update subparts_cpus mask of parent cpuset
+ * @cpuset:  The cpuset that requests change in partition root state
+ * @cmd:     Partition root state change command
+ * @newmask: Optional new cpumask for partcmd_update
+ * @tmp:     Temporary addmask and delmask
+ * Return:   0, 1 or an error code
+ *
+ * For partcmd_enable, the cpuset is being transformed from a non-partition
+ * root to a partition root. The cpus_allowed mask of the given cpuset will
+ * be put into parent's subparts_cpus and taken away from parent's
+ * effective_cpus. The function will return 0 if all the CPUs listed in
+ * cpus_allowed can be granted or an error code will be returned.
+ *
+ * For partcmd_disable, the cpuset is being transofrmed from a partition
+ * root back to a non-partition root. any CPUs in cpus_allowed that are in
+ * parent's subparts_cpus will be taken away from that cpumask and put back
+ * into parent's effective_cpus. 0 should always be returned.
+ *
+ * For partcmd_update, if the optional newmask is specified, the cpu
+ * list is to be changed from cpus_allowed to newmask. Otherwise,
+ * cpus_allowed is assumed to remain the same.  The function will return
+ * 1 if changes to parent's subparts_cpus and effective_cpus happen or 0
+ * otherwise. In case of error, an error code will be returned.
+ *
+ * The partcmd_enable and partcmd_disable commands are used by
+ * update_prstate(). The partcmd_update command is used by
+ * update_cpumasks_hier() with newmask NULL and update_cpumask() with
+ * newmask set.
+ *
+ * The checking is more strict when enabling partition root than the
+ * other two commands.
+ *
+ * Because of the implicit cpu exclusive nature of a partition root,
+ * cpumask changes that violates the cpu exclusivity rule will not be
+ * permitted when checked by validate_change(). The validate_change()
+ * function will also prevent any changes to the cpu list if it is not
+ * a superset of children's cpu lists.
+ */
+static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
+					  struct cpumask *newmask,
+					  struct tmpmasks *tmp)
+{
+	struct cpuset *parent = parent_cs(cpuset);
+	int adding;	/* Moving cpus from effective_cpus to subparts_cpus */
+	int deleting;	/* Moving cpus from subparts_cpus to effective_cpus */
+
+	lockdep_assert_held(&cpuset_mutex);
+
+	/*
+	 * The parent must be a partition root.
+	 * The new cpumask, if present, or the current cpus_allowed must
+	 * not be empty.
+	 */
+	if (!is_partition_root(parent) ||
+	   (newmask && cpumask_empty(newmask)) ||
+	   (!newmask && cpumask_empty(cpuset->cpus_allowed)))
+		return -EINVAL;
+
+	/*
+	 * Enabling/disabling partition root is not allowed if there are
+	 * online children.
+	 */
+	if ((cmd != partcmd_update) && css_has_online_children(&cpuset->css))
+		return -EBUSY;
+
+	/*
+	 * Enabling partition root is not allowed if not all the CPUs
+	 * can be granted from parent's effective_cpus or at least one
+	 * CPU will be left after that.
+	 */
+	if ((cmd == partcmd_enable) &&
+	   (!cpumask_subset(cpuset->cpus_allowed, parent->effective_cpus) ||
+	     cpumask_equal(cpuset->cpus_allowed, parent->effective_cpus)))
+		return -EINVAL;
+
+	/*
+	 * A cpumask update cannot make parent's effective_cpus become empty.
+	 */
+	adding = deleting = false;
+	if (cmd == partcmd_enable) {
+		cpumask_copy(tmp->addmask, cpuset->cpus_allowed);
+		adding = true;
+	} else if (cmd == partcmd_disable) {
+		deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
+				       parent->subparts_cpus);
+	} else if (newmask) {
+		/*
+		 * partcmd_update with newmask:
+		 *
+		 * delmask = cpus_allowed & ~newmask & parent->subparts_cpus
+		 * addmask = newmask & parent->effective_cpus
+		 *		     & ~parent->subparts_cpus
+		 */
+		cpumask_andnot(tmp->delmask, cpuset->cpus_allowed, newmask);
+		deleting = cpumask_and(tmp->delmask, tmp->delmask,
+				       parent->subparts_cpus);
+
+		cpumask_and(tmp->addmask, newmask, parent->effective_cpus);
+		adding = cpumask_andnot(tmp->addmask, tmp->addmask,
+					parent->subparts_cpus);
+		/*
+		 * Return error if the new effective_cpus could become empty.
+		 */
+		if (adding && !deleting &&
+		    cpumask_equal(parent->effective_cpus, tmp->addmask))
+			return -EINVAL;
+	} else {
+		/*
+		 * partcmd_update w/o newmask:
+		 *
+		 * addmask = cpus_allowed & parent->effectiveb_cpus
+		 *
+		 * Note that parent's subparts_cpus may have been
+		 * pre-shrunk in case the CPUs granted to the parent
+		 * by the grandparent changes. So no deletion is needed.
+		 */
+		adding = cpumask_and(tmp->addmask, cpuset->cpus_allowed,
+				     parent->effective_cpus);
+		if (cpumask_equal(tmp->addmask, parent->effective_cpus))
+			return -EINVAL;
+	}
+
+	if (!adding && !deleting)
+		return 0;
+
+	/*
+	 * Change the parent's subparts_cpus.
+	 * Newly added CPUs will be removed from effective_cpus and
+	 * newly deleted ones will be added back to effective_cpus.
+	 */
+	spin_lock_irq(&callback_lock);
+	if (adding) {
+		cpumask_or(parent->subparts_cpus,
+			   parent->subparts_cpus, tmp->addmask);
+		cpumask_andnot(parent->effective_cpus,
+			       parent->effective_cpus, tmp->addmask);
+	}
+	if (deleting) {
+		cpumask_andnot(parent->subparts_cpus,
+			       parent->subparts_cpus, tmp->delmask);
+		cpumask_or(parent->effective_cpus,
+			   parent->effective_cpus, tmp->delmask);
+	}
+
+	parent->nr_subparts_cpus = cpumask_weight(parent->subparts_cpus);
+	spin_unlock_irq(&callback_lock);
+
+	return cmd == partcmd_update;
+}
+
 /*
  * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
- * @cs: the cpuset to consider
- * @new_cpus: temp variable for calculating new effective_cpus
+ * @cs:  the cpuset to consider
+ * @tmp: temp variables for calculating effective_cpus & partition setup
  *
  * When congifured cpumask is changed, the effective cpumasks of this cpuset
  * and all its descendants need to be updated.
@@ -982,7 +1163,7 @@ static void update_tasks_cpumask(struct cpuset *cs)
  *
  * Called with cpuset_mutex held
  */
-static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
+static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
 {
 	struct cpuset *cp;
 	struct cgroup_subsys_state *pos_css;
@@ -991,28 +1172,61 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
 	rcu_read_lock();
 	cpuset_for_each_descendant_pre(cp, pos_css, cs) {
 		struct cpuset *parent = parent_cs(cp);
+		bool cs_empty;
+
+		compute_effective_cpumask(tmp->new_cpus, cp, parent);
+		cs_empty = cpumask_empty(tmp->new_cpus);
 
-		cpumask_and(new_cpus, cp->cpus_allowed, parent->effective_cpus);
+		/*
+		 * A partition root cannot have empty effective_cpus
+		 */
+		WARN_ON_ONCE(cs_empty && is_partition_root(cp));
 
 		/*
 		 * If it becomes empty, inherit the effective mask of the
 		 * parent, which is guaranteed to have some CPUs.
 		 */
-		if (is_in_v2_mode() && cpumask_empty(new_cpus))
-			cpumask_copy(new_cpus, parent->effective_cpus);
+		if (is_in_v2_mode() && cs_empty)
+			cpumask_copy(tmp->new_cpus, parent->effective_cpus);
 
-		/* Skip the whole subtree if the cpumask remains the same. */
-		if (cpumask_equal(new_cpus, cp->effective_cpus)) {
+		/*
+		 * Skip the whole subtree if the cpumask remains the same
+		 * and has no partition root state.
+		 */
+		if (!is_partition_root(cp) &&
+		    cpumask_equal(tmp->new_cpus, cp->effective_cpus)) {
 			pos_css = css_rightmost_descendant(pos_css);
 			continue;
 		}
 
+		/*
+		 * update_parent_subparts_cpumask() should have been called
+		 * for cs already in update_cpumask(). We should also call
+		 * update_tasks_cpumask() again for tasks in the parent
+		 * cpuset if the parent's subparts_cpus changes.
+		 */
+		if ((cp != cs) && cp->partition_root_state &&
+		    update_parent_subparts_cpumask(cp, partcmd_update,
+						   NULL, tmp)) {
+			if (parent != &top_cpuset)
+				update_tasks_cpumask(parent);
+		}
+
 		if (!css_tryget_online(&cp->css))
 			continue;
 		rcu_read_unlock();
 
 		spin_lock_irq(&callback_lock);
-		cpumask_copy(cp->effective_cpus, new_cpus);
+
+		cpumask_copy(cp->effective_cpus, tmp->new_cpus);
+		if (cp->nr_subparts_cpus) {
+			/*
+			 * Make sure that effective_cpus & subparts_cpus
+			 * are mutually exclusive.
+			 */
+			cpumask_andnot(cp->effective_cpus, cp->effective_cpus,
+				       cp->subparts_cpus);
+		}
 		spin_unlock_irq(&callback_lock);
 
 		WARN_ON(!is_in_v2_mode() &&
@@ -1047,6 +1261,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 			  const char *buf)
 {
 	int retval;
+	struct tmpmasks tmp;
 
 	/* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
 	if (cs == &top_cpuset)
@@ -1078,12 +1293,39 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 	if (retval < 0)
 		return retval;
 
+#ifdef CONFIG_CPUMASK_OFFSTACK
+	/*
+	 * Use the cpumasks in trialcs for tmpmasks when they are pointers
+	 * to allocated cpumasks.
+	 */
+	tmp.addmask  = trialcs->subparts_cpus;
+	tmp.delmask  = trialcs->effective_cpus;
+	tmp.new_cpus = trialcs->cpus_allowed;
+#endif
+
+	if (cs->partition_root_state) {
+		/* Cpumask of a partition root cannot be empty */
+		if (cpumask_empty(trialcs->cpus_allowed))
+			return -EINVAL;
+		if (update_parent_subparts_cpumask(cs, partcmd_update,
+					trialcs->cpus_allowed, &tmp) < 0)
+			return -EINVAL;
+	}
+
 	spin_lock_irq(&callback_lock);
 	cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
+
+	/*
+	 * Make sure that subparts_cpus is a subset of cpus_allowed.
+	 */
+	if (cs->nr_subparts_cpus) {
+		cpumask_andnot(cs->subparts_cpus, cs->subparts_cpus,
+			       cs->cpus_allowed);
+		cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus);
+	}
 	spin_unlock_irq(&callback_lock);
 
-	/* use trialcs->cpus_allowed as a temp variable */
-	update_cpumasks_hier(cs, trialcs->cpus_allowed);
+	update_cpumasks_hier(cs, &tmp);
 	return 0;
 }
 
@@ -1441,6 +1683,80 @@ out:
 	return err;
 }
 
+/*
+ * update_prstate - update partititon_root_state
+ * cs:	the cpuset to update
+ * val: 0 - disabled, 1 - enabled
+ *
+ * Call with cpuset_mutex held.
+ */
+static int update_prstate(struct cpuset *cs, int val)
+{
+	int err;
+	struct cpuset *parent = parent_cs(cs);
+	struct tmpmasks tmp;
+
+	if ((val != 0) && (val != 1))
+		return -EINVAL;
+	if (val == cs->partition_root_state)
+		return 0;
+
+	/*
+	 * Cannot force a partial or erroneous partition root to a full
+	 * partition root.
+	 */
+	if (val && cs->partition_root_state)
+		return -EINVAL;
+
+	if (alloc_cpumasks(NULL, &tmp))
+		return -ENOMEM;
+
+	err = -EINVAL;
+	if (!cs->partition_root_state) {
+		/*
+		 * Turning on partition root requires setting the
+		 * CS_CPU_EXCLUSIVE bit implicitly as well and cpus_allowed
+		 * cannot be NULL.
+		 */
+		if (cpumask_empty(cs->cpus_allowed))
+			goto out;
+
+		err = update_flag(CS_CPU_EXCLUSIVE, cs, 1);
+		if (err)
+			goto out;
+
+		err = update_parent_subparts_cpumask(cs, partcmd_enable,
+						     NULL, &tmp);
+		if (err) {
+			update_flag(CS_CPU_EXCLUSIVE, cs, 0);
+			goto out;
+		}
+		cs->partition_root_state = PRS_ENABLED;
+	} else {
+		err = update_parent_subparts_cpumask(cs, partcmd_disable,
+						     NULL, &tmp);
+		if (err)
+			goto out;
+
+		cs->partition_root_state = 0;
+
+		/* Turning off CS_CPU_EXCLUSIVE will not return error */
+		update_flag(CS_CPU_EXCLUSIVE, cs, 0);
+	}
+
+	/*
+	 * Update cpumask of parent's tasks except when it is the top
+	 * cpuset as some system daemons cannot be mapped to other CPUs.
+	 */
+	if (parent != &top_cpuset)
+		update_tasks_cpumask(parent);
+
+	rebuild_sched_domains_locked();
+out:
+	free_cpumasks(NULL, &tmp);
+	return err;
+}
+
 /*
  * Frequency meter - How fast is some event occurring?
  *
@@ -1686,6 +2002,7 @@ typedef enum {
 	FILE_MEM_EXCLUSIVE,
 	FILE_MEM_HARDWALL,
 	FILE_SCHED_LOAD_BALANCE,
+	FILE_PARTITION_ROOT,
 	FILE_SCHED_RELAX_DOMAIN_LEVEL,
 	FILE_MEMORY_PRESSURE_ENABLED,
 	FILE_MEMORY_PRESSURE,
@@ -1755,6 +2072,9 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
 	case FILE_SCHED_RELAX_DOMAIN_LEVEL:
 		retval = update_relax_domain_level(cs, val);
 		break;
+	case FILE_PARTITION_ROOT:
+		retval = update_prstate(cs, val);
+		break;
 	default:
 		retval = -EINVAL;
 		break;
@@ -1905,6 +2225,8 @@ static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
 	switch (type) {
 	case FILE_SCHED_RELAX_DOMAIN_LEVEL:
 		return cs->relax_domain_level;
+	case FILE_PARTITION_ROOT:
+		return cs->partition_root_state;
 	default:
 		BUG();
 	}
@@ -2056,6 +2378,14 @@ static struct cftype dfl_files[] = {
 		.flags = CFTYPE_NOT_ON_ROOT,
 	},
 
+	{
+		.name = "sched.partition",
+		.read_s64 = cpuset_read_s64,
+		.write_s64 = cpuset_write_s64,
+		.private = FILE_PARTITION_ROOT,
+		.flags = CFTYPE_NOT_ON_ROOT,
+	},
+
 	{ }	/* terminate */
 };
 
@@ -2157,7 +2487,12 @@ out_unlock:
 /*
  * If the cpuset being removed has its flag 'sched_load_balance'
  * enabled, then simulate turning sched_load_balance off, which
- * will call rebuild_sched_domains_locked().
+ * will call rebuild_sched_domains_locked(). That is not needed
+ * in the default hierarchy where only changes in partition
+ * will cause repartitioning.
+ *
+ * If the cpuset has the 'sched.partition' flag enabled, simulate
+ * turning 'sched.partition" off.
  */
 
 static void cpuset_css_offline(struct cgroup_subsys_state *css)
@@ -2166,7 +2501,11 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
 
 	mutex_lock(&cpuset_mutex);
 
-	if (is_sched_load_balance(cs))
+	if (is_partition_root(cs))
+		update_prstate(cs, 0);
+
+	if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+	    is_sched_load_balance(cs))
 		update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
 
 	cpuset_dec();
-- 
cgit 


From 3881b86128d0be22e8947ac1fca0429c74bf055e Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Thu, 8 Nov 2018 10:08:39 -0500
Subject: cpuset: Add an error state to cpuset.sched.partition

When external events like CPU offlining or user events like changing
the cpu list of an ancestor cpuset happen, update_cpumasks_hier()
will be called to update the effective cpus of each of the affected
cpusets. That will then call update_parent_subparts_cpumask() if
partitions are impacted.

Currently, these events may cause update_parent_subparts_cpumask()
to return error if none of the requested cpus are available or it will
consume all the cpus in the parent partition root. Handling these errors
is problematic as the states may become inconsistent.

Instead of letting update_parent_subparts_cpumask() return error, a new
error state (-1) is added to the partition_root_state flag to designate
the fact that the partition is no longer valid. IOW, it is no longer a
real partition root, but the CS_CPU_EXCLUSIVE flag will still be set
as it can be changed back to a real one if favorable change happens
later on.

This new error state is set internally and user cannot write this new
value to "cpuset.sched.partition".

Signed-off-by: Waiman Long <longman@redhat.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cpuset.c | 153 +++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 129 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 85e04162adcb..ef41f58d7cdf 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -153,10 +153,19 @@ struct cpuset {
  * Partition root states:
  *
  *   0 - not a partition root
+ *
  *   1 - partition root
+ *
+ *  -1 - invalid partition root
+ *       None of the cpus in cpus_allowed can be put into the parent's
+ *       subparts_cpus. In this case, the cpuset is not a real partition
+ *       root anymore.  However, the CPU_EXCLUSIVE bit will still be set
+ *       and the cpuset can be restored back to a partition root if the
+ *       parent cpuset can give more CPUs back to this child cpuset.
  */
 #define PRS_DISABLED		0
 #define PRS_ENABLED		1
+#define PRS_ERROR		-1
 
 /*
  * Temporary cpumasks for working with partitions that are passed among
@@ -251,7 +260,7 @@ static inline int is_spread_slab(const struct cpuset *cs)
 
 static inline int is_partition_root(const struct cpuset *cs)
 {
-	return cs->partition_root_state;
+	return cs->partition_root_state > 0;
 }
 
 static struct cpuset top_cpuset = {
@@ -1021,9 +1030,12 @@ enum subparts_cmd {
  *
  * For partcmd_update, if the optional newmask is specified, the cpu
  * list is to be changed from cpus_allowed to newmask. Otherwise,
- * cpus_allowed is assumed to remain the same.  The function will return
- * 1 if changes to parent's subparts_cpus and effective_cpus happen or 0
- * otherwise. In case of error, an error code will be returned.
+ * cpus_allowed is assumed to remain the same. The cpuset should either
+ * be a partition root or an invalid partition root. The partition root
+ * state may change if newmask is NULL and none of the requested CPUs can
+ * be granted by the parent. The function will return 1 if changes to
+ * parent's subparts_cpus and effective_cpus happen or 0 otherwise.
+ * Error code should only be returned when newmask is non-NULL.
  *
  * The partcmd_enable and partcmd_disable commands are used by
  * update_prstate(). The partcmd_update command is used by
@@ -1046,6 +1058,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
 	struct cpuset *parent = parent_cs(cpuset);
 	int adding;	/* Moving cpus from effective_cpus to subparts_cpus */
 	int deleting;	/* Moving cpus from subparts_cpus to effective_cpus */
+	bool part_error = false;	/* Partition error? */
 
 	lockdep_assert_held(&cpuset_mutex);
 
@@ -1114,13 +1127,48 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
 		 * addmask = cpus_allowed & parent->effectiveb_cpus
 		 *
 		 * Note that parent's subparts_cpus may have been
-		 * pre-shrunk in case the CPUs granted to the parent
-		 * by the grandparent changes. So no deletion is needed.
+		 * pre-shrunk in case there is a change in the cpu list.
+		 * So no deletion is needed.
 		 */
 		adding = cpumask_and(tmp->addmask, cpuset->cpus_allowed,
 				     parent->effective_cpus);
-		if (cpumask_equal(tmp->addmask, parent->effective_cpus))
-			return -EINVAL;
+		part_error = cpumask_equal(tmp->addmask,
+					   parent->effective_cpus);
+	}
+
+	if (cmd == partcmd_update) {
+		int prev_prs = cpuset->partition_root_state;
+
+		/*
+		 * Check for possible transition between PRS_ENABLED
+		 * and PRS_ERROR.
+		 */
+		switch (cpuset->partition_root_state) {
+		case PRS_ENABLED:
+			if (part_error)
+				cpuset->partition_root_state = PRS_ERROR;
+			break;
+		case PRS_ERROR:
+			if (!part_error)
+				cpuset->partition_root_state = PRS_ENABLED;
+			break;
+		}
+		/*
+		 * Set part_error if previously in invalid state.
+		 */
+		part_error = (prev_prs == PRS_ERROR);
+	}
+
+	if (!part_error && (cpuset->partition_root_state == PRS_ERROR))
+		return 0;	/* Nothing need to be done */
+
+	if (cpuset->partition_root_state == PRS_ERROR) {
+		/*
+		 * Remove all its cpus from parent's subparts_cpus.
+		 */
+		adding = false;
+		deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
+				       parent->subparts_cpus);
 	}
 
 	if (!adding && !deleting)
@@ -1172,28 +1220,21 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
 	rcu_read_lock();
 	cpuset_for_each_descendant_pre(cp, pos_css, cs) {
 		struct cpuset *parent = parent_cs(cp);
-		bool cs_empty;
 
 		compute_effective_cpumask(tmp->new_cpus, cp, parent);
-		cs_empty = cpumask_empty(tmp->new_cpus);
-
-		/*
-		 * A partition root cannot have empty effective_cpus
-		 */
-		WARN_ON_ONCE(cs_empty && is_partition_root(cp));
 
 		/*
 		 * If it becomes empty, inherit the effective mask of the
 		 * parent, which is guaranteed to have some CPUs.
 		 */
-		if (is_in_v2_mode() && cs_empty)
+		if (is_in_v2_mode() && cpumask_empty(tmp->new_cpus))
 			cpumask_copy(tmp->new_cpus, parent->effective_cpus);
 
 		/*
 		 * Skip the whole subtree if the cpumask remains the same
 		 * and has no partition root state.
 		 */
-		if (!is_partition_root(cp) &&
+		if (!cp->partition_root_state &&
 		    cpumask_equal(tmp->new_cpus, cp->effective_cpus)) {
 			pos_css = css_rightmost_descendant(pos_css);
 			continue;
@@ -1205,11 +1246,44 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
 		 * update_tasks_cpumask() again for tasks in the parent
 		 * cpuset if the parent's subparts_cpus changes.
 		 */
-		if ((cp != cs) && cp->partition_root_state &&
-		    update_parent_subparts_cpumask(cp, partcmd_update,
-						   NULL, tmp)) {
-			if (parent != &top_cpuset)
-				update_tasks_cpumask(parent);
+		if ((cp != cs) && cp->partition_root_state) {
+			switch (parent->partition_root_state) {
+			case PRS_DISABLED:
+				/*
+				 * If parent is not a partition root or an
+				 * invalid partition root, clear the state
+				 * state and the CS_CPU_EXCLUSIVE flag.
+				 */
+				WARN_ON_ONCE(cp->partition_root_state
+					     != PRS_ERROR);
+				cp->partition_root_state = 0;
+
+				/*
+				 * clear_bit() is an atomic operation and
+				 * readers aren't interested in the state
+				 * of CS_CPU_EXCLUSIVE anyway. So we can
+				 * just update the flag without holding
+				 * the callback_lock.
+				 */
+				clear_bit(CS_CPU_EXCLUSIVE, &cp->flags);
+				break;
+
+			case PRS_ENABLED:
+				if (update_parent_subparts_cpumask(cp, partcmd_update, NULL, tmp))
+					update_tasks_cpumask(parent);
+				break;
+
+			case PRS_ERROR:
+				/*
+				 * When parent is invalid, it has to be too.
+				 */
+				cp->partition_root_state = PRS_ERROR;
+				if (cp->nr_subparts_cpus) {
+					cp->nr_subparts_cpus = 0;
+					cpumask_clear(cp->subparts_cpus);
+				}
+				break;
+			}
 		}
 
 		if (!css_tryget_online(&cp->css))
@@ -1219,13 +1293,33 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
 		spin_lock_irq(&callback_lock);
 
 		cpumask_copy(cp->effective_cpus, tmp->new_cpus);
-		if (cp->nr_subparts_cpus) {
+		if (cp->nr_subparts_cpus &&
+		   (cp->partition_root_state != PRS_ENABLED)) {
+			cp->nr_subparts_cpus = 0;
+			cpumask_clear(cp->subparts_cpus);
+		} else if (cp->nr_subparts_cpus) {
 			/*
 			 * Make sure that effective_cpus & subparts_cpus
 			 * are mutually exclusive.
+			 *
+			 * In the unlikely event that effective_cpus
+			 * becomes empty. we clear cp->nr_subparts_cpus and
+			 * let its child partition roots to compete for
+			 * CPUs again.
 			 */
 			cpumask_andnot(cp->effective_cpus, cp->effective_cpus,
 				       cp->subparts_cpus);
+			if (cpumask_empty(cp->effective_cpus)) {
+				cpumask_copy(cp->effective_cpus, tmp->new_cpus);
+				cpumask_clear(cp->subparts_cpus);
+				cp->nr_subparts_cpus = 0;
+			} else if (!cpumask_subset(cp->subparts_cpus,
+						   tmp->new_cpus)) {
+				cpumask_andnot(cp->subparts_cpus,
+					cp->subparts_cpus, tmp->new_cpus);
+				cp->nr_subparts_cpus
+					= cpumask_weight(cp->subparts_cpus);
+			}
 		}
 		spin_unlock_irq(&callback_lock);
 
@@ -1702,7 +1796,7 @@ static int update_prstate(struct cpuset *cs, int val)
 		return 0;
 
 	/*
-	 * Cannot force a partial or erroneous partition root to a full
+	 * Cannot force a partial or invalid partition root to a full
 	 * partition root.
 	 */
 	if (val && cs->partition_root_state)
@@ -1733,6 +1827,17 @@ static int update_prstate(struct cpuset *cs, int val)
 		}
 		cs->partition_root_state = PRS_ENABLED;
 	} else {
+		/*
+		 * Turning off partition root will clear the
+		 * CS_CPU_EXCLUSIVE bit.
+		 */
+		if (cs->partition_root_state == PRS_ERROR) {
+			cs->partition_root_state = 0;
+			update_flag(CS_CPU_EXCLUSIVE, cs, 0);
+			err = 0;
+			goto out;
+		}
+
 		err = update_parent_subparts_cpumask(cs, partcmd_disable,
 						     NULL, &tmp);
 		if (err)
-- 
cgit 


From 4716909cc5c566e946a3acc884bf5dc469812007 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Thu, 8 Nov 2018 10:08:40 -0500
Subject: cpuset: Track cpusets that use parent's effective_cpus

In the default hierarchy, a cpuset will use the parent's effective_cpus
if none of the requested CPUs can be granted from the parent. That can
be a problem if a parent is a partition root with children partition
roots. Changes to a parent's effective_cpus list due to changes in a
child partition root may not be properly reflected in a child cpuset
that use parent's effective_cpus because the cpu_exclusive rule of a
partition root will not guard against that.

In order to avoid the mismatch, two new tracking variables are added to
the cpuset structure to track if a cpuset uses parent's effective_cpus
and the number of children cpusets that use its effective_cpus. So
whenever cpumask changes are made to a parent, it will also check to
see if it has other children cpusets that use its effective_cpus and
call update_cpumasks_hier() if that is the case.

Signed-off-by: Waiman Long <longman@redhat.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cpuset.c | 71 +++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 70 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index ef41f58d7cdf..21eaa896c1a9 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -147,6 +147,14 @@ struct cpuset {
 
 	/* partition root state */
 	int partition_root_state;
+
+	/*
+	 * Default hierarchy only:
+	 * use_parent_ecpus - set if using parent's effective_cpus
+	 * child_ecpus_count - # of children with use_parent_ecpus set
+	 */
+	int use_parent_ecpus;
+	int child_ecpus_count;
 };
 
 /*
@@ -1227,8 +1235,17 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
 		 * If it becomes empty, inherit the effective mask of the
 		 * parent, which is guaranteed to have some CPUs.
 		 */
-		if (is_in_v2_mode() && cpumask_empty(tmp->new_cpus))
+		if (is_in_v2_mode() && cpumask_empty(tmp->new_cpus)) {
 			cpumask_copy(tmp->new_cpus, parent->effective_cpus);
+			if (!cp->use_parent_ecpus) {
+				cp->use_parent_ecpus = true;
+				parent->child_ecpus_count++;
+			}
+		} else if (cp->use_parent_ecpus) {
+			cp->use_parent_ecpus = false;
+			WARN_ON_ONCE(!parent->child_ecpus_count);
+			parent->child_ecpus_count--;
+		}
 
 		/*
 		 * Skip the whole subtree if the cpumask remains the same
@@ -1345,6 +1362,35 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
 		rebuild_sched_domains_locked();
 }
 
+/**
+ * update_sibling_cpumasks - Update siblings cpumasks
+ * @parent:  Parent cpuset
+ * @cs:      Current cpuset
+ * @tmp:     Temp variables
+ */
+static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
+				    struct tmpmasks *tmp)
+{
+	struct cpuset *sibling;
+	struct cgroup_subsys_state *pos_css;
+
+	/*
+	 * Check all its siblings and call update_cpumasks_hier()
+	 * if their use_parent_ecpus flag is set in order for them
+	 * to use the right effective_cpus value.
+	 */
+	rcu_read_lock();
+	cpuset_for_each_child(sibling, pos_css, parent) {
+		if (sibling == cs)
+			continue;
+		if (!sibling->use_parent_ecpus)
+			continue;
+
+		update_cpumasks_hier(sibling, tmp);
+	}
+	rcu_read_unlock();
+}
+
 /**
  * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
  * @cs: the cpuset to consider
@@ -1420,6 +1466,17 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 	spin_unlock_irq(&callback_lock);
 
 	update_cpumasks_hier(cs, &tmp);
+
+	if (cs->partition_root_state) {
+		struct cpuset *parent = parent_cs(cs);
+
+		/*
+		 * For partition root, update the cpumasks of sibling
+		 * cpusets if they use parent's effective_cpus.
+		 */
+		if (parent->child_ecpus_count)
+			update_sibling_cpumasks(parent, cs, &tmp);
+	}
 	return 0;
 }
 
@@ -1856,6 +1913,9 @@ static int update_prstate(struct cpuset *cs, int val)
 	if (parent != &top_cpuset)
 		update_tasks_cpumask(parent);
 
+	if (parent->child_ecpus_count)
+		update_sibling_cpumasks(parent, cs, &tmp);
+
 	rebuild_sched_domains_locked();
 out:
 	free_cpumasks(NULL, &tmp);
@@ -2550,6 +2610,8 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
 	if (is_in_v2_mode()) {
 		cpumask_copy(cs->effective_cpus, parent->effective_cpus);
 		cs->effective_mems = parent->effective_mems;
+		cs->use_parent_ecpus = true;
+		parent->child_ecpus_count++;
 	}
 	spin_unlock_irq(&callback_lock);
 
@@ -2613,6 +2675,13 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
 	    is_sched_load_balance(cs))
 		update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
 
+	if (cs->use_parent_ecpus) {
+		struct cpuset *parent = parent_cs(cs);
+
+		cs->use_parent_ecpus = false;
+		parent->child_ecpus_count--;
+	}
+
 	cpuset_dec();
 	clear_bit(CS_ONLINE, &cs->flags);
 
-- 
cgit 


From 4b842da276a8a1057aed7af6b2a5da471f840dd0 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Thu, 8 Nov 2018 10:08:41 -0500
Subject: cpuset: Make CPU hotplug work with partition

When there is a cpu hotplug event (CPU online or offline), the partitions
may need to be reconfigured and regenerated. So code is added to the
hotplug functions to make them work with new subparts_cpus mask to
compute the right effective_cpus for each of the affected cpusets.
It may also change the state of a partition root from real one to an
erroneous one or vice versa.

Signed-off-by: Waiman Long <longman@redhat.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cpuset.c | 131 +++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 116 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 21eaa896c1a9..82c88c60c83d 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -113,6 +113,9 @@ struct cpuset {
 	 * CPUs allocated to child sub-partitions (default hierarchy only)
 	 * - CPUs granted by the parent = effective_cpus U subparts_cpus
 	 * - effective_cpus and subparts_cpus are mutually exclusive.
+	 *
+	 * effective_cpus contains only onlined CPUs, but subparts_cpus
+	 * may have offlined ones.
 	 */
 	cpumask_var_t subparts_cpus;
 
@@ -994,7 +997,9 @@ static void update_tasks_cpumask(struct cpuset *cs)
  * @parent: the parent cpuset
  *
  * If the parent has subpartition CPUs, include them in the list of
- * allowable CPUs in computing the new effective_cpus mask.
+ * allowable CPUs in computing the new effective_cpus mask. Since offlined
+ * CPUs are not removed from subparts_cpus, we have to use cpu_active_mask
+ * to mask those out.
  */
 static void compute_effective_cpumask(struct cpumask *new_cpus,
 				      struct cpuset *cs, struct cpuset *parent)
@@ -1003,6 +1008,7 @@ static void compute_effective_cpumask(struct cpumask *new_cpus,
 		cpumask_or(new_cpus, parent->effective_cpus,
 			   parent->subparts_cpus);
 		cpumask_and(new_cpus, new_cpus, cs->cpus_allowed);
+		cpumask_and(new_cpus, new_cpus, cpu_active_mask);
 	} else {
 		cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus);
 	}
@@ -1125,9 +1131,20 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
 		/*
 		 * Return error if the new effective_cpus could become empty.
 		 */
-		if (adding && !deleting &&
-		    cpumask_equal(parent->effective_cpus, tmp->addmask))
-			return -EINVAL;
+		if (adding &&
+		    cpumask_equal(parent->effective_cpus, tmp->addmask)) {
+			if (!deleting)
+				return -EINVAL;
+			/*
+			 * As some of the CPUs in subparts_cpus might have
+			 * been offlined, we need to compute the real delmask
+			 * to confirm that.
+			 */
+			if (!cpumask_and(tmp->addmask, tmp->delmask,
+					 cpu_active_mask))
+				return -EINVAL;
+			cpumask_copy(tmp->addmask, parent->effective_cpus);
+		}
 	} else {
 		/*
 		 * partcmd_update w/o newmask:
@@ -1197,6 +1214,10 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
 	if (deleting) {
 		cpumask_andnot(parent->subparts_cpus,
 			       parent->subparts_cpus, tmp->delmask);
+		/*
+		 * Some of the CPUs in subparts_cpus might have been offlined.
+		 */
+		cpumask_and(tmp->delmask, tmp->delmask, cpu_active_mask);
 		cpumask_or(parent->effective_cpus,
 			   parent->effective_cpus, tmp->delmask);
 	}
@@ -2863,20 +2884,29 @@ hotplug_update_tasks(struct cpuset *cs,
 		update_tasks_nodemask(cs);
 }
 
+static bool force_rebuild;
+
+void cpuset_force_rebuild(void)
+{
+	force_rebuild = true;
+}
+
 /**
  * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
  * @cs: cpuset in interest
+ * @tmp: the tmpmasks structure pointer
  *
  * Compare @cs's cpu and mem masks against top_cpuset and if some have gone
  * offline, update @cs accordingly.  If @cs ends up with no CPU or memory,
  * all its tasks are moved to the nearest ancestor with both resources.
  */
-static void cpuset_hotplug_update_tasks(struct cpuset *cs)
+static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
 {
 	static cpumask_t new_cpus;
 	static nodemask_t new_mems;
 	bool cpus_updated;
 	bool mems_updated;
+	struct cpuset *parent;
 retry:
 	wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
 
@@ -2891,9 +2921,60 @@ retry:
 		goto retry;
 	}
 
-	cpumask_and(&new_cpus, cs->cpus_allowed, parent_cs(cs)->effective_cpus);
-	nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems);
+	parent =  parent_cs(cs);
+	compute_effective_cpumask(&new_cpus, cs, parent);
+	nodes_and(new_mems, cs->mems_allowed, parent->effective_mems);
+
+	if (cs->nr_subparts_cpus)
+		/*
+		 * Make sure that CPUs allocated to child partitions
+		 * do not show up in effective_cpus.
+		 */
+		cpumask_andnot(&new_cpus, &new_cpus, cs->subparts_cpus);
+
+	if (!tmp || !cs->partition_root_state)
+		goto update_tasks;
+
+	/*
+	 * In the unlikely event that a partition root has empty
+	 * effective_cpus or its parent becomes erroneous, we have to
+	 * transition it to the erroneous state.
+	 */
+	if (is_partition_root(cs) && (cpumask_empty(&new_cpus) ||
+	   (parent->partition_root_state == PRS_ERROR))) {
+		if (cs->nr_subparts_cpus) {
+			cs->nr_subparts_cpus = 0;
+			cpumask_clear(cs->subparts_cpus);
+			compute_effective_cpumask(&new_cpus, cs, parent);
+		}
 
+		/*
+		 * If the effective_cpus is empty because the child
+		 * partitions take away all the CPUs, we can keep
+		 * the current partition and let the child partitions
+		 * fight for available CPUs.
+		 */
+		if ((parent->partition_root_state == PRS_ERROR) ||
+		     cpumask_empty(&new_cpus)) {
+			update_parent_subparts_cpumask(cs, partcmd_disable,
+						       NULL, tmp);
+			cs->partition_root_state = PRS_ERROR;
+		}
+		cpuset_force_rebuild();
+	}
+
+	/*
+	 * On the other hand, an erroneous partition root may be transitioned
+	 * back to a regular one or a partition root with no CPU allocated
+	 * from the parent may change to erroneous.
+	 */
+	if (is_partition_root(parent) &&
+	   ((cs->partition_root_state == PRS_ERROR) ||
+	    !cpumask_intersects(&new_cpus, parent->subparts_cpus)) &&
+	     update_parent_subparts_cpumask(cs, partcmd_update, NULL, tmp))
+		cpuset_force_rebuild();
+
+update_tasks:
 	cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
 	mems_updated = !nodes_equal(new_mems, cs->effective_mems);
 
@@ -2907,13 +2988,6 @@ retry:
 	mutex_unlock(&cpuset_mutex);
 }
 
-static bool force_rebuild;
-
-void cpuset_force_rebuild(void)
-{
-	force_rebuild = true;
-}
-
 /**
  * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
  *
@@ -2936,6 +3010,10 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
 	static nodemask_t new_mems;
 	bool cpus_updated, mems_updated;
 	bool on_dfl = is_in_v2_mode();
+	struct tmpmasks tmp, *ptmp = NULL;
+
+	if (on_dfl && !alloc_cpumasks(NULL, &tmp))
+		ptmp = &tmp;
 
 	mutex_lock(&cpuset_mutex);
 
@@ -2943,6 +3021,11 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
 	cpumask_copy(&new_cpus, cpu_active_mask);
 	new_mems = node_states[N_MEMORY];
 
+	/*
+	 * If subparts_cpus is populated, it is likely that the check below
+	 * will produce a false positive on cpus_updated when the cpu list
+	 * isn't changed. It is extra work, but it is better to be safe.
+	 */
 	cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
 	mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
 
@@ -2951,6 +3034,22 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
 		spin_lock_irq(&callback_lock);
 		if (!on_dfl)
 			cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
+		/*
+		 * Make sure that CPUs allocated to child partitions
+		 * do not show up in effective_cpus. If no CPU is left,
+		 * we clear the subparts_cpus & let the child partitions
+		 * fight for the CPUs again.
+		 */
+		if (top_cpuset.nr_subparts_cpus) {
+			if (cpumask_subset(&new_cpus,
+					   top_cpuset.subparts_cpus)) {
+				top_cpuset.nr_subparts_cpus = 0;
+				cpumask_clear(top_cpuset.subparts_cpus);
+			} else {
+				cpumask_andnot(&new_cpus, &new_cpus,
+					       top_cpuset.subparts_cpus);
+			}
+		}
 		cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
 		spin_unlock_irq(&callback_lock);
 		/* we don't mess with cpumasks of tasks in top_cpuset */
@@ -2979,7 +3078,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
 				continue;
 			rcu_read_unlock();
 
-			cpuset_hotplug_update_tasks(cs);
+			cpuset_hotplug_update_tasks(cs, ptmp);
 
 			rcu_read_lock();
 			css_put(&cs->css);
@@ -2992,6 +3091,8 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
 		force_rebuild = false;
 		rebuild_sched_domains();
 	}
+
+	free_cpumasks(NULL, ptmp);
 }
 
 void cpuset_update_active_cpus(void)
-- 
cgit 


From 0ccea8feb9807ba87b0405a826f6830a386706f5 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Thu, 8 Nov 2018 10:08:42 -0500
Subject: cpuset: Make generate_sched_domains() work with partition

The generate_sched_domains() function is modified to make it work
correctly with the newly introduced subparts_cpus mask for scheduling
domains generation.

Signed-off-by: Waiman Long <longman@redhat.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cpuset.c | 34 +++++++++++++++++++++++++++-------
 1 file changed, 27 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 82c88c60c83d..3960de7a75cc 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -769,13 +769,14 @@ static int generate_sched_domains(cpumask_var_t **domains,
 	int ndoms = 0;		/* number of sched domains in result */
 	int nslot;		/* next empty doms[] struct cpumask slot */
 	struct cgroup_subsys_state *pos_css;
+	bool root_load_balance = is_sched_load_balance(&top_cpuset);
 
 	doms = NULL;
 	dattr = NULL;
 	csa = NULL;
 
 	/* Special case for the 99% of systems with one, full, sched domain */
-	if (is_sched_load_balance(&top_cpuset)) {
+	if (root_load_balance && !top_cpuset.nr_subparts_cpus) {
 		ndoms = 1;
 		doms = alloc_sched_domains(ndoms);
 		if (!doms)
@@ -798,6 +799,8 @@ static int generate_sched_domains(cpumask_var_t **domains,
 	csn = 0;
 
 	rcu_read_lock();
+	if (root_load_balance)
+		csa[csn++] = &top_cpuset;
 	cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
 		if (cp == &top_cpuset)
 			continue;
@@ -808,6 +811,9 @@ static int generate_sched_domains(cpumask_var_t **domains,
 		 * parent's cpus, so just skip them, and then we call
 		 * update_domain_attr_tree() to calc relax_domain_level of
 		 * the corresponding sched domain.
+		 *
+		 * If root is load-balancing, we can skip @cp if it
+		 * is a subset of the root's effective_cpus.
 		 */
 		if (!cpumask_empty(cp->cpus_allowed) &&
 		    !(is_sched_load_balance(cp) &&
@@ -815,11 +821,16 @@ static int generate_sched_domains(cpumask_var_t **domains,
 					 housekeeping_cpumask(HK_FLAG_DOMAIN))))
 			continue;
 
+		if (root_load_balance &&
+		    cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus))
+			continue;
+
 		if (is_sched_load_balance(cp))
 			csa[csn++] = cp;
 
-		/* skip @cp's subtree */
-		pos_css = css_rightmost_descendant(pos_css);
+		/* skip @cp's subtree if not a partition root */
+		if (!is_partition_root(cp))
+			pos_css = css_rightmost_descendant(pos_css);
 	}
 	rcu_read_unlock();
 
@@ -947,7 +958,12 @@ static void rebuild_sched_domains_locked(void)
 	 * passing doms with offlined cpu to partition_sched_domains().
 	 * Anyways, hotplug work item will rebuild sched domains.
 	 */
-	if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
+	if (!top_cpuset.nr_subparts_cpus &&
+	    !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
+		goto out;
+
+	if (top_cpuset.nr_subparts_cpus &&
+	   !cpumask_subset(top_cpuset.effective_cpus, cpu_active_mask))
 		goto out;
 
 	/* Generate domain masks and attrs */
@@ -1367,11 +1383,15 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
 		update_tasks_cpumask(cp);
 
 		/*
-		 * If the effective cpumask of any non-empty cpuset is changed,
-		 * we need to rebuild sched domains.
+		 * On legacy hierarchy, if the effective cpumask of any non-
+		 * empty cpuset is changed, we need to rebuild sched domains.
+		 * On default hierarchy, the cpuset needs to be a partition
+		 * root as well.
 		 */
 		if (!cpumask_empty(cp->cpus_allowed) &&
-		    is_sched_load_balance(cp))
+		    is_sched_load_balance(cp) &&
+		   (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
+		    is_partition_root(cp)))
 			need_rebuild_sched_domains = true;
 
 		rcu_read_lock();
-- 
cgit 


From 5776ceccd4de2a53dec740422a409e9e588c5a70 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Thu, 8 Nov 2018 10:08:43 -0500
Subject: cpuset: Expose cpus.effective and mems.effective on cgroup v2 root

Because of the fact that setting the "cpuset.sched.partition" in
a direct child of root can remove CPUs from the root's effective CPU
list, it makes sense to know what CPUs are left in the root cgroup for
scheduling purpose. So the "cpuset.cpus.effective" control file is now
exposed in the v2 cgroup root.

For consistency, the "cpuset.mems.effective" control file is exposed
as well.

Signed-off-by: Waiman Long <longman@redhat.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 Documentation/admin-guide/cgroup-v2.rst | 4 ++--
 kernel/cgroup/cpuset.c                  | 2 --
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 01b70f69304e..595b0757ad2b 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1653,7 +1653,7 @@ Cpuset Interface Files
 	and won't be affected by any CPU hotplug events.
 
   cpuset.cpus.effective
-	A read-only multiple values file which exists on non-root
+	A read-only multiple values file which exists on all
 	cpuset-enabled cgroups.
 
 	It lists the onlined CPUs that are actually granted to this
@@ -1693,7 +1693,7 @@ Cpuset Interface Files
 	and won't be affected by any memory nodes hotplug events.
 
   cpuset.mems.effective
-	A read-only multiple values file which exists on non-root
+	A read-only multiple values file which exists on all
 	cpuset-enabled cgroups.
 
 	It lists the onlined memory nodes that are actually granted to
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 3960de7a75cc..fc1a809cd5bb 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -2574,14 +2574,12 @@ static struct cftype dfl_files[] = {
 		.name = "cpus.effective",
 		.seq_show = cpuset_common_seq_show,
 		.private = FILE_EFFECTIVE_CPULIST,
-		.flags = CFTYPE_NOT_ON_ROOT,
 	},
 
 	{
 		.name = "mems.effective",
 		.seq_show = cpuset_common_seq_show,
 		.private = FILE_EFFECTIVE_MEMLIST,
-		.flags = CFTYPE_NOT_ON_ROOT,
 	},
 
 	{
-- 
cgit 


From bb5b553c33cb3393f604483b103158bf7d01ca1c Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Thu, 8 Nov 2018 10:08:44 -0500
Subject: cpuset: Use descriptive text when reading/writing
 cpuset.sched.partition

Currently, cpuset.sched.partition returns the values, 0, 1 or -1 on
read. A person who is not familiar with the partition code may not
understand what they mean.

In order to make cpuset.sched.partition more user-friendly, it will
now display the following descriptive text on read:

  "root" - A partition root (top cpuset of a partition)
  "member" - A non-root member of a partition
  "root invalid" - An invalid partition root

Note that there is at least one partition in the whole cgroup hierarchy.
The top cpuset is the root of that partition.  The rests are either a
root if it starts a new partition or a member of a partition.

The cpuset.sched.partition file will now also accept "root" and
"member" besides 1 and 0 as valid input values. The "root invalid"
value is internal only and cannot be written to the file.

Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Waiman Long <longman@redhat.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cpuset.c | 58 ++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 51 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index fc1a809cd5bb..c739fda805e0 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -2278,9 +2278,6 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
 	case FILE_SCHED_RELAX_DOMAIN_LEVEL:
 		retval = update_relax_domain_level(cs, val);
 		break;
-	case FILE_PARTITION_ROOT:
-		retval = update_prstate(cs, val);
-		break;
 	default:
 		retval = -EINVAL;
 		break;
@@ -2431,8 +2428,6 @@ static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
 	switch (type) {
 	case FILE_SCHED_RELAX_DOMAIN_LEVEL:
 		return cs->relax_domain_level;
-	case FILE_PARTITION_ROOT:
-		return cs->partition_root_state;
 	default:
 		BUG();
 	}
@@ -2441,6 +2436,55 @@ static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
 	return 0;
 }
 
+static int sched_partition_show(struct seq_file *seq, void *v)
+{
+	struct cpuset *cs = css_cs(seq_css(seq));
+
+	switch (cs->partition_root_state) {
+	case PRS_ENABLED:
+		seq_puts(seq, "root\n");
+		break;
+	case PRS_DISABLED:
+		seq_puts(seq, "member\n");
+		break;
+	case PRS_ERROR:
+		seq_puts(seq, "root invalid\n");
+		break;
+	}
+	return 0;
+}
+
+static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
+				     size_t nbytes, loff_t off)
+{
+	struct cpuset *cs = css_cs(of_css(of));
+	int val;
+	int retval = -ENODEV;
+
+	buf = strstrip(buf);
+
+	/*
+	 * Convert "root"/"1" to 1, and convert "member"/"0" to 0.
+	 */
+	if (!strcmp(buf, "root") || !strcmp(buf, "1"))
+		val = PRS_ENABLED;
+	else if (!strcmp(buf, "member") || !strcmp(buf, "0"))
+		val = PRS_DISABLED;
+	else
+		return -EINVAL;
+
+	css_get(&cs->css);
+	mutex_lock(&cpuset_mutex);
+	if (!is_cpuset_online(cs))
+		goto out_unlock;
+
+	retval = update_prstate(cs, val);
+out_unlock:
+	mutex_unlock(&cpuset_mutex);
+	css_put(&cs->css);
+	return retval ?: nbytes;
+}
+
 /*
  * for the common functions, 'private' gives the type of file
  */
@@ -2584,8 +2628,8 @@ static struct cftype dfl_files[] = {
 
 	{
 		.name = "sched.partition",
-		.read_s64 = cpuset_read_s64,
-		.write_s64 = cpuset_write_s64,
+		.seq_show = sched_partition_show,
+		.write = sched_partition_write,
 		.private = FILE_PARTITION_ROOT,
 		.flags = CFTYPE_NOT_ON_ROOT,
 	},
-- 
cgit 


From 5cf8114d6e90b3822be5eb6a2faedf99d1c08f77 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Thu, 8 Nov 2018 10:08:46 -0500
Subject: cpuset: Expose cpuset.cpus.subpartitions with cgroup_debug

For debugging purpose, it will be useful to expose the content of the
subparts_cpus as a read-only file to see if the code work correctly.
However, subparts_cpus will not be used at all in most use cases. So
adding a new cpuset file that clutters the cgroup directory may not be
desirable.  This is now being done by using the hidden "cgroup_debug"
kernel command line option to expose a new "cpuset.cpus.subpartitions"
file.

That option was originally used by the debug controller to expose
itself when configured into the kernel. This is now extended to set an
internal flag used by cgroup_addrm_files(). A new CFTYPE_DEBUG flag
can now be used to specify that a cgroup file should only be created
when the "cgroup_debug" option is specified.

Signed-off-by: Waiman Long <longman@redhat.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup-defs.h     |  1 +
 kernel/cgroup/cgroup-internal.h |  2 ++
 kernel/cgroup/cgroup.c          | 14 +++++++++++++-
 kernel/cgroup/cpuset.c          | 11 +++++++++++
 kernel/cgroup/debug.c           |  4 +---
 5 files changed, 28 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 5e1694fe035b..8fcbae1b8db0 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -92,6 +92,7 @@ enum {
 
 	CFTYPE_NO_PREFIX	= (1 << 3),	/* (DON'T USE FOR NEW FILES) no subsys prefix */
 	CFTYPE_WORLD_WRITABLE	= (1 << 4),	/* (DON'T USE FOR NEW FILES) S_IWUGO */
+	CFTYPE_DEBUG		= (1 << 5),	/* create when cgroup_debug */
 
 	/* internal flags, do not use outside cgroup core proper */
 	__CFTYPE_ONLY_ON_DFL	= (1 << 16),	/* only on default hierarchy */
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 75568fcf2180..c950864016e2 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -11,6 +11,8 @@
 #define TRACE_CGROUP_PATH_LEN 1024
 extern spinlock_t trace_cgroup_path_lock;
 extern char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
+extern bool cgroup_debug;
+extern void __init enable_debug_cgroup(void);
 
 /*
  * cgroup_path() takes a spin lock. It is good practice not to take
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 2e5d90dfcb49..ed7f0bfe6429 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -86,6 +86,7 @@ EXPORT_SYMBOL_GPL(css_set_lock);
 
 DEFINE_SPINLOCK(trace_cgroup_path_lock);
 char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
+bool cgroup_debug __read_mostly;
 
 /*
  * Protects cgroup_idr and css_idr so that IDs can be released without
@@ -3639,7 +3640,8 @@ restart:
 			continue;
 		if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))
 			continue;
-
+		if ((cft->flags & CFTYPE_DEBUG) && !cgroup_debug)
+			continue;
 		if (is_add) {
 			ret = cgroup_add_file(css, cgrp, cft);
 			if (ret) {
@@ -5743,6 +5745,16 @@ static int __init cgroup_disable(char *str)
 }
 __setup("cgroup_disable=", cgroup_disable);
 
+void __init __weak enable_debug_cgroup(void) { }
+
+static int __init enable_cgroup_debug(char *str)
+{
+	cgroup_debug = true;
+	enable_debug_cgroup();
+	return 1;
+}
+__setup("cgroup_debug", enable_cgroup_debug);
+
 /**
  * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
  * @dentry: directory dentry of interest
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index c739fda805e0..b897314bab53 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -2204,6 +2204,7 @@ typedef enum {
 	FILE_MEMLIST,
 	FILE_EFFECTIVE_CPULIST,
 	FILE_EFFECTIVE_MEMLIST,
+	FILE_SUBPARTS_CPULIST,
 	FILE_CPU_EXCLUSIVE,
 	FILE_MEM_EXCLUSIVE,
 	FILE_MEM_HARDWALL,
@@ -2382,6 +2383,9 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
 	case FILE_EFFECTIVE_MEMLIST:
 		seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
 		break;
+	case FILE_SUBPARTS_CPULIST:
+		seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->subparts_cpus));
+		break;
 	default:
 		ret = -EINVAL;
 	}
@@ -2634,6 +2638,13 @@ static struct cftype dfl_files[] = {
 		.flags = CFTYPE_NOT_ON_ROOT,
 	},
 
+	{
+		.name = "cpus.subpartitions",
+		.seq_show = cpuset_common_seq_show,
+		.private = FILE_SUBPARTS_CPULIST,
+		.flags = CFTYPE_DEBUG,
+	},
+
 	{ }	/* terminate */
 };
 
diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c
index 9caeda610249..5f1b87330bee 100644
--- a/kernel/cgroup/debug.c
+++ b/kernel/cgroup/debug.c
@@ -373,11 +373,9 @@ struct cgroup_subsys debug_cgrp_subsys = {
  * On v2, debug is an implicit controller enabled by "cgroup_debug" boot
  * parameter.
  */
-static int __init enable_cgroup_debug(char *str)
+void __init enable_debug_cgroup(void)
 {
 	debug_cgrp_subsys.dfl_cftypes = debug_files;
 	debug_cgrp_subsys.implicit_on_dfl = true;
 	debug_cgrp_subsys.threaded = true;
-	return 1;
 }
-__setup("cgroup_debug", enable_cgroup_debug);
-- 
cgit 


From 042d4c70a203998697b34eaad1a99f6f09d09e4d Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Mon, 22 Oct 2018 07:43:22 -0700
Subject: rcu: Eliminate BUG_ON() for sync.c

The sync.c file has a number of calls to BUG_ON(), which panics the
kernel, which is not a good strategy for devices (like embedded) that
don't have a way to capture console output.  This commit therefore
changes these BUG_ON() calls to WARN_ON_ONCE(), but does so quite naively.

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 kernel/rcu/sync.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index 3f943efcf61c..a6ba446a9693 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -125,8 +125,7 @@ void rcu_sync_enter(struct rcu_sync *rsp)
 		rsp->gp_state = GP_PENDING;
 	spin_unlock_irq(&rsp->rss_lock);
 
-	BUG_ON(need_wait && need_sync);
-
+	WARN_ON_ONCE(need_wait && need_sync);
 	if (need_sync) {
 		gp_ops[rsp->gp_type].sync();
 		rsp->gp_state = GP_PASSED;
@@ -139,7 +138,7 @@ void rcu_sync_enter(struct rcu_sync *rsp)
 		 * Nobody has yet been allowed the 'fast' path and thus we can
 		 * avoid doing any sync(). The callback will get 'dropped'.
 		 */
-		BUG_ON(rsp->gp_state != GP_PASSED);
+		WARN_ON_ONCE(rsp->gp_state != GP_PASSED);
 	}
 }
 
@@ -166,8 +165,8 @@ static void rcu_sync_func(struct rcu_head *rhp)
 	struct rcu_sync *rsp = container_of(rhp, struct rcu_sync, cb_head);
 	unsigned long flags;
 
-	BUG_ON(rsp->gp_state != GP_PASSED);
-	BUG_ON(rsp->cb_state == CB_IDLE);
+	WARN_ON_ONCE(rsp->gp_state != GP_PASSED);
+	WARN_ON_ONCE(rsp->cb_state == CB_IDLE);
 
 	spin_lock_irqsave(&rsp->rss_lock, flags);
 	if (rsp->gp_count) {
@@ -225,7 +224,7 @@ void rcu_sync_dtor(struct rcu_sync *rsp)
 {
 	int cb_state;
 
-	BUG_ON(rsp->gp_count);
+	WARN_ON_ONCE(rsp->gp_count);
 
 	spin_lock_irq(&rsp->rss_lock);
 	if (rsp->cb_state == CB_REPLAY)
@@ -235,6 +234,6 @@ void rcu_sync_dtor(struct rcu_sync *rsp)
 
 	if (cb_state != CB_IDLE) {
 		gp_ops[rsp->gp_type].wait();
-		BUG_ON(rsp->cb_state != CB_IDLE);
+		WARN_ON_ONCE(rsp->cb_state != CB_IDLE);
 	}
 }
-- 
cgit 


From 08543bda42ef06c5ee4cd74501c894aa7cc13ea8 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Mon, 22 Oct 2018 08:04:03 -0700
Subject: rcu: Eliminate BUG_ON() for kernel/rcu/tree.c

The tree.c file has a number of calls to BUG_ON(), which panics the
kernel, which is not a good strategy for devices (like embedded) that
don't have a way to capture console output.  This commit therefore
converts these BUG_ON() calls to WARN_ON_ONCE() and WARN_ONCE().

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 kernel/rcu/tree.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 121f833acd04..bdb6659a8dbc 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2826,7 +2826,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy)
 		 * Very early boot, before rcu_init().  Initialize if needed
 		 * and then drop through to queue the callback.
 		 */
-		BUG_ON(cpu != -1);
+		WARN_ON_ONCE(cpu != -1);
 		WARN_ON_ONCE(!rcu_is_watching());
 		if (rcu_segcblist_empty(&rdp->cblist))
 			rcu_segcblist_init(&rdp->cblist);
@@ -3485,7 +3485,8 @@ static int __init rcu_spawn_gp_kthread(void)
 
 	rcu_scheduler_fully_active = 1;
 	t = kthread_create(rcu_gp_kthread, NULL, "%s", rcu_state.name);
-	BUG_ON(IS_ERR(t));
+	if (WARN_ONCE(IS_ERR(t), "%s: Could not start grace-period kthread, OOM is now expected behavior\n", __func__))
+		return 0;
 	rnp = rcu_get_root();
 	raw_spin_lock_irqsave_rcu_node(rnp, flags);
 	rcu_state.gp_kthread = t;
-- 
cgit 


From 75a8f72245223cce1571b0405b0881ca3c046df7 Mon Sep 17 00:00:00 2001
From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Date: Sat, 22 Sep 2018 19:41:25 -0400
Subject: rcu: Remove unused rcu_state externs

The rcu_bh_state and rcu_sched_state variables were removed during the
RCU flavor consolidations, but external declarations remain in tree.h.
This commit therefore removes these obsolete declarations.

Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: <kernel-team@android.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 kernel/rcu/tree.h | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 703e19ff532d..57a937ac51c2 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -398,17 +398,6 @@ static const char *tp_rcu_varname __used __tracepoint_string = rcu_name;
 #define RCU_NAME rcu_name
 #endif /* #else #ifdef CONFIG_TRACING */
 
-/*
- * RCU implementation internal declarations:
- */
-extern struct rcu_state rcu_sched_state;
-
-extern struct rcu_state rcu_bh_state;
-
-#ifdef CONFIG_PREEMPT_RCU
-extern struct rcu_state rcu_preempt_state;
-#endif /* #ifdef CONFIG_PREEMPT_RCU */
-
 int rcu_dynticks_snap(struct rcu_data *rdp);
 
 #ifdef CONFIG_RCU_BOOST
-- 
cgit 


From adbccddb4a16b1dbf047d330ae1e78fd1ec80352 Mon Sep 17 00:00:00 2001
From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Date: Sat, 22 Sep 2018 19:41:26 -0400
Subject: rcu: Fix rcu_{node,data} comments about gp_seq_needed

Recent changes have removed the old ->gp_seq_needed field from the
rcu_state structure, which in turn obsoleted a couple of comments in
the rcu_node and rcu_data structures.  This commit therefore updates
these comments accordingly.

Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: <kernel-team@android.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 kernel/rcu/tree.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 57a937ac51c2..c3e2807a834a 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -57,7 +57,7 @@ struct rcu_node {
 					/*  some rcu_state fields as well as */
 					/*  following. */
 	unsigned long gp_seq;	/* Track rsp->rcu_gp_seq. */
-	unsigned long gp_seq_needed; /* Track rsp->rcu_gp_seq_needed. */
+	unsigned long gp_seq_needed; /* Track furthest future GP request. */
 	unsigned long completedqs; /* All QSes done for this node. */
 	unsigned long qsmask;	/* CPUs or groups that need to switch in */
 				/*  order for current grace period to proceed.*/
@@ -163,7 +163,7 @@ union rcu_noqs {
 struct rcu_data {
 	/* 1) quiescent-state and grace-period handling : */
 	unsigned long	gp_seq;		/* Track rsp->rcu_gp_seq counter. */
-	unsigned long	gp_seq_needed;	/* Track rsp->rcu_gp_seq_needed ctr. */
+	unsigned long	gp_seq_needed;	/* Track furthest future GP request. */
 	union rcu_noqs	cpu_no_qs;	/* No QSes yet for this CPU. */
 	bool		core_needs_qs;	/* Core waits for quiesc state. */
 	bool		beenonline;	/* CPU online at least once. */
-- 
cgit 


From 309ba859b95085f61f4f2a154df6be9cb9713a12 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 11 Jul 2018 14:36:49 -0700
Subject: rcu: Eliminate synchronize_rcu_mult()

Now that synchronize_rcu() waits for both RCU read-side critical
sections and preempt-disabled regions of code, the sole caller of
synchronize_rcu_mult() can be replaced by synchronize_rcu().
This patch makes this change and removes synchronize_rcu_mult().
Note that _wait_rcu_gp() still supports synchronize_rcu_mult(),
and thus might be simplified in the future to take only take
a single call_rcu() function rather than the current list of them.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcupdate_wait.h | 17 -----------------
 kernel/rcu/update.c           |  6 ++----
 kernel/sched/core.c           |  2 +-
 3 files changed, 3 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcupdate_wait.h b/include/linux/rcupdate_wait.h
index 8a16c3eb3dd0..c0578ba23c1a 100644
--- a/include/linux/rcupdate_wait.h
+++ b/include/linux/rcupdate_wait.h
@@ -31,21 +31,4 @@ do {									\
 
 #define wait_rcu_gp(...) _wait_rcu_gp(false, __VA_ARGS__)
 
-/**
- * synchronize_rcu_mult - Wait concurrently for multiple grace periods
- * @...: List of call_rcu() functions for different grace periods to wait on
- *
- * This macro waits concurrently for multiple types of RCU grace periods.
- * For example, synchronize_rcu_mult(call_rcu, call_rcu_tasks) would wait
- * on concurrent RCU and RCU-tasks grace periods.  Waiting on a give SRCU
- * domain requires you to write a wrapper function for that SRCU domain's
- * call_srcu() function, supplying the corresponding srcu_struct.
- *
- * If Tiny RCU, tell _wait_rcu_gp() does not bother waiting for RCU,
- * given that anywhere synchronize_rcu_mult() can be called is automatically
- * a grace period.
- */
-#define synchronize_rcu_mult(...) \
-	_wait_rcu_gp(IS_ENABLED(CONFIG_TINY_RCU), __VA_ARGS__)
-
 #endif /* _LINUX_SCHED_RCUPDATE_WAIT_H */
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index f203b94f6b5b..c729ca5e6ee2 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -335,8 +335,7 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array,
 	/* Initialize and register callbacks for each crcu_array element. */
 	for (i = 0; i < n; i++) {
 		if (checktiny &&
-		    (crcu_array[i] == call_rcu ||
-		     crcu_array[i] == call_rcu_bh)) {
+		    (crcu_array[i] == call_rcu)) {
 			might_sleep();
 			continue;
 		}
@@ -352,8 +351,7 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array,
 	/* Wait for all callbacks to be invoked. */
 	for (i = 0; i < n; i++) {
 		if (checktiny &&
-		    (crcu_array[i] == call_rcu ||
-		     crcu_array[i] == call_rcu_bh))
+		    (crcu_array[i] == call_rcu))
 			continue;
 		for (j = 0; j < i; j++)
 			if (crcu_array[j] == crcu_array[i])
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f12225f26b70..ea12ebc57840 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5788,7 +5788,7 @@ int sched_cpu_deactivate(unsigned int cpu)
 	 *
 	 * Do sync before park smpboot threads to take care the rcu boost case.
 	 */
-	synchronize_rcu_mult(call_rcu, call_rcu_sched);
+	synchronize_rcu();
 
 	if (!sched_smp_initialized)
 		return 0;
-- 
cgit 


From d3ff3891b2edba63a7dee9023306bb66878fc3d8 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 11 Jul 2018 14:42:53 -0700
Subject: rcu: Consolidate the RCU update functions invoked by sync.c

This commit retains all the various gp_ops[] entries, but makes their
update functions all be synchronize_rcu(), call_rcu() and rcu_barrier().
The read-side checks remain consistent with the various RCU flavors,
which still exist on the read side.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 kernel/rcu/sync.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index 3f943efcf61c..9d570b1892b0 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -44,15 +44,15 @@ static const struct {
 		__INIT_HELD(rcu_read_lock_held)
 	},
 	[RCU_SCHED_SYNC] = {
-		.sync = synchronize_sched,
-		.call = call_rcu_sched,
-		.wait = rcu_barrier_sched,
+		.sync = synchronize_rcu,
+		.call = call_rcu,
+		.wait = rcu_barrier,
 		__INIT_HELD(rcu_read_lock_sched_held)
 	},
 	[RCU_BH_SYNC] = {
-		.sync = synchronize_rcu_bh,
-		.call = call_rcu_bh,
-		.wait = rcu_barrier_bh,
+		.sync = synchronize_rcu,
+		.call = call_rcu,
+		.wait = rcu_barrier,
 		__INIT_HELD(rcu_read_lock_bh_held)
 	},
 };
-- 
cgit 


From 78d125d33858d00756baf0f40e75e77dcbfbea55 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 11 Jul 2018 15:36:43 -0700
Subject: sched/membarrier: Replace synchronize_sched() with synchronize_rcu()

Now that synchronize_rcu() waits for preempt-disable regions of code
as well as RCU read-side critical sections, the synchronize_sched()
in sys_membarrier() can be replaced by synchronize_rcu().  This commit
therefore makes this change.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
---
 kernel/sched/membarrier.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index 76e0eaf4654e..388a7a6c1aa2 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -298,7 +298,7 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
 		if (tick_nohz_full_enabled())
 			return -EINVAL;
 		if (num_online_cpus() > 1)
-			synchronize_sched();
+			synchronize_rcu();
 		return 0;
 	case MEMBARRIER_CMD_GLOBAL_EXPEDITED:
 		return membarrier_global_expedited();
-- 
cgit 


From 0607ba8403c4cdb253f8c5200ecf654dfb7790cc Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 25 Apr 2018 13:01:25 -0700
Subject: srcu: Prevent __call_srcu() counter wrap with read-side critical
 section

Ever since cdf7abc4610a ("srcu: Allow use of Tiny/Tree SRCU from
both process and interrupt context"), it has been permissible
to use SRCU read-side critical sections in interrupt context.
This allows __call_srcu() to use SRCU read-side critical sections to
prevent a new SRCU grace period from ending before the call to either
srcu_funnel_gp_start() or srcu_funnel_exp_start completes, thus preventing
SRCU grace-period counter overflow during that time.

Note that this does not permit removal of the counter-wrap checks in
srcu_gp_end().  These check are necessary to handle the case where
a given CPU does not interact at all with SRCU for an extended time
period.

This commit therefore adds an SRCU read-side critical section to
__call_srcu() in order to prevent grace period counter wrap during
the funnel-locking process.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/srcutree.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index a8846ed7f352..60f3236beaf7 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -858,6 +858,7 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
 		 rcu_callback_t func, bool do_norm)
 {
 	unsigned long flags;
+	int idx;
 	bool needexp = false;
 	bool needgp = false;
 	unsigned long s;
@@ -871,6 +872,7 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
 		return;
 	}
 	rhp->func = func;
+	idx = srcu_read_lock(sp);
 	local_irq_save(flags);
 	sdp = this_cpu_ptr(sp->sda);
 	spin_lock_rcu_node(sdp);
@@ -892,6 +894,7 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
 		srcu_funnel_gp_start(sp, sdp, s, do_norm);
 	else if (needexp)
 		srcu_funnel_exp_start(sp, sdp->mynode, s);
+	srcu_read_unlock(sp, idx);
 }
 
 /**
-- 
cgit 


From e647815a4d3b3be9d85b5750ed0f2947fd78fac7 Mon Sep 17 00:00:00 2001
From: Jiong Wang <jiong.wang@netronome.com>
Date: Thu, 8 Nov 2018 04:08:42 -0500
Subject: bpf: let verifier to calculate and record max_pkt_offset

In check_packet_access, update max_pkt_offset after the offset has passed
__check_packet_access.

It should be safe to use u32 for max_pkt_offset as explained in code
comment.

Also, when there is tail call, the max_pkt_offset of the called program is
unknown, so conservatively set max_pkt_offset to MAX_PACKET_OFF for such
case.

Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h   |  1 +
 kernel/bpf/verifier.c | 12 ++++++++++++
 2 files changed, 13 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 33014ae73103..b6a296e01f6a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -293,6 +293,7 @@ struct bpf_prog_aux {
 	atomic_t refcnt;
 	u32 used_map_cnt;
 	u32 max_ctx_offset;
+	u32 max_pkt_offset;
 	u32 stack_depth;
 	u32 id;
 	u32 func_cnt;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 1971ca325fb4..75dab40b19a3 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1455,6 +1455,17 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
 		verbose(env, "R%d offset is outside of the packet\n", regno);
 		return err;
 	}
+
+	/* __check_packet_access has made sure "off + size - 1" is within u16.
+	 * reg->umax_value can't be bigger than MAX_PACKET_OFF which is 0xffff,
+	 * otherwise find_good_pkt_pointers would have refused to set range info
+	 * that __check_packet_access would have rejected this pkt access.
+	 * Therefore, "off + reg->umax_value + size - 1" won't overflow u32.
+	 */
+	env->prog->aux->max_pkt_offset =
+		max_t(u32, env->prog->aux->max_pkt_offset,
+		      off + reg->umax_value + size - 1);
+
 	return err;
 }
 
@@ -6138,6 +6149,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 			 */
 			prog->cb_access = 1;
 			env->prog->aux->stack_depth = MAX_BPF_STACK;
+			env->prog->aux->max_pkt_offset = MAX_PACKET_OFF;
 
 			/* mark bpf_tail_call as different opcode to avoid
 			 * conditional branch in the interpeter for every normal
-- 
cgit 


From 1385d755cfb42f596ef1cf9f5c761010ff3b34e7 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Fri, 9 Nov 2018 13:03:25 +0000
Subject: bpf: pass a struct with offload callbacks to bpf_offload_dev_create()

For passing device functions for offloaded eBPF programs, there used to
be no place where to store the pointer without making the non-offloaded
programs pay a memory price.

As a consequence, three functions were called with ndo_bpf() through
specific commands. Now that we have struct bpf_offload_dev, and since
none of those operations rely on RTNL, we can turn these three commands
into hooks inside the struct bpf_prog_offload_ops, and pass them as part
of bpf_offload_dev_create().

This commit effectively passes a pointer to the struct to
bpf_offload_dev_create(). We temporarily have two struct
bpf_prog_offload_ops instances, one under offdev->ops and one under
offload->dev_ops. The next patches will make the transition towards the
former, so that offload->dev_ops can be removed, and callbacks relying
on ndo_bpf() added to offdev->ops as well.

While at it, rename "nfp_bpf_analyzer_ops" as "nfp_bpf_dev_ops" (and
similarly for netdevsim).

Suggested-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 drivers/net/ethernet/netronome/nfp/bpf/main.c    | 2 +-
 drivers/net/ethernet/netronome/nfp/bpf/main.h    | 2 +-
 drivers/net/ethernet/netronome/nfp/bpf/offload.c | 4 ++--
 drivers/net/netdevsim/bpf.c                      | 6 +++---
 include/linux/bpf.h                              | 3 ++-
 kernel/bpf/offload.c                             | 5 ++++-
 6 files changed, 13 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c
index 6243af0ab025..dccae0319204 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c
@@ -465,7 +465,7 @@ static int nfp_bpf_init(struct nfp_app *app)
 		app->ctrl_mtu = nfp_bpf_ctrl_cmsg_mtu(bpf);
 	}
 
-	bpf->bpf_dev = bpf_offload_dev_create();
+	bpf->bpf_dev = bpf_offload_dev_create(&nfp_bpf_dev_ops);
 	err = PTR_ERR_OR_ZERO(bpf->bpf_dev);
 	if (err)
 		goto err_free_neutral_maps;
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h
index abdd93d14439..941277936475 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h
@@ -513,7 +513,7 @@ int nfp_verify_insn(struct bpf_verifier_env *env, int insn_idx,
 		    int prev_insn_idx);
 int nfp_bpf_finalize(struct bpf_verifier_env *env);
 
-extern const struct bpf_prog_offload_ops nfp_bpf_analyzer_ops;
+extern const struct bpf_prog_offload_ops nfp_bpf_dev_ops;
 
 struct netdev_bpf;
 struct nfp_app;
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
index dc548bb4089e..2fca996a7e77 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
@@ -209,7 +209,7 @@ nfp_bpf_verifier_prep(struct nfp_app *app, struct nfp_net *nn,
 		goto err_free;
 
 	nfp_prog->verifier_meta = nfp_prog_first_meta(nfp_prog);
-	bpf->verifier.ops = &nfp_bpf_analyzer_ops;
+	bpf->verifier.ops = &nfp_bpf_dev_ops;
 
 	return 0;
 
@@ -602,7 +602,7 @@ int nfp_net_bpf_offload(struct nfp_net *nn, struct bpf_prog *prog,
 	return 0;
 }
 
-const struct bpf_prog_offload_ops nfp_bpf_analyzer_ops = {
+const struct bpf_prog_offload_ops nfp_bpf_dev_ops = {
 	.insn_hook	= nfp_verify_insn,
 	.finalize	= nfp_bpf_finalize,
 };
diff --git a/drivers/net/netdevsim/bpf.c b/drivers/net/netdevsim/bpf.c
index cb3518474f0e..135aee864162 100644
--- a/drivers/net/netdevsim/bpf.c
+++ b/drivers/net/netdevsim/bpf.c
@@ -91,7 +91,7 @@ static int nsim_bpf_finalize(struct bpf_verifier_env *env)
 	return 0;
 }
 
-static const struct bpf_prog_offload_ops nsim_bpf_analyzer_ops = {
+static const struct bpf_prog_offload_ops nsim_bpf_dev_ops = {
 	.insn_hook	= nsim_bpf_verify_insn,
 	.finalize	= nsim_bpf_finalize,
 };
@@ -547,7 +547,7 @@ int nsim_bpf(struct net_device *dev, struct netdev_bpf *bpf)
 		if (err)
 			return err;
 
-		bpf->verifier.ops = &nsim_bpf_analyzer_ops;
+		bpf->verifier.ops = &nsim_bpf_dev_ops;
 		return 0;
 	case BPF_OFFLOAD_TRANSLATE:
 		state = bpf->offload.prog->aux->offload->dev_priv;
@@ -599,7 +599,7 @@ int nsim_bpf_init(struct netdevsim *ns)
 		if (IS_ERR_OR_NULL(ns->sdev->ddir_bpf_bound_progs))
 			return -ENOMEM;
 
-		ns->sdev->bpf_dev = bpf_offload_dev_create();
+		ns->sdev->bpf_dev = bpf_offload_dev_create(&nsim_bpf_dev_ops);
 		err = PTR_ERR_OR_ZERO(ns->sdev->bpf_dev);
 		if (err)
 			return err;
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index b6a296e01f6a..c0197c37b2b2 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -692,7 +692,8 @@ int bpf_map_offload_get_next_key(struct bpf_map *map,
 
 bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map);
 
-struct bpf_offload_dev *bpf_offload_dev_create(void);
+struct bpf_offload_dev *
+bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops);
 void bpf_offload_dev_destroy(struct bpf_offload_dev *offdev);
 int bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev,
 				    struct net_device *netdev);
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 8e93c47f0779..d513fbf9ca53 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -33,6 +33,7 @@
 static DECLARE_RWSEM(bpf_devs_lock);
 
 struct bpf_offload_dev {
+	const struct bpf_prog_offload_ops *ops;
 	struct list_head netdevs;
 };
 
@@ -655,7 +656,8 @@ unlock:
 }
 EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_unregister);
 
-struct bpf_offload_dev *bpf_offload_dev_create(void)
+struct bpf_offload_dev *
+bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops)
 {
 	struct bpf_offload_dev *offdev;
 	int err;
@@ -673,6 +675,7 @@ struct bpf_offload_dev *bpf_offload_dev_create(void)
 	if (!offdev)
 		return ERR_PTR(-ENOMEM);
 
+	offdev->ops = ops;
 	INIT_LIST_HEAD(&offdev->netdevs);
 
 	return offdev;
-- 
cgit 


From 341b3e7b7b89315c43d262da3199098bcf9bbe57 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Fri, 9 Nov 2018 13:03:26 +0000
Subject: bpf: call verify_insn from its callback in struct bpf_offload_dev

We intend to remove the dev_ops in struct bpf_prog_offload, and to only
keep the ops in struct bpf_offload_dev instead, which is accessible from
more locations for passing function pointers.

But dev_ops is used for calling the verify_insn hook. Switch to the
newly added ops in struct bpf_prog_offload instead.

To avoid table lookups for each eBPF instruction to verify, we remember
the offdev attached to a netdev and modify bpf_offload_find_netdev() to
avoid performing more than once a lookup for a given offload object.

Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h  | 1 +
 kernel/bpf/offload.c | 4 +++-
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c0197c37b2b2..672714cd904f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -273,6 +273,7 @@ struct bpf_prog_offload_ops {
 struct bpf_prog_offload {
 	struct bpf_prog		*prog;
 	struct net_device	*netdev;
+	struct bpf_offload_dev	*offdev;
 	void			*dev_priv;
 	struct list_head	offloads;
 	bool			dev_state;
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index d513fbf9ca53..2cd3c0d0417b 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -107,6 +107,7 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr)
 		err = -EINVAL;
 		goto err_unlock;
 	}
+	offload->offdev = ondev->offdev;
 	prog->aux->offload = offload;
 	list_add_tail(&offload->offloads, &ondev->progs);
 	dev_put(offload->netdev);
@@ -167,7 +168,8 @@ int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env,
 	down_read(&bpf_devs_lock);
 	offload = env->prog->aux->offload;
 	if (offload)
-		ret = offload->dev_ops->insn_hook(env, insn_idx, prev_insn_idx);
+		ret = offload->offdev->ops->insn_hook(env, insn_idx,
+						      prev_insn_idx);
 	up_read(&bpf_devs_lock);
 
 	return ret;
-- 
cgit 


From 6dc18fa6f4cad69c892d6fb9499f7e41c6a88a8e Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Fri, 9 Nov 2018 13:03:27 +0000
Subject: bpf: call finalize() from its callback in struct bpf_offload_dev

In a way similar to the change previously brought to the verify_insn
hook, switch to the newly added ops in struct bpf_prog_offload for
calling the functions used to perform final verification steps for
offloaded programs.

Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/offload.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 2cd3c0d0417b..2c88cb4ddfd8 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -183,8 +183,8 @@ int bpf_prog_offload_finalize(struct bpf_verifier_env *env)
 	down_read(&bpf_devs_lock);
 	offload = env->prog->aux->offload;
 	if (offload) {
-		if (offload->dev_ops->finalize)
-			ret = offload->dev_ops->finalize(env);
+		if (offload->offdev->ops->finalize)
+			ret = offload->offdev->ops->finalize(env);
 		else
 			ret = 0;
 	}
-- 
cgit 


From 00db12c3d141356a4d1e6b6f688e0d5ed3b1f757 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Fri, 9 Nov 2018 13:03:28 +0000
Subject: bpf: call verifier_prep from its callback in struct bpf_offload_dev

In a way similar to the change previously brought to the verify_insn
hook and to the finalize callback, switch to the newly added ops in
struct bpf_prog_offload for calling the functions used to prepare driver
verifiers.

Since the dev_ops pointer in struct bpf_prog_offload is no longer used
by any callback, we can now remove it from struct bpf_prog_offload.

Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 drivers/net/ethernet/netronome/nfp/bpf/offload.c | 11 ++++----
 drivers/net/netdevsim/bpf.c                      | 32 +++++++++++++-----------
 include/linux/bpf.h                              |  2 +-
 include/linux/netdevice.h                        |  6 -----
 kernel/bpf/offload.c                             | 22 +++++++---------
 5 files changed, 32 insertions(+), 41 deletions(-)

(limited to 'kernel')

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
index 2fca996a7e77..16a3a9c55852 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
@@ -188,10 +188,11 @@ static void nfp_prog_free(struct nfp_prog *nfp_prog)
 }
 
 static int
-nfp_bpf_verifier_prep(struct nfp_app *app, struct nfp_net *nn,
-		      struct netdev_bpf *bpf)
+nfp_bpf_verifier_prep(struct net_device *netdev, struct bpf_verifier_env *env)
 {
-	struct bpf_prog *prog = bpf->verifier.prog;
+	struct nfp_net *nn = netdev_priv(netdev);
+	struct bpf_prog *prog = env->prog;
+	struct nfp_app *app = nn->app;
 	struct nfp_prog *nfp_prog;
 	int ret;
 
@@ -209,7 +210,6 @@ nfp_bpf_verifier_prep(struct nfp_app *app, struct nfp_net *nn,
 		goto err_free;
 
 	nfp_prog->verifier_meta = nfp_prog_first_meta(nfp_prog);
-	bpf->verifier.ops = &nfp_bpf_dev_ops;
 
 	return 0;
 
@@ -422,8 +422,6 @@ nfp_bpf_map_free(struct nfp_app_bpf *bpf, struct bpf_offloaded_map *offmap)
 int nfp_ndo_bpf(struct nfp_app *app, struct nfp_net *nn, struct netdev_bpf *bpf)
 {
 	switch (bpf->command) {
-	case BPF_OFFLOAD_VERIFIER_PREP:
-		return nfp_bpf_verifier_prep(app, nn, bpf);
 	case BPF_OFFLOAD_TRANSLATE:
 		return nfp_bpf_translate(nn, bpf->offload.prog);
 	case BPF_OFFLOAD_DESTROY:
@@ -605,4 +603,5 @@ int nfp_net_bpf_offload(struct nfp_net *nn, struct bpf_prog *prog,
 const struct bpf_prog_offload_ops nfp_bpf_dev_ops = {
 	.insn_hook	= nfp_verify_insn,
 	.finalize	= nfp_bpf_finalize,
+	.prepare	= nfp_bpf_verifier_prep,
 };
diff --git a/drivers/net/netdevsim/bpf.c b/drivers/net/netdevsim/bpf.c
index 135aee864162..d045b7d666d9 100644
--- a/drivers/net/netdevsim/bpf.c
+++ b/drivers/net/netdevsim/bpf.c
@@ -91,11 +91,6 @@ static int nsim_bpf_finalize(struct bpf_verifier_env *env)
 	return 0;
 }
 
-static const struct bpf_prog_offload_ops nsim_bpf_dev_ops = {
-	.insn_hook	= nsim_bpf_verify_insn,
-	.finalize	= nsim_bpf_finalize,
-};
-
 static bool nsim_xdp_offload_active(struct netdevsim *ns)
 {
 	return ns->xdp_hw.prog;
@@ -263,6 +258,17 @@ static int nsim_bpf_create_prog(struct netdevsim *ns, struct bpf_prog *prog)
 	return 0;
 }
 
+static int
+nsim_bpf_verifier_prep(struct net_device *dev, struct bpf_verifier_env *env)
+{
+	struct netdevsim *ns = netdev_priv(dev);
+
+	if (!ns->bpf_bind_accept)
+		return -EOPNOTSUPP;
+
+	return nsim_bpf_create_prog(ns, env->prog);
+}
+
 static void nsim_bpf_destroy_prog(struct bpf_prog *prog)
 {
 	struct nsim_bpf_bound_prog *state;
@@ -275,6 +281,12 @@ static void nsim_bpf_destroy_prog(struct bpf_prog *prog)
 	kfree(state);
 }
 
+static const struct bpf_prog_offload_ops nsim_bpf_dev_ops = {
+	.insn_hook	= nsim_bpf_verify_insn,
+	.finalize	= nsim_bpf_finalize,
+	.prepare	= nsim_bpf_verifier_prep,
+};
+
 static int nsim_setup_prog_checks(struct netdevsim *ns, struct netdev_bpf *bpf)
 {
 	if (bpf->prog && bpf->prog->aux->offload) {
@@ -539,16 +551,6 @@ int nsim_bpf(struct net_device *dev, struct netdev_bpf *bpf)
 	ASSERT_RTNL();
 
 	switch (bpf->command) {
-	case BPF_OFFLOAD_VERIFIER_PREP:
-		if (!ns->bpf_bind_accept)
-			return -EOPNOTSUPP;
-
-		err = nsim_bpf_create_prog(ns, bpf->verifier.prog);
-		if (err)
-			return err;
-
-		bpf->verifier.ops = &nsim_bpf_dev_ops;
-		return 0;
 	case BPF_OFFLOAD_TRANSLATE:
 		state = bpf->offload.prog->aux->offload->dev_priv;
 
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 672714cd904f..f250494a4f56 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -268,6 +268,7 @@ struct bpf_prog_offload_ops {
 	int (*insn_hook)(struct bpf_verifier_env *env,
 			 int insn_idx, int prev_insn_idx);
 	int (*finalize)(struct bpf_verifier_env *env);
+	int (*prepare)(struct net_device *netdev, struct bpf_verifier_env *env);
 };
 
 struct bpf_prog_offload {
@@ -277,7 +278,6 @@ struct bpf_prog_offload {
 	void			*dev_priv;
 	struct list_head	offloads;
 	bool			dev_state;
-	const struct bpf_prog_offload_ops *dev_ops;
 	void			*jited_image;
 	u32			jited_len;
 };
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 857f8abf7b91..0fa2c2744928 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -863,7 +863,6 @@ enum bpf_netdev_command {
 	XDP_QUERY_PROG,
 	XDP_QUERY_PROG_HW,
 	/* BPF program for offload callbacks, invoked at program load time. */
-	BPF_OFFLOAD_VERIFIER_PREP,
 	BPF_OFFLOAD_TRANSLATE,
 	BPF_OFFLOAD_DESTROY,
 	BPF_OFFLOAD_MAP_ALLOC,
@@ -891,11 +890,6 @@ struct netdev_bpf {
 			/* flags with which program was installed */
 			u32 prog_flags;
 		};
-		/* BPF_OFFLOAD_VERIFIER_PREP */
-		struct {
-			struct bpf_prog *prog;
-			const struct bpf_prog_offload_ops *ops; /* callee set */
-		} verifier;
 		/* BPF_OFFLOAD_TRANSLATE, BPF_OFFLOAD_DESTROY */
 		struct {
 			struct bpf_prog *prog;
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 2c88cb4ddfd8..1f7ac00a494d 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -142,21 +142,17 @@ static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd,
 
 int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env)
 {
-	struct netdev_bpf data = {};
-	int err;
-
-	data.verifier.prog = env->prog;
+	struct bpf_prog_offload *offload;
+	int ret = -ENODEV;
 
-	rtnl_lock();
-	err = __bpf_offload_ndo(env->prog, BPF_OFFLOAD_VERIFIER_PREP, &data);
-	if (err)
-		goto exit_unlock;
+	down_read(&bpf_devs_lock);
+	offload = env->prog->aux->offload;
+	if (offload)
+		ret = offload->offdev->ops->prepare(offload->netdev, env);
+	offload->dev_state = !ret;
+	up_read(&bpf_devs_lock);
 
-	env->prog->aux->offload->dev_ops = data.verifier.ops;
-	env->prog->aux->offload->dev_state = true;
-exit_unlock:
-	rtnl_unlock();
-	return err;
+	return ret;
 }
 
 int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env,
-- 
cgit 


From b07ade27e93360197e453e5ca80eebdc9099dcb5 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Fri, 9 Nov 2018 13:03:29 +0000
Subject: bpf: pass translate() as a callback and remove its ndo_bpf subcommand

As part of the transition from ndo_bpf() to callbacks attached to struct
bpf_offload_dev for some of the eBPF offload operations, move the
functions related to code translation to the struct and remove the
subcommand that was used to call them through the NDO.

Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 drivers/net/ethernet/netronome/nfp/bpf/offload.c | 11 +++--------
 drivers/net/netdevsim/bpf.c                      | 14 +++++++++-----
 include/linux/bpf.h                              |  1 +
 include/linux/netdevice.h                        |  3 +--
 kernel/bpf/offload.c                             | 14 +++++++-------
 5 files changed, 21 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
index 16a3a9c55852..8653a2189c19 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
@@ -33,9 +33,6 @@ nfp_map_ptr_record(struct nfp_app_bpf *bpf, struct nfp_prog *nfp_prog,
 	struct nfp_bpf_neutral_map *record;
 	int err;
 
-	/* Map record paths are entered via ndo, update side is protected. */
-	ASSERT_RTNL();
-
 	/* Reuse path - other offloaded program is already tracking this map. */
 	record = rhashtable_lookup_fast(&bpf->maps_neutral, &map->id,
 					nfp_bpf_maps_neutral_params);
@@ -84,8 +81,6 @@ nfp_map_ptrs_forget(struct nfp_app_bpf *bpf, struct nfp_prog *nfp_prog)
 	bool freed = false;
 	int i;
 
-	ASSERT_RTNL();
-
 	for (i = 0; i < nfp_prog->map_records_cnt; i++) {
 		if (--nfp_prog->map_records[i]->count) {
 			nfp_prog->map_records[i] = NULL;
@@ -219,9 +214,10 @@ err_free:
 	return ret;
 }
 
-static int nfp_bpf_translate(struct nfp_net *nn, struct bpf_prog *prog)
+static int nfp_bpf_translate(struct net_device *netdev, struct bpf_prog *prog)
 {
 	struct nfp_prog *nfp_prog = prog->aux->offload->dev_priv;
+	struct nfp_net *nn = netdev_priv(netdev);
 	unsigned int max_instr;
 	int err;
 
@@ -422,8 +418,6 @@ nfp_bpf_map_free(struct nfp_app_bpf *bpf, struct bpf_offloaded_map *offmap)
 int nfp_ndo_bpf(struct nfp_app *app, struct nfp_net *nn, struct netdev_bpf *bpf)
 {
 	switch (bpf->command) {
-	case BPF_OFFLOAD_TRANSLATE:
-		return nfp_bpf_translate(nn, bpf->offload.prog);
 	case BPF_OFFLOAD_DESTROY:
 		return nfp_bpf_destroy(nn, bpf->offload.prog);
 	case BPF_OFFLOAD_MAP_ALLOC:
@@ -604,4 +598,5 @@ const struct bpf_prog_offload_ops nfp_bpf_dev_ops = {
 	.insn_hook	= nfp_verify_insn,
 	.finalize	= nfp_bpf_finalize,
 	.prepare	= nfp_bpf_verifier_prep,
+	.translate	= nfp_bpf_translate,
 };
diff --git a/drivers/net/netdevsim/bpf.c b/drivers/net/netdevsim/bpf.c
index d045b7d666d9..30c2cd516d1c 100644
--- a/drivers/net/netdevsim/bpf.c
+++ b/drivers/net/netdevsim/bpf.c
@@ -269,6 +269,14 @@ nsim_bpf_verifier_prep(struct net_device *dev, struct bpf_verifier_env *env)
 	return nsim_bpf_create_prog(ns, env->prog);
 }
 
+static int nsim_bpf_translate(struct net_device *dev, struct bpf_prog *prog)
+{
+	struct nsim_bpf_bound_prog *state = prog->aux->offload->dev_priv;
+
+	state->state = "xlated";
+	return 0;
+}
+
 static void nsim_bpf_destroy_prog(struct bpf_prog *prog)
 {
 	struct nsim_bpf_bound_prog *state;
@@ -285,6 +293,7 @@ static const struct bpf_prog_offload_ops nsim_bpf_dev_ops = {
 	.insn_hook	= nsim_bpf_verify_insn,
 	.finalize	= nsim_bpf_finalize,
 	.prepare	= nsim_bpf_verifier_prep,
+	.translate	= nsim_bpf_translate,
 };
 
 static int nsim_setup_prog_checks(struct netdevsim *ns, struct netdev_bpf *bpf)
@@ -551,11 +560,6 @@ int nsim_bpf(struct net_device *dev, struct netdev_bpf *bpf)
 	ASSERT_RTNL();
 
 	switch (bpf->command) {
-	case BPF_OFFLOAD_TRANSLATE:
-		state = bpf->offload.prog->aux->offload->dev_priv;
-
-		state->state = "xlated";
-		return 0;
 	case BPF_OFFLOAD_DESTROY:
 		nsim_bpf_destroy_prog(bpf->offload.prog);
 		return 0;
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f250494a4f56..d1eb3c8a3fa9 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -269,6 +269,7 @@ struct bpf_prog_offload_ops {
 			 int insn_idx, int prev_insn_idx);
 	int (*finalize)(struct bpf_verifier_env *env);
 	int (*prepare)(struct net_device *netdev, struct bpf_verifier_env *env);
+	int (*translate)(struct net_device *netdev, struct bpf_prog *prog);
 };
 
 struct bpf_prog_offload {
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 0fa2c2744928..27499127e038 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -863,7 +863,6 @@ enum bpf_netdev_command {
 	XDP_QUERY_PROG,
 	XDP_QUERY_PROG_HW,
 	/* BPF program for offload callbacks, invoked at program load time. */
-	BPF_OFFLOAD_TRANSLATE,
 	BPF_OFFLOAD_DESTROY,
 	BPF_OFFLOAD_MAP_ALLOC,
 	BPF_OFFLOAD_MAP_FREE,
@@ -890,7 +889,7 @@ struct netdev_bpf {
 			/* flags with which program was installed */
 			u32 prog_flags;
 		};
-		/* BPF_OFFLOAD_TRANSLATE, BPF_OFFLOAD_DESTROY */
+		/* BPF_OFFLOAD_DESTROY */
 		struct {
 			struct bpf_prog *prog;
 		} offload;
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 1f7ac00a494d..ae0167366c12 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -219,14 +219,14 @@ void bpf_prog_offload_destroy(struct bpf_prog *prog)
 
 static int bpf_prog_offload_translate(struct bpf_prog *prog)
 {
-	struct netdev_bpf data = {};
-	int ret;
-
-	data.offload.prog = prog;
+	struct bpf_prog_offload *offload;
+	int ret = -ENODEV;
 
-	rtnl_lock();
-	ret = __bpf_offload_ndo(prog, BPF_OFFLOAD_TRANSLATE, &data);
-	rtnl_unlock();
+	down_read(&bpf_devs_lock);
+	offload = prog->aux->offload;
+	if (offload)
+		ret = offload->offdev->ops->translate(offload->netdev, prog);
+	up_read(&bpf_devs_lock);
 
 	return ret;
 }
-- 
cgit 


From eb9119471efbf730c8f830f706026b486eb701dd Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Fri, 9 Nov 2018 13:03:30 +0000
Subject: bpf: pass destroy() as a callback and remove its ndo_bpf subcommand

As part of the transition from ndo_bpf() to callbacks attached to struct
bpf_offload_dev for some of the eBPF offload operations, move the
functions related to program destruction to the struct and remove the
subcommand that was used to call them through the NDO.

Remove function __bpf_offload_ndo(), which is no longer used.

Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 drivers/net/ethernet/netronome/nfp/bpf/offload.c |  7 ++-----
 drivers/net/netdevsim/bpf.c                      |  4 +---
 include/linux/bpf.h                              |  1 +
 include/linux/netdevice.h                        |  5 -----
 kernel/bpf/offload.c                             | 24 +-----------------------
 5 files changed, 5 insertions(+), 36 deletions(-)

(limited to 'kernel')

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
index 8653a2189c19..91085cc3c843 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
@@ -238,15 +238,13 @@ static int nfp_bpf_translate(struct net_device *netdev, struct bpf_prog *prog)
 	return nfp_map_ptrs_record(nfp_prog->bpf, nfp_prog, prog);
 }
 
-static int nfp_bpf_destroy(struct nfp_net *nn, struct bpf_prog *prog)
+static void nfp_bpf_destroy(struct bpf_prog *prog)
 {
 	struct nfp_prog *nfp_prog = prog->aux->offload->dev_priv;
 
 	kvfree(nfp_prog->prog);
 	nfp_map_ptrs_forget(nfp_prog->bpf, nfp_prog);
 	nfp_prog_free(nfp_prog);
-
-	return 0;
 }
 
 /* Atomic engine requires values to be in big endian, we need to byte swap
@@ -418,8 +416,6 @@ nfp_bpf_map_free(struct nfp_app_bpf *bpf, struct bpf_offloaded_map *offmap)
 int nfp_ndo_bpf(struct nfp_app *app, struct nfp_net *nn, struct netdev_bpf *bpf)
 {
 	switch (bpf->command) {
-	case BPF_OFFLOAD_DESTROY:
-		return nfp_bpf_destroy(nn, bpf->offload.prog);
 	case BPF_OFFLOAD_MAP_ALLOC:
 		return nfp_bpf_map_alloc(app->priv, bpf->offmap);
 	case BPF_OFFLOAD_MAP_FREE:
@@ -599,4 +595,5 @@ const struct bpf_prog_offload_ops nfp_bpf_dev_ops = {
 	.finalize	= nfp_bpf_finalize,
 	.prepare	= nfp_bpf_verifier_prep,
 	.translate	= nfp_bpf_translate,
+	.destroy	= nfp_bpf_destroy,
 };
diff --git a/drivers/net/netdevsim/bpf.c b/drivers/net/netdevsim/bpf.c
index 30c2cd516d1c..33e3d54c3a0a 100644
--- a/drivers/net/netdevsim/bpf.c
+++ b/drivers/net/netdevsim/bpf.c
@@ -294,6 +294,7 @@ static const struct bpf_prog_offload_ops nsim_bpf_dev_ops = {
 	.finalize	= nsim_bpf_finalize,
 	.prepare	= nsim_bpf_verifier_prep,
 	.translate	= nsim_bpf_translate,
+	.destroy	= nsim_bpf_destroy_prog,
 };
 
 static int nsim_setup_prog_checks(struct netdevsim *ns, struct netdev_bpf *bpf)
@@ -560,9 +561,6 @@ int nsim_bpf(struct net_device *dev, struct netdev_bpf *bpf)
 	ASSERT_RTNL();
 
 	switch (bpf->command) {
-	case BPF_OFFLOAD_DESTROY:
-		nsim_bpf_destroy_prog(bpf->offload.prog);
-		return 0;
 	case XDP_QUERY_PROG:
 		return xdp_attachment_query(&ns->xdp, bpf);
 	case XDP_QUERY_PROG_HW:
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index d1eb3c8a3fa9..867d2801db64 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -270,6 +270,7 @@ struct bpf_prog_offload_ops {
 	int (*finalize)(struct bpf_verifier_env *env);
 	int (*prepare)(struct net_device *netdev, struct bpf_verifier_env *env);
 	int (*translate)(struct net_device *netdev, struct bpf_prog *prog);
+	void (*destroy)(struct bpf_prog *prog);
 };
 
 struct bpf_prog_offload {
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 27499127e038..17d52a647fe5 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -863,7 +863,6 @@ enum bpf_netdev_command {
 	XDP_QUERY_PROG,
 	XDP_QUERY_PROG_HW,
 	/* BPF program for offload callbacks, invoked at program load time. */
-	BPF_OFFLOAD_DESTROY,
 	BPF_OFFLOAD_MAP_ALLOC,
 	BPF_OFFLOAD_MAP_FREE,
 	XDP_QUERY_XSK_UMEM,
@@ -889,10 +888,6 @@ struct netdev_bpf {
 			/* flags with which program was installed */
 			u32 prog_flags;
 		};
-		/* BPF_OFFLOAD_DESTROY */
-		struct {
-			struct bpf_prog *prog;
-		} offload;
 		/* BPF_OFFLOAD_MAP_ALLOC, BPF_OFFLOAD_MAP_FREE */
 		struct {
 			struct bpf_offloaded_map *offmap;
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index ae0167366c12..d665e75a0ac3 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -123,23 +123,6 @@ err_maybe_put:
 	return err;
 }
 
-static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd,
-			     struct netdev_bpf *data)
-{
-	struct bpf_prog_offload *offload = prog->aux->offload;
-	struct net_device *netdev;
-
-	ASSERT_RTNL();
-
-	if (!offload)
-		return -ENODEV;
-	netdev = offload->netdev;
-
-	data->command = cmd;
-
-	return netdev->netdev_ops->ndo_bpf(netdev, data);
-}
-
 int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env)
 {
 	struct bpf_prog_offload *offload;
@@ -192,12 +175,9 @@ int bpf_prog_offload_finalize(struct bpf_verifier_env *env)
 static void __bpf_prog_offload_destroy(struct bpf_prog *prog)
 {
 	struct bpf_prog_offload *offload = prog->aux->offload;
-	struct netdev_bpf data = {};
-
-	data.offload.prog = prog;
 
 	if (offload->dev_state)
-		WARN_ON(__bpf_offload_ndo(prog, BPF_OFFLOAD_DESTROY, &data));
+		offload->offdev->ops->destroy(prog);
 
 	/* Make sure BPF_PROG_GET_NEXT_ID can't find this dead program */
 	bpf_prog_free_id(prog, true);
@@ -209,12 +189,10 @@ static void __bpf_prog_offload_destroy(struct bpf_prog *prog)
 
 void bpf_prog_offload_destroy(struct bpf_prog *prog)
 {
-	rtnl_lock();
 	down_write(&bpf_devs_lock);
 	if (prog->aux->offload)
 		__bpf_prog_offload_destroy(prog);
 	up_write(&bpf_devs_lock);
-	rtnl_unlock();
 }
 
 static int bpf_prog_offload_translate(struct bpf_prog *prog)
-- 
cgit 


From a40a26322a83d4a26a99ad2616cbd77394c19587 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Fri, 9 Nov 2018 13:03:31 +0000
Subject: bpf: pass prog instead of env to bpf_prog_offload_verifier_prep()

Function bpf_prog_offload_verifier_prep(), called from the kernel BPF
verifier to run a driver-specific callback for preparing for the
verification step for offloaded programs, takes a pointer to a struct
bpf_verifier_env object. However, no driver callback needs the whole
structure at this time: the two drivers supporting this, nfp and
netdevsim, only need a pointer to the struct bpf_prog instance held by
env.

Update the callback accordingly, on kernel side and in these two
drivers.

Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 drivers/net/ethernet/netronome/nfp/bpf/offload.c | 3 +--
 drivers/net/netdevsim/bpf.c                      | 4 ++--
 include/linux/bpf.h                              | 2 +-
 include/linux/bpf_verifier.h                     | 2 +-
 kernel/bpf/offload.c                             | 6 +++---
 kernel/bpf/verifier.c                            | 2 +-
 6 files changed, 9 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
index 91085cc3c843..e6b26d2f651d 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
@@ -183,10 +183,9 @@ static void nfp_prog_free(struct nfp_prog *nfp_prog)
 }
 
 static int
-nfp_bpf_verifier_prep(struct net_device *netdev, struct bpf_verifier_env *env)
+nfp_bpf_verifier_prep(struct net_device *netdev, struct bpf_prog *prog)
 {
 	struct nfp_net *nn = netdev_priv(netdev);
-	struct bpf_prog *prog = env->prog;
 	struct nfp_app *app = nn->app;
 	struct nfp_prog *nfp_prog;
 	int ret;
diff --git a/drivers/net/netdevsim/bpf.c b/drivers/net/netdevsim/bpf.c
index 33e3d54c3a0a..560bdaf1c98b 100644
--- a/drivers/net/netdevsim/bpf.c
+++ b/drivers/net/netdevsim/bpf.c
@@ -259,14 +259,14 @@ static int nsim_bpf_create_prog(struct netdevsim *ns, struct bpf_prog *prog)
 }
 
 static int
-nsim_bpf_verifier_prep(struct net_device *dev, struct bpf_verifier_env *env)
+nsim_bpf_verifier_prep(struct net_device *dev, struct bpf_prog *prog)
 {
 	struct netdevsim *ns = netdev_priv(dev);
 
 	if (!ns->bpf_bind_accept)
 		return -EOPNOTSUPP;
 
-	return nsim_bpf_create_prog(ns, env->prog);
+	return nsim_bpf_create_prog(ns, prog);
 }
 
 static int nsim_bpf_translate(struct net_device *dev, struct bpf_prog *prog)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 867d2801db64..888111350d0e 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -268,7 +268,7 @@ struct bpf_prog_offload_ops {
 	int (*insn_hook)(struct bpf_verifier_env *env,
 			 int insn_idx, int prev_insn_idx);
 	int (*finalize)(struct bpf_verifier_env *env);
-	int (*prepare)(struct net_device *netdev, struct bpf_verifier_env *env);
+	int (*prepare)(struct net_device *netdev, struct bpf_prog *prog);
 	int (*translate)(struct net_device *netdev, struct bpf_prog *prog);
 	void (*destroy)(struct bpf_prog *prog);
 };
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index d93e89761a8b..11f5df1092d9 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -245,7 +245,7 @@ static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env)
 	return cur_func(env)->regs;
 }
 
-int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env);
+int bpf_prog_offload_verifier_prep(struct bpf_prog *prog);
 int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env,
 				 int insn_idx, int prev_insn_idx);
 int bpf_prog_offload_finalize(struct bpf_verifier_env *env);
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index d665e75a0ac3..397d206e184b 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -123,15 +123,15 @@ err_maybe_put:
 	return err;
 }
 
-int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env)
+int bpf_prog_offload_verifier_prep(struct bpf_prog *prog)
 {
 	struct bpf_prog_offload *offload;
 	int ret = -ENODEV;
 
 	down_read(&bpf_devs_lock);
-	offload = env->prog->aux->offload;
+	offload = prog->aux->offload;
 	if (offload)
-		ret = offload->offdev->ops->prepare(offload->netdev, env);
+		ret = offload->offdev->ops->prepare(offload->netdev, prog);
 	offload->dev_state = !ret;
 	up_read(&bpf_devs_lock);
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 75dab40b19a3..8d0977980cfa 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6368,7 +6368,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 		goto skip_full_check;
 
 	if (bpf_prog_is_dev_bound(env->prog->aux)) {
-		ret = bpf_prog_offload_verifier_prep(env);
+		ret = bpf_prog_offload_verifier_prep(env->prog);
 		if (ret)
 			goto skip_full_check;
 	}
-- 
cgit 


From 16a8cb5cffd0a2929ae97bc258d2d9c92a4e7f6d Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Fri, 9 Nov 2018 13:03:32 +0000
Subject: bpf: do not pass netdev to translate() and prepare() offload
 callbacks

The kernel functions to prepare verifier and translate for offloaded
program retrieve "offload" from "prog", and "netdev" from "offload".
Then both "prog" and "netdev" are passed to the callbacks.

Simplify this by letting the drivers retrieve the net device themselves
from the offload object attached to prog - if they need it at all. There
is currently no need to pass the netdev as an argument to those
functions.

Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 drivers/net/ethernet/netronome/nfp/bpf/offload.c | 9 ++++-----
 drivers/net/netdevsim/bpf.c                      | 7 +++----
 include/linux/bpf.h                              | 4 ++--
 kernel/bpf/offload.c                             | 4 ++--
 4 files changed, 11 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
index e6b26d2f651d..f0283854fade 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
@@ -182,10 +182,9 @@ static void nfp_prog_free(struct nfp_prog *nfp_prog)
 	kfree(nfp_prog);
 }
 
-static int
-nfp_bpf_verifier_prep(struct net_device *netdev, struct bpf_prog *prog)
+static int nfp_bpf_verifier_prep(struct bpf_prog *prog)
 {
-	struct nfp_net *nn = netdev_priv(netdev);
+	struct nfp_net *nn = netdev_priv(prog->aux->offload->netdev);
 	struct nfp_app *app = nn->app;
 	struct nfp_prog *nfp_prog;
 	int ret;
@@ -213,10 +212,10 @@ err_free:
 	return ret;
 }
 
-static int nfp_bpf_translate(struct net_device *netdev, struct bpf_prog *prog)
+static int nfp_bpf_translate(struct bpf_prog *prog)
 {
+	struct nfp_net *nn = netdev_priv(prog->aux->offload->netdev);
 	struct nfp_prog *nfp_prog = prog->aux->offload->dev_priv;
-	struct nfp_net *nn = netdev_priv(netdev);
 	unsigned int max_instr;
 	int err;
 
diff --git a/drivers/net/netdevsim/bpf.c b/drivers/net/netdevsim/bpf.c
index 560bdaf1c98b..6a5b7bd9a1f9 100644
--- a/drivers/net/netdevsim/bpf.c
+++ b/drivers/net/netdevsim/bpf.c
@@ -258,10 +258,9 @@ static int nsim_bpf_create_prog(struct netdevsim *ns, struct bpf_prog *prog)
 	return 0;
 }
 
-static int
-nsim_bpf_verifier_prep(struct net_device *dev, struct bpf_prog *prog)
+static int nsim_bpf_verifier_prep(struct bpf_prog *prog)
 {
-	struct netdevsim *ns = netdev_priv(dev);
+	struct netdevsim *ns = netdev_priv(prog->aux->offload->netdev);
 
 	if (!ns->bpf_bind_accept)
 		return -EOPNOTSUPP;
@@ -269,7 +268,7 @@ nsim_bpf_verifier_prep(struct net_device *dev, struct bpf_prog *prog)
 	return nsim_bpf_create_prog(ns, prog);
 }
 
-static int nsim_bpf_translate(struct net_device *dev, struct bpf_prog *prog)
+static int nsim_bpf_translate(struct bpf_prog *prog)
 {
 	struct nsim_bpf_bound_prog *state = prog->aux->offload->dev_priv;
 
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 888111350d0e..987815152629 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -268,8 +268,8 @@ struct bpf_prog_offload_ops {
 	int (*insn_hook)(struct bpf_verifier_env *env,
 			 int insn_idx, int prev_insn_idx);
 	int (*finalize)(struct bpf_verifier_env *env);
-	int (*prepare)(struct net_device *netdev, struct bpf_prog *prog);
-	int (*translate)(struct net_device *netdev, struct bpf_prog *prog);
+	int (*prepare)(struct bpf_prog *prog);
+	int (*translate)(struct bpf_prog *prog);
 	void (*destroy)(struct bpf_prog *prog);
 };
 
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 397d206e184b..52c5617e3716 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -131,7 +131,7 @@ int bpf_prog_offload_verifier_prep(struct bpf_prog *prog)
 	down_read(&bpf_devs_lock);
 	offload = prog->aux->offload;
 	if (offload)
-		ret = offload->offdev->ops->prepare(offload->netdev, prog);
+		ret = offload->offdev->ops->prepare(prog);
 	offload->dev_state = !ret;
 	up_read(&bpf_devs_lock);
 
@@ -203,7 +203,7 @@ static int bpf_prog_offload_translate(struct bpf_prog *prog)
 	down_read(&bpf_devs_lock);
 	offload = prog->aux->offload;
 	if (offload)
-		ret = offload->offdev->ops->translate(offload->netdev, prog);
+		ret = offload->offdev->ops->translate(prog);
 	up_read(&bpf_devs_lock);
 
 	return ret;
-- 
cgit 


From 46f53a65d2de3e1591636c22b626b09d8684fd71 Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Sat, 10 Nov 2018 22:15:13 -0800
Subject: bpf: Allow narrow loads with offset > 0

Currently BPF verifier allows narrow loads for a context field only with
offset zero. E.g. if there is a __u32 field then only the following
loads are permitted:
  * off=0, size=1 (narrow);
  * off=0, size=2 (narrow);
  * off=0, size=4 (full).

On the other hand LLVM can generate a load with offset different than
zero that make sense from program logic point of view, but verifier
doesn't accept it.

E.g. tools/testing/selftests/bpf/sendmsg4_prog.c has code:

  #define DST_IP4			0xC0A801FEU /* 192.168.1.254 */
  ...
  	if ((ctx->user_ip4 >> 24) == (bpf_htonl(DST_IP4) >> 24) &&

where ctx is struct bpf_sock_addr.

Some versions of LLVM can produce the following byte code for it:

       8:       71 12 07 00 00 00 00 00         r2 = *(u8 *)(r1 + 7)
       9:       67 02 00 00 18 00 00 00         r2 <<= 24
      10:       18 03 00 00 00 00 00 fe 00 00 00 00 00 00 00 00         r3 = 4261412864 ll
      12:       5d 32 07 00 00 00 00 00         if r2 != r3 goto +7 <LBB0_6>

where `*(u8 *)(r1 + 7)` means narrow load for ctx->user_ip4 with size=1
and offset=3 (7 - sizeof(ctx->user_family) = 3). This load is currently
rejected by verifier.

Verifier code that rejects such loads is in bpf_ctx_narrow_access_ok()
what means any is_valid_access implementation, that uses the function,
works this way, e.g. bpf_skb_is_valid_access() for __sk_buff or
sock_addr_is_valid_access() for bpf_sock_addr.

The patch makes such loads supported. Offset can be in [0; size_default)
but has to be multiple of load size. E.g. for __u32 field the following
loads are supported now:
  * off=0, size=1 (narrow);
  * off=1, size=1 (narrow);
  * off=2, size=1 (narrow);
  * off=3, size=1 (narrow);
  * off=0, size=2 (narrow);
  * off=2, size=2 (narrow);
  * off=0, size=4 (full).

Reported-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Andrey Ignatov <rdna@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/filter.h | 16 +---------------
 kernel/bpf/verifier.c  | 21 ++++++++++++++++-----
 2 files changed, 17 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index de629b706d1d..cc17f5f32fbb 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -668,24 +668,10 @@ static inline u32 bpf_ctx_off_adjust_machine(u32 size)
 	return size;
 }
 
-static inline bool bpf_ctx_narrow_align_ok(u32 off, u32 size_access,
-					   u32 size_default)
-{
-	size_default = bpf_ctx_off_adjust_machine(size_default);
-	size_access  = bpf_ctx_off_adjust_machine(size_access);
-
-#ifdef __LITTLE_ENDIAN
-	return (off & (size_default - 1)) == 0;
-#else
-	return (off & (size_default - 1)) + size_access == size_default;
-#endif
-}
-
 static inline bool
 bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default)
 {
-	return bpf_ctx_narrow_align_ok(off, size, size_default) &&
-	       size <= size_default && (size & (size - 1)) == 0;
+	return size <= size_default && (size & (size - 1)) == 0;
 }
 
 #define bpf_classic_proglen(fprog) (fprog->len * sizeof(fprog->filter[0]))
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8d0977980cfa..b5222aa61d54 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5718,10 +5718,10 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 	int i, cnt, size, ctx_field_size, delta = 0;
 	const int insn_cnt = env->prog->len;
 	struct bpf_insn insn_buf[16], *insn;
+	u32 target_size, size_default, off;
 	struct bpf_prog *new_prog;
 	enum bpf_access_type type;
 	bool is_narrower_load;
-	u32 target_size;
 
 	if (ops->gen_prologue || env->seen_direct_write) {
 		if (!ops->gen_prologue) {
@@ -5814,9 +5814,9 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 		 * we will apply proper mask to the result.
 		 */
 		is_narrower_load = size < ctx_field_size;
+		size_default = bpf_ctx_off_adjust_machine(ctx_field_size);
+		off = insn->off;
 		if (is_narrower_load) {
-			u32 size_default = bpf_ctx_off_adjust_machine(ctx_field_size);
-			u32 off = insn->off;
 			u8 size_code;
 
 			if (type == BPF_WRITE) {
@@ -5844,12 +5844,23 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 		}
 
 		if (is_narrower_load && size < target_size) {
-			if (ctx_field_size <= 4)
+			u8 shift = (off & (size_default - 1)) * 8;
+
+			if (ctx_field_size <= 4) {
+				if (shift)
+					insn_buf[cnt++] = BPF_ALU32_IMM(BPF_RSH,
+									insn->dst_reg,
+									shift);
 				insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg,
 								(1 << size * 8) - 1);
-			else
+			} else {
+				if (shift)
+					insn_buf[cnt++] = BPF_ALU64_IMM(BPF_RSH,
+									insn->dst_reg,
+									shift);
 				insn_buf[cnt++] = BPF_ALU64_IMM(BPF_AND, insn->dst_reg,
 								(1 << size * 8) - 1);
+			}
 		}
 
 		new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
-- 
cgit 


From 9cac83a57e99e4692315b7a91a81bab787961d97 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 11 Sep 2018 08:57:48 -0700
Subject: rcu: Stop expedited grace periods from relying on stop-machine

The CPU-selection code in sync_rcu_exp_select_cpus() disables preemption
to prevent the cpu_online_mask from changing.  However, this relies on
the stop-machine mechanism in the CPU-hotplug offline code, which is not
desirable (it would be good to someday remove the stop-machine mechanism).

This commit therefore instead uses the relevant leaf rcu_node structure's
->ffmask, which has a bit set for all CPUs that are fully functional.
A given CPU's bit is cleared very early during offline processing by
rcutree_offline_cpu() and set very late during online processing by
rcutree_online_cpu().  Therefore, if a CPU's bit is set in this mask, and
preemption is disabled, we have to be before the synchronize_sched() in
the CPU-hotplug offline code, which means that the CPU is guaranteed to be
workqueue-ready throughout the duration of the enclosing preempt_disable()
region of code.

This also has the side-effect of using WORK_CPU_UNBOUND if all the CPUs for
this leaf rcu_node structure are offline, which is an acceptable difference
in behavior.

Reported-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree_exp.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 8d18c1014e2b..e669ccf3751b 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -450,10 +450,12 @@ static void sync_rcu_exp_select_cpus(smp_call_func_t func)
 		}
 		INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus);
 		preempt_disable();
-		cpu = cpumask_next(rnp->grplo - 1, cpu_online_mask);
+		cpu = find_next_bit(&rnp->ffmask, BITS_PER_LONG, -1);
 		/* If all offline, queue the work on an unbound CPU. */
-		if (unlikely(cpu > rnp->grphi))
+		if (unlikely(cpu > rnp->grphi - rnp->grplo))
 			cpu = WORK_CPU_UNBOUND;
+		else
+			cpu += rnp->grplo;
 		queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work);
 		preempt_enable();
 		rnp->exp_need_flush = true;
-- 
cgit 


From 92a801e5d5b7a893881c1676b15dd246727ccd16 Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <patrick.bellasi@arm.com>
Date: Mon, 5 Nov 2018 14:53:59 +0000
Subject: sched/fair: Mask UTIL_AVG_UNCHANGED usages

The _task_util_est() is mainly used to add/remove the task contribution
to/from the rq's estimated utilization at task enqueue/dequeue time.
In both cases we ensure the UTIL_AVG_UNCHANGED flag is set to keep
consistency between enqueue and dequeue time while still being
transparent to update_load_avg calls which will eventually reset the
flag.

Let's move the flag forcing within _task_util_est() itself so that we
can simplify calling code by hiding that estimated utilization
implementation detail into one of its internal functions.

This will affect also the "public" API task_util_est() but we know that
the flag will (eventually) impact just on the LSB of the estimated
utilization, thus it's certainly acceptable.

Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Perret <quentin.perret@arm.com>
Cc: Steve Muckle <smuckle@google.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Todd Kjos <tkjos@google.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Link: http://lkml.kernel.org/r/20181105145400.935-3-patrick.bellasi@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a1ccf1ddd37a..28ee60cabba1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3604,7 +3604,7 @@ static inline unsigned long _task_util_est(struct task_struct *p)
 {
 	struct util_est ue = READ_ONCE(p->se.avg.util_est);
 
-	return max(ue.ewma, ue.enqueued);
+	return (max(ue.ewma, ue.enqueued) | UTIL_AVG_UNCHANGED);
 }
 
 static inline unsigned long task_util_est(struct task_struct *p)
@@ -3622,7 +3622,7 @@ static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
 
 	/* Update root cfs_rq's estimated utilization */
 	enqueued  = cfs_rq->avg.util_est.enqueued;
-	enqueued += (_task_util_est(p) | UTIL_AVG_UNCHANGED);
+	enqueued += _task_util_est(p);
 	WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
 }
 
@@ -3650,8 +3650,7 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
 
 	/* Update root cfs_rq's estimated utilization */
 	ue.enqueued  = cfs_rq->avg.util_est.enqueued;
-	ue.enqueued -= min_t(unsigned int, ue.enqueued,
-			     (_task_util_est(p) | UTIL_AVG_UNCHANGED));
+	ue.enqueued -= min_t(unsigned int, ue.enqueued, _task_util_est(p));
 	WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
 
 	/*
@@ -6292,7 +6291,7 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p)
 		 */
 		if (unlikely(task_on_rq_queued(p) || current == p)) {
 			estimated -= min_t(unsigned int, estimated,
-					   (_task_util_est(p) | UTIL_AVG_UNCHANGED));
+					   _task_util_est(p));
 		}
 		util = max(util, estimated);
 	}
-- 
cgit 


From b5c0ce7bd1848892e2930f481828b6d7750231ed Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <patrick.bellasi@arm.com>
Date: Mon, 5 Nov 2018 14:54:00 +0000
Subject: sched/fair: Add lsub_positive() and use it consistently

The following pattern:

   var -= min_t(typeof(var), var, val);

is used multiple times in fair.c.

The existing sub_positive() already captures that pattern, but it also
adds an explicit load-store to properly support lockless observations.
In other cases the pattern above is used to update local, and/or not
concurrently accessed, variables.

Let's add a simpler version of sub_positive(), targeted at local variables
updates, which gives the same readability benefits at calling sites,
without enforcing {READ,WRITE}_ONCE() barriers.

Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Perret <quentin.perret@arm.com>
Cc: Steve Muckle <smuckle@google.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Todd Kjos <tkjos@google.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Link: https://lore.kernel.org/lkml/20181031184527.GA3178@hirez.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 28ee60cabba1..2cac9a469df4 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2734,6 +2734,17 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	WRITE_ONCE(*ptr, res);					\
 } while (0)
 
+/*
+ * Remove and clamp on negative, from a local variable.
+ *
+ * A variant of sub_positive(), which does not use explicit load-store
+ * and is thus optimized for local variable updates.
+ */
+#define lsub_positive(_ptr, _val) do {				\
+	typeof(_ptr) ptr = (_ptr);				\
+	*ptr -= min_t(typeof(*ptr), *ptr, _val);		\
+} while (0)
+
 #ifdef CONFIG_SMP
 static inline void
 enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -4639,7 +4650,7 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
 		cfs_b->distribute_running = 0;
 		throttled = !list_empty(&cfs_b->throttled_cfs_rq);
 
-		cfs_b->runtime -= min(runtime, cfs_b->runtime);
+		lsub_positive(&cfs_b->runtime, runtime);
 	}
 
 	/*
@@ -4773,7 +4784,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
 
 	raw_spin_lock(&cfs_b->lock);
 	if (expires == cfs_b->runtime_expires)
-		cfs_b->runtime -= min(runtime, cfs_b->runtime);
+		lsub_positive(&cfs_b->runtime, runtime);
 	cfs_b->distribute_running = 0;
 	raw_spin_unlock(&cfs_b->lock);
 }
@@ -6240,7 +6251,7 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p)
 	util = READ_ONCE(cfs_rq->avg.util_avg);
 
 	/* Discount task's util from CPU's util */
-	util -= min_t(unsigned int, util, task_util(p));
+	lsub_positive(&util, task_util(p));
 
 	/*
 	 * Covered cases:
@@ -6289,10 +6300,9 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p)
 		 * properly fix the execl regression and it helps in further
 		 * reducing the chances for the above race.
 		 */
-		if (unlikely(task_on_rq_queued(p) || current == p)) {
-			estimated -= min_t(unsigned int, estimated,
-					   _task_util_est(p));
-		}
+		if (unlikely(task_on_rq_queued(p) || current == p))
+			lsub_positive(&estimated, _task_util_est(p));
+
 		util = max(util, estimated);
 	}
 
-- 
cgit 


From 1da1843f9f0334e2428308945d396ffecc2acfe1 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Mon, 5 Nov 2018 16:51:55 +0530
Subject: sched/core: Create task_has_idle_policy() helper

We already have task_has_rt_policy() and task_has_dl_policy() helpers,
create task_has_idle_policy() as well and update sched core to start
using it.

While at it, use task_has_dl_policy() at one more place.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Link: http://lkml.kernel.org/r/ce3915d5b490fc81af926a3b6bfb775e7188e005.1541416894.git.viresh.kumar@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c  |  4 ++--
 kernel/sched/debug.c |  2 +-
 kernel/sched/fair.c  | 10 +++++-----
 kernel/sched/sched.h |  5 +++++
 4 files changed, 13 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 02a20ef196a6..5afb868f7339 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -697,7 +697,7 @@ static void set_load_weight(struct task_struct *p, bool update_load)
 	/*
 	 * SCHED_IDLE tasks get minimal weight:
 	 */
-	if (idle_policy(p->policy)) {
+	if (task_has_idle_policy(p)) {
 		load->weight = scale_load(WEIGHT_IDLEPRIO);
 		load->inv_weight = WMULT_IDLEPRIO;
 		p->se.runnable_weight = load->weight;
@@ -4199,7 +4199,7 @@ recheck:
 		 * Treat SCHED_IDLE as nice 20. Only allow a switch to
 		 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
 		 */
-		if (idle_policy(p->policy) && !idle_policy(policy)) {
+		if (task_has_idle_policy(p) && !idle_policy(policy)) {
 			if (!can_nice(p, task_nice(p)))
 				return -EPERM;
 		}
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 6383aa6a60ca..02bd5f969b21 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -974,7 +974,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
 #endif
 	P(policy);
 	P(prio);
-	if (p->policy == SCHED_DEADLINE) {
+	if (task_has_dl_policy(p)) {
 		P(dl.runtime);
 		P(dl.deadline);
 	}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2cac9a469df4..d1f91e6efe51 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6529,7 +6529,7 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
 
 static void set_last_buddy(struct sched_entity *se)
 {
-	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
+	if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
 		return;
 
 	for_each_sched_entity(se) {
@@ -6541,7 +6541,7 @@ static void set_last_buddy(struct sched_entity *se)
 
 static void set_next_buddy(struct sched_entity *se)
 {
-	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
+	if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
 		return;
 
 	for_each_sched_entity(se) {
@@ -6599,8 +6599,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 		return;
 
 	/* Idle tasks are by definition preempted by non-idle tasks. */
-	if (unlikely(curr->policy == SCHED_IDLE) &&
-	    likely(p->policy != SCHED_IDLE))
+	if (unlikely(task_has_idle_policy(curr)) &&
+	    likely(!task_has_idle_policy(p)))
 		goto preempt;
 
 	/*
@@ -7021,7 +7021,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
 	if (p->sched_class != &fair_sched_class)
 		return 0;
 
-	if (unlikely(p->policy == SCHED_IDLE))
+	if (unlikely(task_has_idle_policy(p)))
 		return 0;
 
 	/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 618577fc9aa8..b7a3147874e3 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -176,6 +176,11 @@ static inline bool valid_policy(int policy)
 		rt_policy(policy) || dl_policy(policy);
 }
 
+static inline int task_has_idle_policy(struct task_struct *p)
+{
+	return idle_policy(p->policy);
+}
+
 static inline int task_has_rt_policy(struct task_struct *p)
 {
 	return rt_policy(p->policy);
-- 
cgit 


From ed8885a14433aec04067463493051eaaeef3255f Mon Sep 17 00:00:00 2001
From: Muchun Song <smuchun@gmail.com>
Date: Sat, 10 Nov 2018 15:52:02 +0800
Subject: sched/fair: Make some variables static

The variables are local to the source and do not
need to be in global scope, so make them static.

Signed-off-by: Muchun Song <smuchun@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20181110075202.61172-1-smuchun@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d1f91e6efe51..e30dea59d215 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -38,7 +38,7 @@
  * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
  */
 unsigned int sysctl_sched_latency			= 6000000ULL;
-unsigned int normalized_sysctl_sched_latency		= 6000000ULL;
+static unsigned int normalized_sysctl_sched_latency	= 6000000ULL;
 
 /*
  * The initial- and re-scaling of tunables is configurable
@@ -58,8 +58,8 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_L
  *
  * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
  */
-unsigned int sysctl_sched_min_granularity		= 750000ULL;
-unsigned int normalized_sysctl_sched_min_granularity	= 750000ULL;
+unsigned int sysctl_sched_min_granularity			= 750000ULL;
+static unsigned int normalized_sysctl_sched_min_granularity	= 750000ULL;
 
 /*
  * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
@@ -81,8 +81,8 @@ unsigned int sysctl_sched_child_runs_first __read_mostly;
  *
  * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
  */
-unsigned int sysctl_sched_wakeup_granularity		= 1000000UL;
-unsigned int normalized_sysctl_sched_wakeup_granularity	= 1000000UL;
+unsigned int sysctl_sched_wakeup_granularity			= 1000000UL;
+static unsigned int normalized_sysctl_sched_wakeup_granularity	= 1000000UL;
 
 const_debug unsigned int sysctl_sched_migration_cost	= 500000UL;
 
@@ -116,7 +116,7 @@ unsigned int sysctl_sched_cfs_bandwidth_slice		= 5000UL;
  *
  * (default: ~20%)
  */
-unsigned int capacity_margin				= 1280;
+static unsigned int capacity_margin			= 1280;
 
 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
 {
-- 
cgit 


From 3e184501083c38fa091f640acb13af17a21fd228 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 6 Nov 2018 11:12:57 +0530
Subject: sched/core: Clean up the #ifdef block in add_nr_running()

There is no point in keeping the conditional statement of the #if block
outside of the #ifdef block, while all of its body is contained within
the #ifdef block.

Move the conditional statement under the #ifdef block as well.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Link: http://lkml.kernel.org/r/78cbd78a615d6f9fdcd3327f1ead68470f92593e.1541482935.git.viresh.kumar@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/sched.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b7a3147874e3..e0e052a50fcd 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1801,12 +1801,12 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
 
 	rq->nr_running = prev_nr + count;
 
-	if (prev_nr < 2 && rq->nr_running >= 2) {
 #ifdef CONFIG_SMP
+	if (prev_nr < 2 && rq->nr_running >= 2) {
 		if (!READ_ONCE(rq->rd->overload))
 			WRITE_ONCE(rq->rd->overload, 1);
-#endif
 	}
+#endif
 
 	sched_update_tick_dependency(rq);
 }
-- 
cgit 


From 9f16d2e6241b2fc664523f17d74adda7489f496b Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 17 Oct 2018 12:14:52 +0200
Subject: audit_tree: Remove mark->lock locking

Currently, audit_tree code uses mark->lock to protect against detaching
of mark from an inode. In most places it however also uses
mark->group->mark_mutex (as we need to atomically replace attached
marks) and this provides protection against mark detaching as well. So
just remove protection with mark->lock from audit tree code and replace
it with mark->group->mark_mutex protection in all the places. It
simplifies the code and gets rid of some ugly catches like calling
fsnotify_add_mark_locked() with mark->lock held (which cannot sleep only
because we hold a reference to another mark attached to the same inode).

Reviewed-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit_tree.c | 24 ++++--------------------
 1 file changed, 4 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index ea43181cde4a..1b55b1026a36 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -193,7 +193,7 @@ static inline struct list_head *chunk_hash(unsigned long key)
 	return chunk_hash_heads + n % HASH_SIZE;
 }
 
-/* hash_lock & entry->lock is held by caller */
+/* hash_lock & entry->group->mark_mutex is held by caller */
 static void insert_hash(struct audit_chunk *chunk)
 {
 	unsigned long key = chunk_to_key(chunk);
@@ -256,13 +256,11 @@ static void untag_chunk(struct node *p)
 		new = alloc_chunk(size);
 
 	mutex_lock(&entry->group->mark_mutex);
-	spin_lock(&entry->lock);
 	/*
 	 * mark_mutex protects mark from getting detached and thus also from
 	 * mark->connector->obj getting NULL.
 	 */
 	if (chunk->dead || !(entry->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) {
-		spin_unlock(&entry->lock);
 		mutex_unlock(&entry->group->mark_mutex);
 		if (new)
 			fsnotify_put_mark(&new->mark);
@@ -280,7 +278,6 @@ static void untag_chunk(struct node *p)
 		list_del_init(&p->list);
 		list_del_rcu(&chunk->hash);
 		spin_unlock(&hash_lock);
-		spin_unlock(&entry->lock);
 		mutex_unlock(&entry->group->mark_mutex);
 		fsnotify_destroy_mark(entry, audit_tree_group);
 		goto out;
@@ -323,7 +320,6 @@ static void untag_chunk(struct node *p)
 	list_for_each_entry(owner, &new->trees, same_root)
 		owner->root = new;
 	spin_unlock(&hash_lock);
-	spin_unlock(&entry->lock);
 	mutex_unlock(&entry->group->mark_mutex);
 	fsnotify_destroy_mark(entry, audit_tree_group);
 	fsnotify_put_mark(&new->mark);	/* drop initial reference */
@@ -340,7 +336,6 @@ Fallback:
 	p->owner = NULL;
 	put_tree(owner);
 	spin_unlock(&hash_lock);
-	spin_unlock(&entry->lock);
 	mutex_unlock(&entry->group->mark_mutex);
 out:
 	fsnotify_put_mark(entry);
@@ -360,12 +355,12 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
 		return -ENOSPC;
 	}
 
-	spin_lock(&entry->lock);
+	mutex_lock(&entry->group->mark_mutex);
 	spin_lock(&hash_lock);
 	if (tree->goner) {
 		spin_unlock(&hash_lock);
 		chunk->dead = 1;
-		spin_unlock(&entry->lock);
+		mutex_unlock(&entry->group->mark_mutex);
 		fsnotify_destroy_mark(entry, audit_tree_group);
 		fsnotify_put_mark(entry);
 		return 0;
@@ -380,7 +375,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
 	}
 	insert_hash(chunk);
 	spin_unlock(&hash_lock);
-	spin_unlock(&entry->lock);
+	mutex_unlock(&entry->group->mark_mutex);
 	fsnotify_put_mark(entry);	/* drop initial reference */
 	return 0;
 }
@@ -421,14 +416,12 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 	chunk_entry = &chunk->mark;
 
 	mutex_lock(&old_entry->group->mark_mutex);
-	spin_lock(&old_entry->lock);
 	/*
 	 * mark_mutex protects mark from getting detached and thus also from
 	 * mark->connector->obj getting NULL.
 	 */
 	if (!(old_entry->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) {
 		/* old_entry is being shot, lets just lie */
-		spin_unlock(&old_entry->lock);
 		mutex_unlock(&old_entry->group->mark_mutex);
 		fsnotify_put_mark(old_entry);
 		fsnotify_put_mark(&chunk->mark);
@@ -437,23 +430,16 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 
 	if (fsnotify_add_mark_locked(chunk_entry, old_entry->connector->obj,
 				     FSNOTIFY_OBJ_TYPE_INODE, 1)) {
-		spin_unlock(&old_entry->lock);
 		mutex_unlock(&old_entry->group->mark_mutex);
 		fsnotify_put_mark(chunk_entry);
 		fsnotify_put_mark(old_entry);
 		return -ENOSPC;
 	}
 
-	/* even though we hold old_entry->lock, this is safe since chunk_entry->lock could NEVER have been grabbed before */
-	spin_lock(&chunk_entry->lock);
 	spin_lock(&hash_lock);
-
-	/* we now hold old_entry->lock, chunk_entry->lock, and hash_lock */
 	if (tree->goner) {
 		spin_unlock(&hash_lock);
 		chunk->dead = 1;
-		spin_unlock(&chunk_entry->lock);
-		spin_unlock(&old_entry->lock);
 		mutex_unlock(&old_entry->group->mark_mutex);
 
 		fsnotify_destroy_mark(chunk_entry, audit_tree_group);
@@ -485,8 +471,6 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 		list_add(&tree->same_root, &chunk->trees);
 	}
 	spin_unlock(&hash_lock);
-	spin_unlock(&chunk_entry->lock);
-	spin_unlock(&old_entry->lock);
 	mutex_unlock(&old_entry->group->mark_mutex);
 	fsnotify_destroy_mark(old_entry, audit_tree_group);
 	fsnotify_put_mark(chunk_entry);	/* drop initial reference */
-- 
cgit 


From a5789b07b35aa56569dff762bfc063303a9ccb95 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 12 Nov 2018 09:54:48 -0500
Subject: audit: Fix possible spurious -ENOSPC error

When an inode is tagged with a tree, tag_chunk() checks whether there is
audit_tree_group mark attached to the inode and adds one if not. However
nothing protects another tag_chunk() to add the mark between we've
checked and try to add the fsnotify mark thus resulting in an error from
fsnotify_add_mark() and consequently an ENOSPC error from tag_chunk().

Fix the problem by holding mark_mutex over the whole check-insert code
sequence.

Reviewed-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit_tree.c | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 1b55b1026a36..8a74b468b666 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -342,25 +342,29 @@ out:
 	spin_lock(&hash_lock);
 }
 
+/* Call with group->mark_mutex held, releases it */
 static int create_chunk(struct inode *inode, struct audit_tree *tree)
 {
 	struct fsnotify_mark *entry;
 	struct audit_chunk *chunk = alloc_chunk(1);
-	if (!chunk)
+
+	if (!chunk) {
+		mutex_unlock(&audit_tree_group->mark_mutex);
 		return -ENOMEM;
+	}
 
 	entry = &chunk->mark;
-	if (fsnotify_add_inode_mark(entry, inode, 0)) {
+	if (fsnotify_add_inode_mark_locked(entry, inode, 0)) {
+		mutex_unlock(&audit_tree_group->mark_mutex);
 		fsnotify_put_mark(entry);
 		return -ENOSPC;
 	}
 
-	mutex_lock(&entry->group->mark_mutex);
 	spin_lock(&hash_lock);
 	if (tree->goner) {
 		spin_unlock(&hash_lock);
 		chunk->dead = 1;
-		mutex_unlock(&entry->group->mark_mutex);
+		mutex_unlock(&audit_tree_group->mark_mutex);
 		fsnotify_destroy_mark(entry, audit_tree_group);
 		fsnotify_put_mark(entry);
 		return 0;
@@ -375,7 +379,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
 	}
 	insert_hash(chunk);
 	spin_unlock(&hash_lock);
-	mutex_unlock(&entry->group->mark_mutex);
+	mutex_unlock(&audit_tree_group->mark_mutex);
 	fsnotify_put_mark(entry);	/* drop initial reference */
 	return 0;
 }
@@ -389,6 +393,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 	struct node *p;
 	int n;
 
+	mutex_lock(&audit_tree_group->mark_mutex);
 	old_entry = fsnotify_find_mark(&inode->i_fsnotify_marks,
 				       audit_tree_group);
 	if (!old_entry)
@@ -401,6 +406,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 	for (n = 0; n < old->count; n++) {
 		if (old->owners[n].owner == tree) {
 			spin_unlock(&hash_lock);
+			mutex_unlock(&audit_tree_group->mark_mutex);
 			fsnotify_put_mark(old_entry);
 			return 0;
 		}
@@ -409,20 +415,20 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 
 	chunk = alloc_chunk(old->count + 1);
 	if (!chunk) {
+		mutex_unlock(&audit_tree_group->mark_mutex);
 		fsnotify_put_mark(old_entry);
 		return -ENOMEM;
 	}
 
 	chunk_entry = &chunk->mark;
 
-	mutex_lock(&old_entry->group->mark_mutex);
 	/*
 	 * mark_mutex protects mark from getting detached and thus also from
 	 * mark->connector->obj getting NULL.
 	 */
 	if (!(old_entry->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) {
 		/* old_entry is being shot, lets just lie */
-		mutex_unlock(&old_entry->group->mark_mutex);
+		mutex_unlock(&audit_tree_group->mark_mutex);
 		fsnotify_put_mark(old_entry);
 		fsnotify_put_mark(&chunk->mark);
 		return -ENOENT;
@@ -430,7 +436,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 
 	if (fsnotify_add_mark_locked(chunk_entry, old_entry->connector->obj,
 				     FSNOTIFY_OBJ_TYPE_INODE, 1)) {
-		mutex_unlock(&old_entry->group->mark_mutex);
+		mutex_unlock(&audit_tree_group->mark_mutex);
 		fsnotify_put_mark(chunk_entry);
 		fsnotify_put_mark(old_entry);
 		return -ENOSPC;
@@ -440,7 +446,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 	if (tree->goner) {
 		spin_unlock(&hash_lock);
 		chunk->dead = 1;
-		mutex_unlock(&old_entry->group->mark_mutex);
+		mutex_unlock(&audit_tree_group->mark_mutex);
 
 		fsnotify_destroy_mark(chunk_entry, audit_tree_group);
 
@@ -471,7 +477,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 		list_add(&tree->same_root, &chunk->trees);
 	}
 	spin_unlock(&hash_lock);
-	mutex_unlock(&old_entry->group->mark_mutex);
+	mutex_unlock(&audit_tree_group->mark_mutex);
 	fsnotify_destroy_mark(old_entry, audit_tree_group);
 	fsnotify_put_mark(chunk_entry);	/* drop initial reference */
 	fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */
-- 
cgit 


From b1e4603b92d8aef8776e5673dc13fedb68d32ea4 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 12 Nov 2018 09:54:48 -0500
Subject: audit: Fix possible tagging failures

Audit tree code is replacing marks attached to inodes in non-atomic way.
Thus fsnotify_find_mark() in tag_chunk() may find a mark that belongs to
a chunk that is no longer valid one and will soon be destroyed. Tags
added to such chunk will be simply lost.

Fix the problem by making sure old mark is marked as going away (through
fsnotify_detach_mark()) before dropping mark_mutex and thus in an atomic
way wrt tag_chunk(). Note that this does not fix the problem completely
as if tag_chunk() finds a mark that is going away, it fails with
-ENOENT. But at least the failure is not silent and currently there's no
way to search for another fsnotify mark attached to the inode. We'll fix
this problem in later patch.

Reviewed-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit_tree.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 8a74b468b666..c194dbd53753 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -278,8 +278,9 @@ static void untag_chunk(struct node *p)
 		list_del_init(&p->list);
 		list_del_rcu(&chunk->hash);
 		spin_unlock(&hash_lock);
+		fsnotify_detach_mark(entry);
 		mutex_unlock(&entry->group->mark_mutex);
-		fsnotify_destroy_mark(entry, audit_tree_group);
+		fsnotify_free_mark(entry);
 		goto out;
 	}
 
@@ -320,8 +321,9 @@ static void untag_chunk(struct node *p)
 	list_for_each_entry(owner, &new->trees, same_root)
 		owner->root = new;
 	spin_unlock(&hash_lock);
+	fsnotify_detach_mark(entry);
 	mutex_unlock(&entry->group->mark_mutex);
-	fsnotify_destroy_mark(entry, audit_tree_group);
+	fsnotify_free_mark(entry);
 	fsnotify_put_mark(&new->mark);	/* drop initial reference */
 	goto out;
 
@@ -364,8 +366,9 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
 	if (tree->goner) {
 		spin_unlock(&hash_lock);
 		chunk->dead = 1;
+		fsnotify_detach_mark(entry);
 		mutex_unlock(&audit_tree_group->mark_mutex);
-		fsnotify_destroy_mark(entry, audit_tree_group);
+		fsnotify_free_mark(entry);
 		fsnotify_put_mark(entry);
 		return 0;
 	}
@@ -446,10 +449,9 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 	if (tree->goner) {
 		spin_unlock(&hash_lock);
 		chunk->dead = 1;
+		fsnotify_detach_mark(chunk_entry);
 		mutex_unlock(&audit_tree_group->mark_mutex);
-
-		fsnotify_destroy_mark(chunk_entry, audit_tree_group);
-
+		fsnotify_free_mark(chunk_entry);
 		fsnotify_put_mark(chunk_entry);
 		fsnotify_put_mark(old_entry);
 		return 0;
@@ -477,8 +479,9 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 		list_add(&tree->same_root, &chunk->trees);
 	}
 	spin_unlock(&hash_lock);
+	fsnotify_detach_mark(old_entry);
 	mutex_unlock(&audit_tree_group->mark_mutex);
-	fsnotify_destroy_mark(old_entry, audit_tree_group);
+	fsnotify_free_mark(old_entry);
 	fsnotify_put_mark(chunk_entry);	/* drop initial reference */
 	fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */
 	return 0;
-- 
cgit 


From 8d20d6e9301d7b3777d66d47dd5b89acd645cd39 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 12 Nov 2018 09:54:48 -0500
Subject: audit: Embed key into chunk

Currently chunk hash key (which is in fact pointer to the inode) is
derived as chunk->mark.conn->obj. It is tricky to make this dereference
reliable for hash table lookups only under RCU as mark can get detached
from the connector and connector gets freed independently of the
running lookup. Thus there is a possible use after free / NULL ptr
dereference issue:

CPU1					CPU2
					untag_chunk()
					  ...
audit_tree_lookup()
  list_for_each_entry_rcu(p, list, hash) {
					  list_del_rcu(&chunk->hash);
					  fsnotify_destroy_mark(entry);
					  fsnotify_put_mark(entry)
    chunk_to_key(p)
      if (!chunk->mark.connector)
					    ...
					    hlist_del_init_rcu(&mark->obj_list);
					    if (hlist_empty(&conn->list)) {
					      inode = fsnotify_detach_connector_from_object(conn);
					    mark->connector = NULL;
					    ...
					    frees connector from workqueue
      chunk->mark.connector->obj

This race is probably impossible to hit in practice as the race window
on CPU1 is very narrow and CPU2 has a lot of code to execute. Still it's
better to have this fixed. Since the inode the chunk is attached to is
constant during chunk's lifetime it is easy to cache the key in the
chunk itself and thus avoid these issues.

Reviewed-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit_tree.c | 27 ++++++++-------------------
 1 file changed, 8 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index c194dbd53753..bac5dd90c629 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -24,6 +24,7 @@ struct audit_tree {
 
 struct audit_chunk {
 	struct list_head hash;
+	unsigned long key;
 	struct fsnotify_mark mark;
 	struct list_head trees;		/* with root here */
 	int dead;
@@ -172,21 +173,6 @@ static unsigned long inode_to_key(const struct inode *inode)
 	return (unsigned long)&inode->i_fsnotify_marks;
 }
 
-/*
- * Function to return search key in our hash from chunk. Key 0 is special and
- * should never be present in the hash.
- */
-static unsigned long chunk_to_key(struct audit_chunk *chunk)
-{
-	/*
-	 * We have a reference to the mark so it should be attached to a
-	 * connector.
-	 */
-	if (WARN_ON_ONCE(!chunk->mark.connector))
-		return 0;
-	return (unsigned long)chunk->mark.connector->obj;
-}
-
 static inline struct list_head *chunk_hash(unsigned long key)
 {
 	unsigned long n = key / L1_CACHE_BYTES;
@@ -196,12 +182,12 @@ static inline struct list_head *chunk_hash(unsigned long key)
 /* hash_lock & entry->group->mark_mutex is held by caller */
 static void insert_hash(struct audit_chunk *chunk)
 {
-	unsigned long key = chunk_to_key(chunk);
 	struct list_head *list;
 
 	if (!(chunk->mark.flags & FSNOTIFY_MARK_FLAG_ATTACHED))
 		return;
-	list = chunk_hash(key);
+	WARN_ON_ONCE(!chunk->key);
+	list = chunk_hash(chunk->key);
 	list_add_rcu(&chunk->hash, list);
 }
 
@@ -213,7 +199,7 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode)
 	struct audit_chunk *p;
 
 	list_for_each_entry_rcu(p, list, hash) {
-		if (chunk_to_key(p) == key) {
+		if (p->key == key) {
 			atomic_long_inc(&p->refs);
 			return p;
 		}
@@ -295,6 +281,7 @@ static void untag_chunk(struct node *p)
 
 	chunk->dead = 1;
 	spin_lock(&hash_lock);
+	new->key = chunk->key;
 	list_replace_init(&chunk->trees, &new->trees);
 	if (owner->root == chunk) {
 		list_del_init(&owner->same_root);
@@ -380,6 +367,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
 		tree->root = chunk;
 		list_add(&tree->same_root, &chunk->trees);
 	}
+	chunk->key = inode_to_key(inode);
 	insert_hash(chunk);
 	spin_unlock(&hash_lock);
 	mutex_unlock(&audit_tree_group->mark_mutex);
@@ -456,6 +444,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 		fsnotify_put_mark(old_entry);
 		return 0;
 	}
+	chunk->key = old->key;
 	list_replace_init(&old->trees, &chunk->trees);
 	for (n = 0, p = chunk->owners; n < old->count; n++, p++) {
 		struct audit_tree *s = old->owners[n].owner;
@@ -654,7 +643,7 @@ void audit_trim_trees(void)
 			/* this could be NULL if the watch is dying else where... */
 			node->index |= 1U<<31;
 			if (iterate_mounts(compare_root,
-					   (void *)chunk_to_key(chunk),
+					   (void *)(chunk->key),
 					   root_mnt))
 				node->index &= ~(1U<<31);
 		}
-- 
cgit 


From 1635e5722350597b6a149bdb131358fcd7e34906 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 12 Nov 2018 09:54:48 -0500
Subject: audit: Make hash table insertion safe against concurrent lookups

Currently, the audit tree code does not make sure that when a chunk is
inserted into the hash table, it is fully initialized. So in theory a
user of RCU lookup could see uninitialized structure in the hash table
and crash. Add appropriate barriers between initialization of the
structure and its insertion into hash table.

Reviewed-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit_tree.c | 32 +++++++++++++++++++++++++++++---
 1 file changed, 29 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index bac5dd90c629..307749d6773c 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -186,6 +186,12 @@ static void insert_hash(struct audit_chunk *chunk)
 
 	if (!(chunk->mark.flags & FSNOTIFY_MARK_FLAG_ATTACHED))
 		return;
+	/*
+	 * Make sure chunk is fully initialized before making it visible in the
+	 * hash. Pairs with a data dependency barrier in READ_ONCE() in
+	 * audit_tree_lookup().
+	 */
+	smp_wmb();
 	WARN_ON_ONCE(!chunk->key);
 	list = chunk_hash(chunk->key);
 	list_add_rcu(&chunk->hash, list);
@@ -199,7 +205,11 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode)
 	struct audit_chunk *p;
 
 	list_for_each_entry_rcu(p, list, hash) {
-		if (p->key == key) {
+		/*
+		 * We use a data dependency barrier in READ_ONCE() to make sure
+		 * the chunk we see is fully initialized.
+		 */
+		if (READ_ONCE(p->key) == key) {
 			atomic_long_inc(&p->refs);
 			return p;
 		}
@@ -304,9 +314,15 @@ static void untag_chunk(struct node *p)
 		list_replace_init(&chunk->owners[j].list, &new->owners[i].list);
 	}
 
-	list_replace_rcu(&chunk->hash, &new->hash);
 	list_for_each_entry(owner, &new->trees, same_root)
 		owner->root = new;
+	/*
+	 * Make sure chunk is fully initialized before making it visible in the
+	 * hash. Pairs with a data dependency barrier in READ_ONCE() in
+	 * audit_tree_lookup().
+	 */
+	smp_wmb();
+	list_replace_rcu(&chunk->hash, &new->hash);
 	spin_unlock(&hash_lock);
 	fsnotify_detach_mark(entry);
 	mutex_unlock(&entry->group->mark_mutex);
@@ -368,6 +384,10 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
 		list_add(&tree->same_root, &chunk->trees);
 	}
 	chunk->key = inode_to_key(inode);
+	/*
+	 * Inserting into the hash table has to go last as once we do that RCU
+	 * readers can see the chunk.
+	 */
 	insert_hash(chunk);
 	spin_unlock(&hash_lock);
 	mutex_unlock(&audit_tree_group->mark_mutex);
@@ -459,7 +479,6 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 	p->owner = tree;
 	get_tree(tree);
 	list_add(&p->list, &tree->chunks);
-	list_replace_rcu(&old->hash, &chunk->hash);
 	list_for_each_entry(owner, &chunk->trees, same_root)
 		owner->root = chunk;
 	old->dead = 1;
@@ -467,6 +486,13 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 		tree->root = chunk;
 		list_add(&tree->same_root, &chunk->trees);
 	}
+	/*
+	 * Make sure chunk is fully initialized before making it visible in the
+	 * hash. Pairs with a data dependency barrier in READ_ONCE() in
+	 * audit_tree_lookup().
+	 */
+	smp_wmb();
+	list_replace_rcu(&old->hash, &chunk->hash);
 	spin_unlock(&hash_lock);
 	fsnotify_detach_mark(old_entry);
 	mutex_unlock(&audit_tree_group->mark_mutex);
-- 
cgit 


From d31b326d3ce7b1ff2ec36470dfcccb14a6c3e04e Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 12 Nov 2018 09:54:48 -0500
Subject: audit: Factor out chunk replacement code

Chunk replacement code is very similar for the cases where we grow or
shrink chunk. Factor the code out into a common helper function.

Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit_tree.c | 86 +++++++++++++++++++++++++----------------------------
 1 file changed, 40 insertions(+), 46 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 307749d6773c..d8f6cfa0005b 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -235,6 +235,38 @@ static struct audit_chunk *find_chunk(struct node *p)
 	return container_of(p, struct audit_chunk, owners[0]);
 }
 
+static void replace_chunk(struct audit_chunk *new, struct audit_chunk *old,
+			  struct node *skip)
+{
+	struct audit_tree *owner;
+	int i, j;
+
+	new->key = old->key;
+	list_splice_init(&old->trees, &new->trees);
+	list_for_each_entry(owner, &new->trees, same_root)
+		owner->root = new;
+	for (i = j = 0; j < old->count; i++, j++) {
+		if (&old->owners[j] == skip) {
+			i--;
+			continue;
+		}
+		owner = old->owners[j].owner;
+		new->owners[i].owner = owner;
+		new->owners[i].index = old->owners[j].index - j + i;
+		if (!owner) /* result of earlier fallback */
+			continue;
+		get_tree(owner);
+		list_replace_init(&old->owners[j].list, &new->owners[i].list);
+	}
+	/*
+	 * Make sure chunk is fully initialized before making it visible in the
+	 * hash. Pairs with a data dependency barrier in READ_ONCE() in
+	 * audit_tree_lookup().
+	 */
+	smp_wmb();
+	list_replace_rcu(&old->hash, &new->hash);
+}
+
 static void untag_chunk(struct node *p)
 {
 	struct audit_chunk *chunk = find_chunk(p);
@@ -242,7 +274,6 @@ static void untag_chunk(struct node *p)
 	struct audit_chunk *new = NULL;
 	struct audit_tree *owner;
 	int size = chunk->count - 1;
-	int i, j;
 
 	fsnotify_get_mark(entry);
 
@@ -291,38 +322,16 @@ static void untag_chunk(struct node *p)
 
 	chunk->dead = 1;
 	spin_lock(&hash_lock);
-	new->key = chunk->key;
-	list_replace_init(&chunk->trees, &new->trees);
 	if (owner->root == chunk) {
 		list_del_init(&owner->same_root);
 		owner->root = NULL;
 	}
-
-	for (i = j = 0; j <= size; i++, j++) {
-		struct audit_tree *s;
-		if (&chunk->owners[j] == p) {
-			list_del_init(&p->list);
-			i--;
-			continue;
-		}
-		s = chunk->owners[j].owner;
-		new->owners[i].owner = s;
-		new->owners[i].index = chunk->owners[j].index - j + i;
-		if (!s) /* result of earlier fallback */
-			continue;
-		get_tree(s);
-		list_replace_init(&chunk->owners[j].list, &new->owners[i].list);
-	}
-
-	list_for_each_entry(owner, &new->trees, same_root)
-		owner->root = new;
+	list_del_init(&p->list);
 	/*
-	 * Make sure chunk is fully initialized before making it visible in the
-	 * hash. Pairs with a data dependency barrier in READ_ONCE() in
-	 * audit_tree_lookup().
+	 * This has to go last when updating chunk as once replace_chunk() is
+	 * called, new RCU readers can see the new chunk.
 	 */
-	smp_wmb();
-	list_replace_rcu(&chunk->hash, &new->hash);
+	replace_chunk(new, chunk, p);
 	spin_unlock(&hash_lock);
 	fsnotify_detach_mark(entry);
 	mutex_unlock(&entry->group->mark_mutex);
@@ -399,7 +408,6 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
 static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 {
 	struct fsnotify_mark *old_entry, *chunk_entry;
-	struct audit_tree *owner;
 	struct audit_chunk *chunk, *old;
 	struct node *p;
 	int n;
@@ -464,35 +472,21 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 		fsnotify_put_mark(old_entry);
 		return 0;
 	}
-	chunk->key = old->key;
-	list_replace_init(&old->trees, &chunk->trees);
-	for (n = 0, p = chunk->owners; n < old->count; n++, p++) {
-		struct audit_tree *s = old->owners[n].owner;
-		p->owner = s;
-		p->index = old->owners[n].index;
-		if (!s) /* result of fallback in untag */
-			continue;
-		get_tree(s);
-		list_replace_init(&old->owners[n].list, &p->list);
-	}
+	p = &chunk->owners[chunk->count - 1];
 	p->index = (chunk->count - 1) | (1U<<31);
 	p->owner = tree;
 	get_tree(tree);
 	list_add(&p->list, &tree->chunks);
-	list_for_each_entry(owner, &chunk->trees, same_root)
-		owner->root = chunk;
 	old->dead = 1;
 	if (!tree->root) {
 		tree->root = chunk;
 		list_add(&tree->same_root, &chunk->trees);
 	}
 	/*
-	 * Make sure chunk is fully initialized before making it visible in the
-	 * hash. Pairs with a data dependency barrier in READ_ONCE() in
-	 * audit_tree_lookup().
+	 * This has to go last when updating chunk as once replace_chunk() is
+	 * called, new RCU readers can see the new chunk.
 	 */
-	smp_wmb();
-	list_replace_rcu(&old->hash, &chunk->hash);
+	replace_chunk(chunk, old, NULL);
 	spin_unlock(&hash_lock);
 	fsnotify_detach_mark(old_entry);
 	mutex_unlock(&audit_tree_group->mark_mutex);
-- 
cgit 


From 8cd0feb5234ccda3c15de35b40c8010a406dfc03 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 12 Nov 2018 09:54:48 -0500
Subject: audit: Remove pointless check in insert_hash()

The audit_tree_group->mark_mutex is held all the time while we create
the fsnotify mark, add it to the inode, and insert chunk into the hash.
Hence mark cannot get detached during this time and so the check whether
the mark is attached in insert_hash() is pointless.

Reviewed-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit_tree.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index d8f6cfa0005b..d150514ff15e 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -184,8 +184,6 @@ static void insert_hash(struct audit_chunk *chunk)
 {
 	struct list_head *list;
 
-	if (!(chunk->mark.flags & FSNOTIFY_MARK_FLAG_ATTACHED))
-		return;
 	/*
 	 * Make sure chunk is fully initialized before making it visible in the
 	 * hash. Pairs with a data dependency barrier in READ_ONCE() in
-- 
cgit 


From a8375713fb1ff28ec718b601895958f1db775774 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 12 Nov 2018 09:54:49 -0500
Subject: audit: Provide helper for dropping mark's chunk reference

Provide a helper function audit_mark_put_chunk() for dropping mark's
reference (which has to happen only after RCU grace period expires).
Currently that happens only from a single place but in later patches we
introduce more callers.

Reviewed-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit_tree.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index d150514ff15e..35c031ebcc12 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -132,10 +132,20 @@ static void __put_chunk(struct rcu_head *rcu)
 	audit_put_chunk(chunk);
 }
 
+/*
+ * Drop reference to the chunk that was held by the mark. This is the reference
+ * that gets dropped after we've removed the chunk from the hash table and we
+ * use it to make sure chunk cannot be freed before RCU grace period expires.
+ */
+static void audit_mark_put_chunk(struct audit_chunk *chunk)
+{
+	call_rcu(&chunk->head, __put_chunk);
+}
+
 static void audit_tree_destroy_watch(struct fsnotify_mark *entry)
 {
 	struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark);
-	call_rcu(&chunk->head, __put_chunk);
+	audit_mark_put_chunk(chunk);
 }
 
 static struct audit_chunk *alloc_chunk(int count)
-- 
cgit 


From 5f5161300d7bd530e062428ac694824832960cf5 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 12 Nov 2018 09:54:49 -0500
Subject: audit: Allocate fsnotify mark independently of chunk

Allocate fsnotify mark independently instead of embedding it inside
chunk. This will allow us to just replace chunk attached to mark when
growing / shrinking chunk instead of replacing mark attached to inode
which is a more complex operation.

Reviewed-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit_tree.c | 64 +++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 50 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 35c031ebcc12..c98ab2d68a1c 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -25,7 +25,7 @@ struct audit_tree {
 struct audit_chunk {
 	struct list_head hash;
 	unsigned long key;
-	struct fsnotify_mark mark;
+	struct fsnotify_mark *mark;
 	struct list_head trees;		/* with root here */
 	int dead;
 	int count;
@@ -38,6 +38,11 @@ struct audit_chunk {
 	} owners[];
 };
 
+struct audit_tree_mark {
+	struct fsnotify_mark mark;
+	struct audit_chunk *chunk;
+};
+
 static LIST_HEAD(tree_list);
 static LIST_HEAD(prune_list);
 static struct task_struct *prune_thread;
@@ -73,6 +78,7 @@ static struct task_struct *prune_thread;
  */
 
 static struct fsnotify_group *audit_tree_group;
+static struct kmem_cache *audit_tree_mark_cachep __read_mostly;
 
 static struct audit_tree *alloc_tree(const char *s)
 {
@@ -142,10 +148,33 @@ static void audit_mark_put_chunk(struct audit_chunk *chunk)
 	call_rcu(&chunk->head, __put_chunk);
 }
 
+static inline struct audit_tree_mark *audit_mark(struct fsnotify_mark *entry)
+{
+	return container_of(entry, struct audit_tree_mark, mark);
+}
+
+static struct audit_chunk *mark_chunk(struct fsnotify_mark *mark)
+{
+	return audit_mark(mark)->chunk;
+}
+
 static void audit_tree_destroy_watch(struct fsnotify_mark *entry)
 {
-	struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark);
+	struct audit_chunk *chunk = mark_chunk(entry);
 	audit_mark_put_chunk(chunk);
+	kmem_cache_free(audit_tree_mark_cachep, audit_mark(entry));
+}
+
+static struct fsnotify_mark *alloc_mark(void)
+{
+	struct audit_tree_mark *amark;
+
+	amark = kmem_cache_zalloc(audit_tree_mark_cachep, GFP_KERNEL);
+	if (!amark)
+		return NULL;
+	fsnotify_init_mark(&amark->mark, audit_tree_group);
+	amark->mark.mask = FS_IN_IGNORED;
+	return &amark->mark;
 }
 
 static struct audit_chunk *alloc_chunk(int count)
@@ -159,6 +188,13 @@ static struct audit_chunk *alloc_chunk(int count)
 	if (!chunk)
 		return NULL;
 
+	chunk->mark = alloc_mark();
+	if (!chunk->mark) {
+		kfree(chunk);
+		return NULL;
+	}
+	audit_mark(chunk->mark)->chunk = chunk;
+
 	INIT_LIST_HEAD(&chunk->hash);
 	INIT_LIST_HEAD(&chunk->trees);
 	chunk->count = count;
@@ -167,8 +203,6 @@ static struct audit_chunk *alloc_chunk(int count)
 		INIT_LIST_HEAD(&chunk->owners[i].list);
 		chunk->owners[i].index = i;
 	}
-	fsnotify_init_mark(&chunk->mark, audit_tree_group);
-	chunk->mark.mask = FS_IN_IGNORED;
 	return chunk;
 }
 
@@ -278,7 +312,7 @@ static void replace_chunk(struct audit_chunk *new, struct audit_chunk *old,
 static void untag_chunk(struct node *p)
 {
 	struct audit_chunk *chunk = find_chunk(p);
-	struct fsnotify_mark *entry = &chunk->mark;
+	struct fsnotify_mark *entry = chunk->mark;
 	struct audit_chunk *new = NULL;
 	struct audit_tree *owner;
 	int size = chunk->count - 1;
@@ -298,7 +332,7 @@ static void untag_chunk(struct node *p)
 	if (chunk->dead || !(entry->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) {
 		mutex_unlock(&entry->group->mark_mutex);
 		if (new)
-			fsnotify_put_mark(&new->mark);
+			fsnotify_put_mark(new->mark);
 		goto out;
 	}
 
@@ -322,9 +356,9 @@ static void untag_chunk(struct node *p)
 	if (!new)
 		goto Fallback;
 
-	if (fsnotify_add_mark_locked(&new->mark, entry->connector->obj,
+	if (fsnotify_add_mark_locked(new->mark, entry->connector->obj,
 				     FSNOTIFY_OBJ_TYPE_INODE, 1)) {
-		fsnotify_put_mark(&new->mark);
+		fsnotify_put_mark(new->mark);
 		goto Fallback;
 	}
 
@@ -344,7 +378,7 @@ static void untag_chunk(struct node *p)
 	fsnotify_detach_mark(entry);
 	mutex_unlock(&entry->group->mark_mutex);
 	fsnotify_free_mark(entry);
-	fsnotify_put_mark(&new->mark);	/* drop initial reference */
+	fsnotify_put_mark(new->mark);	/* drop initial reference */
 	goto out;
 
 Fallback:
@@ -375,7 +409,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
 		return -ENOMEM;
 	}
 
-	entry = &chunk->mark;
+	entry = chunk->mark;
 	if (fsnotify_add_inode_mark_locked(entry, inode, 0)) {
 		mutex_unlock(&audit_tree_group->mark_mutex);
 		fsnotify_put_mark(entry);
@@ -426,7 +460,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 	if (!old_entry)
 		return create_chunk(inode, tree);
 
-	old = container_of(old_entry, struct audit_chunk, mark);
+	old = mark_chunk(old_entry);
 
 	/* are we already there? */
 	spin_lock(&hash_lock);
@@ -447,7 +481,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 		return -ENOMEM;
 	}
 
-	chunk_entry = &chunk->mark;
+	chunk_entry = chunk->mark;
 
 	/*
 	 * mark_mutex protects mark from getting detached and thus also from
@@ -457,7 +491,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 		/* old_entry is being shot, lets just lie */
 		mutex_unlock(&audit_tree_group->mark_mutex);
 		fsnotify_put_mark(old_entry);
-		fsnotify_put_mark(&chunk->mark);
+		fsnotify_put_mark(chunk->mark);
 		return -ENOENT;
 	}
 
@@ -1011,7 +1045,7 @@ static int audit_tree_handle_event(struct fsnotify_group *group,
 
 static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify_group *group)
 {
-	struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark);
+	struct audit_chunk *chunk = mark_chunk(entry);
 
 	evict_chunk(chunk);
 
@@ -1032,6 +1066,8 @@ static int __init audit_tree_init(void)
 {
 	int i;
 
+	audit_tree_mark_cachep = KMEM_CACHE(audit_tree_mark, SLAB_PANIC);
+
 	audit_tree_group = fsnotify_alloc_group(&audit_tree_ops);
 	if (IS_ERR(audit_tree_group))
 		audit_panic("cannot initialize fsnotify group for rectree watches");
-- 
cgit 


From 49a4ee7d98dbe34cfed90b930664c8a9fa73b24c Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 12 Nov 2018 09:54:49 -0500
Subject: audit: Guarantee forward progress of chunk untagging

When removing chunk from a tree, we do shrink the chunk. This can fail
for various reasons (due to races, ENOMEM, etc.) and in some cases we
just bail from untag_chunk() relying on someone else to cleanup.
Although this currently works, later we will need to add new failure
situation which would break. Also this simplifies the code and will
allow us to make locking around untag_chunk() less awkward.

Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit_tree.c | 42 +++++++++++++++++-------------------------
 1 file changed, 17 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index c98ab2d68a1c..ca2b6baff7aa 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -309,16 +309,28 @@ static void replace_chunk(struct audit_chunk *new, struct audit_chunk *old,
 	list_replace_rcu(&old->hash, &new->hash);
 }
 
+static void remove_chunk_node(struct audit_chunk *chunk, struct node *p)
+{
+	struct audit_tree *owner = p->owner;
+
+	if (owner->root == chunk) {
+		list_del_init(&owner->same_root);
+		owner->root = NULL;
+	}
+	list_del_init(&p->list);
+	p->owner = NULL;
+	put_tree(owner);
+}
+
 static void untag_chunk(struct node *p)
 {
 	struct audit_chunk *chunk = find_chunk(p);
 	struct fsnotify_mark *entry = chunk->mark;
 	struct audit_chunk *new = NULL;
-	struct audit_tree *owner;
 	int size = chunk->count - 1;
 
+	remove_chunk_node(chunk, p);
 	fsnotify_get_mark(entry);
-
 	spin_unlock(&hash_lock);
 
 	if (size)
@@ -336,15 +348,10 @@ static void untag_chunk(struct node *p)
 		goto out;
 	}
 
-	owner = p->owner;
-
 	if (!size) {
 		chunk->dead = 1;
 		spin_lock(&hash_lock);
 		list_del_init(&chunk->trees);
-		if (owner->root == chunk)
-			owner->root = NULL;
-		list_del_init(&p->list);
 		list_del_rcu(&chunk->hash);
 		spin_unlock(&hash_lock);
 		fsnotify_detach_mark(entry);
@@ -354,21 +361,16 @@ static void untag_chunk(struct node *p)
 	}
 
 	if (!new)
-		goto Fallback;
+		goto out_mutex;
 
 	if (fsnotify_add_mark_locked(new->mark, entry->connector->obj,
 				     FSNOTIFY_OBJ_TYPE_INODE, 1)) {
 		fsnotify_put_mark(new->mark);
-		goto Fallback;
+		goto out_mutex;
 	}
 
 	chunk->dead = 1;
 	spin_lock(&hash_lock);
-	if (owner->root == chunk) {
-		list_del_init(&owner->same_root);
-		owner->root = NULL;
-	}
-	list_del_init(&p->list);
 	/*
 	 * This has to go last when updating chunk as once replace_chunk() is
 	 * called, new RCU readers can see the new chunk.
@@ -381,17 +383,7 @@ static void untag_chunk(struct node *p)
 	fsnotify_put_mark(new->mark);	/* drop initial reference */
 	goto out;
 
-Fallback:
-	// do the best we can
-	spin_lock(&hash_lock);
-	if (owner->root == chunk) {
-		list_del_init(&owner->same_root);
-		owner->root = NULL;
-	}
-	list_del_init(&p->list);
-	p->owner = NULL;
-	put_tree(owner);
-	spin_unlock(&hash_lock);
+out_mutex:
 	mutex_unlock(&entry->group->mark_mutex);
 out:
 	fsnotify_put_mark(entry);
-- 
cgit 


From c22fcde775dcc9f46d73d694061441efdc7bdaad Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 12 Nov 2018 09:54:49 -0500
Subject: audit: Drop all unused chunk nodes during deletion

When deleting chunk from a tree, drop all unused nodes in a chunk
instead of just the one used by the tree. This gets rid of possibly
lingering unused nodes (created due to fallback path in untag_chunk())
and also removes some special cases and will allow us to simplify
locking in untag_chunk().

Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit_tree.c | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index ca2b6baff7aa..145e8c92dd31 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -277,8 +277,7 @@ static struct audit_chunk *find_chunk(struct node *p)
 	return container_of(p, struct audit_chunk, owners[0]);
 }
 
-static void replace_chunk(struct audit_chunk *new, struct audit_chunk *old,
-			  struct node *skip)
+static void replace_chunk(struct audit_chunk *new, struct audit_chunk *old)
 {
 	struct audit_tree *owner;
 	int i, j;
@@ -288,7 +287,7 @@ static void replace_chunk(struct audit_chunk *new, struct audit_chunk *old,
 	list_for_each_entry(owner, &new->trees, same_root)
 		owner->root = new;
 	for (i = j = 0; j < old->count; i++, j++) {
-		if (&old->owners[j] == skip) {
+		if (!old->owners[j].owner) {
 			i--;
 			continue;
 		}
@@ -322,20 +321,28 @@ static void remove_chunk_node(struct audit_chunk *chunk, struct node *p)
 	put_tree(owner);
 }
 
+static int chunk_count_trees(struct audit_chunk *chunk)
+{
+	int i;
+	int ret = 0;
+
+	for (i = 0; i < chunk->count; i++)
+		if (chunk->owners[i].owner)
+			ret++;
+	return ret;
+}
+
 static void untag_chunk(struct node *p)
 {
 	struct audit_chunk *chunk = find_chunk(p);
 	struct fsnotify_mark *entry = chunk->mark;
 	struct audit_chunk *new = NULL;
-	int size = chunk->count - 1;
+	int size;
 
 	remove_chunk_node(chunk, p);
 	fsnotify_get_mark(entry);
 	spin_unlock(&hash_lock);
 
-	if (size)
-		new = alloc_chunk(size);
-
 	mutex_lock(&entry->group->mark_mutex);
 	/*
 	 * mark_mutex protects mark from getting detached and thus also from
@@ -348,6 +355,7 @@ static void untag_chunk(struct node *p)
 		goto out;
 	}
 
+	size = chunk_count_trees(chunk);
 	if (!size) {
 		chunk->dead = 1;
 		spin_lock(&hash_lock);
@@ -360,6 +368,7 @@ static void untag_chunk(struct node *p)
 		goto out;
 	}
 
+	new = alloc_chunk(size);
 	if (!new)
 		goto out_mutex;
 
@@ -375,7 +384,7 @@ static void untag_chunk(struct node *p)
 	 * This has to go last when updating chunk as once replace_chunk() is
 	 * called, new RCU readers can see the new chunk.
 	 */
-	replace_chunk(new, chunk, p);
+	replace_chunk(new, chunk);
 	spin_unlock(&hash_lock);
 	fsnotify_detach_mark(entry);
 	mutex_unlock(&entry->group->mark_mutex);
@@ -520,7 +529,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 	 * This has to go last when updating chunk as once replace_chunk() is
 	 * called, new RCU readers can see the new chunk.
 	 */
-	replace_chunk(chunk, old, NULL);
+	replace_chunk(chunk, old);
 	spin_unlock(&hash_lock);
 	fsnotify_detach_mark(old_entry);
 	mutex_unlock(&audit_tree_group->mark_mutex);
-- 
cgit 


From 8432c70062978d9a57bde6715496d585ec520c3e Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 12 Nov 2018 09:54:56 -0500
Subject: audit: Simplify locking around untag_chunk()

untag_chunk() has to be called with hash_lock, it drops it and
reacquires it when returning. The unlocking of hash_lock is thus hidden
from the callers of untag_chunk() with is rather error prone. Reorganize
the code so that untag_chunk() is called without hash_lock, only with
mark reference preventing the chunk from going away.

Since this requires some more code in the caller of untag_chunk() to
assure forward progress, factor out loop pruning tree from all chunks
into a common helper function.

Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit_tree.c | 77 ++++++++++++++++++++++++++++-------------------------
 1 file changed, 40 insertions(+), 37 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 145e8c92dd31..82b27da7031c 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -332,28 +332,18 @@ static int chunk_count_trees(struct audit_chunk *chunk)
 	return ret;
 }
 
-static void untag_chunk(struct node *p)
+static void untag_chunk(struct audit_chunk *chunk, struct fsnotify_mark *entry)
 {
-	struct audit_chunk *chunk = find_chunk(p);
-	struct fsnotify_mark *entry = chunk->mark;
-	struct audit_chunk *new = NULL;
+	struct audit_chunk *new;
 	int size;
 
-	remove_chunk_node(chunk, p);
-	fsnotify_get_mark(entry);
-	spin_unlock(&hash_lock);
-
-	mutex_lock(&entry->group->mark_mutex);
+	mutex_lock(&audit_tree_group->mark_mutex);
 	/*
 	 * mark_mutex protects mark from getting detached and thus also from
 	 * mark->connector->obj getting NULL.
 	 */
-	if (chunk->dead || !(entry->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) {
-		mutex_unlock(&entry->group->mark_mutex);
-		if (new)
-			fsnotify_put_mark(new->mark);
-		goto out;
-	}
+	if (chunk->dead || !(entry->flags & FSNOTIFY_MARK_FLAG_ATTACHED))
+		goto out_mutex;
 
 	size = chunk_count_trees(chunk);
 	if (!size) {
@@ -363,9 +353,9 @@ static void untag_chunk(struct node *p)
 		list_del_rcu(&chunk->hash);
 		spin_unlock(&hash_lock);
 		fsnotify_detach_mark(entry);
-		mutex_unlock(&entry->group->mark_mutex);
+		mutex_unlock(&audit_tree_group->mark_mutex);
 		fsnotify_free_mark(entry);
-		goto out;
+		return;
 	}
 
 	new = alloc_chunk(size);
@@ -387,16 +377,13 @@ static void untag_chunk(struct node *p)
 	replace_chunk(new, chunk);
 	spin_unlock(&hash_lock);
 	fsnotify_detach_mark(entry);
-	mutex_unlock(&entry->group->mark_mutex);
+	mutex_unlock(&audit_tree_group->mark_mutex);
 	fsnotify_free_mark(entry);
 	fsnotify_put_mark(new->mark);	/* drop initial reference */
-	goto out;
+	return;
 
 out_mutex:
-	mutex_unlock(&entry->group->mark_mutex);
-out:
-	fsnotify_put_mark(entry);
-	spin_lock(&hash_lock);
+	mutex_unlock(&audit_tree_group->mark_mutex);
 }
 
 /* Call with group->mark_mutex held, releases it */
@@ -579,22 +566,45 @@ static void kill_rules(struct audit_tree *tree)
 }
 
 /*
- * finish killing struct audit_tree
+ * Remove tree from chunks. If 'tagged' is set, remove tree only from tagged
+ * chunks. The function expects tagged chunks are all at the beginning of the
+ * chunks list.
  */
-static void prune_one(struct audit_tree *victim)
+static void prune_tree_chunks(struct audit_tree *victim, bool tagged)
 {
 	spin_lock(&hash_lock);
 	while (!list_empty(&victim->chunks)) {
 		struct node *p;
+		struct audit_chunk *chunk;
+		struct fsnotify_mark *mark;
+
+		p = list_first_entry(&victim->chunks, struct node, list);
+		/* have we run out of marked? */
+		if (tagged && !(p->index & (1U<<31)))
+			break;
+		chunk = find_chunk(p);
+		mark = chunk->mark;
+		remove_chunk_node(chunk, p);
+		fsnotify_get_mark(mark);
+		spin_unlock(&hash_lock);
 
-		p = list_entry(victim->chunks.next, struct node, list);
+		untag_chunk(chunk, mark);
+		fsnotify_put_mark(mark);
 
-		untag_chunk(p);
+		spin_lock(&hash_lock);
 	}
 	spin_unlock(&hash_lock);
 	put_tree(victim);
 }
 
+/*
+ * finish killing struct audit_tree
+ */
+static void prune_one(struct audit_tree *victim)
+{
+	prune_tree_chunks(victim, false);
+}
+
 /* trim the uncommitted chunks from tree */
 
 static void trim_marked(struct audit_tree *tree)
@@ -614,18 +624,11 @@ static void trim_marked(struct audit_tree *tree)
 			list_add(p, &tree->chunks);
 		}
 	}
+	spin_unlock(&hash_lock);
 
-	while (!list_empty(&tree->chunks)) {
-		struct node *node;
-
-		node = list_entry(tree->chunks.next, struct node, list);
-
-		/* have we run out of marked? */
-		if (!(node->index & (1U<<31)))
-			break;
+	prune_tree_chunks(tree, true);
 
-		untag_chunk(node);
-	}
+	spin_lock(&hash_lock);
 	if (!tree->root && !tree->goner) {
 		tree->goner = 1;
 		spin_unlock(&hash_lock);
-- 
cgit 


From 83d23bc8aedc51fc40078026e9fae6e349d83b2a Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 12 Nov 2018 09:55:16 -0500
Subject: audit: Replace chunk attached to mark instead of replacing mark

Audit tree code currently associates new fsnotify mark with each new
chunk. As chunk attached to an inode is replaced when new tag is added /
removed, we also need to remove old fsnotify mark and add a new one on
such occasion.  This is cumbersome and makes locking rules somewhat
difficult to follow.

Fix these problems by allocating fsnotify mark independently of chunk
and keeping it all the time while there is some chunk attached to an
inode. Also add documentation about the locking rules so that things are
easier to follow.

Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Richard Guy Briggs <rgb@redhat.com>
[PM: minor merge fuzz due to updated patches previously in the series]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit_tree.c | 160 +++++++++++++++++++++++++++-------------------------
 1 file changed, 83 insertions(+), 77 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 82b27da7031c..1d8dc20296fb 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -27,7 +27,6 @@ struct audit_chunk {
 	unsigned long key;
 	struct fsnotify_mark *mark;
 	struct list_head trees;		/* with root here */
-	int dead;
 	int count;
 	atomic_long_t refs;
 	struct rcu_head head;
@@ -48,8 +47,15 @@ static LIST_HEAD(prune_list);
 static struct task_struct *prune_thread;
 
 /*
- * One struct chunk is attached to each inode of interest.
- * We replace struct chunk on tagging/untagging.
+ * One struct chunk is attached to each inode of interest through
+ * audit_tree_mark (fsnotify mark). We replace struct chunk on tagging /
+ * untagging, the mark is stable as long as there is chunk attached. The
+ * association between mark and chunk is protected by hash_lock and
+ * audit_tree_group->mark_mutex. Thus as long as we hold
+ * audit_tree_group->mark_mutex and check that the mark is alive by
+ * FSNOTIFY_MARK_FLAG_ATTACHED flag check, we are sure the mark points to
+ * the current chunk.
+ *
  * Rules have pointer to struct audit_tree.
  * Rules have struct list_head rlist forming a list of rules over
  * the same tree.
@@ -68,8 +74,12 @@ static struct task_struct *prune_thread;
  * tree is refcounted; one reference for "some rules on rules_list refer to
  * it", one for each chunk with pointer to it.
  *
- * chunk is refcounted by embedded fsnotify_mark + .refs (non-zero refcount
- * of watch contributes 1 to .refs).
+ * chunk is refcounted by embedded .refs. Mark associated with the chunk holds
+ * one chunk reference. This reference is dropped either when a mark is going
+ * to be freed (corresponding inode goes away) or when chunk attached to the
+ * mark gets replaced. This reference must be dropped using
+ * audit_mark_put_chunk() to make sure the reference is dropped only after RCU
+ * grace period as it protects RCU readers of the hash table.
  *
  * node.index allows to get from node.list to containing chunk.
  * MSB of that sucker is stolen to mark taggings that we might have to
@@ -160,8 +170,6 @@ static struct audit_chunk *mark_chunk(struct fsnotify_mark *mark)
 
 static void audit_tree_destroy_watch(struct fsnotify_mark *entry)
 {
-	struct audit_chunk *chunk = mark_chunk(entry);
-	audit_mark_put_chunk(chunk);
 	kmem_cache_free(audit_tree_mark_cachep, audit_mark(entry));
 }
 
@@ -188,13 +196,6 @@ static struct audit_chunk *alloc_chunk(int count)
 	if (!chunk)
 		return NULL;
 
-	chunk->mark = alloc_mark();
-	if (!chunk->mark) {
-		kfree(chunk);
-		return NULL;
-	}
-	audit_mark(chunk->mark)->chunk = chunk;
-
 	INIT_LIST_HEAD(&chunk->hash);
 	INIT_LIST_HEAD(&chunk->trees);
 	chunk->count = count;
@@ -277,6 +278,20 @@ static struct audit_chunk *find_chunk(struct node *p)
 	return container_of(p, struct audit_chunk, owners[0]);
 }
 
+static void replace_mark_chunk(struct fsnotify_mark *entry,
+			       struct audit_chunk *chunk)
+{
+	struct audit_chunk *old;
+
+	assert_spin_locked(&hash_lock);
+	old = mark_chunk(entry);
+	audit_mark(entry)->chunk = chunk;
+	if (chunk)
+		chunk->mark = entry;
+	if (old)
+		old->mark = NULL;
+}
+
 static void replace_chunk(struct audit_chunk *new, struct audit_chunk *old)
 {
 	struct audit_tree *owner;
@@ -299,6 +314,7 @@ static void replace_chunk(struct audit_chunk *new, struct audit_chunk *old)
 		get_tree(owner);
 		list_replace_init(&old->owners[j].list, &new->owners[i].list);
 	}
+	replace_mark_chunk(old->mark, new);
 	/*
 	 * Make sure chunk is fully initialized before making it visible in the
 	 * hash. Pairs with a data dependency barrier in READ_ONCE() in
@@ -339,21 +355,23 @@ static void untag_chunk(struct audit_chunk *chunk, struct fsnotify_mark *entry)
 
 	mutex_lock(&audit_tree_group->mark_mutex);
 	/*
-	 * mark_mutex protects mark from getting detached and thus also from
-	 * mark->connector->obj getting NULL.
+	 * mark_mutex stabilizes chunk attached to the mark so we can check
+	 * whether it didn't change while we've dropped hash_lock.
 	 */
-	if (chunk->dead || !(entry->flags & FSNOTIFY_MARK_FLAG_ATTACHED))
+	if (!(entry->flags & FSNOTIFY_MARK_FLAG_ATTACHED) ||
+	    mark_chunk(entry) != chunk)
 		goto out_mutex;
 
 	size = chunk_count_trees(chunk);
 	if (!size) {
-		chunk->dead = 1;
 		spin_lock(&hash_lock);
 		list_del_init(&chunk->trees);
 		list_del_rcu(&chunk->hash);
+		replace_mark_chunk(entry, NULL);
 		spin_unlock(&hash_lock);
 		fsnotify_detach_mark(entry);
 		mutex_unlock(&audit_tree_group->mark_mutex);
+		audit_mark_put_chunk(chunk);
 		fsnotify_free_mark(entry);
 		return;
 	}
@@ -362,13 +380,6 @@ static void untag_chunk(struct audit_chunk *chunk, struct fsnotify_mark *entry)
 	if (!new)
 		goto out_mutex;
 
-	if (fsnotify_add_mark_locked(new->mark, entry->connector->obj,
-				     FSNOTIFY_OBJ_TYPE_INODE, 1)) {
-		fsnotify_put_mark(new->mark);
-		goto out_mutex;
-	}
-
-	chunk->dead = 1;
 	spin_lock(&hash_lock);
 	/*
 	 * This has to go last when updating chunk as once replace_chunk() is
@@ -376,10 +387,8 @@ static void untag_chunk(struct audit_chunk *chunk, struct fsnotify_mark *entry)
 	 */
 	replace_chunk(new, chunk);
 	spin_unlock(&hash_lock);
-	fsnotify_detach_mark(entry);
 	mutex_unlock(&audit_tree_group->mark_mutex);
-	fsnotify_free_mark(entry);
-	fsnotify_put_mark(new->mark);	/* drop initial reference */
+	audit_mark_put_chunk(chunk);
 	return;
 
 out_mutex:
@@ -397,23 +406,31 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
 		return -ENOMEM;
 	}
 
-	entry = chunk->mark;
+	entry = alloc_mark();
+	if (!entry) {
+		mutex_unlock(&audit_tree_group->mark_mutex);
+		kfree(chunk);
+		return -ENOMEM;
+	}
+
 	if (fsnotify_add_inode_mark_locked(entry, inode, 0)) {
 		mutex_unlock(&audit_tree_group->mark_mutex);
 		fsnotify_put_mark(entry);
+		kfree(chunk);
 		return -ENOSPC;
 	}
 
 	spin_lock(&hash_lock);
 	if (tree->goner) {
 		spin_unlock(&hash_lock);
-		chunk->dead = 1;
 		fsnotify_detach_mark(entry);
 		mutex_unlock(&audit_tree_group->mark_mutex);
 		fsnotify_free_mark(entry);
 		fsnotify_put_mark(entry);
+		kfree(chunk);
 		return 0;
 	}
+	replace_mark_chunk(entry, chunk);
 	chunk->owners[0].index = (1U << 31);
 	chunk->owners[0].owner = tree;
 	get_tree(tree);
@@ -430,33 +447,41 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
 	insert_hash(chunk);
 	spin_unlock(&hash_lock);
 	mutex_unlock(&audit_tree_group->mark_mutex);
-	fsnotify_put_mark(entry);	/* drop initial reference */
+	/*
+	 * Drop our initial reference. When mark we point to is getting freed,
+	 * we get notification through ->freeing_mark callback and cleanup
+	 * chunk pointing to this mark.
+	 */
+	fsnotify_put_mark(entry);
 	return 0;
 }
 
 /* the first tagged inode becomes root of tree */
 static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 {
-	struct fsnotify_mark *old_entry, *chunk_entry;
+	struct fsnotify_mark *entry;
 	struct audit_chunk *chunk, *old;
 	struct node *p;
 	int n;
 
 	mutex_lock(&audit_tree_group->mark_mutex);
-	old_entry = fsnotify_find_mark(&inode->i_fsnotify_marks,
-				       audit_tree_group);
-	if (!old_entry)
+	entry = fsnotify_find_mark(&inode->i_fsnotify_marks, audit_tree_group);
+	if (!entry)
 		return create_chunk(inode, tree);
 
-	old = mark_chunk(old_entry);
-
+	/*
+	 * Found mark is guaranteed to be attached and mark_mutex protects mark
+	 * from getting detached and thus it makes sure there is chunk attached
+	 * to the mark.
+	 */
 	/* are we already there? */
 	spin_lock(&hash_lock);
+	old = mark_chunk(entry);
 	for (n = 0; n < old->count; n++) {
 		if (old->owners[n].owner == tree) {
 			spin_unlock(&hash_lock);
 			mutex_unlock(&audit_tree_group->mark_mutex);
-			fsnotify_put_mark(old_entry);
+			fsnotify_put_mark(entry);
 			return 0;
 		}
 	}
@@ -465,41 +490,16 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 	chunk = alloc_chunk(old->count + 1);
 	if (!chunk) {
 		mutex_unlock(&audit_tree_group->mark_mutex);
-		fsnotify_put_mark(old_entry);
+		fsnotify_put_mark(entry);
 		return -ENOMEM;
 	}
 
-	chunk_entry = chunk->mark;
-
-	/*
-	 * mark_mutex protects mark from getting detached and thus also from
-	 * mark->connector->obj getting NULL.
-	 */
-	if (!(old_entry->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) {
-		/* old_entry is being shot, lets just lie */
-		mutex_unlock(&audit_tree_group->mark_mutex);
-		fsnotify_put_mark(old_entry);
-		fsnotify_put_mark(chunk->mark);
-		return -ENOENT;
-	}
-
-	if (fsnotify_add_mark_locked(chunk_entry, old_entry->connector->obj,
-				     FSNOTIFY_OBJ_TYPE_INODE, 1)) {
-		mutex_unlock(&audit_tree_group->mark_mutex);
-		fsnotify_put_mark(chunk_entry);
-		fsnotify_put_mark(old_entry);
-		return -ENOSPC;
-	}
-
 	spin_lock(&hash_lock);
 	if (tree->goner) {
 		spin_unlock(&hash_lock);
-		chunk->dead = 1;
-		fsnotify_detach_mark(chunk_entry);
 		mutex_unlock(&audit_tree_group->mark_mutex);
-		fsnotify_free_mark(chunk_entry);
-		fsnotify_put_mark(chunk_entry);
-		fsnotify_put_mark(old_entry);
+		fsnotify_put_mark(entry);
+		kfree(chunk);
 		return 0;
 	}
 	p = &chunk->owners[chunk->count - 1];
@@ -507,7 +507,6 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 	p->owner = tree;
 	get_tree(tree);
 	list_add(&p->list, &tree->chunks);
-	old->dead = 1;
 	if (!tree->root) {
 		tree->root = chunk;
 		list_add(&tree->same_root, &chunk->trees);
@@ -518,11 +517,10 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 	 */
 	replace_chunk(chunk, old);
 	spin_unlock(&hash_lock);
-	fsnotify_detach_mark(old_entry);
 	mutex_unlock(&audit_tree_group->mark_mutex);
-	fsnotify_free_mark(old_entry);
-	fsnotify_put_mark(chunk_entry);	/* drop initial reference */
-	fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */
+	fsnotify_put_mark(entry); /* pair to fsnotify_find mark_entry */
+	audit_mark_put_chunk(old);
+
 	return 0;
 }
 
@@ -585,6 +583,9 @@ static void prune_tree_chunks(struct audit_tree *victim, bool tagged)
 		chunk = find_chunk(p);
 		mark = chunk->mark;
 		remove_chunk_node(chunk, p);
+		/* Racing with audit_tree_freeing_mark()? */
+		if (!mark)
+			continue;
 		fsnotify_get_mark(mark);
 		spin_unlock(&hash_lock);
 
@@ -1007,10 +1008,6 @@ static void evict_chunk(struct audit_chunk *chunk)
 	int need_prune = 0;
 	int n;
 
-	if (chunk->dead)
-		return;
-
-	chunk->dead = 1;
 	mutex_lock(&audit_filter_mutex);
 	spin_lock(&hash_lock);
 	while (!list_empty(&chunk->trees)) {
@@ -1049,9 +1046,18 @@ static int audit_tree_handle_event(struct fsnotify_group *group,
 
 static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify_group *group)
 {
-	struct audit_chunk *chunk = mark_chunk(entry);
+	struct audit_chunk *chunk;
 
-	evict_chunk(chunk);
+	mutex_lock(&entry->group->mark_mutex);
+	spin_lock(&hash_lock);
+	chunk = mark_chunk(entry);
+	replace_mark_chunk(entry, NULL);
+	spin_unlock(&hash_lock);
+	mutex_unlock(&entry->group->mark_mutex);
+	if (chunk) {
+		evict_chunk(chunk);
+		audit_mark_put_chunk(chunk);
+	}
 
 	/*
 	 * We are guaranteed to have at least one reference to the mark from
-- 
cgit 


From f905c2fc3980a41aeccb8673ab10ed5e616391fd Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 12 Nov 2018 09:55:16 -0500
Subject: audit: Use 'mark' name for fsnotify_mark variables

Variables pointing to fsnotify_mark are sometimes called 'entry' and
sometimes 'mark'. Use 'mark' in all places.

Reviewed-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Jan Kara <jack@suse.cz>
[PM: minor merge fuzz due to updated patches previously in the series]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit_tree.c | 79 +++++++++++++++++++++++++++--------------------------
 1 file changed, 40 insertions(+), 39 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 1d8dc20296fb..58e84eb5d826 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -158,9 +158,9 @@ static void audit_mark_put_chunk(struct audit_chunk *chunk)
 	call_rcu(&chunk->head, __put_chunk);
 }
 
-static inline struct audit_tree_mark *audit_mark(struct fsnotify_mark *entry)
+static inline struct audit_tree_mark *audit_mark(struct fsnotify_mark *mark)
 {
-	return container_of(entry, struct audit_tree_mark, mark);
+	return container_of(mark, struct audit_tree_mark, mark);
 }
 
 static struct audit_chunk *mark_chunk(struct fsnotify_mark *mark)
@@ -168,9 +168,9 @@ static struct audit_chunk *mark_chunk(struct fsnotify_mark *mark)
 	return audit_mark(mark)->chunk;
 }
 
-static void audit_tree_destroy_watch(struct fsnotify_mark *entry)
+static void audit_tree_destroy_watch(struct fsnotify_mark *mark)
 {
-	kmem_cache_free(audit_tree_mark_cachep, audit_mark(entry));
+	kmem_cache_free(audit_tree_mark_cachep, audit_mark(mark));
 }
 
 static struct fsnotify_mark *alloc_mark(void)
@@ -224,7 +224,7 @@ static inline struct list_head *chunk_hash(unsigned long key)
 	return chunk_hash_heads + n % HASH_SIZE;
 }
 
-/* hash_lock & entry->group->mark_mutex is held by caller */
+/* hash_lock & mark->group->mark_mutex is held by caller */
 static void insert_hash(struct audit_chunk *chunk)
 {
 	struct list_head *list;
@@ -278,16 +278,16 @@ static struct audit_chunk *find_chunk(struct node *p)
 	return container_of(p, struct audit_chunk, owners[0]);
 }
 
-static void replace_mark_chunk(struct fsnotify_mark *entry,
+static void replace_mark_chunk(struct fsnotify_mark *mark,
 			       struct audit_chunk *chunk)
 {
 	struct audit_chunk *old;
 
 	assert_spin_locked(&hash_lock);
-	old = mark_chunk(entry);
-	audit_mark(entry)->chunk = chunk;
+	old = mark_chunk(mark);
+	audit_mark(mark)->chunk = chunk;
 	if (chunk)
-		chunk->mark = entry;
+		chunk->mark = mark;
 	if (old)
 		old->mark = NULL;
 }
@@ -348,7 +348,7 @@ static int chunk_count_trees(struct audit_chunk *chunk)
 	return ret;
 }
 
-static void untag_chunk(struct audit_chunk *chunk, struct fsnotify_mark *entry)
+static void untag_chunk(struct audit_chunk *chunk, struct fsnotify_mark *mark)
 {
 	struct audit_chunk *new;
 	int size;
@@ -358,8 +358,8 @@ static void untag_chunk(struct audit_chunk *chunk, struct fsnotify_mark *entry)
 	 * mark_mutex stabilizes chunk attached to the mark so we can check
 	 * whether it didn't change while we've dropped hash_lock.
 	 */
-	if (!(entry->flags & FSNOTIFY_MARK_FLAG_ATTACHED) ||
-	    mark_chunk(entry) != chunk)
+	if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) ||
+	    mark_chunk(mark) != chunk)
 		goto out_mutex;
 
 	size = chunk_count_trees(chunk);
@@ -367,12 +367,12 @@ static void untag_chunk(struct audit_chunk *chunk, struct fsnotify_mark *entry)
 		spin_lock(&hash_lock);
 		list_del_init(&chunk->trees);
 		list_del_rcu(&chunk->hash);
-		replace_mark_chunk(entry, NULL);
+		replace_mark_chunk(mark, NULL);
 		spin_unlock(&hash_lock);
-		fsnotify_detach_mark(entry);
+		fsnotify_detach_mark(mark);
 		mutex_unlock(&audit_tree_group->mark_mutex);
 		audit_mark_put_chunk(chunk);
-		fsnotify_free_mark(entry);
+		fsnotify_free_mark(mark);
 		return;
 	}
 
@@ -398,7 +398,7 @@ out_mutex:
 /* Call with group->mark_mutex held, releases it */
 static int create_chunk(struct inode *inode, struct audit_tree *tree)
 {
-	struct fsnotify_mark *entry;
+	struct fsnotify_mark *mark;
 	struct audit_chunk *chunk = alloc_chunk(1);
 
 	if (!chunk) {
@@ -406,16 +406,16 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
 		return -ENOMEM;
 	}
 
-	entry = alloc_mark();
-	if (!entry) {
+	mark = alloc_mark();
+	if (!mark) {
 		mutex_unlock(&audit_tree_group->mark_mutex);
 		kfree(chunk);
 		return -ENOMEM;
 	}
 
-	if (fsnotify_add_inode_mark_locked(entry, inode, 0)) {
+	if (fsnotify_add_inode_mark_locked(mark, inode, 0)) {
 		mutex_unlock(&audit_tree_group->mark_mutex);
-		fsnotify_put_mark(entry);
+		fsnotify_put_mark(mark);
 		kfree(chunk);
 		return -ENOSPC;
 	}
@@ -423,14 +423,14 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
 	spin_lock(&hash_lock);
 	if (tree->goner) {
 		spin_unlock(&hash_lock);
-		fsnotify_detach_mark(entry);
+		fsnotify_detach_mark(mark);
 		mutex_unlock(&audit_tree_group->mark_mutex);
-		fsnotify_free_mark(entry);
-		fsnotify_put_mark(entry);
+		fsnotify_free_mark(mark);
+		fsnotify_put_mark(mark);
 		kfree(chunk);
 		return 0;
 	}
-	replace_mark_chunk(entry, chunk);
+	replace_mark_chunk(mark, chunk);
 	chunk->owners[0].index = (1U << 31);
 	chunk->owners[0].owner = tree;
 	get_tree(tree);
@@ -452,21 +452,21 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
 	 * we get notification through ->freeing_mark callback and cleanup
 	 * chunk pointing to this mark.
 	 */
-	fsnotify_put_mark(entry);
+	fsnotify_put_mark(mark);
 	return 0;
 }
 
 /* the first tagged inode becomes root of tree */
 static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 {
-	struct fsnotify_mark *entry;
+	struct fsnotify_mark *mark;
 	struct audit_chunk *chunk, *old;
 	struct node *p;
 	int n;
 
 	mutex_lock(&audit_tree_group->mark_mutex);
-	entry = fsnotify_find_mark(&inode->i_fsnotify_marks, audit_tree_group);
-	if (!entry)
+	mark = fsnotify_find_mark(&inode->i_fsnotify_marks, audit_tree_group);
+	if (!mark)
 		return create_chunk(inode, tree);
 
 	/*
@@ -476,12 +476,12 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 	 */
 	/* are we already there? */
 	spin_lock(&hash_lock);
-	old = mark_chunk(entry);
+	old = mark_chunk(mark);
 	for (n = 0; n < old->count; n++) {
 		if (old->owners[n].owner == tree) {
 			spin_unlock(&hash_lock);
 			mutex_unlock(&audit_tree_group->mark_mutex);
-			fsnotify_put_mark(entry);
+			fsnotify_put_mark(mark);
 			return 0;
 		}
 	}
@@ -490,7 +490,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 	chunk = alloc_chunk(old->count + 1);
 	if (!chunk) {
 		mutex_unlock(&audit_tree_group->mark_mutex);
-		fsnotify_put_mark(entry);
+		fsnotify_put_mark(mark);
 		return -ENOMEM;
 	}
 
@@ -498,7 +498,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 	if (tree->goner) {
 		spin_unlock(&hash_lock);
 		mutex_unlock(&audit_tree_group->mark_mutex);
-		fsnotify_put_mark(entry);
+		fsnotify_put_mark(mark);
 		kfree(chunk);
 		return 0;
 	}
@@ -518,7 +518,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 	replace_chunk(chunk, old);
 	spin_unlock(&hash_lock);
 	mutex_unlock(&audit_tree_group->mark_mutex);
-	fsnotify_put_mark(entry); /* pair to fsnotify_find mark_entry */
+	fsnotify_put_mark(mark); /* pair to fsnotify_find_mark */
 	audit_mark_put_chunk(old);
 
 	return 0;
@@ -1044,16 +1044,17 @@ static int audit_tree_handle_event(struct fsnotify_group *group,
 	return 0;
 }
 
-static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify_group *group)
+static void audit_tree_freeing_mark(struct fsnotify_mark *mark,
+				    struct fsnotify_group *group)
 {
 	struct audit_chunk *chunk;
 
-	mutex_lock(&entry->group->mark_mutex);
+	mutex_lock(&mark->group->mark_mutex);
 	spin_lock(&hash_lock);
-	chunk = mark_chunk(entry);
-	replace_mark_chunk(entry, NULL);
+	chunk = mark_chunk(mark);
+	replace_mark_chunk(mark, NULL);
 	spin_unlock(&hash_lock);
-	mutex_unlock(&entry->group->mark_mutex);
+	mutex_unlock(&mark->group->mark_mutex);
 	if (chunk) {
 		evict_chunk(chunk);
 		audit_mark_put_chunk(chunk);
@@ -1063,7 +1064,7 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify
 	 * We are guaranteed to have at least one reference to the mark from
 	 * either the inode or the caller of fsnotify_destroy_mark().
 	 */
-	BUG_ON(refcount_read(&entry->refcnt) < 1);
+	BUG_ON(refcount_read(&mark->refcnt) < 1);
 }
 
 static const struct fsnotify_ops audit_tree_ops = {
-- 
cgit 


From 9213784b48f8ba666b4695ca3f5d34f583daab83 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Mon, 22 Oct 2018 08:26:00 -0700
Subject: rcu: Eliminate BUG_ON() for kernel/rcu/tree_plugin.h

The tree_plugin.h file has a number of calls to BUG_ON(), which panics
the kernel, which is not a good strategy for devices (like embedded)
that don't have a way to capture console output.  This commit therefore
converts these BUG_ON() calls to WARN_ON_ONCE() and WARN_ONCE().

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
[ paulmck: Fix typo: s/rcuo/rcub/. ]
---
 kernel/rcu/tree_plugin.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 05915e536336..adda608874a8 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1464,7 +1464,8 @@ static void __init rcu_spawn_boost_kthreads(void)
 
 	for_each_possible_cpu(cpu)
 		per_cpu(rcu_cpu_has_work, cpu) = 0;
-	BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
+	if (WARN_ONCE(smpboot_register_percpu_thread(&rcu_cpu_thread_spec), "%s: Could not start rcub kthread, OOM is now expected behavior\n", __func__))
+		return;
 	rcu_for_each_leaf_node(rnp)
 		(void)rcu_spawn_one_boost_kthread(rnp);
 }
@@ -2322,7 +2323,8 @@ static int rcu_nocb_kthread(void *arg)
 		tail = rdp->nocb_follower_tail;
 		rdp->nocb_follower_tail = &rdp->nocb_follower_head;
 		raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
-		BUG_ON(!list);
+		if (WARN_ON_ONCE(!list))
+			continue;
 		trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeNonEmpty"));
 
 		/* Each pass through the following loop invokes a callback. */
@@ -2495,7 +2497,8 @@ static void rcu_spawn_one_nocb_kthread(int cpu)
 	/* Spawn the kthread for this CPU. */
 	t = kthread_run(rcu_nocb_kthread, rdp_spawn,
 			"rcuo%c/%d", rcu_state.abbr, cpu);
-	BUG_ON(IS_ERR(t));
+	if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo kthread, OOM is now expected behavior\n", __func__))
+		return;
 	WRITE_ONCE(rdp_spawn->nocb_kthread, t);
 }
 
-- 
cgit 


From f0ad56e876cdd67730065274625edbcfe0cca278 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Mon, 22 Oct 2018 08:33:06 -0700
Subject: rcu: Eliminate BUG_ON() for kernel/rcu/update.c

The update.c file has a number of calls to BUG_ON(), which panics the
kernel, which is not a good strategy for devices (like embedded) that
don't have a way to capture console output.  This commit therefore
converts these BUG_ON() calls to WARN_ON_ONCE() and WARN_ONCE().

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 kernel/rcu/update.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index f203b94f6b5b..cb26de25658a 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -822,7 +822,8 @@ static int __init rcu_spawn_tasks_kthread(void)
 	struct task_struct *t;
 
 	t = kthread_run(rcu_tasks_kthread, NULL, "rcu_tasks_kthread");
-	BUG_ON(IS_ERR(t));
+	if (WARN_ONCE(IS_ERR(t), "%s: Could not start Tasks-RCU grace-period kthread, OOM is now expected behavior\n", __func__))
+		return 0;
 	smp_mb(); /* Ensure others see full kthread. */
 	WRITE_ONCE(rcu_tasks_kthread_ptr, t);
 	return 0;
-- 
cgit 


From b3c1d9ec7c59feadd22693f43145d8285bd35b04 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Mon, 1 Oct 2018 13:25:32 -0700
Subject: rcu: Avoid double multiply by HZ

The rcu_check_gp_start_stall() function multiplies the return value
from rcu_jiffies_till_stall_check() by HZ, but the units are already
in jiffies.  This commit therefore avoids the need for introduction of
a jiffies-squared unit by removing the extraneous multiplication.

Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 kernel/rcu/tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 121f833acd04..4933f5d9d992 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2603,7 +2603,7 @@ static void force_quiescent_state(void)
 static void
 rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp)
 {
-	const unsigned long gpssdelay = rcu_jiffies_till_stall_check() * HZ;
+	const unsigned long gpssdelay = rcu_jiffies_till_stall_check();
 	unsigned long flags;
 	unsigned long j;
 	struct rcu_node *rnp_root = rcu_get_root();
-- 
cgit 


From 791416c47153b45f640d52baaf30995d9d396a08 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Mon, 1 Oct 2018 15:42:44 -0700
Subject: rcu: Parameterize rcu_check_gp_start_stall()

In order to debug forward-progress stalls, it is necessary to check
for excessively delayed grace-period starts.  This is currently done
for RCU CPU stall warnings by rcu_check_gp_start_stall(), which checks
to see if the start of a requested grace period has been delayed by an
RCU CPU stall warning period.  Because rcutorture will need to check
for the time consumed by an RCU forward-progress delay, this commit
promotes gpssdelay from a local variable to a formal parameter.  It is
not necessary to export rcu_check_gp_start_stall() because rcutorture
will access it via a wrapper function.

Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 kernel/rcu/tree.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 4933f5d9d992..36e30150e1e9 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2600,10 +2600,10 @@ static void force_quiescent_state(void)
  * This function checks for grace-period requests that fail to motivate
  * RCU to come out of its idle mode.
  */
-static void
-rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp)
+void
+rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp,
+			 const unsigned long gpssdelay)
 {
-	const unsigned long gpssdelay = rcu_jiffies_till_stall_check();
 	unsigned long flags;
 	unsigned long j;
 	struct rcu_node *rnp_root = rcu_get_root();
@@ -2690,7 +2690,7 @@ static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused
 		local_irq_restore(flags);
 	}
 
-	rcu_check_gp_start_stall(rnp, rdp);
+	rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check());
 
 	/* If there are callbacks ready, invoke them. */
 	if (rcu_segcblist_ready_cbs(&rdp->cblist))
-- 
cgit 


From 691960197e8daa39bf89886ba2e39de1e33f1ce4 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Tue, 2 Oct 2018 11:24:08 -0700
Subject: rcu: Add state name to show_rcu_gp_kthreads() output

This commit adds the name of the RCU grace-period state to
the show_rcu_gp_kthreads() output in order to ease debugging.
This commit also moves gp_state_getname() up in the code so that
show_rcu_gp_kthreads() can use it.

Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 kernel/rcu/tree.c | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 36e30150e1e9..ea78532183ac 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -499,6 +499,16 @@ void rcu_force_quiescent_state(void)
 }
 EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
 
+/*
+ * Convert a ->gp_state value to a character string.
+ */
+static const char *gp_state_getname(short gs)
+{
+	if (gs < 0 || gs >= ARRAY_SIZE(gp_state_names))
+		return "???";
+	return gp_state_names[gs];
+}
+
 /*
  * Show the state of the grace-period kthreads.
  */
@@ -508,8 +518,9 @@ void show_rcu_gp_kthreads(void)
 	struct rcu_data *rdp;
 	struct rcu_node *rnp;
 
-	pr_info("%s: wait state: %d ->state: %#lx\n", rcu_state.name,
-		rcu_state.gp_state, rcu_state.gp_kthread->state);
+	pr_info("%s: wait state: %s(%d) ->state: %#lx\n", rcu_state.name,
+		gp_state_getname(rcu_state.gp_state), rcu_state.gp_state,
+		rcu_state.gp_kthread->state);
 	rcu_for_each_node_breadth_first(rnp) {
 		if (ULONG_CMP_GE(rcu_state.gp_seq, rnp->gp_seq_needed))
 			continue;
@@ -1142,16 +1153,6 @@ static void record_gp_stall_check_time(void)
 	rcu_state.n_force_qs_gpstart = READ_ONCE(rcu_state.n_force_qs);
 }
 
-/*
- * Convert a ->gp_state value to a character string.
- */
-static const char *gp_state_getname(short gs)
-{
-	if (gs < 0 || gs >= ARRAY_SIZE(gp_state_names))
-		return "???";
-	return gp_state_names[gs];
-}
-
 /*
  * Complain about starvation of grace-period kthread.
  */
-- 
cgit 


From c669c014d1dae9c7cdbfff049c798722f8650829 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Tue, 2 Oct 2018 12:42:21 -0700
Subject: rcu: Add jiffies-since-GP-activity to show_rcu_gp_kthreads()

This commit adds a printout of the number of jiffies since the last time
that the RCU grace-period kthread did any processing.  This can be useful
when tracking down forward-progress issues.

Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 kernel/rcu/tree.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index ea78532183ac..e7c9848d1e1b 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -515,12 +515,14 @@ static const char *gp_state_getname(short gs)
 void show_rcu_gp_kthreads(void)
 {
 	int cpu;
+	unsigned long j;
 	struct rcu_data *rdp;
 	struct rcu_node *rnp;
 
-	pr_info("%s: wait state: %s(%d) ->state: %#lx\n", rcu_state.name,
-		gp_state_getname(rcu_state.gp_state), rcu_state.gp_state,
-		rcu_state.gp_kthread->state);
+	j = jiffies - READ_ONCE(rcu_state.gp_activity);
+	pr_info("%s: wait state: %s(%d) ->state: %#lx delta ->gp_activity %ld\n",
+		rcu_state.name, gp_state_getname(rcu_state.gp_state),
+		rcu_state.gp_state, rcu_state.gp_kthread->state, j);
 	rcu_for_each_node_breadth_first(rnp) {
 		if (ULONG_CMP_GE(rcu_state.gp_seq, rnp->gp_seq_needed))
 			continue;
-- 
cgit 


From 2320bda26df75c02f86c9fa42c3355ebcd16d8ed Mon Sep 17 00:00:00 2001
From: Zhouyi Zhou <zhouzhouyi@gmail.com>
Date: Mon, 8 Oct 2018 06:50:41 +0000
Subject: rcu: Adjust the comment of function rcu_is_watching

Because RCU avoids interrupting idle CPUs, rcu_is_watching() is used to
test whether or not it is currently legal to run RCU read-side critical
sections on this CPU.  However, the first sentence and last sentences
of current comment for rcu_is_watching have opposite meaning of what
is expected.  This commit therefore fixes this header comment.

Signed-off-by: Zhouyi Zhou <zhouzhouyi@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 kernel/rcu/tree.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index e7c9848d1e1b..429a46fb087a 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -904,12 +904,12 @@ void rcu_irq_enter_irqson(void)
 }
 
 /**
- * rcu_is_watching - see if RCU thinks that the current CPU is idle
+ * rcu_is_watching - see if RCU thinks that the current CPU is not idle
  *
  * Return true if RCU is watching the running CPU, which means that this
  * CPU can safely enter RCU read-side critical sections.  In other words,
- * if the current CPU is in its idle loop and is neither in an interrupt
- * or NMI handler, return true.
+ * if the current CPU is not in its idle loop or is in an interrupt or
+ * NMI handler, return true.
  */
 bool notrace rcu_is_watching(void)
 {
-- 
cgit 


From 0a89e5a402e957a97129423599a6ef6342da2764 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Mon, 15 Oct 2018 10:00:58 -0700
Subject: rcu: Trace end of grace period before end of grace period

Currently, rcu_gp_cleanup() traces the end of the old grace period after
the old grace period has officially ended.  This might make intuitive
sense, but it also makes for confusing event-trace output because the
"end" trace displays not the old but instead the new grace-period number.
This commit therefore traces the end of an old grace period just before
that grace period officially ends.

Reported-by: Aravinda Prasad <aravinda@linux.vnet.ibm.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 kernel/rcu/tree.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 429a46fb087a..61bae41b1afe 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2035,9 +2035,9 @@ static void rcu_gp_cleanup(void)
 	rnp = rcu_get_root();
 	raw_spin_lock_irq_rcu_node(rnp); /* GP before ->gp_seq update. */
 
-	/* Declare grace period done. */
-	rcu_seq_end(&rcu_state.gp_seq);
+	/* Declare grace period done, trace first to use old GP number. */
 	trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("end"));
+	rcu_seq_end(&rcu_state.gp_seq);
 	rcu_state.gp_state = RCU_GP_IDLE;
 	/* Check for GP requests since above loop. */
 	rdp = this_cpu_ptr(&rcu_data);
-- 
cgit 


From 05f415715ce45da07a0b1a5eac842765b733157f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Tue, 16 Oct 2018 04:12:58 -0700
Subject: rcu: Speed up expedited GPs when interrupting RCU reader

In PREEMPT kernels, an expedited grace period might send an IPI to a
CPU that is executing an RCU read-side critical section.  In that case,
it would be nice if the rcu_read_unlock() directly interacted with the
RCU core code to immediately report the quiescent state.  And this does
happen in the case where the reader has been preempted.  But it would
also be a nice performance optimization if immediate reporting also
happened in the preemption-free case.

This commit therefore adds an ->exp_hint field to the task_struct structure's
->rcu_read_unlock_special field.  The IPI handler sets this hint when
it has interrupted an RCU read-side critical section, and this causes
the outermost rcu_read_unlock() call to invoke rcu_read_unlock_special(),
which, if preemption is enabled, reports the quiescent state immediately.
If preemption is disabled, then the report is required to be deferred
until preemption (or bottom halves or interrupts or whatever) is re-enabled.

Because this is a hint, it does nothing for more complicated cases.  For
example, if the IPI interrupts an RCU reader, but interrupts are disabled
across the rcu_read_unlock(), but another rcu_read_lock() is executed
before interrupts are re-enabled, the hint will already have been cleared.
If you do crazy things like this, reporting will be deferred until some
later RCU_SOFTIRQ handler, context switch, cond_resched(), or similar.

Reported-by: Joel Fernandes <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
Acked-by: Joel Fernandes (Google) <joel@joelfernandes.org>
---
 include/linux/sched.h    |  4 +++-
 kernel/rcu/tree_exp.h    |  4 +++-
 kernel/rcu/tree_plugin.h | 14 +++++++++++---
 3 files changed, 17 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index a51c13c2b1a0..e4c7b6241088 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -572,8 +572,10 @@ union rcu_special {
 	struct {
 		u8			blocked;
 		u8			need_qs;
+		u8			exp_hint; /* Hint for performance. */
+		u8			pad; /* No garbage from compiler! */
 	} b; /* Bits. */
-	u16 s; /* Set of bits. */
+	u32 s; /* Set of bits. */
 };
 
 enum perf_event_task_context {
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index e669ccf3751b..928fe5893a57 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -692,8 +692,10 @@ static void sync_rcu_exp_handler(void *unused)
 	 */
 	if (t->rcu_read_lock_nesting > 0) {
 		raw_spin_lock_irqsave_rcu_node(rnp, flags);
-		if (rnp->expmask & rdp->grpmask)
+		if (rnp->expmask & rdp->grpmask) {
 			rdp->deferred_qs = true;
+			WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, true);
+		}
 		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 	}
 
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 05915e536336..618956cc7a55 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -642,13 +642,21 @@ static void rcu_read_unlock_special(struct task_struct *t)
 
 	local_irq_save(flags);
 	irqs_were_disabled = irqs_disabled_flags(flags);
-	if ((preempt_bh_were_disabled || irqs_were_disabled) &&
-	    t->rcu_read_unlock_special.b.blocked) {
+	if (preempt_bh_were_disabled || irqs_were_disabled) {
+		WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, false);
 		/* Need to defer quiescent state until everything is enabled. */
-		raise_softirq_irqoff(RCU_SOFTIRQ);
+		if (irqs_were_disabled) {
+			/* Enabling irqs does not reschedule, so... */
+			raise_softirq_irqoff(RCU_SOFTIRQ);
+		} else {
+			/* Enabling BH or preempt does reschedule, so... */
+			set_tsk_need_resched(current);
+			set_preempt_need_resched();
+		}
 		local_irq_restore(flags);
 		return;
 	}
+	WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, false);
 	rcu_preempt_deferred_qs_irqrestore(t, flags);
 }
 
-- 
cgit 


From 117f683c6e0104e1d6dfe8f143ea9c24ab069044 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Mon, 5 Nov 2018 14:20:57 -0800
Subject: rcu: Replace this_cpu_ptr() with __this_cpu_read()

Because __this_cpu_read() can be lighter weight than equivalent uses of
this_cpu_ptr(), this commit replaces the latter with the former.

Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 kernel/rcu/tree_plugin.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 618956cc7a55..0bb1c1593ca4 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -597,7 +597,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
  */
 static bool rcu_preempt_need_deferred_qs(struct task_struct *t)
 {
-	return (this_cpu_ptr(&rcu_data)->deferred_qs ||
+	return (__this_cpu_read(rcu_data.deferred_qs) ||
 		READ_ONCE(t->rcu_read_unlock_special.s)) &&
 	       t->rcu_read_lock_nesting <= 0;
 }
-- 
cgit 


From 5f1a6ef3746f536157922197d98676fa21154549 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Mon, 29 Oct 2018 07:36:50 -0700
Subject: rcu: Avoid signed integer overflow in rcu_preempt_deferred_qs()

Subtracting INT_MIN can be interpreted as unconditional signed integer
overflow, which according to the C standard is undefined behavior.
Therefore, kernel build arguments notwithstanding, it would be good to
future-proof the code.  This commit therefore substitutes INT_MAX for
INT_MIN in order to avoid undefined behavior.

While in the neighborhood, this commit also creates some meaningful names
for INT_MAX and friends in order to improve readability, as suggested
by Joel Fernandes.

Reported-by: Ran Rozenstein <ranro@mellanox.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 kernel/rcu/tree_plugin.h | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 0bb1c1593ca4..3ed43f8cb029 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -397,6 +397,11 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
 	return rnp->gp_tasks != NULL;
 }
 
+/* Bias and limit values for ->rcu_read_lock_nesting. */
+#define RCU_NEST_BIAS INT_MAX
+#define RCU_NEST_NMAX (-INT_MAX / 2)
+#define RCU_NEST_PMAX (INT_MAX / 2)
+
 /*
  * Preemptible RCU implementation for rcu_read_lock().
  * Just increment ->rcu_read_lock_nesting, shared state will be updated
@@ -405,6 +410,8 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
 void __rcu_read_lock(void)
 {
 	current->rcu_read_lock_nesting++;
+	if (IS_ENABLED(CONFIG_PROVE_LOCKING))
+		WARN_ON_ONCE(current->rcu_read_lock_nesting > RCU_NEST_PMAX);
 	barrier();  /* critical section after entry code. */
 }
 EXPORT_SYMBOL_GPL(__rcu_read_lock);
@@ -424,20 +431,18 @@ void __rcu_read_unlock(void)
 		--t->rcu_read_lock_nesting;
 	} else {
 		barrier();  /* critical section before exit code. */
-		t->rcu_read_lock_nesting = INT_MIN;
+		t->rcu_read_lock_nesting = -RCU_NEST_BIAS;
 		barrier();  /* assign before ->rcu_read_unlock_special load */
 		if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s)))
 			rcu_read_unlock_special(t);
 		barrier();  /* ->rcu_read_unlock_special load before assign */
 		t->rcu_read_lock_nesting = 0;
 	}
-#ifdef CONFIG_PROVE_LOCKING
-	{
-		int rrln = READ_ONCE(t->rcu_read_lock_nesting);
+	if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
+		int rrln = t->rcu_read_lock_nesting;
 
-		WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
+		WARN_ON_ONCE(rrln < 0 && rrln > RCU_NEST_NMAX);
 	}
-#endif /* #ifdef CONFIG_PROVE_LOCKING */
 }
 EXPORT_SYMBOL_GPL(__rcu_read_unlock);
 
@@ -617,11 +622,11 @@ static void rcu_preempt_deferred_qs(struct task_struct *t)
 	if (!rcu_preempt_need_deferred_qs(t))
 		return;
 	if (couldrecurse)
-		t->rcu_read_lock_nesting -= INT_MIN;
+		t->rcu_read_lock_nesting -= RCU_NEST_BIAS;
 	local_irq_save(flags);
 	rcu_preempt_deferred_qs_irqrestore(t, flags);
 	if (couldrecurse)
-		t->rcu_read_lock_nesting += INT_MIN;
+		t->rcu_read_lock_nesting += RCU_NEST_BIAS;
 }
 
 /*
-- 
cgit 


From 04547728b7b775333a4f6fbb3c55102a79dc4596 Mon Sep 17 00:00:00 2001
From: Lance Roy <ldr709@gmail.com>
Date: Thu, 4 Oct 2018 23:45:46 -0700
Subject: locking/mutex: Replace spin_is_locked() with lockdep

lockdep_assert_held() is better suited to checking locking requirements,
since it only checks if the current thread holds the lock regardless of
whether someone else does. This is also a step towards possibly removing
spin_is_locked().

Signed-off-by: Lance Roy <ldr709@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 kernel/locking/mutex-debug.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index 9aa713629387..771d4ca96dda 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -36,7 +36,7 @@ void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter)
 
 void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter)
 {
-	SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock));
+	lockdep_assert_held(&lock->wait_lock);
 	DEBUG_LOCKS_WARN_ON(list_empty(&lock->wait_list));
 	DEBUG_LOCKS_WARN_ON(waiter->magic != waiter);
 	DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list));
@@ -51,7 +51,7 @@ void debug_mutex_free_waiter(struct mutex_waiter *waiter)
 void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
 			    struct task_struct *task)
 {
-	SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock));
+	lockdep_assert_held(&lock->wait_lock);
 
 	/* Mark the current thread as blocked on the lock: */
 	task->blocked_on = waiter;
-- 
cgit 


From b1e3aeb11c5e86ee0988a038c4e7682d6beaa977 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 13 Nov 2018 12:03:33 -0800
Subject: cpuset: Minor cgroup2 interface updates

* Rename the partition file from "cpuset.sched.partition" to
  "cpuset.cpus.partition".

* When writing to the partition file, drop "0" and "1" and only accept
  "member" and "root".

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Waiman Long <longman@redhat.com>
---
 Documentation/admin-guide/cgroup-v2.rst | 6 +++---
 kernel/cgroup/cpuset.c                  | 8 ++++----
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index f83a5231bbe3..07e06136a550 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1708,15 +1708,15 @@ Cpuset Interface Files
 
 	Its value will be affected by memory nodes hotplug events.
 
-  cpuset.sched.partition
+  cpuset.cpus.partition
 	A read-write single value file which exists on non-root
 	cpuset-enabled cgroups.  This flag is owned by the parent cgroup
 	and is not delegatable.
 
         It accepts only the following input values when written to.
 
-        "root" or "1"   - a paritition root
-        "member" or "0" - a non-root member of a partition
+        "root"   - a paritition root
+        "member" - a non-root member of a partition
 
 	When set to be a partition root, the current cgroup is the
 	root of a new partition or scheduling domain that comprises
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index b897314bab53..1151e93d71b6 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -2468,11 +2468,11 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
 	buf = strstrip(buf);
 
 	/*
-	 * Convert "root"/"1" to 1, and convert "member"/"0" to 0.
+	 * Convert "root" to ENABLED, and convert "member" to DISABLED.
 	 */
-	if (!strcmp(buf, "root") || !strcmp(buf, "1"))
+	if (!strcmp(buf, "root"))
 		val = PRS_ENABLED;
-	else if (!strcmp(buf, "member") || !strcmp(buf, "0"))
+	else if (!strcmp(buf, "member"))
 		val = PRS_DISABLED;
 	else
 		return -EINVAL;
@@ -2631,7 +2631,7 @@ static struct cftype dfl_files[] = {
 	},
 
 	{
-		.name = "sched.partition",
+		.name = "cpus.partition",
 		.seq_show = sched_partition_show,
 		.write = sched_partition_write,
 		.private = FILE_PARTITION_ROOT,
-- 
cgit 


From c1bbd933e5fae83f96acd3f748bb01a0300b315d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 13 Nov 2018 12:06:41 -0800
Subject: cgroup: Add .__DEBUG__. prefix to debug file names

Clearly mark the debug files and hide them by default by prefixing
".__DEBUG__.".

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Waiman Long <longman@redhat.com>
---
 kernel/cgroup/cgroup.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index ed7f0bfe6429..e06994fd4e34 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1400,12 +1400,15 @@ static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
 	struct cgroup_subsys *ss = cft->ss;
 
 	if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
-	    !(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
-		snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
-			 cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
+	    !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
+		const char *dbg = (cft->flags & CFTYPE_DEBUG) ? ".__DEBUG__." : "";
+
+		snprintf(buf, CGROUP_FILE_NAME_MAX, "%s%s.%s",
+			 dbg, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
 			 cft->name);
-	else
+	} else {
 		strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
+	}
 	return buf;
 }
 
-- 
cgit 


From 8ddab428730d66e86ebe90aef15d5f7c5c45fe0d Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Fri, 9 Nov 2018 13:16:39 +0000
Subject: padata: clean an indentation issue, remove extraneous space

Trivial fix to clean up an indentation issue

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 kernel/padata.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/padata.c b/kernel/padata.c
index d568cc56405f..3e2633ae3bca 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -720,7 +720,7 @@ int padata_start(struct padata_instance *pinst)
 	if (pinst->flags & PADATA_INVALID)
 		err = -EINVAL;
 
-	 __padata_start(pinst);
+	__padata_start(pinst);
 
 	mutex_unlock(&pinst->lock);
 
-- 
cgit 


From 592ee43faf860c1f2c0a4c11838db6fdb974bb78 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Tue, 13 Nov 2018 09:29:26 +0000
Subject: bpf: fix null pointer dereference on pointer offload

Pointer offload is being null checked however the following statement
dereferences the potentially null pointer offload when assigning
offload->dev_state.  Fix this by only assigning it if offload is not
null.

Detected by CoverityScan, CID#1475437 ("Dereference after null check")

Fixes: 00db12c3d141 ("bpf: call verifier_prep from its callback in struct bpf_offload_dev")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Acked-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/offload.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 52c5617e3716..54cf2b9c44a4 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -130,9 +130,10 @@ int bpf_prog_offload_verifier_prep(struct bpf_prog *prog)
 
 	down_read(&bpf_devs_lock);
 	offload = prog->aux->offload;
-	if (offload)
+	if (offload) {
 		ret = offload->offdev->ops->prepare(prog);
-	offload->dev_state = !ret;
+		offload->dev_state = !ret;
+	}
 	up_read(&bpf_devs_lock);
 
 	return ret;
-- 
cgit 


From 0fe3c7fceb500de2d0adfb9dcf292580cd43ea38 Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Fri, 16 Nov 2018 12:16:35 -0500
Subject: audit: localize audit_log_session_info prototype

The audit_log_session_info() function is only used in kernel/audit*, so
move its prototype to kernel/audit.h

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/audit.h | 2 --
 kernel/audit.h        | 2 ++
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 9334fbef7bae..58cf665f597e 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -115,8 +115,6 @@ extern int audit_classify_compat_syscall(int abi, unsigned syscall);
 
 struct filename;
 
-extern void audit_log_session_info(struct audit_buffer *ab);
-
 #define AUDIT_OFF	0
 #define AUDIT_ON	1
 #define AUDIT_LOCKED	2
diff --git a/kernel/audit.h b/kernel/audit.h
index 214e14948370..9a3828bd387b 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -210,6 +210,8 @@ struct audit_context {
 
 extern bool audit_ever_enabled;
 
+extern void audit_log_session_info(struct audit_buffer *ab);
+
 extern void audit_copy_inode(struct audit_names *name,
 			     const struct dentry *dentry,
 			     struct inode *inode);
-- 
cgit 


From a2c97da11cdb973b752dd434aee9636ce10ee737 Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Fri, 16 Nov 2018 16:30:10 -0500
Subject: audit: use session_info helper

There are still a couple of places (mark and watch config changes) that
open code auid and ses fields in sequence in records instead of using
the audit_log_session_info() helper.  Use the helper.  Adjust the helper
to accommodate being the first fields.  Passes audit-testsuite.

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
[PM: fixed misspellings in the description]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit.c          | 6 +++---
 kernel/audit_fsnotify.c | 5 ++---
 kernel/audit_watch.c    | 5 ++---
 3 files changed, 7 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 2a8058764aa6..6c53e373b828 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -400,7 +400,7 @@ static int audit_log_config_change(char *function_name, u32 new, u32 old,
 	ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
 	if (unlikely(!ab))
 		return rc;
-	audit_log_format(ab, "%s=%u old=%u", function_name, new, old);
+	audit_log_format(ab, "%s=%u old=%u ", function_name, new, old);
 	audit_log_session_info(ab);
 	rc = audit_log_task_context(ab);
 	if (rc)
@@ -1067,7 +1067,7 @@ static void audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type)
 	*ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
 	if (unlikely(!*ab))
 		return;
-	audit_log_format(*ab, "pid=%d uid=%u", pid, uid);
+	audit_log_format(*ab, "pid=%d uid=%u ", pid, uid);
 	audit_log_session_info(*ab);
 	audit_log_task_context(*ab);
 }
@@ -2042,7 +2042,7 @@ void audit_log_session_info(struct audit_buffer *ab)
 	unsigned int sessionid = audit_get_sessionid(current);
 	uid_t auid = from_kuid(&init_user_ns, audit_get_loginuid(current));
 
-	audit_log_format(ab, " auid=%u ses=%u", auid, sessionid);
+	audit_log_format(ab, "auid=%u ses=%u", auid, sessionid);
 }
 
 void audit_log_key(struct audit_buffer *ab, char *key)
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
index fba78047fb37..f90ffa699e5b 100644
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -130,9 +130,8 @@ static void audit_mark_log_rule_change(struct audit_fsnotify_mark *audit_mark, c
 	ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
 	if (unlikely(!ab))
 		return;
-	audit_log_format(ab, "auid=%u ses=%u op=%s",
-			 from_kuid(&init_user_ns, audit_get_loginuid(current)),
-			 audit_get_sessionid(current), op);
+	audit_log_session_info(ab);
+	audit_log_format(ab, " op=%s", op);
 	audit_log_format(ab, " path=");
 	audit_log_untrustedstring(ab, audit_mark->path);
 	audit_log_key(ab, rule->filterkey);
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 787c7afdf829..568e48d1d0ab 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -245,9 +245,8 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc
 	ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
 	if (!ab)
 		return;
-	audit_log_format(ab, "auid=%u ses=%u op=%s",
-			 from_kuid(&init_user_ns, audit_get_loginuid(current)),
-			 audit_get_sessionid(current), op);
+	audit_log_session_info(ab);
+	audit_log_format(ab, "op=%s", op);
 	audit_log_format(ab, " path=");
 	audit_log_untrustedstring(ab, w->path);
 	audit_log_key(ab, r->filterkey);
-- 
cgit 


From c8fc5d49c341805fee7fc295f2ea8a709f78aec4 Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Fri, 16 Nov 2018 12:17:35 -0500
Subject: audit: remove WATCH and TREE config options

Remove the CONFIG_AUDIT_WATCH and CONFIG_AUDIT_TREE config options since
they are both dependent on CONFIG_AUDITSYSCALL and force
CONFIG_FSNOTIFY.

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 init/Kconfig     |  9 ---------
 kernel/Makefile  |  4 +---
 kernel/audit.h   |  6 +++---
 kernel/auditsc.c | 10 ----------
 4 files changed, 4 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/init/Kconfig b/init/Kconfig
index a4112e95724a..7eb2538e6ca0 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -335,15 +335,6 @@ config HAVE_ARCH_AUDITSYSCALL
 config AUDITSYSCALL
 	def_bool y
 	depends on AUDIT && HAVE_ARCH_AUDITSYSCALL
-
-config AUDIT_WATCH
-	def_bool y
-	depends on AUDITSYSCALL
-	select FSNOTIFY
-
-config AUDIT_TREE
-	def_bool y
-	depends on AUDITSYSCALL
 	select FSNOTIFY
 
 source "kernel/irq/Kconfig"
diff --git a/kernel/Makefile b/kernel/Makefile
index 7343b3a9bff0..9dc7f519129d 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -76,9 +76,7 @@ obj-$(CONFIG_IKCONFIG) += configs.o
 obj-$(CONFIG_SMP) += stop_machine.o
 obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
 obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
-obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
-obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o audit_fsnotify.o
-obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
+obj-$(CONFIG_AUDITSYSCALL) += auditsc.o audit_watch.o audit_fsnotify.o audit_tree.o
 obj-$(CONFIG_GCOV_KERNEL) += gcov/
 obj-$(CONFIG_KCOV) += kcov.o
 obj-$(CONFIG_KPROBES) += kprobes.o
diff --git a/kernel/audit.h b/kernel/audit.h
index 9a3828bd387b..0b5295aeaebb 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -268,7 +268,7 @@ extern struct tty_struct *audit_get_tty(struct task_struct *tsk);
 extern void audit_put_tty(struct tty_struct *tty);
 
 /* audit watch functions */
-#ifdef CONFIG_AUDIT_WATCH
+#ifdef CONFIG_AUDITSYSCALL
 extern void audit_put_watch(struct audit_watch *watch);
 extern void audit_get_watch(struct audit_watch *watch);
 extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op);
@@ -301,9 +301,9 @@ extern int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark
 #define audit_mark_compare(m, i, d) 0
 #define audit_exe_compare(t, m) (-EINVAL)
 #define audit_dupe_exe(n, o) (-EINVAL)
-#endif /* CONFIG_AUDIT_WATCH */
+#endif /* CONFIG_AUDITSYSCALL */
 
-#ifdef CONFIG_AUDIT_TREE
+#ifdef CONFIG_AUDITSYSCALL
 extern struct audit_chunk *audit_tree_lookup(const struct inode *inode);
 extern void audit_put_chunk(struct audit_chunk *chunk);
 extern bool audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 1513873e23bd..605f2d825204 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -200,7 +200,6 @@ static int audit_match_filetype(struct audit_context *ctx, int val)
  * References in it _are_ dropped - at the same time we free/drop aux stuff.
  */
 
-#ifdef CONFIG_AUDIT_TREE
 static void audit_set_auditable(struct audit_context *ctx)
 {
 	if (!ctx->prio) {
@@ -245,12 +244,10 @@ static int grow_tree_refs(struct audit_context *ctx)
 	ctx->tree_count = 31;
 	return 1;
 }
-#endif
 
 static void unroll_tree_refs(struct audit_context *ctx,
 		      struct audit_tree_refs *p, int count)
 {
-#ifdef CONFIG_AUDIT_TREE
 	struct audit_tree_refs *q;
 	int n;
 	if (!p) {
@@ -274,7 +271,6 @@ static void unroll_tree_refs(struct audit_context *ctx,
 	}
 	ctx->trees = p;
 	ctx->tree_count = count;
-#endif
 }
 
 static void free_tree_refs(struct audit_context *ctx)
@@ -288,7 +284,6 @@ static void free_tree_refs(struct audit_context *ctx)
 
 static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree)
 {
-#ifdef CONFIG_AUDIT_TREE
 	struct audit_tree_refs *p;
 	int n;
 	if (!tree)
@@ -305,7 +300,6 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree)
 			if (audit_tree_match(p->c[n], tree))
 				return 1;
 	}
-#endif
 	return 0;
 }
 
@@ -1602,7 +1596,6 @@ void __audit_syscall_exit(int success, long return_code)
 
 static inline void handle_one(const struct inode *inode)
 {
-#ifdef CONFIG_AUDIT_TREE
 	struct audit_context *context;
 	struct audit_tree_refs *p;
 	struct audit_chunk *chunk;
@@ -1627,12 +1620,10 @@ static inline void handle_one(const struct inode *inode)
 		return;
 	}
 	put_tree_ref(context, chunk);
-#endif
 }
 
 static void handle_path(const struct dentry *dentry)
 {
-#ifdef CONFIG_AUDIT_TREE
 	struct audit_context *context;
 	struct audit_tree_refs *p;
 	const struct dentry *d, *parent;
@@ -1685,7 +1676,6 @@ retry:
 		return;
 	}
 	rcu_read_unlock();
-#endif
 }
 
 static struct audit_names *audit_alloc_name(struct audit_context *context,
-- 
cgit 


From 96b3b6c9091d23289721350e32c63cc8749686be Mon Sep 17 00:00:00 2001
From: Lorenz Bauer <lmb@cloudflare.com>
Date: Fri, 16 Nov 2018 11:41:08 +0000
Subject: bpf: allow zero-initializing hash map seed

Add a new flag BPF_F_ZERO_SEED, which forces a hash map
to initialize the seed to zero. This is useful when doing
performance analysis both on individual BPF programs, as
well as the kernel's hash table implementation.

Signed-off-by: Lorenz Bauer <lmb@cloudflare.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h |  3 +++
 kernel/bpf/hashtab.c     | 13 +++++++++++--
 2 files changed, 14 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 47d606d744cc..8c01b89a4cb4 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -269,6 +269,9 @@ enum bpf_attach_type {
 /* Flag for stack_map, store build_id+offset instead of pointer */
 #define BPF_F_STACK_BUILD_ID	(1U << 5)
 
+/* Zero-initialize hash function seed. This should only be used for testing. */
+#define BPF_F_ZERO_SEED		(1U << 6)
+
 enum bpf_stack_build_id_status {
 	/* user space need an empty entry to identify end of a trace */
 	BPF_STACK_BUILD_ID_EMPTY = 0,
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 2c1790288138..4b7c76765d9d 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -23,7 +23,7 @@
 
 #define HTAB_CREATE_FLAG_MASK						\
 	(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE |	\
-	 BPF_F_RDONLY | BPF_F_WRONLY)
+	 BPF_F_RDONLY | BPF_F_WRONLY | BPF_F_ZERO_SEED)
 
 struct bucket {
 	struct hlist_nulls_head head;
@@ -244,6 +244,7 @@ static int htab_map_alloc_check(union bpf_attr *attr)
 	 */
 	bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
 	bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
+	bool zero_seed = (attr->map_flags & BPF_F_ZERO_SEED);
 	int numa_node = bpf_map_attr_numa_node(attr);
 
 	BUILD_BUG_ON(offsetof(struct htab_elem, htab) !=
@@ -257,6 +258,10 @@ static int htab_map_alloc_check(union bpf_attr *attr)
 		 */
 		return -EPERM;
 
+	if (zero_seed && !capable(CAP_SYS_ADMIN))
+		/* Guard against local DoS, and discourage production use. */
+		return -EPERM;
+
 	if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK)
 		/* reserved bits should not be used */
 		return -EINVAL;
@@ -373,7 +378,11 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 	if (!htab->buckets)
 		goto free_htab;
 
-	htab->hashrnd = get_random_int();
+	if (htab->map.map_flags & BPF_F_ZERO_SEED)
+		htab->hashrnd = 0;
+	else
+		htab->hashrnd = get_random_int();
+
 	for (i = 0; i < htab->n_buckets; i++) {
 		INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
 		raw_spin_lock_init(&htab->buckets[i].lock);
-- 
cgit 


From e9d81a1bc2c48ea9782e3e8b53875f419766ef47 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 8 Nov 2018 12:15:15 -0800
Subject: cgroup: fix CSS_TASK_ITER_PROCS

CSS_TASK_ITER_PROCS implements process-only iteration by making
css_task_iter_advance() skip tasks which aren't threadgroup leaders;
however, when an iteration is started css_task_iter_start() calls the
inner helper function css_task_iter_advance_css_set() instead of
css_task_iter_advance().  As the helper doesn't have the skip logic,
when the first task to visit is a non-leader thread, it doesn't get
skipped correctly as shown in the following example.

  # ps -L 2030
    PID   LWP TTY      STAT   TIME COMMAND
   2030  2030 pts/0    Sl+    0:00 ./test-thread
   2030  2031 pts/0    Sl+    0:00 ./test-thread
  # mkdir -p /sys/fs/cgroup/x/a/b
  # echo threaded > /sys/fs/cgroup/x/a/cgroup.type
  # echo threaded > /sys/fs/cgroup/x/a/b/cgroup.type
  # echo 2030 > /sys/fs/cgroup/x/a/cgroup.procs
  # cat /sys/fs/cgroup/x/a/cgroup.threads
  2030
  2031
  # cat /sys/fs/cgroup/x/cgroup.procs
  2030
  # echo 2030 > /sys/fs/cgroup/x/a/b/cgroup.threads
  # cat /sys/fs/cgroup/x/cgroup.procs
  2031
  2030

The last read of cgroup.procs is incorrectly showing non-leader 2031
in cgroup.procs output.

This can be fixed by updating css_task_iter_advance() to handle the
first advance and css_task_iters_tart() to call
css_task_iter_advance() instead of the inner helper.  After the fix,
the same commands result in the following (correct) result:

  # ps -L 2062
    PID   LWP TTY      STAT   TIME COMMAND
   2062  2062 pts/0    Sl+    0:00 ./test-thread
   2062  2063 pts/0    Sl+    0:00 ./test-thread
  # mkdir -p /sys/fs/cgroup/x/a/b
  # echo threaded > /sys/fs/cgroup/x/a/cgroup.type
  # echo threaded > /sys/fs/cgroup/x/a/b/cgroup.type
  # echo 2062 > /sys/fs/cgroup/x/a/cgroup.procs
  # cat /sys/fs/cgroup/x/a/cgroup.threads
  2062
  2063
  # cat /sys/fs/cgroup/x/cgroup.procs
  2062
  # echo 2062 > /sys/fs/cgroup/x/a/b/cgroup.threads
  # cat /sys/fs/cgroup/x/cgroup.procs
  2062

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: "Michael Kerrisk (man-pages)" <mtk.manpages@gmail.com>
Fixes: 8cfd8147df67 ("cgroup: implement cgroup v2 thread support")
Cc: stable@vger.kernel.org # v4.14+
---
 kernel/cgroup/cgroup.c | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 6aaf5dd5383b..1f84977fab47 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -4202,20 +4202,25 @@ static void css_task_iter_advance(struct css_task_iter *it)
 
 	lockdep_assert_held(&css_set_lock);
 repeat:
-	/*
-	 * Advance iterator to find next entry.  cset->tasks is consumed
-	 * first and then ->mg_tasks.  After ->mg_tasks, we move onto the
-	 * next cset.
-	 */
-	next = it->task_pos->next;
+	if (it->task_pos) {
+		/*
+		 * Advance iterator to find next entry.  cset->tasks is
+		 * consumed first and then ->mg_tasks.  After ->mg_tasks,
+		 * we move onto the next cset.
+		 */
+		next = it->task_pos->next;
 
-	if (next == it->tasks_head)
-		next = it->mg_tasks_head->next;
+		if (next == it->tasks_head)
+			next = it->mg_tasks_head->next;
 
-	if (next == it->mg_tasks_head)
+		if (next == it->mg_tasks_head)
+			css_task_iter_advance_css_set(it);
+		else
+			it->task_pos = next;
+	} else {
+		/* called from start, proceed to the first cset */
 		css_task_iter_advance_css_set(it);
-	else
-		it->task_pos = next;
+	}
 
 	/* if PROCS, skip over tasks which aren't group leaders */
 	if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos &&
@@ -4255,7 +4260,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
 
 	it->cset_head = it->cset_pos;
 
-	css_task_iter_advance_css_set(it);
+	css_task_iter_advance(it);
 
 	spin_unlock_irq(&css_set_lock);
 }
-- 
cgit 


From b47a0bd23e34022aa0d4b812fcebe85cb0c54d49 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Mon, 19 Nov 2018 15:29:06 -0800
Subject: bpf: btf: Break up btf_type_is_void()

This patch breaks up btf_type_is_void() into
btf_type_is_void() and btf_type_is_fwd().

It also adds btf_type_nosize() to better describe it is
testing a type has nosize info.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/btf.c | 37 ++++++++++++++++++++++---------------
 1 file changed, 22 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index ee4c82667d65..2a50d87de485 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -306,15 +306,22 @@ static bool btf_type_is_modifier(const struct btf_type *t)
 
 static bool btf_type_is_void(const struct btf_type *t)
 {
-	/* void => no type and size info.
-	 * Hence, FWD is also treated as void.
-	 */
-	return t == &btf_void || BTF_INFO_KIND(t->info) == BTF_KIND_FWD;
+	return t == &btf_void;
+}
+
+static bool btf_type_is_fwd(const struct btf_type *t)
+{
+	return BTF_INFO_KIND(t->info) == BTF_KIND_FWD;
+}
+
+static bool btf_type_nosize(const struct btf_type *t)
+{
+	return btf_type_is_void(t) || btf_type_is_fwd(t);
 }
 
-static bool btf_type_is_void_or_null(const struct btf_type *t)
+static bool btf_type_nosize_or_null(const struct btf_type *t)
 {
-	return !t || btf_type_is_void(t);
+	return !t || btf_type_nosize(t);
 }
 
 /* union is only a special case of struct:
@@ -826,7 +833,7 @@ const struct btf_type *btf_type_id_size(const struct btf *btf,
 	u32 size = 0;
 
 	size_type = btf_type_by_id(btf, size_type_id);
-	if (btf_type_is_void_or_null(size_type))
+	if (btf_type_nosize_or_null(size_type))
 		return NULL;
 
 	if (btf_type_has_size(size_type)) {
@@ -842,7 +849,7 @@ const struct btf_type *btf_type_id_size(const struct btf *btf,
 		size = btf->resolved_sizes[size_type_id];
 		size_type_id = btf->resolved_ids[size_type_id];
 		size_type = btf_type_by_id(btf, size_type_id);
-		if (btf_type_is_void(size_type))
+		if (btf_type_nosize_or_null(size_type))
 			return NULL;
 	}
 
@@ -1164,7 +1171,7 @@ static int btf_modifier_resolve(struct btf_verifier_env *env,
 	}
 
 	/* "typedef void new_void", "const void"...etc */
-	if (btf_type_is_void(next_type))
+	if (btf_type_is_void(next_type) || btf_type_is_fwd(next_type))
 		goto resolved;
 
 	if (!env_type_is_resolve_sink(env, next_type) &&
@@ -1178,7 +1185,7 @@ static int btf_modifier_resolve(struct btf_verifier_env *env,
 	 * pretty print).
 	 */
 	if (!btf_type_id_size(btf, &next_type_id, &next_type_size) &&
-	    !btf_type_is_void(btf_type_id_resolve(btf, &next_type_id))) {
+	    !btf_type_nosize(btf_type_id_resolve(btf, &next_type_id))) {
 		btf_verifier_log_type(env, v->t, "Invalid type_id");
 		return -EINVAL;
 	}
@@ -1205,7 +1212,7 @@ static int btf_ptr_resolve(struct btf_verifier_env *env,
 	}
 
 	/* "void *" */
-	if (btf_type_is_void(next_type))
+	if (btf_type_is_void(next_type) || btf_type_is_fwd(next_type))
 		goto resolved;
 
 	if (!env_type_is_resolve_sink(env, next_type) &&
@@ -1235,7 +1242,7 @@ static int btf_ptr_resolve(struct btf_verifier_env *env,
 	}
 
 	if (!btf_type_id_size(btf, &next_type_id, &next_type_size) &&
-	    !btf_type_is_void(btf_type_id_resolve(btf, &next_type_id))) {
+	    !btf_type_nosize(btf_type_id_resolve(btf, &next_type_id))) {
 		btf_verifier_log_type(env, v->t, "Invalid type_id");
 		return -EINVAL;
 	}
@@ -1396,7 +1403,7 @@ static int btf_array_resolve(struct btf_verifier_env *env,
 	/* Check array->index_type */
 	index_type_id = array->index_type;
 	index_type = btf_type_by_id(btf, index_type_id);
-	if (btf_type_is_void_or_null(index_type)) {
+	if (btf_type_nosize_or_null(index_type)) {
 		btf_verifier_log_type(env, v->t, "Invalid index");
 		return -EINVAL;
 	}
@@ -1415,7 +1422,7 @@ static int btf_array_resolve(struct btf_verifier_env *env,
 	/* Check array->type */
 	elem_type_id = array->type;
 	elem_type = btf_type_by_id(btf, elem_type_id);
-	if (btf_type_is_void_or_null(elem_type)) {
+	if (btf_type_nosize_or_null(elem_type)) {
 		btf_verifier_log_type(env, v->t,
 				      "Invalid elem");
 		return -EINVAL;
@@ -1615,7 +1622,7 @@ static int btf_struct_resolve(struct btf_verifier_env *env,
 		const struct btf_type *member_type = btf_type_by_id(env->btf,
 								member_type_id);
 
-		if (btf_type_is_void_or_null(member_type)) {
+		if (btf_type_nosize_or_null(member_type)) {
 			btf_verifier_log_member(env, v->t, member,
 						"Invalid member");
 			return -EINVAL;
-- 
cgit 


From 2667a2626f4da370409c2830552f6e8c8b8c41e2 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Mon, 19 Nov 2018 15:29:08 -0800
Subject: bpf: btf: Add BTF_KIND_FUNC and BTF_KIND_FUNC_PROTO

This patch adds BTF_KIND_FUNC and BTF_KIND_FUNC_PROTO
to support the function debug info.

BTF_KIND_FUNC_PROTO must not have a name (i.e. !t->name_off)
and it is followed by >= 0 'struct bpf_param' objects to
describe the function arguments.

The BTF_KIND_FUNC must have a valid name and it must
refer back to a BTF_KIND_FUNC_PROTO.

The above is the conclusion after the discussion between
Edward Cree, Alexei, Daniel, Yonghong and Martin.

By combining BTF_KIND_FUNC and BTF_LIND_FUNC_PROTO,
a complete function signature can be obtained.  It will be
used in the later patches to learn the function signature of
a running bpf program.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/btf.h |  18 ++-
 kernel/bpf/btf.c         | 389 +++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 354 insertions(+), 53 deletions(-)

(limited to 'kernel')

diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h
index 972265f32871..14f66948fc95 100644
--- a/include/uapi/linux/btf.h
+++ b/include/uapi/linux/btf.h
@@ -40,7 +40,8 @@ struct btf_type {
 	/* "size" is used by INT, ENUM, STRUCT and UNION.
 	 * "size" tells the size of the type it is describing.
 	 *
-	 * "type" is used by PTR, TYPEDEF, VOLATILE, CONST and RESTRICT.
+	 * "type" is used by PTR, TYPEDEF, VOLATILE, CONST, RESTRICT,
+	 * FUNC and FUNC_PROTO.
 	 * "type" is a type_id referring to another type.
 	 */
 	union {
@@ -64,8 +65,10 @@ struct btf_type {
 #define BTF_KIND_VOLATILE	9	/* Volatile	*/
 #define BTF_KIND_CONST		10	/* Const	*/
 #define BTF_KIND_RESTRICT	11	/* Restrict	*/
-#define BTF_KIND_MAX		11
-#define NR_BTF_KINDS		12
+#define BTF_KIND_FUNC		12	/* Function	*/
+#define BTF_KIND_FUNC_PROTO	13	/* Function Proto	*/
+#define BTF_KIND_MAX		13
+#define NR_BTF_KINDS		14
 
 /* For some specific BTF_KIND, "struct btf_type" is immediately
  * followed by extra data.
@@ -110,4 +113,13 @@ struct btf_member {
 	__u32	offset;	/* offset in bits */
 };
 
+/* BTF_KIND_FUNC_PROTO is followed by multiple "struct btf_param".
+ * The exact number of btf_param is stored in the vlen (of the
+ * info in "struct btf_type").
+ */
+struct btf_param {
+	__u32	name_off;
+	__u32	type;
+};
+
 #endif /* _UAPI__LINUX_BTF_H__ */
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 2a50d87de485..6a2be79b73fc 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -5,6 +5,7 @@
 #include <uapi/linux/types.h>
 #include <linux/seq_file.h>
 #include <linux/compiler.h>
+#include <linux/ctype.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
 #include <linux/anon_inodes.h>
@@ -259,6 +260,8 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = {
 	[BTF_KIND_VOLATILE]	= "VOLATILE",
 	[BTF_KIND_CONST]	= "CONST",
 	[BTF_KIND_RESTRICT]	= "RESTRICT",
+	[BTF_KIND_FUNC]		= "FUNC",
+	[BTF_KIND_FUNC_PROTO]	= "FUNC_PROTO",
 };
 
 struct btf_kind_operations {
@@ -281,6 +284,9 @@ struct btf_kind_operations {
 static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS];
 static struct btf_type btf_void;
 
+static int btf_resolve(struct btf_verifier_env *env,
+		       const struct btf_type *t, u32 type_id);
+
 static bool btf_type_is_modifier(const struct btf_type *t)
 {
 	/* Some of them is not strictly a C modifier
@@ -314,9 +320,20 @@ static bool btf_type_is_fwd(const struct btf_type *t)
 	return BTF_INFO_KIND(t->info) == BTF_KIND_FWD;
 }
 
+static bool btf_type_is_func(const struct btf_type *t)
+{
+	return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC;
+}
+
+static bool btf_type_is_func_proto(const struct btf_type *t)
+{
+	return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC_PROTO;
+}
+
 static bool btf_type_nosize(const struct btf_type *t)
 {
-	return btf_type_is_void(t) || btf_type_is_fwd(t);
+	return btf_type_is_void(t) || btf_type_is_fwd(t) ||
+	       btf_type_is_func(t) || btf_type_is_func_proto(t);
 }
 
 static bool btf_type_nosize_or_null(const struct btf_type *t)
@@ -433,6 +450,30 @@ static bool btf_name_offset_valid(const struct btf *btf, u32 offset)
 		offset < btf->hdr.str_len;
 }
 
+/* Only C-style identifier is permitted. This can be relaxed if
+ * necessary.
+ */
+static bool btf_name_valid_identifier(const struct btf *btf, u32 offset)
+{
+	/* offset must be valid */
+	const char *src = &btf->strings[offset];
+	const char *src_limit;
+
+	if (!isalpha(*src) && *src != '_')
+		return false;
+
+	/* set a limit on identifier length */
+	src_limit = src + KSYM_NAME_LEN;
+	src++;
+	while (*src && src < src_limit) {
+		if (!isalnum(*src) && *src != '_')
+			return false;
+		src++;
+	}
+
+	return !*src;
+}
+
 static const char *btf_name_by_offset(const struct btf *btf, u32 offset)
 {
 	if (!offset)
@@ -747,11 +788,15 @@ static bool env_type_is_resolve_sink(const struct btf_verifier_env *env,
 		/* int, enum or void is a sink */
 		return !btf_type_needs_resolve(next_type);
 	case RESOLVE_PTR:
-		/* int, enum, void, struct or array is a sink for ptr */
+		/* int, enum, void, struct, array, func or func_proto is a sink
+		 * for ptr
+		 */
 		return !btf_type_is_modifier(next_type) &&
 			!btf_type_is_ptr(next_type);
 	case RESOLVE_STRUCT_OR_ARRAY:
-		/* int, enum, void or ptr is a sink for struct and array */
+		/* int, enum, void, ptr, func or func_proto is a sink
+		 * for struct and array
+		 */
 		return !btf_type_is_modifier(next_type) &&
 			!btf_type_is_array(next_type) &&
 			!btf_type_is_struct(next_type);
@@ -1170,10 +1215,6 @@ static int btf_modifier_resolve(struct btf_verifier_env *env,
 		return -EINVAL;
 	}
 
-	/* "typedef void new_void", "const void"...etc */
-	if (btf_type_is_void(next_type) || btf_type_is_fwd(next_type))
-		goto resolved;
-
 	if (!env_type_is_resolve_sink(env, next_type) &&
 	    !env_type_is_resolved(env, next_type_id))
 		return env_stack_push(env, next_type, next_type_id);
@@ -1184,13 +1225,18 @@ static int btf_modifier_resolve(struct btf_verifier_env *env,
 	 * save us a few type-following when we use it later (e.g. in
 	 * pretty print).
 	 */
-	if (!btf_type_id_size(btf, &next_type_id, &next_type_size) &&
-	    !btf_type_nosize(btf_type_id_resolve(btf, &next_type_id))) {
-		btf_verifier_log_type(env, v->t, "Invalid type_id");
-		return -EINVAL;
+	if (!btf_type_id_size(btf, &next_type_id, &next_type_size)) {
+		if (env_type_is_resolved(env, next_type_id))
+			next_type = btf_type_id_resolve(btf, &next_type_id);
+
+		/* "typedef void new_void", "const void"...etc */
+		if (!btf_type_is_void(next_type) &&
+		    !btf_type_is_fwd(next_type)) {
+			btf_verifier_log_type(env, v->t, "Invalid type_id");
+			return -EINVAL;
+		}
 	}
 
-resolved:
 	env_stack_pop_resolved(env, next_type_id, next_type_size);
 
 	return 0;
@@ -1203,7 +1249,6 @@ static int btf_ptr_resolve(struct btf_verifier_env *env,
 	const struct btf_type *t = v->t;
 	u32 next_type_id = t->type;
 	struct btf *btf = env->btf;
-	u32 next_type_size = 0;
 
 	next_type = btf_type_by_id(btf, next_type_id);
 	if (!next_type) {
@@ -1211,10 +1256,6 @@ static int btf_ptr_resolve(struct btf_verifier_env *env,
 		return -EINVAL;
 	}
 
-	/* "void *" */
-	if (btf_type_is_void(next_type) || btf_type_is_fwd(next_type))
-		goto resolved;
-
 	if (!env_type_is_resolve_sink(env, next_type) &&
 	    !env_type_is_resolved(env, next_type_id))
 		return env_stack_push(env, next_type, next_type_id);
@@ -1241,13 +1282,18 @@ static int btf_ptr_resolve(struct btf_verifier_env *env,
 					      resolved_type_id);
 	}
 
-	if (!btf_type_id_size(btf, &next_type_id, &next_type_size) &&
-	    !btf_type_nosize(btf_type_id_resolve(btf, &next_type_id))) {
-		btf_verifier_log_type(env, v->t, "Invalid type_id");
-		return -EINVAL;
+	if (!btf_type_id_size(btf, &next_type_id, NULL)) {
+		if (env_type_is_resolved(env, next_type_id))
+			next_type = btf_type_id_resolve(btf, &next_type_id);
+
+		if (!btf_type_is_void(next_type) &&
+		    !btf_type_is_fwd(next_type) &&
+		    !btf_type_is_func_proto(next_type)) {
+			btf_verifier_log_type(env, v->t, "Invalid type_id");
+			return -EINVAL;
+		}
 	}
 
-resolved:
 	env_stack_pop_resolved(env, next_type_id, 0);
 
 	return 0;
@@ -1787,6 +1833,232 @@ static struct btf_kind_operations enum_ops = {
 	.seq_show = btf_enum_seq_show,
 };
 
+static s32 btf_func_proto_check_meta(struct btf_verifier_env *env,
+				     const struct btf_type *t,
+				     u32 meta_left)
+{
+	u32 meta_needed = btf_type_vlen(t) * sizeof(struct btf_param);
+
+	if (meta_left < meta_needed) {
+		btf_verifier_log_basic(env, t,
+				       "meta_left:%u meta_needed:%u",
+				       meta_left, meta_needed);
+		return -EINVAL;
+	}
+
+	if (t->name_off) {
+		btf_verifier_log_type(env, t, "Invalid name");
+		return -EINVAL;
+	}
+
+	btf_verifier_log_type(env, t, NULL);
+
+	return meta_needed;
+}
+
+static void btf_func_proto_log(struct btf_verifier_env *env,
+			       const struct btf_type *t)
+{
+	const struct btf_param *args = (const struct btf_param *)(t + 1);
+	u16 nr_args = btf_type_vlen(t), i;
+
+	btf_verifier_log(env, "return=%u args=(", t->type);
+	if (!nr_args) {
+		btf_verifier_log(env, "void");
+		goto done;
+	}
+
+	if (nr_args == 1 && !args[0].type) {
+		/* Only one vararg */
+		btf_verifier_log(env, "vararg");
+		goto done;
+	}
+
+	btf_verifier_log(env, "%u %s", args[0].type,
+			 btf_name_by_offset(env->btf,
+					    args[0].name_off));
+	for (i = 1; i < nr_args - 1; i++)
+		btf_verifier_log(env, ", %u %s", args[i].type,
+				 btf_name_by_offset(env->btf,
+						    args[i].name_off));
+
+	if (nr_args > 1) {
+		const struct btf_param *last_arg = &args[nr_args - 1];
+
+		if (last_arg->type)
+			btf_verifier_log(env, ", %u %s", last_arg->type,
+					 btf_name_by_offset(env->btf,
+							    last_arg->name_off));
+		else
+			btf_verifier_log(env, ", vararg");
+	}
+
+done:
+	btf_verifier_log(env, ")");
+}
+
+static struct btf_kind_operations func_proto_ops = {
+	.check_meta = btf_func_proto_check_meta,
+	.resolve = btf_df_resolve,
+	/*
+	 * BTF_KIND_FUNC_PROTO cannot be directly referred by
+	 * a struct's member.
+	 *
+	 * It should be a funciton pointer instead.
+	 * (i.e. struct's member -> BTF_KIND_PTR -> BTF_KIND_FUNC_PROTO)
+	 *
+	 * Hence, there is no btf_func_check_member().
+	 */
+	.check_member = btf_df_check_member,
+	.log_details = btf_func_proto_log,
+	.seq_show = btf_df_seq_show,
+};
+
+static s32 btf_func_check_meta(struct btf_verifier_env *env,
+			       const struct btf_type *t,
+			       u32 meta_left)
+{
+	if (!t->name_off ||
+	    !btf_name_valid_identifier(env->btf, t->name_off)) {
+		btf_verifier_log_type(env, t, "Invalid name");
+		return -EINVAL;
+	}
+
+	if (btf_type_vlen(t)) {
+		btf_verifier_log_type(env, t, "vlen != 0");
+		return -EINVAL;
+	}
+
+	btf_verifier_log_type(env, t, NULL);
+
+	return 0;
+}
+
+static struct btf_kind_operations func_ops = {
+	.check_meta = btf_func_check_meta,
+	.resolve = btf_df_resolve,
+	.check_member = btf_df_check_member,
+	.log_details = btf_ref_type_log,
+	.seq_show = btf_df_seq_show,
+};
+
+static int btf_func_proto_check(struct btf_verifier_env *env,
+				const struct btf_type *t)
+{
+	const struct btf_type *ret_type;
+	const struct btf_param *args;
+	const struct btf *btf;
+	u16 nr_args, i;
+	int err;
+
+	btf = env->btf;
+	args = (const struct btf_param *)(t + 1);
+	nr_args = btf_type_vlen(t);
+
+	/* Check func return type which could be "void" (t->type == 0) */
+	if (t->type) {
+		u32 ret_type_id = t->type;
+
+		ret_type = btf_type_by_id(btf, ret_type_id);
+		if (!ret_type) {
+			btf_verifier_log_type(env, t, "Invalid return type");
+			return -EINVAL;
+		}
+
+		if (btf_type_needs_resolve(ret_type) &&
+		    !env_type_is_resolved(env, ret_type_id)) {
+			err = btf_resolve(env, ret_type, ret_type_id);
+			if (err)
+				return err;
+		}
+
+		/* Ensure the return type is a type that has a size */
+		if (!btf_type_id_size(btf, &ret_type_id, NULL)) {
+			btf_verifier_log_type(env, t, "Invalid return type");
+			return -EINVAL;
+		}
+	}
+
+	if (!nr_args)
+		return 0;
+
+	/* Last func arg type_id could be 0 if it is a vararg */
+	if (!args[nr_args - 1].type) {
+		if (args[nr_args - 1].name_off) {
+			btf_verifier_log_type(env, t, "Invalid arg#%u",
+					      nr_args);
+			return -EINVAL;
+		}
+		nr_args--;
+	}
+
+	err = 0;
+	for (i = 0; i < nr_args; i++) {
+		const struct btf_type *arg_type;
+		u32 arg_type_id;
+
+		arg_type_id = args[i].type;
+		arg_type = btf_type_by_id(btf, arg_type_id);
+		if (!arg_type) {
+			btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1);
+			err = -EINVAL;
+			break;
+		}
+
+		if (args[i].name_off &&
+		    (!btf_name_offset_valid(btf, args[i].name_off) ||
+		     !btf_name_valid_identifier(btf, args[i].name_off))) {
+			btf_verifier_log_type(env, t,
+					      "Invalid arg#%u", i + 1);
+			err = -EINVAL;
+			break;
+		}
+
+		if (btf_type_needs_resolve(arg_type) &&
+		    !env_type_is_resolved(env, arg_type_id)) {
+			err = btf_resolve(env, arg_type, arg_type_id);
+			if (err)
+				break;
+		}
+
+		if (!btf_type_id_size(btf, &arg_type_id, NULL)) {
+			btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1);
+			err = -EINVAL;
+			break;
+		}
+	}
+
+	return err;
+}
+
+static int btf_func_check(struct btf_verifier_env *env,
+			  const struct btf_type *t)
+{
+	const struct btf_type *proto_type;
+	const struct btf_param *args;
+	const struct btf *btf;
+	u16 nr_args, i;
+
+	btf = env->btf;
+	proto_type = btf_type_by_id(btf, t->type);
+
+	if (!proto_type || !btf_type_is_func_proto(proto_type)) {
+		btf_verifier_log_type(env, t, "Invalid type_id");
+		return -EINVAL;
+	}
+
+	args = (const struct btf_param *)(proto_type + 1);
+	nr_args = btf_type_vlen(proto_type);
+	for (i = 0; i < nr_args; i++) {
+		if (!args[i].name_off && args[i].type) {
+			btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
 static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = {
 	[BTF_KIND_INT] = &int_ops,
 	[BTF_KIND_PTR] = &ptr_ops,
@@ -1799,6 +2071,8 @@ static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = {
 	[BTF_KIND_VOLATILE] = &modifier_ops,
 	[BTF_KIND_CONST] = &modifier_ops,
 	[BTF_KIND_RESTRICT] = &modifier_ops,
+	[BTF_KIND_FUNC] = &func_ops,
+	[BTF_KIND_FUNC_PROTO] = &func_proto_ops,
 };
 
 static s32 btf_check_meta(struct btf_verifier_env *env,
@@ -1870,30 +2144,6 @@ static int btf_check_all_metas(struct btf_verifier_env *env)
 	return 0;
 }
 
-static int btf_resolve(struct btf_verifier_env *env,
-		       const struct btf_type *t, u32 type_id)
-{
-	const struct resolve_vertex *v;
-	int err = 0;
-
-	env->resolve_mode = RESOLVE_TBD;
-	env_stack_push(env, t, type_id);
-	while (!err && (v = env_stack_peak(env))) {
-		env->log_type_id = v->type_id;
-		err = btf_type_ops(v->t)->resolve(env, v);
-	}
-
-	env->log_type_id = type_id;
-	if (err == -E2BIG)
-		btf_verifier_log_type(env, t,
-				      "Exceeded max resolving depth:%u",
-				      MAX_RESOLVE_DEPTH);
-	else if (err == -EEXIST)
-		btf_verifier_log_type(env, t, "Loop detected");
-
-	return err;
-}
-
 static bool btf_resolve_valid(struct btf_verifier_env *env,
 			      const struct btf_type *t,
 			      u32 type_id)
@@ -1927,6 +2177,39 @@ static bool btf_resolve_valid(struct btf_verifier_env *env,
 	return false;
 }
 
+static int btf_resolve(struct btf_verifier_env *env,
+		       const struct btf_type *t, u32 type_id)
+{
+	u32 save_log_type_id = env->log_type_id;
+	const struct resolve_vertex *v;
+	int err = 0;
+
+	env->resolve_mode = RESOLVE_TBD;
+	env_stack_push(env, t, type_id);
+	while (!err && (v = env_stack_peak(env))) {
+		env->log_type_id = v->type_id;
+		err = btf_type_ops(v->t)->resolve(env, v);
+	}
+
+	env->log_type_id = type_id;
+	if (err == -E2BIG) {
+		btf_verifier_log_type(env, t,
+				      "Exceeded max resolving depth:%u",
+				      MAX_RESOLVE_DEPTH);
+	} else if (err == -EEXIST) {
+		btf_verifier_log_type(env, t, "Loop detected");
+	}
+
+	/* Final sanity check */
+	if (!err && !btf_resolve_valid(env, t, type_id)) {
+		btf_verifier_log_type(env, t, "Invalid resolve state");
+		err = -EINVAL;
+	}
+
+	env->log_type_id = save_log_type_id;
+	return err;
+}
+
 static int btf_check_all_types(struct btf_verifier_env *env)
 {
 	struct btf *btf = env->btf;
@@ -1949,10 +2232,16 @@ static int btf_check_all_types(struct btf_verifier_env *env)
 				return err;
 		}
 
-		if (btf_type_needs_resolve(t) &&
-		    !btf_resolve_valid(env, t, type_id)) {
-			btf_verifier_log_type(env, t, "Invalid resolve state");
-			return -EINVAL;
+		if (btf_type_is_func_proto(t)) {
+			err = btf_func_proto_check(env, t);
+			if (err)
+				return err;
+		}
+
+		if (btf_type_is_func(t)) {
+			err = btf_func_check(env, t);
+			if (err)
+				return err;
 		}
 	}
 
-- 
cgit 


From 838e96904ff3fc6c30e5ebbc611474669856e3c0 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Mon, 19 Nov 2018 15:29:11 -0800
Subject: bpf: Introduce bpf_func_info

This patch added interface to load a program with the following
additional information:
   . prog_btf_fd
   . func_info, func_info_rec_size and func_info_cnt
where func_info will provide function range and type_id
corresponding to each function.

The func_info_rec_size is introduced in the UAPI to specify
struct bpf_func_info size passed from user space. This
intends to make bpf_func_info structure growable in the future.
If the kernel gets a different bpf_func_info size from userspace,
it will try to handle user request with part of bpf_func_info
it can understand. In this patch, kernel can understand
  struct bpf_func_info {
       __u32   insn_offset;
       __u32   type_id;
  };
If user passed a bpf func_info record size of 16 bytes, the
kernel can still handle part of records with the above definition.

If verifier agrees with function range provided by the user,
the bpf_prog ksym for each function will use the func name
provided in the type_id, which is supposed to provide better
encoding as it is not limited by 16 bytes program name
limitation and this is better for bpf program which contains
multiple subprograms.

The bpf_prog_info interface is also extended to
return btf_id, func_info, func_info_rec_size and func_info_cnt
to userspace, so userspace can print out the function prototype
for each xlated function. The insn_offset in the returned
func_info corresponds to the insn offset for xlated functions.
With other jit related fields in bpf_prog_info, userspace can also
print out function prototypes for each jited function.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h          |   5 +-
 include/linux/bpf_verifier.h |   1 +
 include/linux/btf.h          |   2 +
 include/uapi/linux/bpf.h     |  13 +++++
 kernel/bpf/btf.c             |   4 +-
 kernel/bpf/core.c            |  13 +++++
 kernel/bpf/syscall.c         |  59 +++++++++++++++++++--
 kernel/bpf/verifier.c        | 120 ++++++++++++++++++++++++++++++++++++++++++-
 8 files changed, 209 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 987815152629..7f0e225bf630 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -316,6 +316,8 @@ struct bpf_prog_aux {
 	void *security;
 #endif
 	struct bpf_prog_offload *offload;
+	struct btf *btf;
+	u32 type_id; /* type id for this prog/func */
 	union {
 		struct work_struct work;
 		struct rcu_head	rcu;
@@ -527,7 +529,8 @@ static inline void bpf_long_memcpy(void *dst, const void *src, u32 size)
 }
 
 /* verify correctness of eBPF program */
-int bpf_check(struct bpf_prog **fp, union bpf_attr *attr);
+int bpf_check(struct bpf_prog **fp, union bpf_attr *attr,
+	      union bpf_attr __user *uattr);
 void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth);
 
 /* Map specifics */
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 11f5df1092d9..204382f46fd8 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -204,6 +204,7 @@ static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log)
 struct bpf_subprog_info {
 	u32 start; /* insn idx of function entry point */
 	u16 stack_depth; /* max. stack depth used by this function */
+	u32 type_id; /* btf type_id for this subprog */
 };
 
 /* single container for all structs
diff --git a/include/linux/btf.h b/include/linux/btf.h
index e076c4697049..7f2c0a4a45ea 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -46,5 +46,7 @@ void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj,
 		       struct seq_file *m);
 int btf_get_fd_by_id(u32 id);
 u32 btf_id(const struct btf *btf);
+const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id);
+const char *btf_name_by_offset(const struct btf *btf, u32 offset);
 
 #endif
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 05d95290b848..c1554aa07465 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -338,6 +338,10 @@ union bpf_attr {
 		 * (context accesses, allowed helpers, etc).
 		 */
 		__u32		expected_attach_type;
+		__u32		prog_btf_fd;	/* fd pointing to BTF type data */
+		__u32		func_info_rec_size;	/* userspace bpf_func_info size */
+		__aligned_u64	func_info;	/* func info */
+		__u32		func_info_cnt;	/* number of bpf_func_info records */
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
@@ -2638,6 +2642,10 @@ struct bpf_prog_info {
 	__u32 nr_jited_func_lens;
 	__aligned_u64 jited_ksyms;
 	__aligned_u64 jited_func_lens;
+	__u32 btf_id;
+	__u32 func_info_rec_size;
+	__aligned_u64 func_info;
+	__u32 func_info_cnt;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
@@ -2949,4 +2957,9 @@ struct bpf_flow_keys {
 	};
 };
 
+struct bpf_func_info {
+	__u32	insn_offset;
+	__u32	type_id;
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 6a2be79b73fc..69da9169819a 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -474,7 +474,7 @@ static bool btf_name_valid_identifier(const struct btf *btf, u32 offset)
 	return !*src;
 }
 
-static const char *btf_name_by_offset(const struct btf *btf, u32 offset)
+const char *btf_name_by_offset(const struct btf *btf, u32 offset)
 {
 	if (!offset)
 		return "(anon)";
@@ -484,7 +484,7 @@ static const char *btf_name_by_offset(const struct btf *btf, u32 offset)
 		return "(invalid-name-offset)";
 }
 
-static const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id)
+const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id)
 {
 	if (type_id > btf->nr_types)
 		return NULL;
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 1a796e0799ec..16d77012ad3e 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -21,12 +21,14 @@
  * Kris Katterjohn - Added many additional checks in bpf_check_classic()
  */
 
+#include <uapi/linux/btf.h>
 #include <linux/filter.h>
 #include <linux/skbuff.h>
 #include <linux/vmalloc.h>
 #include <linux/random.h>
 #include <linux/moduleloader.h>
 #include <linux/bpf.h>
+#include <linux/btf.h>
 #include <linux/frame.h>
 #include <linux/rbtree_latch.h>
 #include <linux/kallsyms.h>
@@ -390,6 +392,8 @@ bpf_get_prog_addr_region(const struct bpf_prog *prog,
 static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
 {
 	const char *end = sym + KSYM_NAME_LEN;
+	const struct btf_type *type;
+	const char *func_name;
 
 	BUILD_BUG_ON(sizeof("bpf_prog_") +
 		     sizeof(prog->tag) * 2 +
@@ -404,6 +408,15 @@ static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
 
 	sym += snprintf(sym, KSYM_NAME_LEN, "bpf_prog_");
 	sym  = bin2hex(sym, prog->tag, sizeof(prog->tag));
+
+	/* prog->aux->name will be ignored if full btf name is available */
+	if (prog->aux->btf) {
+		type = btf_type_by_id(prog->aux->btf, prog->aux->type_id);
+		func_name = btf_name_by_offset(prog->aux->btf, type->name_off);
+		snprintf(sym, (size_t)(end - sym), "_%s", func_name);
+		return;
+	}
+
 	if (prog->aux->name[0])
 		snprintf(sym, (size_t)(end - sym), "_%s", prog->aux->name);
 	else
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index cf5040fd5434..998377808102 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1213,6 +1213,7 @@ static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
 		/* bpf_prog_free_id() must be called first */
 		bpf_prog_free_id(prog, do_idr_lock);
 		bpf_prog_kallsyms_del_all(prog);
+		btf_put(prog->aux->btf);
 
 		call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
 	}
@@ -1437,9 +1438,9 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type,
 }
 
 /* last field in 'union bpf_attr' used by this command */
-#define	BPF_PROG_LOAD_LAST_FIELD expected_attach_type
+#define	BPF_PROG_LOAD_LAST_FIELD func_info_cnt
 
-static int bpf_prog_load(union bpf_attr *attr)
+static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
 {
 	enum bpf_prog_type type = attr->prog_type;
 	struct bpf_prog *prog;
@@ -1525,7 +1526,7 @@ static int bpf_prog_load(union bpf_attr *attr)
 		goto free_prog;
 
 	/* run eBPF verifier */
-	err = bpf_check(&prog, attr);
+	err = bpf_check(&prog, attr, uattr);
 	if (err < 0)
 		goto free_used_maps;
 
@@ -2079,6 +2080,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 		info.xlated_prog_len = 0;
 		info.nr_jited_ksyms = 0;
 		info.nr_jited_func_lens = 0;
+		info.func_info_cnt = 0;
 		goto done;
 	}
 
@@ -2216,6 +2218,55 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 		}
 	}
 
+	if (prog->aux->btf) {
+		u32 ucnt, urec_size;
+
+		info.btf_id = btf_id(prog->aux->btf);
+
+		ucnt = info.func_info_cnt;
+		info.func_info_cnt = prog->aux->func_cnt ? : 1;
+		urec_size = info.func_info_rec_size;
+		info.func_info_rec_size = sizeof(struct bpf_func_info);
+		if (ucnt) {
+			/* expect passed-in urec_size is what the kernel expects */
+			if (urec_size != info.func_info_rec_size)
+				return -EINVAL;
+
+			if (bpf_dump_raw_ok()) {
+				struct bpf_func_info kern_finfo;
+				char __user *user_finfo;
+				u32 i, insn_offset;
+
+				user_finfo = u64_to_user_ptr(info.func_info);
+				if (prog->aux->func_cnt) {
+					ucnt = min_t(u32, info.func_info_cnt, ucnt);
+					insn_offset = 0;
+					for (i = 0; i < ucnt; i++) {
+						kern_finfo.insn_offset = insn_offset;
+						kern_finfo.type_id = prog->aux->func[i]->aux->type_id;
+						if (copy_to_user(user_finfo, &kern_finfo,
+								 sizeof(kern_finfo)))
+							return -EFAULT;
+
+						/* func[i]->len holds the prog len */
+						insn_offset += prog->aux->func[i]->len;
+						user_finfo += urec_size;
+					}
+				} else {
+					kern_finfo.insn_offset = 0;
+					kern_finfo.type_id = prog->aux->type_id;
+					if (copy_to_user(user_finfo, &kern_finfo,
+							 sizeof(kern_finfo)))
+						return -EFAULT;
+				}
+			} else {
+				info.func_info_cnt = 0;
+			}
+		}
+	} else {
+		info.func_info_cnt = 0;
+	}
+
 done:
 	if (copy_to_user(uinfo, &info, info_len) ||
 	    put_user(info_len, &uattr->info.info_len))
@@ -2501,7 +2552,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 		err = map_get_next_key(&attr);
 		break;
 	case BPF_PROG_LOAD:
-		err = bpf_prog_load(&attr);
+		err = bpf_prog_load(&attr, uattr);
 		break;
 	case BPF_OBJ_PIN:
 		err = bpf_obj_pin(&attr);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b5222aa61d54..f102c4fd0c5a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -11,10 +11,12 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  * General Public License for more details.
  */
+#include <uapi/linux/btf.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/bpf.h>
+#include <linux/btf.h>
 #include <linux/bpf_verifier.h>
 #include <linux/filter.h>
 #include <net/netlink.h>
@@ -4639,6 +4641,114 @@ err_free:
 	return ret;
 }
 
+/* The minimum supported BTF func info size */
+#define MIN_BPF_FUNCINFO_SIZE	8
+#define MAX_FUNCINFO_REC_SIZE	252
+
+static int check_btf_func(struct bpf_prog *prog, struct bpf_verifier_env *env,
+			  union bpf_attr *attr, union bpf_attr __user *uattr)
+{
+	u32 i, nfuncs, urec_size, min_size, prev_offset;
+	u32 krec_size = sizeof(struct bpf_func_info);
+	struct bpf_func_info krecord = {};
+	const struct btf_type *type;
+	void __user *urecord;
+	struct btf *btf;
+	int ret = 0;
+
+	nfuncs = attr->func_info_cnt;
+	if (!nfuncs)
+		return 0;
+
+	if (nfuncs != env->subprog_cnt) {
+		verbose(env, "number of funcs in func_info doesn't match number of subprogs\n");
+		return -EINVAL;
+	}
+
+	urec_size = attr->func_info_rec_size;
+	if (urec_size < MIN_BPF_FUNCINFO_SIZE ||
+	    urec_size > MAX_FUNCINFO_REC_SIZE ||
+	    urec_size % sizeof(u32)) {
+		verbose(env, "invalid func info rec size %u\n", urec_size);
+		return -EINVAL;
+	}
+
+	btf = btf_get_by_fd(attr->prog_btf_fd);
+	if (IS_ERR(btf)) {
+		verbose(env, "unable to get btf from fd\n");
+		return PTR_ERR(btf);
+	}
+
+	urecord = u64_to_user_ptr(attr->func_info);
+	min_size = min_t(u32, krec_size, urec_size);
+
+	for (i = 0; i < nfuncs; i++) {
+		ret = bpf_check_uarg_tail_zero(urecord, krec_size, urec_size);
+		if (ret) {
+			if (ret == -E2BIG) {
+				verbose(env, "nonzero tailing record in func info");
+				/* set the size kernel expects so loader can zero
+				 * out the rest of the record.
+				 */
+				if (put_user(min_size, &uattr->func_info_rec_size))
+					ret = -EFAULT;
+			}
+			goto free_btf;
+		}
+
+		if (copy_from_user(&krecord, urecord, min_size)) {
+			ret = -EFAULT;
+			goto free_btf;
+		}
+
+		/* check insn_offset */
+		if (i == 0) {
+			if (krecord.insn_offset) {
+				verbose(env,
+					"nonzero insn_offset %u for the first func info record",
+					krecord.insn_offset);
+				ret = -EINVAL;
+				goto free_btf;
+			}
+		} else if (krecord.insn_offset <= prev_offset) {
+			verbose(env,
+				"same or smaller insn offset (%u) than previous func info record (%u)",
+				krecord.insn_offset, prev_offset);
+			ret = -EINVAL;
+			goto free_btf;
+		}
+
+		if (env->subprog_info[i].start != krecord.insn_offset) {
+			verbose(env, "func_info BTF section doesn't match subprog layout in BPF program\n");
+			ret = -EINVAL;
+			goto free_btf;
+		}
+
+		/* check type_id */
+		type = btf_type_by_id(btf, krecord.type_id);
+		if (!type || BTF_INFO_KIND(type->info) != BTF_KIND_FUNC) {
+			verbose(env, "invalid type id %d in func info",
+				krecord.type_id);
+			ret = -EINVAL;
+			goto free_btf;
+		}
+
+		if (i == 0)
+			prog->aux->type_id = krecord.type_id;
+		env->subprog_info[i].type_id = krecord.type_id;
+
+		prev_offset = krecord.insn_offset;
+		urecord += urec_size;
+	}
+
+	prog->aux->btf = btf;
+	return 0;
+
+free_btf:
+	btf_put(btf);
+	return ret;
+}
+
 /* check %cur's range satisfies %old's */
 static bool range_within(struct bpf_reg_state *old,
 			 struct bpf_reg_state *cur)
@@ -5939,6 +6049,9 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 		func[i]->aux->name[0] = 'F';
 		func[i]->aux->stack_depth = env->subprog_info[i].stack_depth;
 		func[i]->jit_requested = 1;
+		/* the btf will be freed only at prog->aux */
+		func[i]->aux->btf = prog->aux->btf;
+		func[i]->aux->type_id = env->subprog_info[i].type_id;
 		func[i] = bpf_int_jit_compile(func[i]);
 		if (!func[i]->jited) {
 			err = -ENOTSUPP;
@@ -6325,7 +6438,8 @@ static void free_states(struct bpf_verifier_env *env)
 	kfree(env->explored_states);
 }
 
-int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
+int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
+	      union bpf_attr __user *uattr)
 {
 	struct bpf_verifier_env *env;
 	struct bpf_verifier_log *log;
@@ -6397,6 +6511,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 	if (ret < 0)
 		goto skip_full_check;
 
+	ret = check_btf_func(env->prog, env, attr, uattr);
+	if (ret < 0)
+		goto skip_full_check;
+
 	ret = do_check(env);
 	if (env->cur_state) {
 		free_verifier_state(env->cur_state, true);
-- 
cgit 


From 8d75839b843ae0ef8d9db97ed05b493e687e6b75 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 21 Nov 2018 21:39:52 -0800
Subject: bpf, lpm: make longest_prefix_match() faster

At LPC 2018 in Vancouver, Vlad Dumitrescu mentioned that longest_prefix_match()
has a high cost [1].

One reason for that cost is a loop handling one byte at a time.

We can handle more bytes at a time, if enough attention is paid
to endianness.

I was able to remove ~55 % of longest_prefix_match() cpu costs.

[1] https://linuxplumbersconf.org/event/2/contributions/88/attachments/76/87/lpc-bpf-2018-shaping.pdf

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Vlad Dumitrescu <vladum@google.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/lpm_trie.c | 59 ++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 49 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 9058317ba9de..bfd4882e1106 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -168,20 +168,59 @@ static size_t longest_prefix_match(const struct lpm_trie *trie,
 				   const struct lpm_trie_node *node,
 				   const struct bpf_lpm_trie_key *key)
 {
-	size_t prefixlen = 0;
-	size_t i;
+	u32 limit = min(node->prefixlen, key->prefixlen);
+	u32 prefixlen = 0, i = 0;
 
-	for (i = 0; i < trie->data_size; i++) {
-		size_t b;
+	BUILD_BUG_ON(offsetof(struct lpm_trie_node, data) % sizeof(u32));
+	BUILD_BUG_ON(offsetof(struct bpf_lpm_trie_key, data) % sizeof(u32));
 
-		b = 8 - fls(node->data[i] ^ key->data[i]);
-		prefixlen += b;
+#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && defined(CONFIG_64BIT)
 
-		if (prefixlen >= node->prefixlen || prefixlen >= key->prefixlen)
-			return min(node->prefixlen, key->prefixlen);
+	/* data_size >= 16 has very small probability.
+	 * We do not use a loop for optimal code generation.
+	 */
+	if (trie->data_size >= 8) {
+		u64 diff = be64_to_cpu(*(__be64 *)node->data ^
+				       *(__be64 *)key->data);
+
+		prefixlen = 64 - fls64(diff);
+		if (prefixlen >= limit)
+			return limit;
+		if (diff)
+			return prefixlen;
+		i = 8;
+	}
+#endif
+
+	while (trie->data_size >= i + 4) {
+		u32 diff = be32_to_cpu(*(__be32 *)&node->data[i] ^
+				       *(__be32 *)&key->data[i]);
+
+		prefixlen += 32 - fls(diff);
+		if (prefixlen >= limit)
+			return limit;
+		if (diff)
+			return prefixlen;
+		i += 4;
+	}
 
-		if (b < 8)
-			break;
+	if (trie->data_size >= i + 2) {
+		u16 diff = be16_to_cpu(*(__be16 *)&node->data[i] ^
+				       *(__be16 *)&key->data[i]);
+
+		prefixlen += 16 - fls(diff);
+		if (prefixlen >= limit)
+			return limit;
+		if (diff)
+			return prefixlen;
+		i += 2;
+	}
+
+	if (trie->data_size >= i + 1) {
+		prefixlen += 8 - fls(node->data[i] ^ key->data[i]);
+
+		if (prefixlen >= limit)
+			return limit;
 	}
 
 	return prefixlen;
-- 
cgit 


From c7c3f05e341a9a2bd1a92993d4f996cfd6e7348e Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
Date: Thu, 25 Oct 2018 19:10:36 +0900
Subject: panic: avoid deadlocks in re-entrant console drivers

From printk()/serial console point of view panic() is special, because
it may force CPU to re-enter printk() or/and serial console driver.
Therefore, some of serial consoles drivers are re-entrant. E.g. 8250:

serial8250_console_write()
{
	if (port->sysrq)
		locked = 0;
	else if (oops_in_progress)
		locked = spin_trylock_irqsave(&port->lock, flags);
	else
		spin_lock_irqsave(&port->lock, flags);
	...
}

panic() does set oops_in_progress via bust_spinlocks(1), so in theory
we should be able to re-enter serial console driver from panic():

	CPU0
	<NMI>
	uart_console_write()
	serial8250_console_write()		// if (oops_in_progress)
						//    spin_trylock_irqsave()
	call_console_drivers()
	console_unlock()
	console_flush_on_panic()
	bust_spinlocks(1)			// oops_in_progress++
	panic()
	<NMI/>
	spin_lock_irqsave(&port->lock, flags)   // spin_lock_irqsave()
	serial8250_console_write()
	call_console_drivers()
	console_unlock()
	printk()
	...

However, this does not happen and we deadlock in serial console on
port->lock spinlock. And the problem is that console_flush_on_panic()
called after bust_spinlocks(0):

void panic(const char *fmt, ...)
{
	bust_spinlocks(1);
	...
	bust_spinlocks(0);
	console_flush_on_panic();
	...
}

bust_spinlocks(0) decrements oops_in_progress, so oops_in_progress
can go back to zero. Thus even re-entrant console drivers will simply
spin on port->lock spinlock. Given that port->lock may already be
locked either by a stopped CPU, or by the very same CPU we execute
panic() on (for instance, NMI panic() on printing CPU) the system
deadlocks and does not reboot.

Fix this by removing bust_spinlocks(0), so oops_in_progress is always
set in panic() now and, thus, re-entrant console drivers will trylock
the port->lock instead of spinning on it forever, when we call them
from console_flush_on_panic().

Link: http://lkml.kernel.org/r/20181025101036.6823-1-sergey.senozhatsky@gmail.com
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Daniel Wang <wonderfly@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Alan Cox <gnomes@lxorguk.ukuu.org.uk>
Cc: Jiri Slaby <jslaby@suse.com>
Cc: Peter Feiner <pfeiner@google.com>
Cc: linux-serial@vger.kernel.org
Cc: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
Cc: stable@vger.kernel.org
Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 kernel/panic.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index 8b2e002d52eb..6a6df23acd1a 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -14,6 +14,7 @@
 #include <linux/kmsg_dump.h>
 #include <linux/kallsyms.h>
 #include <linux/notifier.h>
+#include <linux/vt_kern.h>
 #include <linux/module.h>
 #include <linux/random.h>
 #include <linux/ftrace.h>
@@ -233,7 +234,10 @@ void panic(const char *fmt, ...)
 	if (_crash_kexec_post_notifiers)
 		__crash_kexec(NULL);
 
-	bust_spinlocks(0);
+#ifdef CONFIG_VT
+	unblank_screen();
+#endif
+	console_unblank();
 
 	/*
 	 * We may have ended up stopping the CPU holding the lock (in
-- 
cgit 


From 58c5fc2b96e4ae65068d815a1c3ca81da92fa1c9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 31 Oct 2018 19:21:08 +0100
Subject: time: Remove useless filenames in top level comments

Remove the pointless filenames in the top level comments. They have no
value at all and just occupy space. While at it tidy up some of the
comments and remove a stale one.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Nicolas Pitre <nico@linaro.org>
Acked-by: Kees Cook <keescook@chromium.org>
Acked-by: Ingo Molnar <mingo@kernel.org>
Acked-by: John Stultz <john.stultz@linaro.org>
Acked-by: Corey Minyard <cminyard@mvista.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Kate Stewart <kstewart@linuxfoundation.org>
Cc: Philippe Ombredanne <pombredanne@nexb.com>
Cc: Peter Anvin <hpa@zytor.com>
Cc: Russell King <rmk+kernel@armlinux.org.uk>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: David Riley <davidriley@chromium.org>
Cc: Colin Cross <ccross@android.com>
Cc: Mark Brown <broonie@kernel.org>
Link: https://lkml.kernel.org/r/20181031182252.794898238@linutronix.de
---
 include/linux/hrtimer.h              |  2 --
 kernel/time/clockevents.c            |  2 --
 kernel/time/clocksource.c            |  5 -----
 kernel/time/hrtimer.c                | 16 ++++------------
 kernel/time/itimer.c                 |  2 --
 kernel/time/jiffies.c                |  2 --
 kernel/time/posix-clock.c            |  2 +-
 kernel/time/posix-timers.c           |  4 ----
 kernel/time/sched_clock.c            |  4 ++--
 kernel/time/tick-broadcast-hrtimer.c |  4 +---
 kernel/time/tick-broadcast.c         |  2 --
 kernel/time/tick-common.c            |  2 --
 kernel/time/tick-oneshot.c           |  2 --
 kernel/time/tick-sched.c             |  2 --
 kernel/time/time.c                   | 12 ++++--------
 kernel/time/timecounter.c            |  6 +-----
 kernel/time/timekeeping.c            | 10 ++--------
 kernel/time/timer.c                  |  2 --
 kernel/time/timer_list.c             |  2 --
 19 files changed, 15 insertions(+), 68 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 3892e9c8b2de..50ebe2ad43e0 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -1,6 +1,4 @@
 /*
- *  include/linux/hrtimer.h
- *
  *  hrtimers - High-resolution kernel timers
  *
  *   Copyright(C) 2005, Thomas Gleixner <tglx@linutronix.de>
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index af58898d9ebf..9b8c7c0fd113 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -1,6 +1,4 @@
 /*
- * linux/kernel/time/clockevents.c
- *
  * This file contains functions which manage clock event devices.
  *
  * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index ffe081623aec..1c5273fbd500 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -1,6 +1,4 @@
 /*
- * linux/kernel/time/clocksource.c
- *
  * This file contains the functions which manage clocksource drivers.
  *
  * Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com)
@@ -18,9 +16,6 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * TODO WishList:
- *   o Allow clocksource drivers to be unregistered
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 9cdd74bd2d27..223548bb81c6 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1,26 +1,18 @@
 /*
- *  linux/kernel/hrtimer.c
- *
  *  Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
  *  Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
  *  Copyright(C) 2006-2007  Timesys Corp., Thomas Gleixner
  *
  *  High-resolution kernel timers
  *
- *  In contrast to the low-resolution timeout API implemented in
- *  kernel/timer.c, hrtimers provide finer resolution and accuracy
- *  depending on system configuration and capabilities.
- *
- *  These timers are currently used for:
- *   - itimers
- *   - POSIX timers
- *   - nanosleep
- *   - precise in-kernel timing
+ *  In contrast to the low-resolution timeout API, aka timer wheel,
+ *  hrtimers provide finer resolution and accuracy depending on system
+ *  configuration and capabilities.
  *
  *  Started by: Thomas Gleixner and Ingo Molnar
  *
  *  Credits:
- *	based on kernel/timer.c
+ *	Based on the original timer wheel code
  *
  *	Help, testing, suggestions, bugfixes, improvements were
  *	provided by:
diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
index 9a65713c8309..02068b2d5862 100644
--- a/kernel/time/itimer.c
+++ b/kernel/time/itimer.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * linux/kernel/itimer.c
- *
  * Copyright (C) 1992 Darren Senn
  */
 
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 497719127bf9..9c3957fe9317 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -1,6 +1,4 @@
 /***********************************************************************
-* linux/kernel/time/jiffies.c
-*
 * This file contains the jiffies based clocksource.
 *
 * Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com)
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index fe56c4e06c51..4959815f4fd7 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -1,5 +1,5 @@
 /*
- * posix-clock.c - support for dynamic clock devices
+ * Support for dynamic clock devices
  *
  * Copyright (C) 2010 OMICRON electronics GmbH
  *
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index bd62b5eeb5a0..c72307c119d9 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -1,10 +1,6 @@
 /*
- * linux/kernel/posix-timers.c
- *
- *
  * 2002-10-15  Posix Clocks & timers
  *                           by George Anzinger george@mvista.com
- *
  *			     Copyright (C) 2002 2003 by MontaVista Software.
  *
  * 2004-06-01  Fix CLOCK_REALTIME clock/timer TIMER_ABSTIME bug.
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index cbc72c2c1fca..b38b6628f89b 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -1,6 +1,6 @@
 /*
- * sched_clock.c: Generic sched_clock() support, to extend low level
- *                hardware time counters to full 64-bit ns values.
+ * Generic sched_clock() support, to extend low level hardware time
+ * counters to full 64-bit ns values.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
index a59641fb88b6..5be6154e2fd2 100644
--- a/kernel/time/tick-broadcast-hrtimer.c
+++ b/kernel/time/tick-broadcast-hrtimer.c
@@ -1,8 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * linux/kernel/time/tick-broadcast-hrtimer.c
- * This file emulates a local clock event device
- * via a pseudo clock device.
+ * Emulate a local clock event device via a pseudo clock device.
  */
 #include <linux/cpu.h>
 #include <linux/err.h>
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index be0aac2b4300..4f5abde2dfa7 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -1,6 +1,4 @@
 /*
- * linux/kernel/time/tick-broadcast.c
- *
  * This file contains functions which emulate a local clock-event
  * device via a broadcast event source.
  *
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 14de3727b18e..7b5008039c2d 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -1,6 +1,4 @@
 /*
- * linux/kernel/time/tick-common.c
- *
  * This file contains the base functions to manage periodic tick
  * related events.
  *
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 6fe615d57ebb..77989efe13d2 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -1,6 +1,4 @@
 /*
- * linux/kernel/time/tick-oneshot.c
- *
  * This file contains functions which manage high resolution tick
  * related events.
  *
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 69e673b88474..cb557e56a19f 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -1,6 +1,4 @@
 /*
- *  linux/kernel/time/tick-sched.c
- *
  *  Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
  *  Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
  *  Copyright(C) 2006-2007  Timesys Corp., Thomas Gleixner
diff --git a/kernel/time/time.c b/kernel/time/time.c
index ad204cf6d001..13ffa9950ffc 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -1,14 +1,10 @@
 /*
- *  linux/kernel/time.c
- *
  *  Copyright (C) 1991, 1992  Linus Torvalds
  *
- *  This file contains the interface functions for the various
- *  time related system calls: time, stime, gettimeofday, settimeofday,
- *			       adjtime
- */
-/*
- * Modification history kernel/time.c
+ *  This file contains the interface functions for the various time related
+ *  system calls: time, stime, gettimeofday, settimeofday, adjtime
+ *
+ * Modification history:
  *
  * 1993-09-02    Philip Gladstone
  *      Created file with time related functions from sched/core.c and adjtimex()
diff --git a/kernel/time/timecounter.c b/kernel/time/timecounter.c
index 8afd78932bdf..400f3456d564 100644
--- a/kernel/time/timecounter.c
+++ b/kernel/time/timecounter.c
@@ -1,8 +1,5 @@
 /*
- * linux/kernel/time/timecounter.c
- *
- * based on code that migrated away from
- * linux/kernel/time/clocksource.c
+ * Based on clocksource code. See commit 74d23cc704d1
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -14,7 +11,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  */
-
 #include <linux/export.h>
 #include <linux/timecounter.h>
 
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 2d110c948805..30fdf48f50c2 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1,13 +1,7 @@
 /*
- *  linux/kernel/time/timekeeping.c
- *
- *  Kernel timekeeping code and accessor functions
- *
- *  This code was moved from linux/kernel/timer.c.
- *  Please see that file for copyright and history logs.
- *
+ *  Kernel timekeeping code and accessor functions. Based on code from
+ *  timer.c, moved in commit 8524070b7982.
  */
-
 #include <linux/timekeeper_internal.h>
 #include <linux/module.h>
 #include <linux/interrupt.h>
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index fa49cd753dea..2f248bbedb4a 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1,6 +1,4 @@
 /*
- *  linux/kernel/timer.c
- *
  *  Kernel internal timers
  *
  *  Copyright (C) 1991, 1992  Linus Torvalds
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index d647dabdac97..5d64fff384c8 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -1,6 +1,4 @@
 /*
- * kernel/time/timer_list.c
- *
  * List pending timers
  *
  * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar
-- 
cgit 


From 35728b8209ee7d25b6241a56304ee926469bd154 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 31 Oct 2018 19:21:09 +0100
Subject: time: Add SPDX license identifiers

Update the time(r) core files files with the correct SPDX license
identifier based on the license text in the file itself. The SPDX
identifier is a legally binding shorthand, which can be used instead of the
full boiler plate text.

This work is based on a script and data from Philippe Ombredanne, Kate
Stewart and myself. The data has been created with two independent license
scanners and manual inspection.

The following files do not contain any direct license information and have
been omitted from the big initial SPDX changes:

  timeconst.bc: The .bc files were not touched
  time.c, timer.c, timekeeping.c: Licence was deduced from EXPORT_SYMBOL_GPL

As those files do not contain direct license references they fall under the
project license, i.e. GPL V2 only.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Kees Cook <keescook@chromium.org>
Acked-by: Ingo Molnar <mingo@kernel.org>
Acked-by: John Stultz <john.stultz@linaro.org>
Acked-by: Corey Minyard <cminyard@mvista.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Kate Stewart <kstewart@linuxfoundation.org>
Cc: Philippe Ombredanne <pombredanne@nexb.com>
Cc: Russell King <rmk+kernel@armlinux.org.uk>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Nicolas Pitre <nicolas.pitre@linaro.org>
Cc: David Riley <davidriley@chromium.org>
Cc: Colin Cross <ccross@android.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Link: https://lkml.kernel.org/r/20181031182252.879109557@linutronix.de
---
 include/linux/hrtimer.h         | 1 +
 kernel/time/alarmtimer.c        | 1 +
 kernel/time/clockevents.c       | 1 +
 kernel/time/clocksource.c       | 1 +
 kernel/time/hrtimer.c           | 1 +
 kernel/time/jiffies.c           | 1 +
 kernel/time/posix-clock.c       | 1 +
 kernel/time/posix-stubs.c       | 1 +
 kernel/time/posix-timers.c      | 1 +
 kernel/time/sched_clock.c       | 1 +
 kernel/time/test_udelay.c       | 1 +
 kernel/time/tick-broadcast.c    | 1 +
 kernel/time/tick-common.c       | 1 +
 kernel/time/tick-oneshot.c      | 1 +
 kernel/time/tick-sched.c        | 1 +
 kernel/time/time.c              | 1 +
 kernel/time/timeconst.bc        | 2 ++
 kernel/time/timeconv.c          | 1 +
 kernel/time/timecounter.c       | 1 +
 kernel/time/timekeeping.c       | 1 +
 kernel/time/timekeeping_debug.c | 1 +
 kernel/time/timer.c             | 1 +
 kernel/time/timer_list.c        | 1 +
 23 files changed, 24 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 50ebe2ad43e0..851e4231d3ab 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  *  hrtimers - High-resolution kernel timers
  *
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index fa5de5e8de61..69070d399d70 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Alarmtimer interface
  *
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 9b8c7c0fd113..0fdbdf17f8a2 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * This file contains functions which manage clock event devices.
  *
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 1c5273fbd500..b1abeac5f3f7 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * This file contains the functions which manage clocksource drivers.
  *
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 223548bb81c6..16dacc8d3ca2 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  *  Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
  *  Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 9c3957fe9317..0deb0be2c445 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /***********************************************************************
 * This file contains the jiffies based clocksource.
 *
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index 4959815f4fd7..339e35e4605f 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Support for dynamic clock devices
  *
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index 989ccf028bde..b9f9f6f02e11 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Dummy stubs used when CONFIG_POSIX_TIMERS=n
  *
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index c72307c119d9..e8cd9aa6c9cf 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * 2002-10-15  Posix Clocks & timers
  *                           by George Anzinger george@mvista.com
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index b38b6628f89b..11570ba451cc 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Generic sched_clock() support, to extend low level hardware time
  * counters to full 64-bit ns values.
diff --git a/kernel/time/test_udelay.c b/kernel/time/test_udelay.c
index b0928ab3270f..d6a87bb2040f 100644
--- a/kernel/time/test_udelay.c
+++ b/kernel/time/test_udelay.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * udelay() test kernel module
  *
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 4f5abde2dfa7..f4725f53d852 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * This file contains functions which emulate a local clock-event
  * device via a broadcast event source.
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 7b5008039c2d..455b8d65a2b7 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * This file contains the base functions to manage periodic tick
  * related events.
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 77989efe13d2..1c8ad0fb33c0 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * This file contains functions which manage high resolution tick
  * related events.
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index cb557e56a19f..62ecb2a802ca 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  *  Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
  *  Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 13ffa9950ffc..5aa0a156e331 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  *  Copyright (C) 1991, 1992  Linus Torvalds
  *
diff --git a/kernel/time/timeconst.bc b/kernel/time/timeconst.bc
index f83bbb81600b..7ed0e0fb5831 100644
--- a/kernel/time/timeconst.bc
+++ b/kernel/time/timeconst.bc
@@ -1,3 +1,5 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
 scale=0
 
 define gcd(a,b) {
diff --git a/kernel/time/timeconv.c b/kernel/time/timeconv.c
index 7142580ad94f..589e0a552129 100644
--- a/kernel/time/timeconv.c
+++ b/kernel/time/timeconv.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: LGPL-2.0+
 /*
  * Copyright (C) 1993, 1994, 1995, 1996, 1997 Free Software Foundation, Inc.
  * This file is part of the GNU C Library.
diff --git a/kernel/time/timecounter.c b/kernel/time/timecounter.c
index 400f3456d564..933462326489 100644
--- a/kernel/time/timecounter.c
+++ b/kernel/time/timecounter.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * Based on clocksource code. See commit 74d23cc704d1
  *
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 30fdf48f50c2..cd02bd38cf2d 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  *  Kernel timekeeping code and accessor functions. Based on code from
  *  timer.c, moved in commit 8524070b7982.
diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c
index 238e4be60229..d06f09209fb7 100644
--- a/kernel/time/timekeeping_debug.c
+++ b/kernel/time/timekeeping_debug.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*
  * debugfs file to track time spent in suspend
  *
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 2f248bbedb4a..444156debfa0 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  *  Kernel internal timers
  *
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 5d64fff384c8..f81693cdf981 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * List pending timers
  *
-- 
cgit 


From f49c174b5f431db9fa17315269e288d4548b651c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 31 Oct 2018 19:21:10 +0100
Subject: hrtimers/tick/clockevents: Remove sloppy license references

"For licencing details see kernel-base/COPYING" and similar license
references have no value over the SPDX identifier. Remove them.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Kees Cook <keescook@chromium.org>
Acked-by: Ingo Molnar <mingo@kernel.org>
Acked-by: John Stultz <john.stultz@linaro.org>
Acked-by: Corey Minyard <cminyard@mvista.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Kate Stewart <kstewart@linuxfoundation.org>
Cc: Philippe Ombredanne <pombredanne@nexb.com>
Cc: Peter Anvin <hpa@zytor.com>
Cc: Russell King <rmk+kernel@armlinux.org.uk>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Nicolas Pitre <nicolas.pitre@linaro.org>
Cc: David Riley <davidriley@chromium.org>
Cc: Colin Cross <ccross@android.com>
Cc: Mark Brown <broonie@kernel.org>
Link: https://lkml.kernel.org/r/20181031182252.963632760@linutronix.de
---
 include/linux/hrtimer.h      | 2 --
 kernel/time/clockevents.c    | 3 ---
 kernel/time/hrtimer.c        | 2 --
 kernel/time/tick-broadcast.c | 3 ---
 kernel/time/tick-common.c    | 3 ---
 kernel/time/tick-oneshot.c   | 3 ---
 kernel/time/tick-sched.c     | 2 --
 kernel/time/timer_list.c     | 4 ----
 8 files changed, 22 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 851e4231d3ab..2e8957eac4d4 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -8,8 +8,6 @@
  *  data type definitions, declarations, prototypes
  *
  *  Started by: Thomas Gleixner and Ingo Molnar
- *
- *  For licencing details see kernel-base/COPYING
  */
 #ifndef _LINUX_HRTIMER_H
 #define _LINUX_HRTIMER_H
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 0fdbdf17f8a2..5e77662dd2d9 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -5,9 +5,6 @@
  * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
  * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
  * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
- *
- * This code is licenced under the GPL version 2. For details see
- * kernel-base/COPYING.
  */
 
 #include <linux/clockchips.h>
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 16dacc8d3ca2..f5cfa1b73d6f 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -20,8 +20,6 @@
  *
  *	George Anzinger, Andrew Morton, Steven Rostedt, Roman Zippel
  *	et. al.
- *
- *  For licencing details see kernel-base/COPYING
  */
 
 #include <linux/cpu.h>
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index f4725f53d852..803fa67aace9 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -6,9 +6,6 @@
  * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
  * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
  * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
- *
- * This code is licenced under the GPL version 2. For details see
- * kernel-base/COPYING.
  */
 #include <linux/cpu.h>
 #include <linux/err.h>
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 455b8d65a2b7..529143b4c8d2 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -6,9 +6,6 @@
  * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
  * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
  * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
- *
- * This code is licenced under the GPL version 2. For details see
- * kernel-base/COPYING.
  */
 #include <linux/cpu.h>
 #include <linux/err.h>
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 1c8ad0fb33c0..f9745d47425a 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -6,9 +6,6 @@
  * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
  * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
  * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
- *
- * This code is licenced under the GPL version 2. For details see
- * kernel-base/COPYING.
  */
 #include <linux/cpu.h>
 #include <linux/err.h>
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 62ecb2a802ca..6fa52cd6df0b 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -7,8 +7,6 @@
  *  No idle tick implementation for low and high resolution timers
  *
  *  Started by: Thomas Gleixner and Ingo Molnar
- *
- *  Distribute under GPLv2.
  */
 #include <linux/cpu.h>
 #include <linux/err.h>
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index f81693cdf981..98ba50dcb1b2 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -3,10 +3,6 @@
  * List pending timers
  *
  * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 
 #include <linux/proc_fs.h>
-- 
cgit 


From 9281a7857b91cf4d283be7c86d80e5d091bfb3d9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 31 Oct 2018 19:21:11 +0100
Subject: time/debug: Remove license boilerplate

The SPDX identifier is enough. Remove the license boilerplate.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Kees Cook <keescook@chromium.org>
Acked-by: Ingo Molnar <mingo@kernel.org>
Acked-by: John Stultz <john.stultz@linaro.org>
Acked-by: Corey Minyard <cminyard@mvista.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Kate Stewart <kstewart@linuxfoundation.org>
Cc: Philippe Ombredanne <pombredanne@nexb.com>
Cc: Peter Anvin <hpa@zytor.com>
Cc: Russell King <rmk+kernel@armlinux.org.uk>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Nicolas Pitre <nicolas.pitre@linaro.org>
Cc: David Riley <davidriley@chromium.org>
Cc: Colin Cross <ccross@android.com>
Cc: Mark Brown <broonie@kernel.org>
Link: https://lkml.kernel.org/r/20181031182253.047449481@linutronix.de
---
 kernel/time/test_udelay.c       |  9 ---------
 kernel/time/timekeeping_debug.c | 10 ----------
 2 files changed, 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/test_udelay.c b/kernel/time/test_udelay.c
index d6a87bb2040f..77c63005dc4e 100644
--- a/kernel/time/test_udelay.c
+++ b/kernel/time/test_udelay.c
@@ -8,15 +8,6 @@
  * Specifying usecs of 0 or negative values will run multiples tests.
  *
  * Copyright (C) 2014 Google, Inc.
- *
- * This software is licensed under the terms of the GNU General Public
- * License version 2, as published by the Free Software Foundation, and
- * may be copied, distributed, and modified under those terms.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #include <linux/debugfs.h>
diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c
index d06f09209fb7..f811882cfd13 100644
--- a/kernel/time/timekeeping_debug.c
+++ b/kernel/time/timekeeping_debug.c
@@ -3,16 +3,6 @@
  * debugfs file to track time spent in suspend
  *
  * Copyright (c) 2011, Google, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
  */
 
 #include <linux/debugfs.h>
-- 
cgit 


From 6c7811c628a96fe00965622f3aa1cf272cb898f7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 31 Oct 2018 19:21:12 +0100
Subject: time: Remove license boilerplate

The SPDX identifier defines the license of the files already. No need for
the boilerplates.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Kees Cook <keescook@chromium.org>
Acked-by: Ingo Molnar <mingo@kernel.org>
Acked-by: John Stultz <john.stultz@linaro.org>
Acked-by: Corey Minyard <cminyard@mvista.com>
Acked-by: Paul E. McKenney <paulmck@linux.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Kate Stewart <kstewart@linuxfoundation.org>
Cc: Philippe Ombredanne <pombredanne@nexb.com>
Cc: Peter Anvin <hpa@zytor.com>
Cc: Russell King <rmk+kernel@armlinux.org.uk>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Nicolas Pitre <nicolas.pitre@linaro.org>
Cc: David Riley <davidriley@chromium.org>
Cc: Colin Cross <ccross@android.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Link: https://lkml.kernel.org/r/20181031182253.132458951@linutronix.de
---
 kernel/time/alarmtimer.c  |  4 ----
 kernel/time/clocksource.c | 14 --------------
 kernel/time/jiffies.c     | 25 +++++--------------------
 kernel/time/timecounter.c | 10 ----------
 4 files changed, 5 insertions(+), 48 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 69070d399d70..2c97e8c2d29f 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -11,10 +11,6 @@
  * Copyright (C) 2010 IBM Corperation
  *
  * Author: John Stultz <john.stultz@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 #include <linux/time.h>
 #include <linux/hrtimer.h>
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index b1abeac5f3f7..3bcc19ceb073 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -3,20 +3,6 @@
  * This file contains the functions which manage clocksource drivers.
  *
  * Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com)
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 0deb0be2c445..dc1b6f1929f9 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -1,24 +1,9 @@
 // SPDX-License-Identifier: GPL-2.0+
-/***********************************************************************
-* This file contains the jiffies based clocksource.
-*
-* Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com)
-*
-* This program is free software; you can redistribute it and/or modify
-* it under the terms of the GNU General Public License as published by
-* the Free Software Foundation; either version 2 of the License, or
-* (at your option) any later version.
-*
-* This program is distributed in the hope that it will be useful,
-* but WITHOUT ANY WARRANTY; without even the implied warranty of
-* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-* GNU General Public License for more details.
-*
-* You should have received a copy of the GNU General Public License
-* along with this program; if not, write to the Free Software
-* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-*
-************************************************************************/
+/*
+ * This file contains the jiffies based clocksource.
+ *
+ * Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com)
+ */
 #include <linux/clocksource.h>
 #include <linux/jiffies.h>
 #include <linux/module.h>
diff --git a/kernel/time/timecounter.c b/kernel/time/timecounter.c
index 933462326489..85b98e727306 100644
--- a/kernel/time/timecounter.c
+++ b/kernel/time/timecounter.c
@@ -1,16 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0+
 /*
  * Based on clocksource code. See commit 74d23cc704d1
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 #include <linux/export.h>
 #include <linux/timecounter.h>
-- 
cgit 


From 3c8f2515ac0a986c8af6ba17c376f874306e71f4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 31 Oct 2018 19:21:13 +0100
Subject: posix-timers/stubs: Remove license boilerplate

The SPDX identifier defines the license of the file already. No need for
the boilerplate.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Nicolas Pitre <nico@linaro.org>
Acked-by: Kees Cook <keescook@chromium.org>
Acked-by: Ingo Molnar <mingo@kernel.org>
Acked-by: John Stultz <john.stultz@linaro.org>
Acked-by: Corey Minyard <cminyard@mvista.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Kate Stewart <kstewart@linuxfoundation.org>
Cc: Philippe Ombredanne <pombredanne@nexb.com>
Cc: Peter Anvin <hpa@zytor.com>
Cc: Russell King <rmk+kernel@armlinux.org.uk>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: David Riley <davidriley@chromium.org>
Cc: Colin Cross <ccross@android.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Link: https://lkml.kernel.org/r/20181031182253.215825217@linutronix.de
---
 kernel/time/posix-stubs.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index b9f9f6f02e11..a51895486e5e 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -4,10 +4,6 @@
  *
  * Created by:  Nicolas Pitre, July 2016
  * Copyright:   (C) 2016 Linaro Limited
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 
 #include <linux/linkage.h>
-- 
cgit 


From 2fa6d420c22268b30dc46c8cbfe8bec0e361e03e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 31 Oct 2018 19:21:14 +0100
Subject: sched/clock: Remove license boilerplate

The SPDX identifier defines the license of the file already. No need for
the boilerplate.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Kees Cook <keescook@chromium.org>
Acked-by: Ingo Molnar <mingo@kernel.org>
Acked-by: John Stultz <john.stultz@linaro.org>
Acked-by: Corey Minyard <cminyard@mvista.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Kate Stewart <kstewart@linuxfoundation.org>
Cc: Philippe Ombredanne <pombredanne@nexb.com>
Cc: Peter Anvin <hpa@zytor.com>
Cc: Russell King <rmk+kernel@armlinux.org.uk>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Nicolas Pitre <nicolas.pitre@linaro.org>
Cc: David Riley <davidriley@chromium.org>
Cc: Colin Cross <ccross@android.com>
Cc: Mark Brown <broonie@kernel.org>
Link: https://lkml.kernel.org/r/20181031182253.300140921@linutronix.de
---
 kernel/time/sched_clock.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 11570ba451cc..094b82ca95e5 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -2,10 +2,6 @@
 /*
  * Generic sched_clock() support, to extend low level hardware time
  * counters to full 64-bit ns values.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 #include <linux/clocksource.h>
 #include <linux/init.h>
-- 
cgit 


From c804efeb58229e6040b9a200cbab1fc8c150f99d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 31 Oct 2018 19:21:15 +0100
Subject: posix-clocks: Remove license boiler plate

The SPDX identifier defines the license of the file already. No need for
the boilerplate.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Richard Cochran <richardcochran@gmail.com>
Acked-by: Kees Cook <keescook@chromium.org>
Acked-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Manfred Rudigier <manfred.rudigier@omicronenergy.com>
Acked-by: John Stultz <john.stultz@linaro.org>
Acked-by: Corey Minyard <cminyard@mvista.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Kate Stewart <kstewart@linuxfoundation.org>
Cc: Philippe Ombredanne <pombredanne@nexb.com>
Cc: Peter Anvin <hpa@zytor.com>
Cc: Russell King <rmk+kernel@armlinux.org.uk>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Nicolas Pitre <nicolas.pitre@linaro.org>
Cc: David Riley <davidriley@chromium.org>
Cc: Colin Cross <ccross@android.com>
Cc: Mark Brown <broonie@kernel.org>
Link: https://lkml.kernel.org/r/20181031182253.385909804@linutronix.de
---
 kernel/time/posix-clock.c | 14 --------------
 1 file changed, 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index 339e35e4605f..425bbfce6819 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -3,20 +3,6 @@
  * Support for dynamic clock devices
  *
  * Copyright (C) 2010 OMICRON electronics GmbH
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 #include <linux/device.h>
 #include <linux/export.h>
-- 
cgit 


From 0141de741e0710d5e2e68087577329606f59ed71 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 31 Oct 2018 19:21:16 +0100
Subject: posix-timers: Remove license boilerplate

The SPDX identifier defines the license of the file already. No need for
the boilerplate.

Remove also the completely outdated Montavista snail mail address.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Kees Cook <keescook@chromium.org>
Acked-by: Ingo Molnar <mingo@kernel.org>
Acked-by: John Stultz <john.stultz@linaro.org>
Acked-by: Corey Minyard <cminyard@mvista.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Kate Stewart <kstewart@linuxfoundation.org>
Cc: Philippe Ombredanne <pombredanne@nexb.com>
Cc: Peter Anvin <hpa@zytor.com>
Cc: Russell King <rmk+kernel@armlinux.org.uk>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Nicolas Pitre <nicolas.pitre@linaro.org>
Cc: David Riley <davidriley@chromium.org>
Cc: Colin Cross <ccross@android.com>
Cc: Mark Brown <broonie@kernel.org>
Link: https://lkml.kernel.org/r/20181031182253.479792883@linutronix.de
---
 kernel/time/posix-timers.c | 20 +-------------------
 1 file changed, 1 insertion(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index e8cd9aa6c9cf..dd70ced15a36 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -7,25 +7,7 @@
  * 2004-06-01  Fix CLOCK_REALTIME clock/timer TIMER_ABSTIME bug.
  *			     Copyright (C) 2004 Boris Hu
  *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or (at
- * your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
-
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * MontaVista Software | 1237 East Arques Avenue | Sunnyvale | CA 94085 | USA
- */
-
-/* These are all the functions necessary to implement
- * POSIX clocks & timers
+ * These are all the functions necessary to implement POSIX clocks & timers
  */
 #include <linux/mm.h>
 #include <linux/interrupt.h>
-- 
cgit 


From cf0dd411e80f7066cabf69899724e48dd3192b99 Mon Sep 17 00:00:00 2001
From: Rustam Kovhaev <rkovhaev@gmail.com>
Date: Fri, 23 Nov 2018 15:48:16 -0800
Subject: bpf, tags: Fix DEFINE_PER_CPU expansion

Building tags produces warning:

  ctags: Warning: kernel/bpf/local_storage.c:10: null expansion of name pattern "\1"

Let's use the same fix as in commit 25528213fe9f ("tags: Fix DEFINE_PER_CPU
expansions"), even though it violates the usual code style.

Signed-off-by: Rustam Kovhaev <rkovhaev@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/local_storage.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index c97a8f968638..9e94b1cc6cf2 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -7,8 +7,7 @@
 #include <linux/rbtree.h>
 #include <linux/slab.h>
 
-DEFINE_PER_CPU(struct bpf_cgroup_storage*,
-	       bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
+DEFINE_PER_CPU(struct bpf_cgroup_storage*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
 
 #ifdef CONFIG_CGROUP_BPF
 
-- 
cgit 


From 311fe1a813324ea6d8172a3e9eefb1b274c72fea Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Sun, 25 Nov 2018 23:32:51 +0000
Subject: bpf: btf: fix spelling mistake "Memmber" -> "Member"

There is a spelling mistake in a btf_verifier_log_member message,
fix it.

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/btf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 69da9169819a..a09b2f94ab25 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -1621,7 +1621,7 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env,
 
 		if (BITS_ROUNDUP_BYTES(member->offset) > struct_size) {
 			btf_verifier_log_member(env, t, member,
-						"Memmber bits_offset exceeds its struct size");
+						"Member bits_offset exceeds its struct size");
 			return -EINVAL;
 		}
 
-- 
cgit 


From d0a3f18a70f2d9700bf9f5e9c4a505472388a7c1 Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Thu, 2 Aug 2018 17:56:50 -0400
Subject: audit: minimize our use of audit_log_format()

There are some cases where we are making multiple audit_log_format()
calls in a row, for no apparent reason.  Squash these down to a
single audit_log_format() call whenever possible.

Acked-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit.c          | 11 +++++------
 kernel/audit_fsnotify.c |  3 +--
 kernel/audit_tree.c     |  3 +--
 kernel/audit_watch.c    |  3 +--
 kernel/auditsc.c        |  7 +++----
 5 files changed, 11 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 6c53e373b828..d09298d3c2d2 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -2177,22 +2177,21 @@ void audit_log_name(struct audit_context *context, struct audit_names *n,
 	}
 
 	/* log the audit_names record type */
-	audit_log_format(ab, " nametype=");
 	switch(n->type) {
 	case AUDIT_TYPE_NORMAL:
-		audit_log_format(ab, "NORMAL");
+		audit_log_format(ab, " nametype=NORMAL");
 		break;
 	case AUDIT_TYPE_PARENT:
-		audit_log_format(ab, "PARENT");
+		audit_log_format(ab, " nametype=PARENT");
 		break;
 	case AUDIT_TYPE_CHILD_DELETE:
-		audit_log_format(ab, "DELETE");
+		audit_log_format(ab, " nametype=DELETE");
 		break;
 	case AUDIT_TYPE_CHILD_CREATE:
-		audit_log_format(ab, "CREATE");
+		audit_log_format(ab, " nametype=CREATE");
 		break;
 	default:
-		audit_log_format(ab, "UNKNOWN");
+		audit_log_format(ab, " nametype=UNKNOWN");
 		break;
 	}
 
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
index f90ffa699e5b..cf4512a33675 100644
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -131,8 +131,7 @@ static void audit_mark_log_rule_change(struct audit_fsnotify_mark *audit_mark, c
 	if (unlikely(!ab))
 		return;
 	audit_log_session_info(ab);
-	audit_log_format(ab, " op=%s", op);
-	audit_log_format(ab, " path=");
+	audit_log_format(ab, " op=%s path=", op);
 	audit_log_untrustedstring(ab, audit_mark->path);
 	audit_log_key(ab, rule->filterkey);
 	audit_log_format(ab, " list=%d res=1", rule->listnr);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 58e84eb5d826..d4af4d97f847 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -533,8 +533,7 @@ static void audit_tree_log_remove_rule(struct audit_krule *rule)
 	ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
 	if (unlikely(!ab))
 		return;
-	audit_log_format(ab, "op=remove_rule");
-	audit_log_format(ab, " dir=");
+	audit_log_format(ab, "op=remove_rule dir=");
 	audit_log_untrustedstring(ab, rule->tree->pathname);
 	audit_log_key(ab, rule->filterkey);
 	audit_log_format(ab, " list=%d res=1", rule->listnr);
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 568e48d1d0ab..20ef9ba134b0 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -246,8 +246,7 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc
 	if (!ab)
 		return;
 	audit_log_session_info(ab);
-	audit_log_format(ab, "op=%s", op);
-	audit_log_format(ab, " path=");
+	audit_log_format(ab, "op=%s path=", op);
 	audit_log_untrustedstring(ab, w->path);
 	audit_log_key(ab, r->filterkey);
 	audit_log_format(ab, " list=%d res=1", r->listnr);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 605f2d825204..51e735aedf58 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2503,10 +2503,9 @@ void audit_seccomp_actions_logged(const char *names, const char *old_names,
 	if (unlikely(!ab))
 		return;
 
-	audit_log_format(ab, "op=seccomp-logging");
-	audit_log_format(ab, " actions=%s", names);
-	audit_log_format(ab, " old-actions=%s", old_names);
-	audit_log_format(ab, " res=%d", res);
+	audit_log_format(ab,
+			 "op=seccomp-logging actions=%s old-actions=%s res=%d",
+			 names, old_names, res);
 	audit_log_end(ab);
 }
 
-- 
cgit 


From 2a1fe215e7300c7ebd6a7a24afcab71db5107bb0 Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Mon, 26 Nov 2018 18:40:07 -0500
Subject: audit: use current whenever possible

There are many places, notably audit_log_task_info() and
audit_log_exit(), that take task_struct pointers but in reality they
are always working on the current task.  This patch eliminates the
task_struct arguments and uses current directly which allows a number
of cleanups as well.

Acked-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 drivers/tty/tty_audit.c          |  13 ++--
 include/linux/audit.h            |   6 +-
 kernel/audit.c                   |  34 +++++-----
 kernel/audit.h                   |   2 +-
 kernel/auditsc.c                 | 131 +++++++++++++++++++--------------------
 security/integrity/ima/ima_api.c |   2 +-
 6 files changed, 90 insertions(+), 98 deletions(-)

(limited to 'kernel')

diff --git a/drivers/tty/tty_audit.c b/drivers/tty/tty_audit.c
index 50f567b6a66e..28f87fd6a28e 100644
--- a/drivers/tty/tty_audit.c
+++ b/drivers/tty/tty_audit.c
@@ -61,20 +61,19 @@ static void tty_audit_log(const char *description, dev_t dev,
 			  unsigned char *data, size_t size)
 {
 	struct audit_buffer *ab;
-	struct task_struct *tsk = current;
-	pid_t pid = task_pid_nr(tsk);
-	uid_t uid = from_kuid(&init_user_ns, task_uid(tsk));
-	uid_t loginuid = from_kuid(&init_user_ns, audit_get_loginuid(tsk));
-	unsigned int sessionid = audit_get_sessionid(tsk);
+	pid_t pid = task_pid_nr(current);
+	uid_t uid = from_kuid(&init_user_ns, task_uid(current));
+	uid_t loginuid = from_kuid(&init_user_ns, audit_get_loginuid(current));
+	unsigned int sessionid = audit_get_sessionid(current);
 
 	ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_TTY);
 	if (ab) {
-		char name[sizeof(tsk->comm)];
+		char name[sizeof(current->comm)];
 
 		audit_log_format(ab, "%s pid=%u uid=%u auid=%u ses=%u major=%d"
 				 " minor=%d comm=", description, pid, uid,
 				 loginuid, sessionid, MAJOR(dev), MINOR(dev));
-		get_task_comm(name, tsk);
+		get_task_comm(name, current);
 		audit_log_untrustedstring(ab, name);
 		audit_log_format(ab, " data=");
 		audit_log_n_hex(ab, data, size);
diff --git a/include/linux/audit.h b/include/linux/audit.h
index 58cf665f597e..a625c29a2ea2 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -151,8 +151,7 @@ extern void		    audit_log_link_denied(const char *operation);
 extern void		    audit_log_lost(const char *message);
 
 extern int audit_log_task_context(struct audit_buffer *ab);
-extern void audit_log_task_info(struct audit_buffer *ab,
-				struct task_struct *tsk);
+extern void audit_log_task_info(struct audit_buffer *ab);
 
 extern int		    audit_update_lsm_rules(void);
 
@@ -200,8 +199,7 @@ static inline int audit_log_task_context(struct audit_buffer *ab)
 {
 	return 0;
 }
-static inline void audit_log_task_info(struct audit_buffer *ab,
-				       struct task_struct *tsk)
+static inline void audit_log_task_info(struct audit_buffer *ab)
 { }
 #define audit_enabled AUDIT_OFF
 #endif /* CONFIG_AUDIT */
diff --git a/kernel/audit.c b/kernel/audit.c
index d09298d3c2d2..779671883349 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1096,10 +1096,11 @@ static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature
 
 	if (audit_enabled == AUDIT_OFF)
 		return;
+
 	ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_FEATURE_CHANGE);
 	if (!ab)
 		return;
-	audit_log_task_info(ab, current);
+	audit_log_task_info(ab);
 	audit_log_format(ab, " feature=%s old=%u new=%u old_lock=%u new_lock=%u res=%d",
 			 audit_feature_names[which], !!old_feature, !!new_feature,
 			 !!old_lock, !!new_lock, res);
@@ -2246,15 +2247,15 @@ out_null:
 	audit_log_format(ab, " exe=(null)");
 }
 
-struct tty_struct *audit_get_tty(struct task_struct *tsk)
+struct tty_struct *audit_get_tty(void)
 {
 	struct tty_struct *tty = NULL;
 	unsigned long flags;
 
-	spin_lock_irqsave(&tsk->sighand->siglock, flags);
-	if (tsk->signal)
-		tty = tty_kref_get(tsk->signal->tty);
-	spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
+	spin_lock_irqsave(&current->sighand->siglock, flags);
+	if (current->signal)
+		tty = tty_kref_get(current->signal->tty);
+	spin_unlock_irqrestore(&current->sighand->siglock, flags);
 	return tty;
 }
 
@@ -2263,25 +2264,24 @@ void audit_put_tty(struct tty_struct *tty)
 	tty_kref_put(tty);
 }
 
-void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
+void audit_log_task_info(struct audit_buffer *ab)
 {
 	const struct cred *cred;
-	char comm[sizeof(tsk->comm)];
+	char comm[sizeof(current->comm)];
 	struct tty_struct *tty;
 
 	if (!ab)
 		return;
 
-	/* tsk == current */
 	cred = current_cred();
-	tty = audit_get_tty(tsk);
+	tty = audit_get_tty();
 	audit_log_format(ab,
 			 " ppid=%d pid=%d auid=%u uid=%u gid=%u"
 			 " euid=%u suid=%u fsuid=%u"
 			 " egid=%u sgid=%u fsgid=%u tty=%s ses=%u",
-			 task_ppid_nr(tsk),
-			 task_tgid_nr(tsk),
-			 from_kuid(&init_user_ns, audit_get_loginuid(tsk)),
+			 task_ppid_nr(current),
+			 task_tgid_nr(current),
+			 from_kuid(&init_user_ns, audit_get_loginuid(current)),
 			 from_kuid(&init_user_ns, cred->uid),
 			 from_kgid(&init_user_ns, cred->gid),
 			 from_kuid(&init_user_ns, cred->euid),
@@ -2291,11 +2291,11 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
 			 from_kgid(&init_user_ns, cred->sgid),
 			 from_kgid(&init_user_ns, cred->fsgid),
 			 tty ? tty_name(tty) : "(none)",
-			 audit_get_sessionid(tsk));
+			 audit_get_sessionid(current));
 	audit_put_tty(tty);
 	audit_log_format(ab, " comm=");
-	audit_log_untrustedstring(ab, get_task_comm(comm, tsk));
-	audit_log_d_path_exe(ab, tsk->mm);
+	audit_log_untrustedstring(ab, get_task_comm(comm, current));
+	audit_log_d_path_exe(ab, current->mm);
 	audit_log_task_context(ab);
 }
 EXPORT_SYMBOL(audit_log_task_info);
@@ -2316,7 +2316,7 @@ void audit_log_link_denied(const char *operation)
 	if (!ab)
 		return;
 	audit_log_format(ab, "op=%s", operation);
-	audit_log_task_info(ab, current);
+	audit_log_task_info(ab);
 	audit_log_format(ab, " res=0");
 	audit_log_end(ab);
 }
diff --git a/kernel/audit.h b/kernel/audit.h
index 0b5295aeaebb..91421679a168 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -264,7 +264,7 @@ extern struct audit_entry *audit_dupe_rule(struct audit_krule *old);
 extern void audit_log_d_path_exe(struct audit_buffer *ab,
 				 struct mm_struct *mm);
 
-extern struct tty_struct *audit_get_tty(struct task_struct *tsk);
+extern struct tty_struct *audit_get_tty(void);
 extern void audit_put_tty(struct tty_struct *tty);
 
 /* audit watch functions */
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 51e735aedf58..6593a5207fb0 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -830,44 +830,6 @@ void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx)
 	rcu_read_unlock();
 }
 
-/* Transfer the audit context pointer to the caller, clearing it in the tsk's struct */
-static inline struct audit_context *audit_take_context(struct task_struct *tsk,
-						      int return_valid,
-						      long return_code)
-{
-	struct audit_context *context = tsk->audit_context;
-
-	if (!context)
-		return NULL;
-	context->return_valid = return_valid;
-
-	/*
-	 * we need to fix up the return code in the audit logs if the actual
-	 * return codes are later going to be fixed up by the arch specific
-	 * signal handlers
-	 *
-	 * This is actually a test for:
-	 * (rc == ERESTARTSYS ) || (rc == ERESTARTNOINTR) ||
-	 * (rc == ERESTARTNOHAND) || (rc == ERESTART_RESTARTBLOCK)
-	 *
-	 * but is faster than a bunch of ||
-	 */
-	if (unlikely(return_code <= -ERESTARTSYS) &&
-	    (return_code >= -ERESTART_RESTARTBLOCK) &&
-	    (return_code != -ENOIOCTLCMD))
-		context->return_code = -EINTR;
-	else
-		context->return_code  = return_code;
-
-	if (context->in_syscall && !context->dummy) {
-		audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]);
-		audit_filter_inodes(tsk, context);
-	}
-
-	audit_set_context(tsk, NULL);
-	return context;
-}
-
 static inline void audit_proctitle_free(struct audit_context *context)
 {
 	kfree(context->proctitle.value);
@@ -1296,15 +1258,18 @@ static inline int audit_proctitle_rtrim(char *proctitle, int len)
 	return len;
 }
 
-static void audit_log_proctitle(struct task_struct *tsk,
-			 struct audit_context *context)
+static void audit_log_proctitle(void)
 {
 	int res;
 	char *buf;
 	char *msg = "(null)";
 	int len = strlen(msg);
+	struct audit_context *context = audit_context();
 	struct audit_buffer *ab;
 
+	if (!context || context->dummy)
+		return;
+
 	ab = audit_log_start(context, GFP_KERNEL, AUDIT_PROCTITLE);
 	if (!ab)
 		return;	/* audit_panic or being filtered */
@@ -1317,7 +1282,7 @@ static void audit_log_proctitle(struct task_struct *tsk,
 		if (!buf)
 			goto out;
 		/* Historically called this from procfs naming */
-		res = get_cmdline(tsk, buf, MAX_PROCTITLE_AUDIT_LEN);
+		res = get_cmdline(current, buf, MAX_PROCTITLE_AUDIT_LEN);
 		if (res == 0) {
 			kfree(buf);
 			goto out;
@@ -1337,15 +1302,15 @@ out:
 	audit_log_end(ab);
 }
 
-static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
+static void audit_log_exit(void)
 {
 	int i, call_panic = 0;
+	struct audit_context *context = audit_context();
 	struct audit_buffer *ab;
 	struct audit_aux_data *aux;
 	struct audit_names *n;
 
-	/* tsk == current */
-	context->personality = tsk->personality;
+	context->personality = current->personality;
 
 	ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL);
 	if (!ab)
@@ -1367,7 +1332,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 			 context->argv[3],
 			 context->name_count);
 
-	audit_log_task_info(ab, tsk);
+	audit_log_task_info(ab);
 	audit_log_key(ab, context->filterkey);
 	audit_log_end(ab);
 
@@ -1456,7 +1421,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 		audit_log_name(context, n, NULL, i++, &call_panic);
 	}
 
-	audit_log_proctitle(tsk, context);
+	audit_log_proctitle();
 
 	/* Send end of event record to help user space know we are finished */
 	ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE);
@@ -1474,22 +1439,31 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
  */
 void __audit_free(struct task_struct *tsk)
 {
-	struct audit_context *context;
+	struct audit_context *context = tsk->audit_context;
 
-	context = audit_take_context(tsk, 0, 0);
 	if (!context)
 		return;
 
-	/* Check for system calls that do not go through the exit
-	 * function (e.g., exit_group), then free context block.
-	 * We use GFP_ATOMIC here because we might be doing this
-	 * in the context of the idle thread */
-	/* that can happen only if we are called from do_exit() */
-	if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT)
-		audit_log_exit(context, tsk);
+	/* We are called either by do_exit() or the fork() error handling code;
+	 * in the former case tsk == current and in the latter tsk is a
+	 * random task_struct that doesn't doesn't have any meaningful data we
+	 * need to log via audit_log_exit().
+	 */
+	if (tsk == current && !context->dummy && context->in_syscall) {
+		context->return_valid = 0;
+		context->return_code = 0;
+
+		audit_filter_syscall(tsk, context,
+				     &audit_filter_list[AUDIT_FILTER_EXIT]);
+		audit_filter_inodes(tsk, context);
+		if (context->current_state == AUDIT_RECORD_CONTEXT)
+			audit_log_exit();
+	}
+
 	if (!list_empty(&context->killed_trees))
 		audit_kill_trees(&context->killed_trees);
 
+	audit_set_context(tsk, NULL);
 	audit_free_context(context);
 }
 
@@ -1559,17 +1533,40 @@ void __audit_syscall_exit(int success, long return_code)
 {
 	struct audit_context *context;
 
-	if (success)
-		success = AUDITSC_SUCCESS;
-	else
-		success = AUDITSC_FAILURE;
-
-	context = audit_take_context(current, success, return_code);
+	context = audit_context();
 	if (!context)
 		return;
 
-	if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT)
-		audit_log_exit(context, current);
+	if (!context->dummy && context->in_syscall) {
+		if (success)
+			context->return_valid = AUDITSC_SUCCESS;
+		else
+			context->return_valid = AUDITSC_FAILURE;
+
+		/*
+		 * we need to fix up the return code in the audit logs if the
+		 * actual return codes are later going to be fixed up by the
+		 * arch specific signal handlers
+		 *
+		 * This is actually a test for:
+		 * (rc == ERESTARTSYS ) || (rc == ERESTARTNOINTR) ||
+		 * (rc == ERESTARTNOHAND) || (rc == ERESTART_RESTARTBLOCK)
+		 *
+		 * but is faster than a bunch of ||
+		 */
+		if (unlikely(return_code <= -ERESTARTSYS) &&
+		    (return_code >= -ERESTART_RESTARTBLOCK) &&
+		    (return_code != -ENOIOCTLCMD))
+			context->return_code = -EINTR;
+		else
+			context->return_code  = return_code;
+
+		audit_filter_syscall(current, context,
+				     &audit_filter_list[AUDIT_FILTER_EXIT]);
+		audit_filter_inodes(current, context);
+		if (context->current_state == AUDIT_RECORD_CONTEXT)
+			audit_log_exit();
+	}
 
 	context->in_syscall = 0;
 	context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
@@ -1591,7 +1588,6 @@ void __audit_syscall_exit(int success, long return_code)
 		kfree(context->filterkey);
 		context->filterkey = NULL;
 	}
-	audit_set_context(current, context);
 }
 
 static inline void handle_one(const struct inode *inode)
@@ -2025,7 +2021,7 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
 	uid = from_kuid(&init_user_ns, task_uid(current));
 	oldloginuid = from_kuid(&init_user_ns, koldloginuid);
 	loginuid = from_kuid(&init_user_ns, kloginuid),
-	tty = audit_get_tty(current);
+	tty = audit_get_tty();
 
 	audit_log_format(ab, "pid=%d uid=%u", task_tgid_nr(current), uid);
 	audit_log_task_context(ab);
@@ -2046,7 +2042,6 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
  */
 int audit_set_loginuid(kuid_t loginuid)
 {
-	struct task_struct *task = current;
 	unsigned int oldsessionid, sessionid = AUDIT_SID_UNSET;
 	kuid_t oldloginuid;
 	int rc;
@@ -2065,8 +2060,8 @@ int audit_set_loginuid(kuid_t loginuid)
 			sessionid = (unsigned int)atomic_inc_return(&session_id);
 	}
 
-	task->sessionid = sessionid;
-	task->loginuid = loginuid;
+	current->sessionid = sessionid;
+	current->loginuid = loginuid;
 out:
 	audit_log_set_loginuid(oldloginuid, loginuid, oldsessionid, sessionid, rc);
 	return rc;
diff --git a/security/integrity/ima/ima_api.c b/security/integrity/ima/ima_api.c
index 99dd1d53fc35..af134588ab4e 100644
--- a/security/integrity/ima/ima_api.c
+++ b/security/integrity/ima/ima_api.c
@@ -336,7 +336,7 @@ void ima_audit_measurement(struct integrity_iint_cache *iint,
 	audit_log_untrustedstring(ab, filename);
 	audit_log_format(ab, " hash=\"%s:%s\"", algo_name, hash);
 
-	audit_log_task_info(ab, current);
+	audit_log_task_info(ab);
 	audit_log_end(ab);
 
 	iint->flags |= IMA_AUDITED;
-- 
cgit 


From ba64e7d8525236aa56ab58ba3a3a71615c4ee289 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Sat, 24 Nov 2018 23:20:44 -0800
Subject: bpf: btf: support proper non-jit func info

Commit 838e96904ff3 ("bpf: Introduce bpf_func_info")
added bpf func info support. The userspace is able
to get better ksym's for bpf programs with jit, and
is able to print out func prototypes.

For a program containing func-to-func calls, the existing
implementation returns user specified number of function
calls and BTF types if jit is enabled. If the jit is not
enabled, it only returns the type for the main function.

This is undesirable. Interpreter may still be used
and we should keep feature identical regardless of
whether jit is enabled or not.
This patch fixed this discrepancy.

Fixes: 838e96904ff3 ("bpf: Introduce bpf_func_info")
Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h          |  6 +++--
 include/linux/bpf_verifier.h |  1 -
 kernel/bpf/core.c            |  3 ++-
 kernel/bpf/syscall.c         | 33 +++++++-------------------
 kernel/bpf/verifier.c        | 55 ++++++++++++++++++++++++++++++--------------
 5 files changed, 52 insertions(+), 46 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 7f0e225bf630..e82b7039fc66 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -299,7 +299,8 @@ struct bpf_prog_aux {
 	u32 max_pkt_offset;
 	u32 stack_depth;
 	u32 id;
-	u32 func_cnt;
+	u32 func_cnt; /* used by non-func prog as the number of func progs */
+	u32 func_idx; /* 0 for non-func prog, the index in func array for func prog */
 	bool offload_requested;
 	struct bpf_prog **func;
 	void *jit_data; /* JIT specific data. arch dependent */
@@ -317,7 +318,8 @@ struct bpf_prog_aux {
 #endif
 	struct bpf_prog_offload *offload;
 	struct btf *btf;
-	u32 type_id; /* type id for this prog/func */
+	struct bpf_func_info *func_info;
+	u32 func_info_cnt;
 	union {
 		struct work_struct work;
 		struct rcu_head	rcu;
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 204382f46fd8..11f5df1092d9 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -204,7 +204,6 @@ static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log)
 struct bpf_subprog_info {
 	u32 start; /* insn idx of function entry point */
 	u16 stack_depth; /* max. stack depth used by this function */
-	u32 type_id; /* btf type_id for this subprog */
 };
 
 /* single container for all structs
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 16d77012ad3e..002d67c62c8b 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -411,7 +411,8 @@ static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
 
 	/* prog->aux->name will be ignored if full btf name is available */
 	if (prog->aux->btf) {
-		type = btf_type_by_id(prog->aux->btf, prog->aux->type_id);
+		type = btf_type_by_id(prog->aux->btf,
+				      prog->aux->func_info[prog->aux->func_idx].type_id);
 		func_name = btf_name_by_offset(prog->aux->btf, type->name_off);
 		snprintf(sym, (size_t)(end - sym), "_%s", func_name);
 		return;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 998377808102..85cbeec06e50 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1214,6 +1214,7 @@ static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
 		bpf_prog_free_id(prog, do_idr_lock);
 		bpf_prog_kallsyms_del_all(prog);
 		btf_put(prog->aux->btf);
+		kvfree(prog->aux->func_info);
 
 		call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
 	}
@@ -2219,46 +2220,28 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 	}
 
 	if (prog->aux->btf) {
+		u32 krec_size = sizeof(struct bpf_func_info);
 		u32 ucnt, urec_size;
 
 		info.btf_id = btf_id(prog->aux->btf);
 
 		ucnt = info.func_info_cnt;
-		info.func_info_cnt = prog->aux->func_cnt ? : 1;
+		info.func_info_cnt = prog->aux->func_info_cnt;
 		urec_size = info.func_info_rec_size;
-		info.func_info_rec_size = sizeof(struct bpf_func_info);
+		info.func_info_rec_size = krec_size;
 		if (ucnt) {
 			/* expect passed-in urec_size is what the kernel expects */
 			if (urec_size != info.func_info_rec_size)
 				return -EINVAL;
 
 			if (bpf_dump_raw_ok()) {
-				struct bpf_func_info kern_finfo;
 				char __user *user_finfo;
-				u32 i, insn_offset;
 
 				user_finfo = u64_to_user_ptr(info.func_info);
-				if (prog->aux->func_cnt) {
-					ucnt = min_t(u32, info.func_info_cnt, ucnt);
-					insn_offset = 0;
-					for (i = 0; i < ucnt; i++) {
-						kern_finfo.insn_offset = insn_offset;
-						kern_finfo.type_id = prog->aux->func[i]->aux->type_id;
-						if (copy_to_user(user_finfo, &kern_finfo,
-								 sizeof(kern_finfo)))
-							return -EFAULT;
-
-						/* func[i]->len holds the prog len */
-						insn_offset += prog->aux->func[i]->len;
-						user_finfo += urec_size;
-					}
-				} else {
-					kern_finfo.insn_offset = 0;
-					kern_finfo.type_id = prog->aux->type_id;
-					if (copy_to_user(user_finfo, &kern_finfo,
-							 sizeof(kern_finfo)))
-						return -EFAULT;
-				}
+				ucnt = min_t(u32, info.func_info_cnt, ucnt);
+				if (copy_to_user(user_finfo, prog->aux->func_info,
+						 krec_size * ucnt))
+					return -EFAULT;
 			} else {
 				info.func_info_cnt = 0;
 			}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f102c4fd0c5a..05d95c0e4a26 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4650,7 +4650,7 @@ static int check_btf_func(struct bpf_prog *prog, struct bpf_verifier_env *env,
 {
 	u32 i, nfuncs, urec_size, min_size, prev_offset;
 	u32 krec_size = sizeof(struct bpf_func_info);
-	struct bpf_func_info krecord = {};
+	struct bpf_func_info *krecord = NULL;
 	const struct btf_type *type;
 	void __user *urecord;
 	struct btf *btf;
@@ -4682,6 +4682,12 @@ static int check_btf_func(struct bpf_prog *prog, struct bpf_verifier_env *env,
 	urecord = u64_to_user_ptr(attr->func_info);
 	min_size = min_t(u32, krec_size, urec_size);
 
+	krecord = kvcalloc(nfuncs, krec_size, GFP_KERNEL | __GFP_NOWARN);
+	if (!krecord) {
+		ret = -ENOMEM;
+		goto free_btf;
+	}
+
 	for (i = 0; i < nfuncs; i++) {
 		ret = bpf_check_uarg_tail_zero(urecord, krec_size, urec_size);
 		if (ret) {
@@ -4696,59 +4702,69 @@ static int check_btf_func(struct bpf_prog *prog, struct bpf_verifier_env *env,
 			goto free_btf;
 		}
 
-		if (copy_from_user(&krecord, urecord, min_size)) {
+		if (copy_from_user(&krecord[i], urecord, min_size)) {
 			ret = -EFAULT;
 			goto free_btf;
 		}
 
 		/* check insn_offset */
 		if (i == 0) {
-			if (krecord.insn_offset) {
+			if (krecord[i].insn_offset) {
 				verbose(env,
 					"nonzero insn_offset %u for the first func info record",
-					krecord.insn_offset);
+					krecord[i].insn_offset);
 				ret = -EINVAL;
 				goto free_btf;
 			}
-		} else if (krecord.insn_offset <= prev_offset) {
+		} else if (krecord[i].insn_offset <= prev_offset) {
 			verbose(env,
 				"same or smaller insn offset (%u) than previous func info record (%u)",
-				krecord.insn_offset, prev_offset);
+				krecord[i].insn_offset, prev_offset);
 			ret = -EINVAL;
 			goto free_btf;
 		}
 
-		if (env->subprog_info[i].start != krecord.insn_offset) {
+		if (env->subprog_info[i].start != krecord[i].insn_offset) {
 			verbose(env, "func_info BTF section doesn't match subprog layout in BPF program\n");
 			ret = -EINVAL;
 			goto free_btf;
 		}
 
 		/* check type_id */
-		type = btf_type_by_id(btf, krecord.type_id);
+		type = btf_type_by_id(btf, krecord[i].type_id);
 		if (!type || BTF_INFO_KIND(type->info) != BTF_KIND_FUNC) {
 			verbose(env, "invalid type id %d in func info",
-				krecord.type_id);
+				krecord[i].type_id);
 			ret = -EINVAL;
 			goto free_btf;
 		}
 
-		if (i == 0)
-			prog->aux->type_id = krecord.type_id;
-		env->subprog_info[i].type_id = krecord.type_id;
-
-		prev_offset = krecord.insn_offset;
+		prev_offset = krecord[i].insn_offset;
 		urecord += urec_size;
 	}
 
 	prog->aux->btf = btf;
+	prog->aux->func_info = krecord;
+	prog->aux->func_info_cnt = nfuncs;
 	return 0;
 
 free_btf:
 	btf_put(btf);
+	kvfree(krecord);
 	return ret;
 }
 
+static void adjust_btf_func(struct bpf_verifier_env *env)
+{
+	int i;
+
+	if (!env->prog->aux->func_info)
+		return;
+
+	for (i = 0; i < env->subprog_cnt; i++)
+		env->prog->aux->func_info[i].insn_offset = env->subprog_info[i].start;
+}
+
 /* check %cur's range satisfies %old's */
 static bool range_within(struct bpf_reg_state *old,
 			 struct bpf_reg_state *cur)
@@ -6043,15 +6059,17 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 		if (bpf_prog_calc_tag(func[i]))
 			goto out_free;
 		func[i]->is_func = 1;
+		func[i]->aux->func_idx = i;
+		/* the btf and func_info will be freed only at prog->aux */
+		func[i]->aux->btf = prog->aux->btf;
+		func[i]->aux->func_info = prog->aux->func_info;
+
 		/* Use bpf_prog_F_tag to indicate functions in stack traces.
 		 * Long term would need debug info to populate names
 		 */
 		func[i]->aux->name[0] = 'F';
 		func[i]->aux->stack_depth = env->subprog_info[i].stack_depth;
 		func[i]->jit_requested = 1;
-		/* the btf will be freed only at prog->aux */
-		func[i]->aux->btf = prog->aux->btf;
-		func[i]->aux->type_id = env->subprog_info[i].type_id;
 		func[i] = bpf_int_jit_compile(func[i]);
 		if (!func[i]->jited) {
 			err = -ENOTSUPP;
@@ -6572,6 +6590,9 @@ skip_full_check:
 		convert_pseudo_ld_imm64(env);
 	}
 
+	if (ret == 0)
+		adjust_btf_func(env);
+
 err_release_maps:
 	if (!env->prog->aux->used_maps)
 		/* if we didn't copy map pointers into bpf_prog_info, release
-- 
cgit 


From 7440172974e85b1828bdd84ac6b23b5bcad9c5eb Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Tue, 6 Nov 2018 18:44:52 -0800
Subject: tracing: Replace synchronize_sched() and call_rcu_sched()

Now that synchronize_rcu() waits for preempt-disable regions of code
as well as RCU read-side critical sections, synchronize_sched() can
be replaced by synchronize_rcu().  Similarly, call_rcu_sched() can be
replaced by call_rcu().  This commit therefore makes these changes.

Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: <linux-kernel@vger.kernel.org>
Acked-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/tracepoint.h         |  2 +-
 kernel/trace/ftrace.c              | 24 ++++++++++++------------
 kernel/trace/ring_buffer.c         | 12 ++++++------
 kernel/trace/trace.c               | 10 +++++-----
 kernel/trace/trace_events_filter.c |  4 ++--
 kernel/trace/trace_kprobe.c        |  2 +-
 kernel/tracepoint.c                |  4 ++--
 7 files changed, 29 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 538ba1a58f5b..432080b59c26 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -82,7 +82,7 @@ int unregister_tracepoint_module_notifier(struct notifier_block *nb)
 static inline void tracepoint_synchronize_unregister(void)
 {
 	synchronize_srcu(&tracepoint_srcu);
-	synchronize_sched();
+	synchronize_rcu();
 }
 #else
 static inline void tracepoint_synchronize_unregister(void)
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f536f601bd46..5b4f73e4fd56 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -173,7 +173,7 @@ static void ftrace_sync(struct work_struct *work)
 {
 	/*
 	 * This function is just a stub to implement a hard force
-	 * of synchronize_sched(). This requires synchronizing
+	 * of synchronize_rcu(). This requires synchronizing
 	 * tasks even in userspace and idle.
 	 *
 	 * Yes, function tracing is rude.
@@ -934,7 +934,7 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf,
 			ftrace_profile_enabled = 0;
 			/*
 			 * unregister_ftrace_profiler calls stop_machine
-			 * so this acts like an synchronize_sched.
+			 * so this acts like an synchronize_rcu.
 			 */
 			unregister_ftrace_profiler();
 		}
@@ -1086,7 +1086,7 @@ struct ftrace_ops *ftrace_ops_trampoline(unsigned long addr)
 
 	/*
 	 * Some of the ops may be dynamically allocated,
-	 * they are freed after a synchronize_sched().
+	 * they are freed after a synchronize_rcu().
 	 */
 	preempt_disable_notrace();
 
@@ -1286,7 +1286,7 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash)
 {
 	if (!hash || hash == EMPTY_HASH)
 		return;
-	call_rcu_sched(&hash->rcu, __free_ftrace_hash_rcu);
+	call_rcu(&hash->rcu, __free_ftrace_hash_rcu);
 }
 
 void ftrace_free_filter(struct ftrace_ops *ops)
@@ -1501,7 +1501,7 @@ static bool hash_contains_ip(unsigned long ip,
  * the ip is not in the ops->notrace_hash.
  *
  * This needs to be called with preemption disabled as
- * the hashes are freed with call_rcu_sched().
+ * the hashes are freed with call_rcu().
  */
 static int
 ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)
@@ -4496,7 +4496,7 @@ unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr,
 	if (ftrace_enabled && !ftrace_hash_empty(hash))
 		ftrace_run_modify_code(&probe->ops, FTRACE_UPDATE_CALLS,
 				       &old_hash_ops);
-	synchronize_sched();
+	synchronize_rcu();
 
 	hlist_for_each_entry_safe(entry, tmp, &hhd, hlist) {
 		hlist_del(&entry->hlist);
@@ -5314,7 +5314,7 @@ ftrace_graph_release(struct inode *inode, struct file *file)
 		mutex_unlock(&graph_lock);
 
 		/* Wait till all users are no longer using the old hash */
-		synchronize_sched();
+		synchronize_rcu();
 
 		free_ftrace_hash(old_hash);
 	}
@@ -5707,7 +5707,7 @@ void ftrace_release_mod(struct module *mod)
 	list_for_each_entry_safe(mod_map, n, &ftrace_mod_maps, list) {
 		if (mod_map->mod == mod) {
 			list_del_rcu(&mod_map->list);
-			call_rcu_sched(&mod_map->rcu, ftrace_free_mod_map);
+			call_rcu(&mod_map->rcu, ftrace_free_mod_map);
 			break;
 		}
 	}
@@ -5927,7 +5927,7 @@ ftrace_mod_address_lookup(unsigned long addr, unsigned long *size,
 	struct ftrace_mod_map *mod_map;
 	const char *ret = NULL;
 
-	/* mod_map is freed via call_rcu_sched() */
+	/* mod_map is freed via call_rcu() */
 	preempt_disable();
 	list_for_each_entry_rcu(mod_map, &ftrace_mod_maps, list) {
 		ret = ftrace_func_address_lookup(mod_map, addr, size, off, sym);
@@ -6262,7 +6262,7 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
 
 	/*
 	 * Some of the ops may be dynamically allocated,
-	 * they must be freed after a synchronize_sched().
+	 * they must be freed after a synchronize_rcu().
 	 */
 	preempt_disable_notrace();
 
@@ -6433,7 +6433,7 @@ static void clear_ftrace_pids(struct trace_array *tr)
 	rcu_assign_pointer(tr->function_pids, NULL);
 
 	/* Wait till all users are no longer using pid filtering */
-	synchronize_sched();
+	synchronize_rcu();
 
 	trace_free_pid_list(pid_list);
 }
@@ -6580,7 +6580,7 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf,
 	rcu_assign_pointer(tr->function_pids, pid_list);
 
 	if (filtered_pids) {
-		synchronize_sched();
+		synchronize_rcu();
 		trace_free_pid_list(filtered_pids);
 	} else if (pid_list) {
 		/* Register a probe to set whether to ignore the tracing of a task */
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 65bd4616220d..4f3247a53259 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1834,7 +1834,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
 		 * There could have been a race between checking
 		 * record_disable and incrementing it.
 		 */
-		synchronize_sched();
+		synchronize_rcu();
 		for_each_buffer_cpu(buffer, cpu) {
 			cpu_buffer = buffer->buffers[cpu];
 			rb_check_pages(cpu_buffer);
@@ -3151,7 +3151,7 @@ static bool rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
  * This prevents all writes to the buffer. Any attempt to write
  * to the buffer after this will fail and return NULL.
  *
- * The caller should call synchronize_sched() after this.
+ * The caller should call synchronize_rcu() after this.
  */
 void ring_buffer_record_disable(struct ring_buffer *buffer)
 {
@@ -3253,7 +3253,7 @@ bool ring_buffer_record_is_set_on(struct ring_buffer *buffer)
  * This prevents all writes to the buffer. Any attempt to write
  * to the buffer after this will fail and return NULL.
  *
- * The caller should call synchronize_sched() after this.
+ * The caller should call synchronize_rcu() after this.
  */
 void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu)
 {
@@ -4191,7 +4191,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_read_prepare);
 void
 ring_buffer_read_prepare_sync(void)
 {
-	synchronize_sched();
+	synchronize_rcu();
 }
 EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync);
 
@@ -4363,7 +4363,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
 	atomic_inc(&cpu_buffer->record_disabled);
 
 	/* Make sure all commits have finished */
-	synchronize_sched();
+	synchronize_rcu();
 
 	raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 
@@ -4496,7 +4496,7 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
 		goto out;
 
 	/*
-	 * We can't do a synchronize_sched here because this
+	 * We can't do a synchronize_rcu here because this
 	 * function can be called in atomic context.
 	 * Normally this will be called from the same CPU as cpu.
 	 * If not it's up to the caller to protect this.
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ff1c4b20cd0a..51612b4a603f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1681,7 +1681,7 @@ void tracing_reset(struct trace_buffer *buf, int cpu)
 	ring_buffer_record_disable(buffer);
 
 	/* Make sure all commits have finished */
-	synchronize_sched();
+	synchronize_rcu();
 	ring_buffer_reset_cpu(buffer, cpu);
 
 	ring_buffer_record_enable(buffer);
@@ -1698,7 +1698,7 @@ void tracing_reset_online_cpus(struct trace_buffer *buf)
 	ring_buffer_record_disable(buffer);
 
 	/* Make sure all commits have finished */
-	synchronize_sched();
+	synchronize_rcu();
 
 	buf->time_start = buffer_ftrace_now(buf, buf->cpu);
 
@@ -2250,7 +2250,7 @@ void trace_buffered_event_disable(void)
 	preempt_enable();
 
 	/* Wait for all current users to finish */
-	synchronize_sched();
+	synchronize_rcu();
 
 	for_each_tracing_cpu(cpu) {
 		free_page((unsigned long)per_cpu(trace_buffered_event, cpu));
@@ -5398,7 +5398,7 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf)
 	if (tr->current_trace->reset)
 		tr->current_trace->reset(tr);
 
-	/* Current trace needs to be nop_trace before synchronize_sched */
+	/* Current trace needs to be nop_trace before synchronize_rcu */
 	tr->current_trace = &nop_trace;
 
 #ifdef CONFIG_TRACER_MAX_TRACE
@@ -5412,7 +5412,7 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf)
 		 * The update_max_tr is called from interrupts disabled
 		 * so a synchronized_sched() is sufficient.
 		 */
-		synchronize_sched();
+		synchronize_rcu();
 		free_snapshot(tr);
 	}
 #endif
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 84a65173b1e9..35f3aa55be85 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1614,7 +1614,7 @@ static int process_system_preds(struct trace_subsystem_dir *dir,
 
 	/*
 	 * The calls can still be using the old filters.
-	 * Do a synchronize_sched() and to ensure all calls are
+	 * Do a synchronize_rcu() and to ensure all calls are
 	 * done with them before we free them.
 	 */
 	tracepoint_synchronize_unregister();
@@ -1845,7 +1845,7 @@ int apply_subsystem_event_filter(struct trace_subsystem_dir *dir,
 	if (filter) {
 		/*
 		 * No event actually uses the system filter
-		 * we can free it without synchronize_sched().
+		 * we can free it without synchronize_rcu().
 		 */
 		__free_filter(system->filter);
 		system->filter = filter;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index fec67188c4d2..adc153ab51c0 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -333,7 +333,7 @@ disable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file)
 		 * event_call related objects, which will be accessed in
 		 * the kprobe_trace_func/kretprobe_trace_func.
 		 */
-		synchronize_sched();
+		synchronize_rcu();
 		kfree(link);	/* Ignored if link == NULL */
 	}
 
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index a3be42304485..46f2ab1e08a9 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -92,7 +92,7 @@ static __init int release_early_probes(void)
 	while (early_probes) {
 		tmp = early_probes;
 		early_probes = tmp->next;
-		call_rcu_sched(tmp, rcu_free_old_probes);
+		call_rcu(tmp, rcu_free_old_probes);
 	}
 
 	return 0;
@@ -123,7 +123,7 @@ static inline void release_probes(struct tracepoint_func *old)
 		 * cover both cases. So let us chain the SRCU and sched RCU
 		 * callbacks to wait for both grace periods.
 		 */
-		call_rcu_sched(&tp_probes->rcu, rcu_free_old_probes);
+		call_rcu(&tp_probes->rcu, rcu_free_old_probes);
 	}
 }
 
-- 
cgit 


From ae8b7ce7647bc5c56a9a9fc32b03e1cd0ae49629 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Tue, 6 Nov 2018 19:04:39 -0800
Subject: kprobes: Replace synchronize_sched() with synchronize_rcu()

Now that synchronize_rcu() waits for preempt-disable regions of code
as well as RCU read-side critical sections, synchronize_sched() can be
replaced by synchronize_rcu().  This commit therefore makes this change.

Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: "David S. Miller" <davem@davemloft.net>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
---
 kernel/kprobes.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 90e98e233647..08e31d863191 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -229,7 +229,7 @@ static int collect_garbage_slots(struct kprobe_insn_cache *c)
 	struct kprobe_insn_page *kip, *next;
 
 	/* Ensure no-one is interrupted on the garbages */
-	synchronize_sched();
+	synchronize_rcu();
 
 	list_for_each_entry_safe(kip, next, &c->pages, list) {
 		int i;
@@ -1382,7 +1382,7 @@ out:
 			if (ret) {
 				ap->flags |= KPROBE_FLAG_DISABLED;
 				list_del_rcu(&p->list);
-				synchronize_sched();
+				synchronize_rcu();
 			}
 		}
 	}
@@ -1597,7 +1597,7 @@ int register_kprobe(struct kprobe *p)
 		ret = arm_kprobe(p);
 		if (ret) {
 			hlist_del_rcu(&p->hlist);
-			synchronize_sched();
+			synchronize_rcu();
 			goto out;
 		}
 	}
@@ -1776,7 +1776,7 @@ void unregister_kprobes(struct kprobe **kps, int num)
 			kps[i]->addr = NULL;
 	mutex_unlock(&kprobe_mutex);
 
-	synchronize_sched();
+	synchronize_rcu();
 	for (i = 0; i < num; i++)
 		if (kps[i]->addr)
 			__unregister_kprobe_bottom(kps[i]);
@@ -1966,7 +1966,7 @@ void unregister_kretprobes(struct kretprobe **rps, int num)
 			rps[i]->kp.addr = NULL;
 	mutex_unlock(&kprobe_mutex);
 
-	synchronize_sched();
+	synchronize_rcu();
 	for (i = 0; i < num; i++) {
 		if (rps[i]->kp.addr) {
 			__unregister_kprobe_bottom(&rps[i]->kp);
-- 
cgit 


From 51959d85f32dde9041655936eef206cc3323dc12 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Tue, 6 Nov 2018 19:06:51 -0800
Subject: lockdep: Replace synchronize_sched() with synchronize_rcu()

Now that synchronize_rcu() waits for preempt-disable regions of code
as well as RCU read-side critical sections, synchronize_sched() can be
replaced by synchronize_rcu().  This commit therefore makes this change.

Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Will Deacon <will.deacon@arm.com>
---
 kernel/locking/lockdep.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 1efada2dd9dd..ef27f98714c0 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -4195,7 +4195,7 @@ void lockdep_free_key_range(void *start, unsigned long size)
 	 *
 	 * sync_sched() is sufficient because the read-side is IRQ disable.
 	 */
-	synchronize_sched();
+	synchronize_rcu();
 
 	/*
 	 * XXX at this point we could return the resources to the pool;
-- 
cgit 


From c9a863bbb1620dca3af57934cdbb002e852449fc Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Tue, 6 Nov 2018 19:09:14 -0800
Subject: sched/membarrier: synchronize_sched() with synchronize_rcu()

Now that synchronize_rcu() waits for preempt-disable regions of code
as well as RCU read-side critical sections, synchronize_sched() can be
replaced by synchronize_rcu().  This commit therefore makes this change.

Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 kernel/sched/membarrier.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index 388a7a6c1aa2..3cd8a3a795d2 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -210,7 +210,7 @@ static int membarrier_register_global_expedited(void)
 		 * future scheduler executions will observe the new
 		 * thread flag state for this mm.
 		 */
-		synchronize_sched();
+		synchronize_rcu();
 	}
 	atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
 		  &mm->membarrier_state);
@@ -246,7 +246,7 @@ static int membarrier_register_private_expedited(int flags)
 		 * Ensure all future scheduler executions will observe the
 		 * new thread flag state for this process.
 		 */
-		synchronize_sched();
+		synchronize_rcu();
 	}
 	atomic_or(state, &mm->membarrier_state);
 
-- 
cgit 


From cb2f55369d3a9e6cf5c34d2da39eb242279a582d Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Tue, 6 Nov 2018 19:17:01 -0800
Subject: modules: Replace synchronize_sched() and call_rcu_sched()

Now that synchronize_rcu() waits for preempt-disable regions of code
as well as RCU read-side critical sections, synchronize_sched() can
be replaced by synchronize_rcu().  Similarly, call_rcu_sched() can be
replaced by call_rcu().  This commit therefore makes these changes.

Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
Acked-by: Jessica Yu <jeyu@kernel.org>
---
 kernel/module.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 49a405891587..99b46c32d579 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2159,7 +2159,7 @@ static void free_module(struct module *mod)
 	/* Remove this module from bug list, this uses list_del_rcu */
 	module_bug_cleanup(mod);
 	/* Wait for RCU-sched synchronizing before releasing mod->list and buglist. */
-	synchronize_sched();
+	synchronize_rcu();
 	mutex_unlock(&module_mutex);
 
 	/* This may be empty, but that's OK */
@@ -3507,15 +3507,15 @@ static noinline int do_init_module(struct module *mod)
 	/*
 	 * We want to free module_init, but be aware that kallsyms may be
 	 * walking this with preempt disabled.  In all the failure paths, we
-	 * call synchronize_sched(), but we don't want to slow down the success
+	 * call synchronize_rcu(), but we don't want to slow down the success
 	 * path, so use actual RCU here.
 	 * Note that module_alloc() on most architectures creates W+X page
 	 * mappings which won't be cleaned up until do_free_init() runs.  Any
 	 * code such as mark_rodata_ro() which depends on those mappings to
 	 * be cleaned up needs to sync with the queued work - ie
-	 * rcu_barrier_sched()
+	 * rcu_barrier()
 	 */
-	call_rcu_sched(&freeinit->rcu, do_free_init);
+	call_rcu(&freeinit->rcu, do_free_init);
 	mutex_unlock(&module_mutex);
 	wake_up_all(&module_wq);
 
@@ -3526,7 +3526,7 @@ fail_free_freeinit:
 fail:
 	/* Try to protect us from buggy refcounters. */
 	mod->state = MODULE_STATE_GOING;
-	synchronize_sched();
+	synchronize_rcu();
 	module_put(mod);
 	blocking_notifier_call_chain(&module_notify_list,
 				     MODULE_STATE_GOING, mod);
@@ -3819,7 +3819,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
  ddebug_cleanup:
 	ftrace_release_mod(mod);
 	dynamic_debug_remove(mod, info->debug);
-	synchronize_sched();
+	synchronize_rcu();
 	kfree(mod->args);
  free_arch_cleanup:
 	module_arch_cleanup(mod);
@@ -3834,7 +3834,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
 	mod_tree_remove(mod);
 	wake_up_all(&module_wq);
 	/* Wait for RCU-sched synchronizing before releasing mod->list. */
-	synchronize_sched();
+	synchronize_rcu();
 	mutex_unlock(&module_mutex);
  free_module:
 	/* Free lock-classes; relies on the preceding sync_rcu() */
-- 
cgit 


From 25b0077511fe7cf1b876174f8481fb1742f4fb4d Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Tue, 6 Nov 2018 19:18:45 -0800
Subject: workqueue: Replace call_rcu_sched() with call_rcu()

Now that call_rcu()'s callback is not invoked until after all
preempt-disable regions of code have completed (in addition to explicitly
marked RCU read-side critical sections), call_rcu() can be used in place
of call_rcu_sched().  This commit therefore makes that change.

Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Acked-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0280deac392e..392be4b252f6 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3396,7 +3396,7 @@ static void put_unbound_pool(struct worker_pool *pool)
 	del_timer_sync(&pool->mayday_timer);
 
 	/* sched-RCU protected to allow dereferences from get_work_pool() */
-	call_rcu_sched(&pool->rcu, rcu_free_pool);
+	call_rcu(&pool->rcu, rcu_free_pool);
 }
 
 /**
@@ -3503,14 +3503,14 @@ static void pwq_unbound_release_workfn(struct work_struct *work)
 	put_unbound_pool(pool);
 	mutex_unlock(&wq_pool_mutex);
 
-	call_rcu_sched(&pwq->rcu, rcu_free_pwq);
+	call_rcu(&pwq->rcu, rcu_free_pwq);
 
 	/*
 	 * If we're the last pwq going away, @wq is already dead and no one
 	 * is gonna access it anymore.  Schedule RCU free.
 	 */
 	if (is_last)
-		call_rcu_sched(&wq->rcu, rcu_free_wq);
+		call_rcu(&wq->rcu, rcu_free_wq);
 }
 
 /**
@@ -4195,7 +4195,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
 		 * The base ref is never dropped on per-cpu pwqs.  Directly
 		 * schedule RCU free.
 		 */
-		call_rcu_sched(&wq->rcu, rcu_free_wq);
+		call_rcu(&wq->rcu, rcu_free_wq);
 	} else {
 		/*
 		 * We're the sole accessor of @wq at this point.  Directly
-- 
cgit 


From 0809d95451f7d867d37cf2b526b8da923fd72891 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Tue, 6 Nov 2018 19:20:05 -0800
Subject: events: Replace synchronize_sched() with synchronize_rcu()

Now that synchronize_rcu() waits for preempt-disable regions of code
as well as RCU read-side critical sections, synchronize_sched() can be
replaced by synchronize_rcu().  This commit therefore makes this change.

Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
---
 kernel/events/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 84530ab358c3..c4b90cf7734a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -9918,7 +9918,7 @@ static void account_event(struct perf_event *event)
 			 * call the perf scheduling hooks before proceeding to
 			 * install events that need them.
 			 */
-			synchronize_sched();
+			synchronize_rcu();
 		}
 		/*
 		 * Now that we have waited for the sync_sched(), allow further
-- 
cgit 


From eb4c2382272ae7ae5d81fdfa5b7a6c86146eaaa4 Mon Sep 17 00:00:00 2001
From: Dennis Krein <Dennis.Krein@netapp.com>
Date: Fri, 26 Oct 2018 07:38:24 -0700
Subject: srcu: Lock srcu_data structure in srcu_gp_start()

The srcu_gp_start() function is called with the srcu_struct structure's
->lock held, but not with the srcu_data structure's ->lock.  This is
problematic because this function accesses and updates the srcu_data
structure's ->srcu_cblist, which is protected by that lock.  Failing to
hold this lock can result in corruption of the SRCU callback lists,
which in turn can result in arbitrarily bad results.

This commit therefore makes srcu_gp_start() acquire the srcu_data
structure's ->lock across the calls to rcu_segcblist_advance() and
rcu_segcblist_accelerate(), thus preventing this corruption.

Reported-by: Bart Van Assche <bvanassche@acm.org>
Reported-by: Christoph Hellwig <hch@infradead.org>
Reported-by: Sebastian Kuzminsky <seb.kuzminsky@gmail.com>
Signed-off-by: Dennis Krein <Dennis.Krein@netapp.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
Tested-by: Dennis Krein <Dennis.Krein@netapp.com>
Cc: <stable@vger.kernel.org> # 4.16.x
---
 kernel/rcu/srcutree.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 60f3236beaf7..697a2d7e8e8a 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -451,10 +451,12 @@ static void srcu_gp_start(struct srcu_struct *sp)
 
 	lockdep_assert_held(&ACCESS_PRIVATE(sp, lock));
 	WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed));
+	spin_lock_rcu_node(sdp);  /* Interrupts already disabled. */
 	rcu_segcblist_advance(&sdp->srcu_cblist,
 			      rcu_seq_current(&sp->srcu_gp_seq));
 	(void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
 				       rcu_seq_snap(&sp->srcu_gp_seq));
+	spin_unlock_rcu_node(sdp);  /* Interrupts remain disabled. */
 	smp_mb(); /* Order prior store to ->srcu_gp_seq_needed vs. GP start. */
 	rcu_seq_start(&sp->srcu_gp_seq);
 	state = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq));
-- 
cgit 


From aacb5d91ab1bfbb0e8123da59a2e333d52ba7f60 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Sun, 28 Oct 2018 10:32:51 -0700
Subject: srcu: Use "ssp" instead of "sp" for srcu_struct pointer

In RCU, the distinction between "rsp", "rnp", and "rdp" has served well
for a great many years, but in SRCU, "sp" vs. "sdp" has proven confusing.
This commit therefore renames SRCU's "sp" pointers to "ssp", so that there
is "ssp" for srcu_struct pointer, "snp" for srcu_node pointer, and "sdp"
for srcu_data pointer.

Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 include/linux/srcu.h     |  78 ++++----
 include/linux/srcutiny.h |  24 +--
 include/linux/srcutree.h |   8 +-
 kernel/rcu/srcutiny.c    | 120 ++++++------
 kernel/rcu/srcutree.c    | 488 +++++++++++++++++++++++------------------------
 5 files changed, 359 insertions(+), 359 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index ebd5f1511690..c614375cd264 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -38,20 +38,20 @@ struct srcu_struct;
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 
-int __init_srcu_struct(struct srcu_struct *sp, const char *name,
+int __init_srcu_struct(struct srcu_struct *ssp, const char *name,
 		       struct lock_class_key *key);
 
-#define init_srcu_struct(sp) \
+#define init_srcu_struct(ssp) \
 ({ \
 	static struct lock_class_key __srcu_key; \
 	\
-	__init_srcu_struct((sp), #sp, &__srcu_key); \
+	__init_srcu_struct((ssp), #ssp, &__srcu_key); \
 })
 
 #define __SRCU_DEP_MAP_INIT(srcu_name)	.dep_map = { .name = #srcu_name },
 #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
-int init_srcu_struct(struct srcu_struct *sp);
+int init_srcu_struct(struct srcu_struct *ssp);
 
 #define __SRCU_DEP_MAP_INIT(srcu_name)
 #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
@@ -67,28 +67,28 @@ int init_srcu_struct(struct srcu_struct *sp);
 struct srcu_struct { };
 #endif
 
-void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
+void call_srcu(struct srcu_struct *ssp, struct rcu_head *head,
 		void (*func)(struct rcu_head *head));
-void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced);
-int __srcu_read_lock(struct srcu_struct *sp) __acquires(sp);
-void __srcu_read_unlock(struct srcu_struct *sp, int idx) __releases(sp);
-void synchronize_srcu(struct srcu_struct *sp);
+void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced);
+int __srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp);
+void __srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp);
+void synchronize_srcu(struct srcu_struct *ssp);
 
 /**
  * cleanup_srcu_struct - deconstruct a sleep-RCU structure
- * @sp: structure to clean up.
+ * @ssp: structure to clean up.
  *
  * Must invoke this after you are finished using a given srcu_struct that
  * was initialized via init_srcu_struct(), else you leak memory.
  */
-static inline void cleanup_srcu_struct(struct srcu_struct *sp)
+static inline void cleanup_srcu_struct(struct srcu_struct *ssp)
 {
-	_cleanup_srcu_struct(sp, false);
+	_cleanup_srcu_struct(ssp, false);
 }
 
 /**
  * cleanup_srcu_struct_quiesced - deconstruct a quiesced sleep-RCU structure
- * @sp: structure to clean up.
+ * @ssp: structure to clean up.
  *
  * Must invoke this after you are finished using a given srcu_struct that
  * was initialized via init_srcu_struct(), else you leak memory.  Also,
@@ -103,16 +103,16 @@ static inline void cleanup_srcu_struct(struct srcu_struct *sp)
  * (with high probability, anyway), and will also cause the srcu_struct
  * to be leaked.
  */
-static inline void cleanup_srcu_struct_quiesced(struct srcu_struct *sp)
+static inline void cleanup_srcu_struct_quiesced(struct srcu_struct *ssp)
 {
-	_cleanup_srcu_struct(sp, true);
+	_cleanup_srcu_struct(ssp, true);
 }
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 
 /**
  * srcu_read_lock_held - might we be in SRCU read-side critical section?
- * @sp: The srcu_struct structure to check
+ * @ssp: The srcu_struct structure to check
  *
  * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an SRCU
  * read-side critical section.  In absence of CONFIG_DEBUG_LOCK_ALLOC,
@@ -126,16 +126,16 @@ static inline void cleanup_srcu_struct_quiesced(struct srcu_struct *sp)
  * relies on normal RCU, it can be called from the CPU which
  * is in the idle loop from an RCU point of view or offline.
  */
-static inline int srcu_read_lock_held(const struct srcu_struct *sp)
+static inline int srcu_read_lock_held(const struct srcu_struct *ssp)
 {
 	if (!debug_lockdep_rcu_enabled())
 		return 1;
-	return lock_is_held(&sp->dep_map);
+	return lock_is_held(&ssp->dep_map);
 }
 
 #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
-static inline int srcu_read_lock_held(const struct srcu_struct *sp)
+static inline int srcu_read_lock_held(const struct srcu_struct *ssp)
 {
 	return 1;
 }
@@ -145,7 +145,7 @@ static inline int srcu_read_lock_held(const struct srcu_struct *sp)
 /**
  * srcu_dereference_check - fetch SRCU-protected pointer for later dereferencing
  * @p: the pointer to fetch and protect for later dereferencing
- * @sp: pointer to the srcu_struct, which is used to check that we
+ * @ssp: pointer to the srcu_struct, which is used to check that we
  *	really are in an SRCU read-side critical section.
  * @c: condition to check for update-side use
  *
@@ -154,32 +154,32 @@ static inline int srcu_read_lock_held(const struct srcu_struct *sp)
  * to 1.  The @c argument will normally be a logical expression containing
  * lockdep_is_held() calls.
  */
-#define srcu_dereference_check(p, sp, c) \
-	__rcu_dereference_check((p), (c) || srcu_read_lock_held(sp), __rcu)
+#define srcu_dereference_check(p, ssp, c) \
+	__rcu_dereference_check((p), (c) || srcu_read_lock_held(ssp), __rcu)
 
 /**
  * srcu_dereference - fetch SRCU-protected pointer for later dereferencing
  * @p: the pointer to fetch and protect for later dereferencing
- * @sp: pointer to the srcu_struct, which is used to check that we
+ * @ssp: pointer to the srcu_struct, which is used to check that we
  *	really are in an SRCU read-side critical section.
  *
  * Makes rcu_dereference_check() do the dirty work.  If PROVE_RCU
  * is enabled, invoking this outside of an RCU read-side critical
  * section will result in an RCU-lockdep splat.
  */
-#define srcu_dereference(p, sp) srcu_dereference_check((p), (sp), 0)
+#define srcu_dereference(p, ssp) srcu_dereference_check((p), (ssp), 0)
 
 /**
  * srcu_dereference_notrace - no tracing and no lockdep calls from here
  * @p: the pointer to fetch and protect for later dereferencing
- * @sp: pointer to the srcu_struct, which is used to check that we
+ * @ssp: pointer to the srcu_struct, which is used to check that we
  *	really are in an SRCU read-side critical section.
  */
-#define srcu_dereference_notrace(p, sp) srcu_dereference_check((p), (sp), 1)
+#define srcu_dereference_notrace(p, ssp) srcu_dereference_check((p), (ssp), 1)
 
 /**
  * srcu_read_lock - register a new reader for an SRCU-protected structure.
- * @sp: srcu_struct in which to register the new reader.
+ * @ssp: srcu_struct in which to register the new reader.
  *
  * Enter an SRCU read-side critical section.  Note that SRCU read-side
  * critical sections may be nested.  However, it is illegal to
@@ -194,44 +194,44 @@ static inline int srcu_read_lock_held(const struct srcu_struct *sp)
  * srcu_read_unlock() in an irq handler if the matching srcu_read_lock()
  * was invoked in process context.
  */
-static inline int srcu_read_lock(struct srcu_struct *sp) __acquires(sp)
+static inline int srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp)
 {
 	int retval;
 
-	retval = __srcu_read_lock(sp);
-	rcu_lock_acquire(&(sp)->dep_map);
+	retval = __srcu_read_lock(ssp);
+	rcu_lock_acquire(&(ssp)->dep_map);
 	return retval;
 }
 
 /* Used by tracing, cannot be traced and cannot invoke lockdep. */
 static inline notrace int
-srcu_read_lock_notrace(struct srcu_struct *sp) __acquires(sp)
+srcu_read_lock_notrace(struct srcu_struct *ssp) __acquires(ssp)
 {
 	int retval;
 
-	retval = __srcu_read_lock(sp);
+	retval = __srcu_read_lock(ssp);
 	return retval;
 }
 
 /**
  * srcu_read_unlock - unregister a old reader from an SRCU-protected structure.
- * @sp: srcu_struct in which to unregister the old reader.
+ * @ssp: srcu_struct in which to unregister the old reader.
  * @idx: return value from corresponding srcu_read_lock().
  *
  * Exit an SRCU read-side critical section.
  */
-static inline void srcu_read_unlock(struct srcu_struct *sp, int idx)
-	__releases(sp)
+static inline void srcu_read_unlock(struct srcu_struct *ssp, int idx)
+	__releases(ssp)
 {
-	rcu_lock_release(&(sp)->dep_map);
-	__srcu_read_unlock(sp, idx);
+	rcu_lock_release(&(ssp)->dep_map);
+	__srcu_read_unlock(ssp, idx);
 }
 
 /* Used by tracing, cannot be traced and cannot call lockdep. */
 static inline notrace void
-srcu_read_unlock_notrace(struct srcu_struct *sp, int idx) __releases(sp)
+srcu_read_unlock_notrace(struct srcu_struct *ssp, int idx) __releases(ssp)
 {
-	__srcu_read_unlock(sp, idx);
+	__srcu_read_unlock(ssp, idx);
 }
 
 /**
diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h
index f41d2fb09f87..b19216aaaef2 100644
--- a/include/linux/srcutiny.h
+++ b/include/linux/srcutiny.h
@@ -60,7 +60,7 @@ void srcu_drive_gp(struct work_struct *wp);
 #define DEFINE_STATIC_SRCU(name) \
 	static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name)
 
-void synchronize_srcu(struct srcu_struct *sp);
+void synchronize_srcu(struct srcu_struct *ssp);
 
 /*
  * Counts the new reader in the appropriate per-CPU element of the
@@ -68,36 +68,36 @@ void synchronize_srcu(struct srcu_struct *sp);
  * __srcu_read_unlock() must be in the same handler instance.  Returns an
  * index that must be passed to the matching srcu_read_unlock().
  */
-static inline int __srcu_read_lock(struct srcu_struct *sp)
+static inline int __srcu_read_lock(struct srcu_struct *ssp)
 {
 	int idx;
 
-	idx = READ_ONCE(sp->srcu_idx);
-	WRITE_ONCE(sp->srcu_lock_nesting[idx], sp->srcu_lock_nesting[idx] + 1);
+	idx = READ_ONCE(ssp->srcu_idx);
+	WRITE_ONCE(ssp->srcu_lock_nesting[idx], ssp->srcu_lock_nesting[idx] + 1);
 	return idx;
 }
 
-static inline void synchronize_srcu_expedited(struct srcu_struct *sp)
+static inline void synchronize_srcu_expedited(struct srcu_struct *ssp)
 {
-	synchronize_srcu(sp);
+	synchronize_srcu(ssp);
 }
 
-static inline void srcu_barrier(struct srcu_struct *sp)
+static inline void srcu_barrier(struct srcu_struct *ssp)
 {
-	synchronize_srcu(sp);
+	synchronize_srcu(ssp);
 }
 
 /* Defined here to avoid size increase for non-torture kernels. */
-static inline void srcu_torture_stats_print(struct srcu_struct *sp,
+static inline void srcu_torture_stats_print(struct srcu_struct *ssp,
 					    char *tt, char *tf)
 {
 	int idx;
 
-	idx = READ_ONCE(sp->srcu_idx) & 0x1;
+	idx = READ_ONCE(ssp->srcu_idx) & 0x1;
 	pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%hd,%hd)\n",
 		 tt, tf, idx,
-		 READ_ONCE(sp->srcu_lock_nesting[!idx]),
-		 READ_ONCE(sp->srcu_lock_nesting[idx]));
+		 READ_ONCE(ssp->srcu_lock_nesting[!idx]),
+		 READ_ONCE(ssp->srcu_lock_nesting[idx]));
 }
 
 #endif
diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
index 0ae91b3a7406..6f292bd3e7db 100644
--- a/include/linux/srcutree.h
+++ b/include/linux/srcutree.h
@@ -51,7 +51,7 @@ struct srcu_data {
 	unsigned long grpmask;			/* Mask for leaf srcu_node */
 						/*  ->srcu_data_have_cbs[]. */
 	int cpu;
-	struct srcu_struct *sp;
+	struct srcu_struct *ssp;
 };
 
 /*
@@ -138,8 +138,8 @@ struct srcu_struct {
 #define DEFINE_SRCU(name)		__DEFINE_SRCU(name, /* not static */)
 #define DEFINE_STATIC_SRCU(name)	__DEFINE_SRCU(name, static)
 
-void synchronize_srcu_expedited(struct srcu_struct *sp);
-void srcu_barrier(struct srcu_struct *sp);
-void srcu_torture_stats_print(struct srcu_struct *sp, char *tt, char *tf);
+void synchronize_srcu_expedited(struct srcu_struct *ssp);
+void srcu_barrier(struct srcu_struct *ssp);
+void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf);
 
 #endif
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index b46e6683f8c9..32dfd6522548 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -37,30 +37,30 @@ int rcu_scheduler_active __read_mostly;
 static LIST_HEAD(srcu_boot_list);
 static bool srcu_init_done;
 
-static int init_srcu_struct_fields(struct srcu_struct *sp)
+static int init_srcu_struct_fields(struct srcu_struct *ssp)
 {
-	sp->srcu_lock_nesting[0] = 0;
-	sp->srcu_lock_nesting[1] = 0;
-	init_swait_queue_head(&sp->srcu_wq);
-	sp->srcu_cb_head = NULL;
-	sp->srcu_cb_tail = &sp->srcu_cb_head;
-	sp->srcu_gp_running = false;
-	sp->srcu_gp_waiting = false;
-	sp->srcu_idx = 0;
-	INIT_WORK(&sp->srcu_work, srcu_drive_gp);
-	INIT_LIST_HEAD(&sp->srcu_work.entry);
+	ssp->srcu_lock_nesting[0] = 0;
+	ssp->srcu_lock_nesting[1] = 0;
+	init_swait_queue_head(&ssp->srcu_wq);
+	ssp->srcu_cb_head = NULL;
+	ssp->srcu_cb_tail = &ssp->srcu_cb_head;
+	ssp->srcu_gp_running = false;
+	ssp->srcu_gp_waiting = false;
+	ssp->srcu_idx = 0;
+	INIT_WORK(&ssp->srcu_work, srcu_drive_gp);
+	INIT_LIST_HEAD(&ssp->srcu_work.entry);
 	return 0;
 }
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 
-int __init_srcu_struct(struct srcu_struct *sp, const char *name,
+int __init_srcu_struct(struct srcu_struct *ssp, const char *name,
 		       struct lock_class_key *key)
 {
 	/* Don't re-initialize a lock while it is held. */
-	debug_check_no_locks_freed((void *)sp, sizeof(*sp));
-	lockdep_init_map(&sp->dep_map, name, key, 0);
-	return init_srcu_struct_fields(sp);
+	debug_check_no_locks_freed((void *)ssp, sizeof(*ssp));
+	lockdep_init_map(&ssp->dep_map, name, key, 0);
+	return init_srcu_struct_fields(ssp);
 }
 EXPORT_SYMBOL_GPL(__init_srcu_struct);
 
@@ -68,15 +68,15 @@ EXPORT_SYMBOL_GPL(__init_srcu_struct);
 
 /*
  * init_srcu_struct - initialize a sleep-RCU structure
- * @sp: structure to initialize.
+ * @ssp: structure to initialize.
  *
  * Must invoke this on a given srcu_struct before passing that srcu_struct
  * to any other function.  Each srcu_struct represents a separate domain
  * of SRCU protection.
  */
-int init_srcu_struct(struct srcu_struct *sp)
+int init_srcu_struct(struct srcu_struct *ssp)
 {
-	return init_srcu_struct_fields(sp);
+	return init_srcu_struct_fields(ssp);
 }
 EXPORT_SYMBOL_GPL(init_srcu_struct);
 
@@ -84,22 +84,22 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
 
 /*
  * cleanup_srcu_struct - deconstruct a sleep-RCU structure
- * @sp: structure to clean up.
+ * @ssp: structure to clean up.
  *
  * Must invoke this after you are finished using a given srcu_struct that
  * was initialized via init_srcu_struct(), else you leak memory.
  */
-void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced)
+void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced)
 {
-	WARN_ON(sp->srcu_lock_nesting[0] || sp->srcu_lock_nesting[1]);
+	WARN_ON(ssp->srcu_lock_nesting[0] || ssp->srcu_lock_nesting[1]);
 	if (quiesced)
-		WARN_ON(work_pending(&sp->srcu_work));
+		WARN_ON(work_pending(&ssp->srcu_work));
 	else
-		flush_work(&sp->srcu_work);
-	WARN_ON(sp->srcu_gp_running);
-	WARN_ON(sp->srcu_gp_waiting);
-	WARN_ON(sp->srcu_cb_head);
-	WARN_ON(&sp->srcu_cb_head != sp->srcu_cb_tail);
+		flush_work(&ssp->srcu_work);
+	WARN_ON(ssp->srcu_gp_running);
+	WARN_ON(ssp->srcu_gp_waiting);
+	WARN_ON(ssp->srcu_cb_head);
+	WARN_ON(&ssp->srcu_cb_head != ssp->srcu_cb_tail);
 }
 EXPORT_SYMBOL_GPL(_cleanup_srcu_struct);
 
@@ -107,13 +107,13 @@ EXPORT_SYMBOL_GPL(_cleanup_srcu_struct);
  * Removes the count for the old reader from the appropriate element of
  * the srcu_struct.
  */
-void __srcu_read_unlock(struct srcu_struct *sp, int idx)
+void __srcu_read_unlock(struct srcu_struct *ssp, int idx)
 {
-	int newval = sp->srcu_lock_nesting[idx] - 1;
+	int newval = ssp->srcu_lock_nesting[idx] - 1;
 
-	WRITE_ONCE(sp->srcu_lock_nesting[idx], newval);
-	if (!newval && READ_ONCE(sp->srcu_gp_waiting))
-		swake_up_one(&sp->srcu_wq);
+	WRITE_ONCE(ssp->srcu_lock_nesting[idx], newval);
+	if (!newval && READ_ONCE(ssp->srcu_gp_waiting))
+		swake_up_one(&ssp->srcu_wq);
 }
 EXPORT_SYMBOL_GPL(__srcu_read_unlock);
 
@@ -127,24 +127,24 @@ void srcu_drive_gp(struct work_struct *wp)
 	int idx;
 	struct rcu_head *lh;
 	struct rcu_head *rhp;
-	struct srcu_struct *sp;
+	struct srcu_struct *ssp;
 
-	sp = container_of(wp, struct srcu_struct, srcu_work);
-	if (sp->srcu_gp_running || !READ_ONCE(sp->srcu_cb_head))
+	ssp = container_of(wp, struct srcu_struct, srcu_work);
+	if (ssp->srcu_gp_running || !READ_ONCE(ssp->srcu_cb_head))
 		return; /* Already running or nothing to do. */
 
 	/* Remove recently arrived callbacks and wait for readers. */
-	WRITE_ONCE(sp->srcu_gp_running, true);
+	WRITE_ONCE(ssp->srcu_gp_running, true);
 	local_irq_disable();
-	lh = sp->srcu_cb_head;
-	sp->srcu_cb_head = NULL;
-	sp->srcu_cb_tail = &sp->srcu_cb_head;
+	lh = ssp->srcu_cb_head;
+	ssp->srcu_cb_head = NULL;
+	ssp->srcu_cb_tail = &ssp->srcu_cb_head;
 	local_irq_enable();
-	idx = sp->srcu_idx;
-	WRITE_ONCE(sp->srcu_idx, !sp->srcu_idx);
-	WRITE_ONCE(sp->srcu_gp_waiting, true);  /* srcu_read_unlock() wakes! */
-	swait_event_exclusive(sp->srcu_wq, !READ_ONCE(sp->srcu_lock_nesting[idx]));
-	WRITE_ONCE(sp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */
+	idx = ssp->srcu_idx;
+	WRITE_ONCE(ssp->srcu_idx, !ssp->srcu_idx);
+	WRITE_ONCE(ssp->srcu_gp_waiting, true);  /* srcu_read_unlock() wakes! */
+	swait_event_exclusive(ssp->srcu_wq, !READ_ONCE(ssp->srcu_lock_nesting[idx]));
+	WRITE_ONCE(ssp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */
 
 	/* Invoke the callbacks we removed above. */
 	while (lh) {
@@ -161,9 +161,9 @@ void srcu_drive_gp(struct work_struct *wp)
 	 * at interrupt level, but the ->srcu_gp_running checks will
 	 * straighten that out.
 	 */
-	WRITE_ONCE(sp->srcu_gp_running, false);
-	if (READ_ONCE(sp->srcu_cb_head))
-		schedule_work(&sp->srcu_work);
+	WRITE_ONCE(ssp->srcu_gp_running, false);
+	if (READ_ONCE(ssp->srcu_cb_head))
+		schedule_work(&ssp->srcu_work);
 }
 EXPORT_SYMBOL_GPL(srcu_drive_gp);
 
@@ -171,7 +171,7 @@ EXPORT_SYMBOL_GPL(srcu_drive_gp);
  * Enqueue an SRCU callback on the specified srcu_struct structure,
  * initiating grace-period processing if it is not already running.
  */
-void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
+void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
 	       rcu_callback_t func)
 {
 	unsigned long flags;
@@ -179,14 +179,14 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
 	rhp->func = func;
 	rhp->next = NULL;
 	local_irq_save(flags);
-	*sp->srcu_cb_tail = rhp;
-	sp->srcu_cb_tail = &rhp->next;
+	*ssp->srcu_cb_tail = rhp;
+	ssp->srcu_cb_tail = &rhp->next;
 	local_irq_restore(flags);
-	if (!READ_ONCE(sp->srcu_gp_running)) {
+	if (!READ_ONCE(ssp->srcu_gp_running)) {
 		if (likely(srcu_init_done))
-			schedule_work(&sp->srcu_work);
-		else if (list_empty(&sp->srcu_work.entry))
-			list_add(&sp->srcu_work.entry, &srcu_boot_list);
+			schedule_work(&ssp->srcu_work);
+		else if (list_empty(&ssp->srcu_work.entry))
+			list_add(&ssp->srcu_work.entry, &srcu_boot_list);
 	}
 }
 EXPORT_SYMBOL_GPL(call_srcu);
@@ -194,13 +194,13 @@ EXPORT_SYMBOL_GPL(call_srcu);
 /*
  * synchronize_srcu - wait for prior SRCU read-side critical-section completion
  */
-void synchronize_srcu(struct srcu_struct *sp)
+void synchronize_srcu(struct srcu_struct *ssp)
 {
 	struct rcu_synchronize rs;
 
 	init_rcu_head_on_stack(&rs.head);
 	init_completion(&rs.completion);
-	call_srcu(sp, &rs.head, wakeme_after_rcu);
+	call_srcu(ssp, &rs.head, wakeme_after_rcu);
 	wait_for_completion(&rs.completion);
 	destroy_rcu_head_on_stack(&rs.head);
 }
@@ -219,13 +219,13 @@ void __init rcu_scheduler_starting(void)
  */
 void __init srcu_init(void)
 {
-	struct srcu_struct *sp;
+	struct srcu_struct *ssp;
 
 	srcu_init_done = true;
 	while (!list_empty(&srcu_boot_list)) {
-		sp = list_first_entry(&srcu_boot_list,
+		ssp = list_first_entry(&srcu_boot_list,
 				      struct srcu_struct, srcu_work.entry);
-		list_del_init(&sp->srcu_work.entry);
-		schedule_work(&sp->srcu_work);
+		list_del_init(&ssp->srcu_work.entry);
+		schedule_work(&ssp->srcu_work);
 	}
 }
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 697a2d7e8e8a..3600d88d8956 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -56,7 +56,7 @@ static LIST_HEAD(srcu_boot_list);
 static bool __read_mostly srcu_init_done;
 
 static void srcu_invoke_callbacks(struct work_struct *work);
-static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay);
+static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay);
 static void process_srcu(struct work_struct *work);
 
 /* Wrappers for lock acquisition and release, see raw_spin_lock_rcu_node(). */
@@ -92,7 +92,7 @@ do {									\
  * srcu_read_unlock() running against them.  So if the is_static parameter
  * is set, don't initialize ->srcu_lock_count[] and ->srcu_unlock_count[].
  */
-static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static)
+static void init_srcu_struct_nodes(struct srcu_struct *ssp, bool is_static)
 {
 	int cpu;
 	int i;
@@ -103,13 +103,13 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static)
 	struct srcu_node *snp_first;
 
 	/* Work out the overall tree geometry. */
-	sp->level[0] = &sp->node[0];
+	ssp->level[0] = &ssp->node[0];
 	for (i = 1; i < rcu_num_lvls; i++)
-		sp->level[i] = sp->level[i - 1] + num_rcu_lvl[i - 1];
+		ssp->level[i] = ssp->level[i - 1] + num_rcu_lvl[i - 1];
 	rcu_init_levelspread(levelspread, num_rcu_lvl);
 
 	/* Each pass through this loop initializes one srcu_node structure. */
-	srcu_for_each_node_breadth_first(sp, snp) {
+	srcu_for_each_node_breadth_first(ssp, snp) {
 		spin_lock_init(&ACCESS_PRIVATE(snp, lock));
 		WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) !=
 			     ARRAY_SIZE(snp->srcu_data_have_cbs));
@@ -120,17 +120,17 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static)
 		snp->srcu_gp_seq_needed_exp = 0;
 		snp->grplo = -1;
 		snp->grphi = -1;
-		if (snp == &sp->node[0]) {
+		if (snp == &ssp->node[0]) {
 			/* Root node, special case. */
 			snp->srcu_parent = NULL;
 			continue;
 		}
 
 		/* Non-root node. */
-		if (snp == sp->level[level + 1])
+		if (snp == ssp->level[level + 1])
 			level++;
-		snp->srcu_parent = sp->level[level - 1] +
-				   (snp - sp->level[level]) /
+		snp->srcu_parent = ssp->level[level - 1] +
+				   (snp - ssp->level[level]) /
 				   levelspread[level - 1];
 	}
 
@@ -141,14 +141,14 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static)
 	WARN_ON_ONCE(ARRAY_SIZE(sdp->srcu_lock_count) !=
 		     ARRAY_SIZE(sdp->srcu_unlock_count));
 	level = rcu_num_lvls - 1;
-	snp_first = sp->level[level];
+	snp_first = ssp->level[level];
 	for_each_possible_cpu(cpu) {
-		sdp = per_cpu_ptr(sp->sda, cpu);
+		sdp = per_cpu_ptr(ssp->sda, cpu);
 		spin_lock_init(&ACCESS_PRIVATE(sdp, lock));
 		rcu_segcblist_init(&sdp->srcu_cblist);
 		sdp->srcu_cblist_invoking = false;
-		sdp->srcu_gp_seq_needed = sp->srcu_gp_seq;
-		sdp->srcu_gp_seq_needed_exp = sp->srcu_gp_seq;
+		sdp->srcu_gp_seq_needed = ssp->srcu_gp_seq;
+		sdp->srcu_gp_seq_needed_exp = ssp->srcu_gp_seq;
 		sdp->mynode = &snp_first[cpu / levelspread[level]];
 		for (snp = sdp->mynode; snp != NULL; snp = snp->srcu_parent) {
 			if (snp->grplo < 0)
@@ -157,7 +157,7 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static)
 		}
 		sdp->cpu = cpu;
 		INIT_DELAYED_WORK(&sdp->work, srcu_invoke_callbacks);
-		sdp->sp = sp;
+		sdp->ssp = ssp;
 		sdp->grpmask = 1 << (cpu - sdp->mynode->grplo);
 		if (is_static)
 			continue;
@@ -176,35 +176,35 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static)
  * parameter is passed through to init_srcu_struct_nodes(), and
  * also tells us that ->sda has already been wired up to srcu_data.
  */
-static int init_srcu_struct_fields(struct srcu_struct *sp, bool is_static)
+static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
 {
-	mutex_init(&sp->srcu_cb_mutex);
-	mutex_init(&sp->srcu_gp_mutex);
-	sp->srcu_idx = 0;
-	sp->srcu_gp_seq = 0;
-	sp->srcu_barrier_seq = 0;
-	mutex_init(&sp->srcu_barrier_mutex);
-	atomic_set(&sp->srcu_barrier_cpu_cnt, 0);
-	INIT_DELAYED_WORK(&sp->work, process_srcu);
+	mutex_init(&ssp->srcu_cb_mutex);
+	mutex_init(&ssp->srcu_gp_mutex);
+	ssp->srcu_idx = 0;
+	ssp->srcu_gp_seq = 0;
+	ssp->srcu_barrier_seq = 0;
+	mutex_init(&ssp->srcu_barrier_mutex);
+	atomic_set(&ssp->srcu_barrier_cpu_cnt, 0);
+	INIT_DELAYED_WORK(&ssp->work, process_srcu);
 	if (!is_static)
-		sp->sda = alloc_percpu(struct srcu_data);
-	init_srcu_struct_nodes(sp, is_static);
-	sp->srcu_gp_seq_needed_exp = 0;
-	sp->srcu_last_gp_end = ktime_get_mono_fast_ns();
-	smp_store_release(&sp->srcu_gp_seq_needed, 0); /* Init done. */
-	return sp->sda ? 0 : -ENOMEM;
+		ssp->sda = alloc_percpu(struct srcu_data);
+	init_srcu_struct_nodes(ssp, is_static);
+	ssp->srcu_gp_seq_needed_exp = 0;
+	ssp->srcu_last_gp_end = ktime_get_mono_fast_ns();
+	smp_store_release(&ssp->srcu_gp_seq_needed, 0); /* Init done. */
+	return ssp->sda ? 0 : -ENOMEM;
 }
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 
-int __init_srcu_struct(struct srcu_struct *sp, const char *name,
+int __init_srcu_struct(struct srcu_struct *ssp, const char *name,
 		       struct lock_class_key *key)
 {
 	/* Don't re-initialize a lock while it is held. */
-	debug_check_no_locks_freed((void *)sp, sizeof(*sp));
-	lockdep_init_map(&sp->dep_map, name, key, 0);
-	spin_lock_init(&ACCESS_PRIVATE(sp, lock));
-	return init_srcu_struct_fields(sp, false);
+	debug_check_no_locks_freed((void *)ssp, sizeof(*ssp));
+	lockdep_init_map(&ssp->dep_map, name, key, 0);
+	spin_lock_init(&ACCESS_PRIVATE(ssp, lock));
+	return init_srcu_struct_fields(ssp, false);
 }
 EXPORT_SYMBOL_GPL(__init_srcu_struct);
 
@@ -212,16 +212,16 @@ EXPORT_SYMBOL_GPL(__init_srcu_struct);
 
 /**
  * init_srcu_struct - initialize a sleep-RCU structure
- * @sp: structure to initialize.
+ * @ssp: structure to initialize.
  *
  * Must invoke this on a given srcu_struct before passing that srcu_struct
  * to any other function.  Each srcu_struct represents a separate domain
  * of SRCU protection.
  */
-int init_srcu_struct(struct srcu_struct *sp)
+int init_srcu_struct(struct srcu_struct *ssp)
 {
-	spin_lock_init(&ACCESS_PRIVATE(sp, lock));
-	return init_srcu_struct_fields(sp, false);
+	spin_lock_init(&ACCESS_PRIVATE(ssp, lock));
+	return init_srcu_struct_fields(ssp, false);
 }
 EXPORT_SYMBOL_GPL(init_srcu_struct);
 
@@ -231,37 +231,37 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
  * First-use initialization of statically allocated srcu_struct
  * structure.  Wiring up the combining tree is more than can be
  * done with compile-time initialization, so this check is added
- * to each update-side SRCU primitive.  Use sp->lock, which -is-
+ * to each update-side SRCU primitive.  Use ssp->lock, which -is-
  * compile-time initialized, to resolve races involving multiple
  * CPUs trying to garner first-use privileges.
  */
-static void check_init_srcu_struct(struct srcu_struct *sp)
+static void check_init_srcu_struct(struct srcu_struct *ssp)
 {
 	unsigned long flags;
 
 	/* The smp_load_acquire() pairs with the smp_store_release(). */
-	if (!rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq_needed))) /*^^^*/
+	if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_gp_seq_needed))) /*^^^*/
 		return; /* Already initialized. */
-	spin_lock_irqsave_rcu_node(sp, flags);
-	if (!rcu_seq_state(sp->srcu_gp_seq_needed)) {
-		spin_unlock_irqrestore_rcu_node(sp, flags);
+	spin_lock_irqsave_rcu_node(ssp, flags);
+	if (!rcu_seq_state(ssp->srcu_gp_seq_needed)) {
+		spin_unlock_irqrestore_rcu_node(ssp, flags);
 		return;
 	}
-	init_srcu_struct_fields(sp, true);
-	spin_unlock_irqrestore_rcu_node(sp, flags);
+	init_srcu_struct_fields(ssp, true);
+	spin_unlock_irqrestore_rcu_node(ssp, flags);
 }
 
 /*
  * Returns approximate total of the readers' ->srcu_lock_count[] values
  * for the rank of per-CPU counters specified by idx.
  */
-static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx)
+static unsigned long srcu_readers_lock_idx(struct srcu_struct *ssp, int idx)
 {
 	int cpu;
 	unsigned long sum = 0;
 
 	for_each_possible_cpu(cpu) {
-		struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu);
+		struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu);
 
 		sum += READ_ONCE(cpuc->srcu_lock_count[idx]);
 	}
@@ -272,13 +272,13 @@ static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx)
  * Returns approximate total of the readers' ->srcu_unlock_count[] values
  * for the rank of per-CPU counters specified by idx.
  */
-static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx)
+static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx)
 {
 	int cpu;
 	unsigned long sum = 0;
 
 	for_each_possible_cpu(cpu) {
-		struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu);
+		struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu);
 
 		sum += READ_ONCE(cpuc->srcu_unlock_count[idx]);
 	}
@@ -289,11 +289,11 @@ static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx)
  * Return true if the number of pre-existing readers is determined to
  * be zero.
  */
-static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
+static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx)
 {
 	unsigned long unlocks;
 
-	unlocks = srcu_readers_unlock_idx(sp, idx);
+	unlocks = srcu_readers_unlock_idx(ssp, idx);
 
 	/*
 	 * Make sure that a lock is always counted if the corresponding
@@ -329,25 +329,25 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
 	 * of floor(ULONG_MAX/NR_CPUS/2), which should be sufficient,
 	 * especially on 64-bit systems.
 	 */
-	return srcu_readers_lock_idx(sp, idx) == unlocks;
+	return srcu_readers_lock_idx(ssp, idx) == unlocks;
 }
 
 /**
  * srcu_readers_active - returns true if there are readers. and false
  *                       otherwise
- * @sp: which srcu_struct to count active readers (holding srcu_read_lock).
+ * @ssp: which srcu_struct to count active readers (holding srcu_read_lock).
  *
  * Note that this is not an atomic primitive, and can therefore suffer
  * severe errors when invoked on an active srcu_struct.  That said, it
  * can be useful as an error check at cleanup time.
  */
-static bool srcu_readers_active(struct srcu_struct *sp)
+static bool srcu_readers_active(struct srcu_struct *ssp)
 {
 	int cpu;
 	unsigned long sum = 0;
 
 	for_each_possible_cpu(cpu) {
-		struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu);
+		struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu);
 
 		sum += READ_ONCE(cpuc->srcu_lock_count[0]);
 		sum += READ_ONCE(cpuc->srcu_lock_count[1]);
@@ -363,44 +363,44 @@ static bool srcu_readers_active(struct srcu_struct *sp)
  * Return grace-period delay, zero if there are expedited grace
  * periods pending, SRCU_INTERVAL otherwise.
  */
-static unsigned long srcu_get_delay(struct srcu_struct *sp)
+static unsigned long srcu_get_delay(struct srcu_struct *ssp)
 {
-	if (ULONG_CMP_LT(READ_ONCE(sp->srcu_gp_seq),
-			 READ_ONCE(sp->srcu_gp_seq_needed_exp)))
+	if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq),
+			 READ_ONCE(ssp->srcu_gp_seq_needed_exp)))
 		return 0;
 	return SRCU_INTERVAL;
 }
 
 /* Helper for cleanup_srcu_struct() and cleanup_srcu_struct_quiesced(). */
-void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced)
+void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced)
 {
 	int cpu;
 
-	if (WARN_ON(!srcu_get_delay(sp)))
+	if (WARN_ON(!srcu_get_delay(ssp)))
 		return; /* Just leak it! */
-	if (WARN_ON(srcu_readers_active(sp)))
+	if (WARN_ON(srcu_readers_active(ssp)))
 		return; /* Just leak it! */
 	if (quiesced) {
-		if (WARN_ON(delayed_work_pending(&sp->work)))
+		if (WARN_ON(delayed_work_pending(&ssp->work)))
 			return; /* Just leak it! */
 	} else {
-		flush_delayed_work(&sp->work);
+		flush_delayed_work(&ssp->work);
 	}
 	for_each_possible_cpu(cpu)
 		if (quiesced) {
-			if (WARN_ON(delayed_work_pending(&per_cpu_ptr(sp->sda, cpu)->work)))
+			if (WARN_ON(delayed_work_pending(&per_cpu_ptr(ssp->sda, cpu)->work)))
 				return; /* Just leak it! */
 		} else {
-			flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work);
+			flush_delayed_work(&per_cpu_ptr(ssp->sda, cpu)->work);
 		}
-	if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE) ||
-	    WARN_ON(srcu_readers_active(sp))) {
+	if (WARN_ON(rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) != SRCU_STATE_IDLE) ||
+	    WARN_ON(srcu_readers_active(ssp))) {
 		pr_info("%s: Active srcu_struct %p state: %d\n",
-			__func__, sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)));
+			__func__, ssp, rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)));
 		return; /* Caller forgot to stop doing call_srcu()? */
 	}
-	free_percpu(sp->sda);
-	sp->sda = NULL;
+	free_percpu(ssp->sda);
+	ssp->sda = NULL;
 }
 EXPORT_SYMBOL_GPL(_cleanup_srcu_struct);
 
@@ -409,12 +409,12 @@ EXPORT_SYMBOL_GPL(_cleanup_srcu_struct);
  * srcu_struct.
  * Returns an index that must be passed to the matching srcu_read_unlock().
  */
-int __srcu_read_lock(struct srcu_struct *sp)
+int __srcu_read_lock(struct srcu_struct *ssp)
 {
 	int idx;
 
-	idx = READ_ONCE(sp->srcu_idx) & 0x1;
-	this_cpu_inc(sp->sda->srcu_lock_count[idx]);
+	idx = READ_ONCE(ssp->srcu_idx) & 0x1;
+	this_cpu_inc(ssp->sda->srcu_lock_count[idx]);
 	smp_mb(); /* B */  /* Avoid leaking the critical section. */
 	return idx;
 }
@@ -425,10 +425,10 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock);
  * element of the srcu_struct.  Note that this may well be a different
  * CPU than that which was incremented by the corresponding srcu_read_lock().
  */
-void __srcu_read_unlock(struct srcu_struct *sp, int idx)
+void __srcu_read_unlock(struct srcu_struct *ssp, int idx)
 {
 	smp_mb(); /* C */  /* Avoid leaking the critical section. */
-	this_cpu_inc(sp->sda->srcu_unlock_count[idx]);
+	this_cpu_inc(ssp->sda->srcu_unlock_count[idx]);
 }
 EXPORT_SYMBOL_GPL(__srcu_read_unlock);
 
@@ -444,22 +444,22 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock);
 /*
  * Start an SRCU grace period.
  */
-static void srcu_gp_start(struct srcu_struct *sp)
+static void srcu_gp_start(struct srcu_struct *ssp)
 {
-	struct srcu_data *sdp = this_cpu_ptr(sp->sda);
+	struct srcu_data *sdp = this_cpu_ptr(ssp->sda);
 	int state;
 
-	lockdep_assert_held(&ACCESS_PRIVATE(sp, lock));
-	WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed));
+	lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock));
+	WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed));
 	spin_lock_rcu_node(sdp);  /* Interrupts already disabled. */
 	rcu_segcblist_advance(&sdp->srcu_cblist,
-			      rcu_seq_current(&sp->srcu_gp_seq));
+			      rcu_seq_current(&ssp->srcu_gp_seq));
 	(void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
-				       rcu_seq_snap(&sp->srcu_gp_seq));
+				       rcu_seq_snap(&ssp->srcu_gp_seq));
 	spin_unlock_rcu_node(sdp);  /* Interrupts remain disabled. */
 	smp_mb(); /* Order prior store to ->srcu_gp_seq_needed vs. GP start. */
-	rcu_seq_start(&sp->srcu_gp_seq);
-	state = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq));
+	rcu_seq_start(&ssp->srcu_gp_seq);
+	state = rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq));
 	WARN_ON_ONCE(state != SRCU_STATE_SCAN1);
 }
 
@@ -513,7 +513,7 @@ static void srcu_schedule_cbs_sdp(struct srcu_data *sdp, unsigned long delay)
  * just-completed grace period, the one corresponding to idx.  If possible,
  * schedule this invocation on the corresponding CPUs.
  */
-static void srcu_schedule_cbs_snp(struct srcu_struct *sp, struct srcu_node *snp,
+static void srcu_schedule_cbs_snp(struct srcu_struct *ssp, struct srcu_node *snp,
 				  unsigned long mask, unsigned long delay)
 {
 	int cpu;
@@ -521,7 +521,7 @@ static void srcu_schedule_cbs_snp(struct srcu_struct *sp, struct srcu_node *snp,
 	for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) {
 		if (!(mask & (1 << (cpu - snp->grplo))))
 			continue;
-		srcu_schedule_cbs_sdp(per_cpu_ptr(sp->sda, cpu), delay);
+		srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, cpu), delay);
 	}
 }
 
@@ -534,7 +534,7 @@ static void srcu_schedule_cbs_snp(struct srcu_struct *sp, struct srcu_node *snp,
  * are initiating callback invocation.  This allows the ->srcu_have_cbs[]
  * array to have a finite number of elements.
  */
-static void srcu_gp_end(struct srcu_struct *sp)
+static void srcu_gp_end(struct srcu_struct *ssp)
 {
 	unsigned long cbdelay;
 	bool cbs;
@@ -548,28 +548,28 @@ static void srcu_gp_end(struct srcu_struct *sp)
 	struct srcu_node *snp;
 
 	/* Prevent more than one additional grace period. */
-	mutex_lock(&sp->srcu_cb_mutex);
+	mutex_lock(&ssp->srcu_cb_mutex);
 
 	/* End the current grace period. */
-	spin_lock_irq_rcu_node(sp);
-	idx = rcu_seq_state(sp->srcu_gp_seq);
+	spin_lock_irq_rcu_node(ssp);
+	idx = rcu_seq_state(ssp->srcu_gp_seq);
 	WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
-	cbdelay = srcu_get_delay(sp);
-	sp->srcu_last_gp_end = ktime_get_mono_fast_ns();
-	rcu_seq_end(&sp->srcu_gp_seq);
-	gpseq = rcu_seq_current(&sp->srcu_gp_seq);
-	if (ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, gpseq))
-		sp->srcu_gp_seq_needed_exp = gpseq;
-	spin_unlock_irq_rcu_node(sp);
-	mutex_unlock(&sp->srcu_gp_mutex);
+	cbdelay = srcu_get_delay(ssp);
+	ssp->srcu_last_gp_end = ktime_get_mono_fast_ns();
+	rcu_seq_end(&ssp->srcu_gp_seq);
+	gpseq = rcu_seq_current(&ssp->srcu_gp_seq);
+	if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, gpseq))
+		ssp->srcu_gp_seq_needed_exp = gpseq;
+	spin_unlock_irq_rcu_node(ssp);
+	mutex_unlock(&ssp->srcu_gp_mutex);
 	/* A new grace period can start at this point.  But only one. */
 
 	/* Initiate callback invocation as needed. */
 	idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
-	srcu_for_each_node_breadth_first(sp, snp) {
+	srcu_for_each_node_breadth_first(ssp, snp) {
 		spin_lock_irq_rcu_node(snp);
 		cbs = false;
-		last_lvl = snp >= sp->level[rcu_num_lvls - 1];
+		last_lvl = snp >= ssp->level[rcu_num_lvls - 1];
 		if (last_lvl)
 			cbs = snp->srcu_have_cbs[idx] == gpseq;
 		snp->srcu_have_cbs[idx] = gpseq;
@@ -580,12 +580,12 @@ static void srcu_gp_end(struct srcu_struct *sp)
 		snp->srcu_data_have_cbs[idx] = 0;
 		spin_unlock_irq_rcu_node(snp);
 		if (cbs)
-			srcu_schedule_cbs_snp(sp, snp, mask, cbdelay);
+			srcu_schedule_cbs_snp(ssp, snp, mask, cbdelay);
 
 		/* Occasionally prevent srcu_data counter wrap. */
 		if (!(gpseq & counter_wrap_check) && last_lvl)
 			for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) {
-				sdp = per_cpu_ptr(sp->sda, cpu);
+				sdp = per_cpu_ptr(ssp->sda, cpu);
 				spin_lock_irqsave_rcu_node(sdp, flags);
 				if (ULONG_CMP_GE(gpseq,
 						 sdp->srcu_gp_seq_needed + 100))
@@ -598,18 +598,18 @@ static void srcu_gp_end(struct srcu_struct *sp)
 	}
 
 	/* Callback initiation done, allow grace periods after next. */
-	mutex_unlock(&sp->srcu_cb_mutex);
+	mutex_unlock(&ssp->srcu_cb_mutex);
 
 	/* Start a new grace period if needed. */
-	spin_lock_irq_rcu_node(sp);
-	gpseq = rcu_seq_current(&sp->srcu_gp_seq);
+	spin_lock_irq_rcu_node(ssp);
+	gpseq = rcu_seq_current(&ssp->srcu_gp_seq);
 	if (!rcu_seq_state(gpseq) &&
-	    ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) {
-		srcu_gp_start(sp);
-		spin_unlock_irq_rcu_node(sp);
-		srcu_reschedule(sp, 0);
+	    ULONG_CMP_LT(gpseq, ssp->srcu_gp_seq_needed)) {
+		srcu_gp_start(ssp);
+		spin_unlock_irq_rcu_node(ssp);
+		srcu_reschedule(ssp, 0);
 	} else {
-		spin_unlock_irq_rcu_node(sp);
+		spin_unlock_irq_rcu_node(ssp);
 	}
 }
 
@@ -620,13 +620,13 @@ static void srcu_gp_end(struct srcu_struct *sp)
  * but without expediting.  To start a completely new grace period,
  * whether expedited or not, use srcu_funnel_gp_start() instead.
  */
-static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp,
+static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp,
 				  unsigned long s)
 {
 	unsigned long flags;
 
 	for (; snp != NULL; snp = snp->srcu_parent) {
-		if (rcu_seq_done(&sp->srcu_gp_seq, s) ||
+		if (rcu_seq_done(&ssp->srcu_gp_seq, s) ||
 		    ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s))
 			return;
 		spin_lock_irqsave_rcu_node(snp, flags);
@@ -637,10 +637,10 @@ static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp,
 		WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s);
 		spin_unlock_irqrestore_rcu_node(snp, flags);
 	}
-	spin_lock_irqsave_rcu_node(sp, flags);
-	if (ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s))
-		sp->srcu_gp_seq_needed_exp = s;
-	spin_unlock_irqrestore_rcu_node(sp, flags);
+	spin_lock_irqsave_rcu_node(ssp, flags);
+	if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s))
+		ssp->srcu_gp_seq_needed_exp = s;
+	spin_unlock_irqrestore_rcu_node(ssp, flags);
 }
 
 /*
@@ -653,7 +653,7 @@ static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp,
  * Note that this function also does the work of srcu_funnel_exp_start(),
  * in some cases by directly invoking it.
  */
-static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp,
+static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
 				 unsigned long s, bool do_norm)
 {
 	unsigned long flags;
@@ -663,7 +663,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp,
 
 	/* Each pass through the loop does one level of the srcu_node tree. */
 	for (; snp != NULL; snp = snp->srcu_parent) {
-		if (rcu_seq_done(&sp->srcu_gp_seq, s) && snp != sdp->mynode)
+		if (rcu_seq_done(&ssp->srcu_gp_seq, s) && snp != sdp->mynode)
 			return; /* GP already done and CBs recorded. */
 		spin_lock_irqsave_rcu_node(snp, flags);
 		if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) {
@@ -678,7 +678,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp,
 				return;
 			}
 			if (!do_norm)
-				srcu_funnel_exp_start(sp, snp, s);
+				srcu_funnel_exp_start(ssp, snp, s);
 			return;
 		}
 		snp->srcu_have_cbs[idx] = s;
@@ -690,29 +690,29 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp,
 	}
 
 	/* Top of tree, must ensure the grace period will be started. */
-	spin_lock_irqsave_rcu_node(sp, flags);
-	if (ULONG_CMP_LT(sp->srcu_gp_seq_needed, s)) {
+	spin_lock_irqsave_rcu_node(ssp, flags);
+	if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed, s)) {
 		/*
 		 * Record need for grace period s.  Pair with load
 		 * acquire setting up for initialization.
 		 */
-		smp_store_release(&sp->srcu_gp_seq_needed, s); /*^^^*/
+		smp_store_release(&ssp->srcu_gp_seq_needed, s); /*^^^*/
 	}
-	if (!do_norm && ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s))
-		sp->srcu_gp_seq_needed_exp = s;
+	if (!do_norm && ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s))
+		ssp->srcu_gp_seq_needed_exp = s;
 
 	/* If grace period not already done and none in progress, start it. */
-	if (!rcu_seq_done(&sp->srcu_gp_seq, s) &&
-	    rcu_seq_state(sp->srcu_gp_seq) == SRCU_STATE_IDLE) {
-		WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed));
-		srcu_gp_start(sp);
+	if (!rcu_seq_done(&ssp->srcu_gp_seq, s) &&
+	    rcu_seq_state(ssp->srcu_gp_seq) == SRCU_STATE_IDLE) {
+		WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed));
+		srcu_gp_start(ssp);
 		if (likely(srcu_init_done))
-			queue_delayed_work(rcu_gp_wq, &sp->work,
-					   srcu_get_delay(sp));
-		else if (list_empty(&sp->work.work.entry))
-			list_add(&sp->work.work.entry, &srcu_boot_list);
+			queue_delayed_work(rcu_gp_wq, &ssp->work,
+					   srcu_get_delay(ssp));
+		else if (list_empty(&ssp->work.work.entry))
+			list_add(&ssp->work.work.entry, &srcu_boot_list);
 	}
-	spin_unlock_irqrestore_rcu_node(sp, flags);
+	spin_unlock_irqrestore_rcu_node(ssp, flags);
 }
 
 /*
@@ -720,12 +720,12 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp,
  * loop an additional time if there is an expedited grace period pending.
  * The caller must ensure that ->srcu_idx is not changed while checking.
  */
-static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount)
+static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount)
 {
 	for (;;) {
-		if (srcu_readers_active_idx_check(sp, idx))
+		if (srcu_readers_active_idx_check(ssp, idx))
 			return true;
-		if (--trycount + !srcu_get_delay(sp) <= 0)
+		if (--trycount + !srcu_get_delay(ssp) <= 0)
 			return false;
 		udelay(SRCU_RETRY_CHECK_DELAY);
 	}
@@ -736,7 +736,7 @@ static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount)
  * use the other rank of the ->srcu_(un)lock_count[] arrays.  This allows
  * us to wait for pre-existing readers in a starvation-free manner.
  */
-static void srcu_flip(struct srcu_struct *sp)
+static void srcu_flip(struct srcu_struct *ssp)
 {
 	/*
 	 * Ensure that if this updater saw a given reader's increment
@@ -748,7 +748,7 @@ static void srcu_flip(struct srcu_struct *sp)
 	 */
 	smp_mb(); /* E */  /* Pairs with B and C. */
 
-	WRITE_ONCE(sp->srcu_idx, sp->srcu_idx + 1);
+	WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1);
 
 	/*
 	 * Ensure that if the updater misses an __srcu_read_unlock()
@@ -781,7 +781,7 @@ static void srcu_flip(struct srcu_struct *sp)
  * negligible when amoritized over that time period, and the extra latency
  * of a needlessly non-expedited grace period is similarly negligible.
  */
-static bool srcu_might_be_idle(struct srcu_struct *sp)
+static bool srcu_might_be_idle(struct srcu_struct *ssp)
 {
 	unsigned long curseq;
 	unsigned long flags;
@@ -790,7 +790,7 @@ static bool srcu_might_be_idle(struct srcu_struct *sp)
 
 	/* If the local srcu_data structure has callbacks, not idle.  */
 	local_irq_save(flags);
-	sdp = this_cpu_ptr(sp->sda);
+	sdp = this_cpu_ptr(ssp->sda);
 	if (rcu_segcblist_pend_cbs(&sdp->srcu_cblist)) {
 		local_irq_restore(flags);
 		return false; /* Callbacks already present, so not idle. */
@@ -806,17 +806,17 @@ static bool srcu_might_be_idle(struct srcu_struct *sp)
 	/* First, see if enough time has passed since the last GP. */
 	t = ktime_get_mono_fast_ns();
 	if (exp_holdoff == 0 ||
-	    time_in_range_open(t, sp->srcu_last_gp_end,
-			       sp->srcu_last_gp_end + exp_holdoff))
+	    time_in_range_open(t, ssp->srcu_last_gp_end,
+			       ssp->srcu_last_gp_end + exp_holdoff))
 		return false; /* Too soon after last GP. */
 
 	/* Next, check for probable idleness. */
-	curseq = rcu_seq_current(&sp->srcu_gp_seq);
+	curseq = rcu_seq_current(&ssp->srcu_gp_seq);
 	smp_mb(); /* Order ->srcu_gp_seq with ->srcu_gp_seq_needed. */
-	if (ULONG_CMP_LT(curseq, READ_ONCE(sp->srcu_gp_seq_needed)))
+	if (ULONG_CMP_LT(curseq, READ_ONCE(ssp->srcu_gp_seq_needed)))
 		return false; /* Grace period in progress, so not idle. */
 	smp_mb(); /* Order ->srcu_gp_seq with prior access. */
-	if (curseq != rcu_seq_current(&sp->srcu_gp_seq))
+	if (curseq != rcu_seq_current(&ssp->srcu_gp_seq))
 		return false; /* GP # changed, so not idle. */
 	return true; /* With reasonable probability, idle! */
 }
@@ -856,7 +856,7 @@ static void srcu_leak_callback(struct rcu_head *rhp)
  * srcu_read_lock(), and srcu_read_unlock() that are all passed the same
  * srcu_struct structure.
  */
-void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
+void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
 		 rcu_callback_t func, bool do_norm)
 {
 	unsigned long flags;
@@ -866,7 +866,7 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
 	unsigned long s;
 	struct srcu_data *sdp;
 
-	check_init_srcu_struct(sp);
+	check_init_srcu_struct(ssp);
 	if (debug_rcu_head_queue(rhp)) {
 		/* Probable double call_srcu(), so leak the callback. */
 		WRITE_ONCE(rhp->func, srcu_leak_callback);
@@ -874,14 +874,14 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
 		return;
 	}
 	rhp->func = func;
-	idx = srcu_read_lock(sp);
+	idx = srcu_read_lock(ssp);
 	local_irq_save(flags);
-	sdp = this_cpu_ptr(sp->sda);
+	sdp = this_cpu_ptr(ssp->sda);
 	spin_lock_rcu_node(sdp);
 	rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp, false);
 	rcu_segcblist_advance(&sdp->srcu_cblist,
-			      rcu_seq_current(&sp->srcu_gp_seq));
-	s = rcu_seq_snap(&sp->srcu_gp_seq);
+			      rcu_seq_current(&ssp->srcu_gp_seq));
+	s = rcu_seq_snap(&ssp->srcu_gp_seq);
 	(void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s);
 	if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) {
 		sdp->srcu_gp_seq_needed = s;
@@ -893,15 +893,15 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
 	}
 	spin_unlock_irqrestore_rcu_node(sdp, flags);
 	if (needgp)
-		srcu_funnel_gp_start(sp, sdp, s, do_norm);
+		srcu_funnel_gp_start(ssp, sdp, s, do_norm);
 	else if (needexp)
-		srcu_funnel_exp_start(sp, sdp->mynode, s);
-	srcu_read_unlock(sp, idx);
+		srcu_funnel_exp_start(ssp, sdp->mynode, s);
+	srcu_read_unlock(ssp, idx);
 }
 
 /**
  * call_srcu() - Queue a callback for invocation after an SRCU grace period
- * @sp: srcu_struct in queue the callback
+ * @ssp: srcu_struct in queue the callback
  * @rhp: structure to be used for queueing the SRCU callback.
  * @func: function to be invoked after the SRCU grace period
  *
@@ -916,21 +916,21 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
  * The callback will be invoked from process context, but must nevertheless
  * be fast and must not block.
  */
-void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
+void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
 	       rcu_callback_t func)
 {
-	__call_srcu(sp, rhp, func, true);
+	__call_srcu(ssp, rhp, func, true);
 }
 EXPORT_SYMBOL_GPL(call_srcu);
 
 /*
  * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
  */
-static void __synchronize_srcu(struct srcu_struct *sp, bool do_norm)
+static void __synchronize_srcu(struct srcu_struct *ssp, bool do_norm)
 {
 	struct rcu_synchronize rcu;
 
-	RCU_LOCKDEP_WARN(lock_is_held(&sp->dep_map) ||
+	RCU_LOCKDEP_WARN(lock_is_held(&ssp->dep_map) ||
 			 lock_is_held(&rcu_bh_lock_map) ||
 			 lock_is_held(&rcu_lock_map) ||
 			 lock_is_held(&rcu_sched_lock_map),
@@ -939,10 +939,10 @@ static void __synchronize_srcu(struct srcu_struct *sp, bool do_norm)
 	if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
 		return;
 	might_sleep();
-	check_init_srcu_struct(sp);
+	check_init_srcu_struct(ssp);
 	init_completion(&rcu.completion);
 	init_rcu_head_on_stack(&rcu.head);
-	__call_srcu(sp, &rcu.head, wakeme_after_rcu, do_norm);
+	__call_srcu(ssp, &rcu.head, wakeme_after_rcu, do_norm);
 	wait_for_completion(&rcu.completion);
 	destroy_rcu_head_on_stack(&rcu.head);
 
@@ -958,7 +958,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, bool do_norm)
 
 /**
  * synchronize_srcu_expedited - Brute-force SRCU grace period
- * @sp: srcu_struct with which to synchronize.
+ * @ssp: srcu_struct with which to synchronize.
  *
  * Wait for an SRCU grace period to elapse, but be more aggressive about
  * spinning rather than blocking when waiting.
@@ -966,15 +966,15 @@ static void __synchronize_srcu(struct srcu_struct *sp, bool do_norm)
  * Note that synchronize_srcu_expedited() has the same deadlock and
  * memory-ordering properties as does synchronize_srcu().
  */
-void synchronize_srcu_expedited(struct srcu_struct *sp)
+void synchronize_srcu_expedited(struct srcu_struct *ssp)
 {
-	__synchronize_srcu(sp, rcu_gp_is_normal());
+	__synchronize_srcu(ssp, rcu_gp_is_normal());
 }
 EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
 
 /**
  * synchronize_srcu - wait for prior SRCU read-side critical-section completion
- * @sp: srcu_struct with which to synchronize.
+ * @ssp: srcu_struct with which to synchronize.
  *
  * Wait for the count to drain to zero of both indexes. To avoid the
  * possible starvation of synchronize_srcu(), it waits for the count of
@@ -1016,12 +1016,12 @@ EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
  * SRCU must also provide it.  Note that detecting idleness is heuristic
  * and subject to both false positives and negatives.
  */
-void synchronize_srcu(struct srcu_struct *sp)
+void synchronize_srcu(struct srcu_struct *ssp)
 {
-	if (srcu_might_be_idle(sp) || rcu_gp_is_expedited())
-		synchronize_srcu_expedited(sp);
+	if (srcu_might_be_idle(ssp) || rcu_gp_is_expedited())
+		synchronize_srcu_expedited(ssp);
 	else
-		__synchronize_srcu(sp, true);
+		__synchronize_srcu(ssp, true);
 }
 EXPORT_SYMBOL_GPL(synchronize_srcu);
 
@@ -1031,36 +1031,36 @@ EXPORT_SYMBOL_GPL(synchronize_srcu);
 static void srcu_barrier_cb(struct rcu_head *rhp)
 {
 	struct srcu_data *sdp;
-	struct srcu_struct *sp;
+	struct srcu_struct *ssp;
 
 	sdp = container_of(rhp, struct srcu_data, srcu_barrier_head);
-	sp = sdp->sp;
-	if (atomic_dec_and_test(&sp->srcu_barrier_cpu_cnt))
-		complete(&sp->srcu_barrier_completion);
+	ssp = sdp->ssp;
+	if (atomic_dec_and_test(&ssp->srcu_barrier_cpu_cnt))
+		complete(&ssp->srcu_barrier_completion);
 }
 
 /**
  * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete.
- * @sp: srcu_struct on which to wait for in-flight callbacks.
+ * @ssp: srcu_struct on which to wait for in-flight callbacks.
  */
-void srcu_barrier(struct srcu_struct *sp)
+void srcu_barrier(struct srcu_struct *ssp)
 {
 	int cpu;
 	struct srcu_data *sdp;
-	unsigned long s = rcu_seq_snap(&sp->srcu_barrier_seq);
+	unsigned long s = rcu_seq_snap(&ssp->srcu_barrier_seq);
 
-	check_init_srcu_struct(sp);
-	mutex_lock(&sp->srcu_barrier_mutex);
-	if (rcu_seq_done(&sp->srcu_barrier_seq, s)) {
+	check_init_srcu_struct(ssp);
+	mutex_lock(&ssp->srcu_barrier_mutex);
+	if (rcu_seq_done(&ssp->srcu_barrier_seq, s)) {
 		smp_mb(); /* Force ordering following return. */
-		mutex_unlock(&sp->srcu_barrier_mutex);
+		mutex_unlock(&ssp->srcu_barrier_mutex);
 		return; /* Someone else did our work for us. */
 	}
-	rcu_seq_start(&sp->srcu_barrier_seq);
-	init_completion(&sp->srcu_barrier_completion);
+	rcu_seq_start(&ssp->srcu_barrier_seq);
+	init_completion(&ssp->srcu_barrier_completion);
 
 	/* Initial count prevents reaching zero until all CBs are posted. */
-	atomic_set(&sp->srcu_barrier_cpu_cnt, 1);
+	atomic_set(&ssp->srcu_barrier_cpu_cnt, 1);
 
 	/*
 	 * Each pass through this loop enqueues a callback, but only
@@ -1071,39 +1071,39 @@ void srcu_barrier(struct srcu_struct *sp)
 	 * grace period as the last callback already in the queue.
 	 */
 	for_each_possible_cpu(cpu) {
-		sdp = per_cpu_ptr(sp->sda, cpu);
+		sdp = per_cpu_ptr(ssp->sda, cpu);
 		spin_lock_irq_rcu_node(sdp);
-		atomic_inc(&sp->srcu_barrier_cpu_cnt);
+		atomic_inc(&ssp->srcu_barrier_cpu_cnt);
 		sdp->srcu_barrier_head.func = srcu_barrier_cb;
 		debug_rcu_head_queue(&sdp->srcu_barrier_head);
 		if (!rcu_segcblist_entrain(&sdp->srcu_cblist,
 					   &sdp->srcu_barrier_head, 0)) {
 			debug_rcu_head_unqueue(&sdp->srcu_barrier_head);
-			atomic_dec(&sp->srcu_barrier_cpu_cnt);
+			atomic_dec(&ssp->srcu_barrier_cpu_cnt);
 		}
 		spin_unlock_irq_rcu_node(sdp);
 	}
 
 	/* Remove the initial count, at which point reaching zero can happen. */
-	if (atomic_dec_and_test(&sp->srcu_barrier_cpu_cnt))
-		complete(&sp->srcu_barrier_completion);
-	wait_for_completion(&sp->srcu_barrier_completion);
+	if (atomic_dec_and_test(&ssp->srcu_barrier_cpu_cnt))
+		complete(&ssp->srcu_barrier_completion);
+	wait_for_completion(&ssp->srcu_barrier_completion);
 
-	rcu_seq_end(&sp->srcu_barrier_seq);
-	mutex_unlock(&sp->srcu_barrier_mutex);
+	rcu_seq_end(&ssp->srcu_barrier_seq);
+	mutex_unlock(&ssp->srcu_barrier_mutex);
 }
 EXPORT_SYMBOL_GPL(srcu_barrier);
 
 /**
  * srcu_batches_completed - return batches completed.
- * @sp: srcu_struct on which to report batch completion.
+ * @ssp: srcu_struct on which to report batch completion.
  *
  * Report the number of batches, correlated with, but not necessarily
  * precisely the same as, the number of grace periods that have elapsed.
  */
-unsigned long srcu_batches_completed(struct srcu_struct *sp)
+unsigned long srcu_batches_completed(struct srcu_struct *ssp)
 {
-	return sp->srcu_idx;
+	return ssp->srcu_idx;
 }
 EXPORT_SYMBOL_GPL(srcu_batches_completed);
 
@@ -1112,11 +1112,11 @@ EXPORT_SYMBOL_GPL(srcu_batches_completed);
  * to SRCU_STATE_SCAN2, and invoke srcu_gp_end() when scan has
  * completed in that state.
  */
-static void srcu_advance_state(struct srcu_struct *sp)
+static void srcu_advance_state(struct srcu_struct *ssp)
 {
 	int idx;
 
-	mutex_lock(&sp->srcu_gp_mutex);
+	mutex_lock(&ssp->srcu_gp_mutex);
 
 	/*
 	 * Because readers might be delayed for an extended period after
@@ -1128,47 +1128,47 @@ static void srcu_advance_state(struct srcu_struct *sp)
 	 * The load-acquire ensures that we see the accesses performed
 	 * by the prior grace period.
 	 */
-	idx = rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq)); /* ^^^ */
+	idx = rcu_seq_state(smp_load_acquire(&ssp->srcu_gp_seq)); /* ^^^ */
 	if (idx == SRCU_STATE_IDLE) {
-		spin_lock_irq_rcu_node(sp);
-		if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) {
-			WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq));
-			spin_unlock_irq_rcu_node(sp);
-			mutex_unlock(&sp->srcu_gp_mutex);
+		spin_lock_irq_rcu_node(ssp);
+		if (ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)) {
+			WARN_ON_ONCE(rcu_seq_state(ssp->srcu_gp_seq));
+			spin_unlock_irq_rcu_node(ssp);
+			mutex_unlock(&ssp->srcu_gp_mutex);
 			return;
 		}
-		idx = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq));
+		idx = rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq));
 		if (idx == SRCU_STATE_IDLE)
-			srcu_gp_start(sp);
-		spin_unlock_irq_rcu_node(sp);
+			srcu_gp_start(ssp);
+		spin_unlock_irq_rcu_node(ssp);
 		if (idx != SRCU_STATE_IDLE) {
-			mutex_unlock(&sp->srcu_gp_mutex);
+			mutex_unlock(&ssp->srcu_gp_mutex);
 			return; /* Someone else started the grace period. */
 		}
 	}
 
-	if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_SCAN1) {
-		idx = 1 ^ (sp->srcu_idx & 1);
-		if (!try_check_zero(sp, idx, 1)) {
-			mutex_unlock(&sp->srcu_gp_mutex);
+	if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) == SRCU_STATE_SCAN1) {
+		idx = 1 ^ (ssp->srcu_idx & 1);
+		if (!try_check_zero(ssp, idx, 1)) {
+			mutex_unlock(&ssp->srcu_gp_mutex);
 			return; /* readers present, retry later. */
 		}
-		srcu_flip(sp);
-		rcu_seq_set_state(&sp->srcu_gp_seq, SRCU_STATE_SCAN2);
+		srcu_flip(ssp);
+		rcu_seq_set_state(&ssp->srcu_gp_seq, SRCU_STATE_SCAN2);
 	}
 
-	if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_SCAN2) {
+	if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) == SRCU_STATE_SCAN2) {
 
 		/*
 		 * SRCU read-side critical sections are normally short,
 		 * so check at least twice in quick succession after a flip.
 		 */
-		idx = 1 ^ (sp->srcu_idx & 1);
-		if (!try_check_zero(sp, idx, 2)) {
-			mutex_unlock(&sp->srcu_gp_mutex);
+		idx = 1 ^ (ssp->srcu_idx & 1);
+		if (!try_check_zero(ssp, idx, 2)) {
+			mutex_unlock(&ssp->srcu_gp_mutex);
 			return; /* readers present, retry later. */
 		}
-		srcu_gp_end(sp);  /* Releases ->srcu_gp_mutex. */
+		srcu_gp_end(ssp);  /* Releases ->srcu_gp_mutex. */
 	}
 }
 
@@ -1184,14 +1184,14 @@ static void srcu_invoke_callbacks(struct work_struct *work)
 	struct rcu_cblist ready_cbs;
 	struct rcu_head *rhp;
 	struct srcu_data *sdp;
-	struct srcu_struct *sp;
+	struct srcu_struct *ssp;
 
 	sdp = container_of(work, struct srcu_data, work.work);
-	sp = sdp->sp;
+	ssp = sdp->ssp;
 	rcu_cblist_init(&ready_cbs);
 	spin_lock_irq_rcu_node(sdp);
 	rcu_segcblist_advance(&sdp->srcu_cblist,
-			      rcu_seq_current(&sp->srcu_gp_seq));
+			      rcu_seq_current(&ssp->srcu_gp_seq));
 	if (sdp->srcu_cblist_invoking ||
 	    !rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) {
 		spin_unlock_irq_rcu_node(sdp);
@@ -1217,7 +1217,7 @@ static void srcu_invoke_callbacks(struct work_struct *work)
 	spin_lock_irq_rcu_node(sdp);
 	rcu_segcblist_insert_count(&sdp->srcu_cblist, &ready_cbs);
 	(void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
-				       rcu_seq_snap(&sp->srcu_gp_seq));
+				       rcu_seq_snap(&ssp->srcu_gp_seq));
 	sdp->srcu_cblist_invoking = false;
 	more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist);
 	spin_unlock_irq_rcu_node(sdp);
@@ -1229,24 +1229,24 @@ static void srcu_invoke_callbacks(struct work_struct *work)
  * Finished one round of SRCU grace period.  Start another if there are
  * more SRCU callbacks queued, otherwise put SRCU into not-running state.
  */
-static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay)
+static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay)
 {
 	bool pushgp = true;
 
-	spin_lock_irq_rcu_node(sp);
-	if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) {
-		if (!WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq))) {
+	spin_lock_irq_rcu_node(ssp);
+	if (ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)) {
+		if (!WARN_ON_ONCE(rcu_seq_state(ssp->srcu_gp_seq))) {
 			/* All requests fulfilled, time to go idle. */
 			pushgp = false;
 		}
-	} else if (!rcu_seq_state(sp->srcu_gp_seq)) {
+	} else if (!rcu_seq_state(ssp->srcu_gp_seq)) {
 		/* Outstanding request and no GP.  Start one. */
-		srcu_gp_start(sp);
+		srcu_gp_start(ssp);
 	}
-	spin_unlock_irq_rcu_node(sp);
+	spin_unlock_irq_rcu_node(ssp);
 
 	if (pushgp)
-		queue_delayed_work(rcu_gp_wq, &sp->work, delay);
+		queue_delayed_work(rcu_gp_wq, &ssp->work, delay);
 }
 
 /*
@@ -1254,41 +1254,41 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay)
  */
 static void process_srcu(struct work_struct *work)
 {
-	struct srcu_struct *sp;
+	struct srcu_struct *ssp;
 
-	sp = container_of(work, struct srcu_struct, work.work);
+	ssp = container_of(work, struct srcu_struct, work.work);
 
-	srcu_advance_state(sp);
-	srcu_reschedule(sp, srcu_get_delay(sp));
+	srcu_advance_state(ssp);
+	srcu_reschedule(ssp, srcu_get_delay(ssp));
 }
 
 void srcutorture_get_gp_data(enum rcutorture_type test_type,
-			     struct srcu_struct *sp, int *flags,
+			     struct srcu_struct *ssp, int *flags,
 			     unsigned long *gp_seq)
 {
 	if (test_type != SRCU_FLAVOR)
 		return;
 	*flags = 0;
-	*gp_seq = rcu_seq_current(&sp->srcu_gp_seq);
+	*gp_seq = rcu_seq_current(&ssp->srcu_gp_seq);
 }
 EXPORT_SYMBOL_GPL(srcutorture_get_gp_data);
 
-void srcu_torture_stats_print(struct srcu_struct *sp, char *tt, char *tf)
+void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf)
 {
 	int cpu;
 	int idx;
 	unsigned long s0 = 0, s1 = 0;
 
-	idx = sp->srcu_idx & 0x1;
+	idx = ssp->srcu_idx & 0x1;
 	pr_alert("%s%s Tree SRCU g%ld per-CPU(idx=%d):",
-		 tt, tf, rcu_seq_current(&sp->srcu_gp_seq), idx);
+		 tt, tf, rcu_seq_current(&ssp->srcu_gp_seq), idx);
 	for_each_possible_cpu(cpu) {
 		unsigned long l0, l1;
 		unsigned long u0, u1;
 		long c0, c1;
 		struct srcu_data *sdp;
 
-		sdp = per_cpu_ptr(sp->sda, cpu);
+		sdp = per_cpu_ptr(ssp->sda, cpu);
 		u0 = sdp->srcu_unlock_count[!idx];
 		u1 = sdp->srcu_unlock_count[idx];
 
@@ -1323,14 +1323,14 @@ early_initcall(srcu_bootup_announce);
 
 void __init srcu_init(void)
 {
-	struct srcu_struct *sp;
+	struct srcu_struct *ssp;
 
 	srcu_init_done = true;
 	while (!list_empty(&srcu_boot_list)) {
-		sp = list_first_entry(&srcu_boot_list, struct srcu_struct,
+		ssp = list_first_entry(&srcu_boot_list, struct srcu_struct,
 				      work.work.entry);
-		check_init_srcu_struct(sp);
-		list_del_init(&sp->work.work.entry);
-		queue_work(rcu_gp_wq, &sp->work.work);
+		check_init_srcu_struct(ssp);
+		list_del_init(&ssp->work.work.entry);
+		queue_work(rcu_gp_wq, &ssp->work.work);
 	}
 }
-- 
cgit 


From 9adcfaffc34d53e498637237fb3701560359d50b Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Date: Sat, 24 Nov 2018 13:10:25 +0900
Subject: printk: Make printk_emit() local function.

printk_emit() is called from only devkmsg_write() in the same file.
Save object size by making it a local function.

Link: http://lkml.kernel.org/r/5cc99d2c-c408-34f7-d1fc-e1cd2a9e31da@i-love.sakura.ne.jp
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 include/linux/printk.h |  5 -----
 kernel/printk/printk.c | 30 ++++++++++++++----------------
 2 files changed, 14 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/printk.h b/include/linux/printk.h
index cf3eccfe1543..55aa96975fa2 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -166,11 +166,6 @@ int vprintk_emit(int facility, int level,
 asmlinkage __printf(1, 0)
 int vprintk(const char *fmt, va_list args);
 
-asmlinkage __printf(5, 6) __cold
-int printk_emit(int facility, int level,
-		const char *dict, size_t dictlen,
-		const char *fmt, ...);
-
 asmlinkage __printf(1, 2) __cold
 int printk(const char *fmt, ...);
 
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index b77150ad1965..a1d88212a5d2 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -753,6 +753,19 @@ struct devkmsg_user {
 	char buf[CONSOLE_EXT_LOG_MAX];
 };
 
+static __printf(3, 4) __cold
+int devkmsg_emit(int facility, int level, const char *fmt, ...)
+{
+	va_list args;
+	int r;
+
+	va_start(args, fmt);
+	r = vprintk_emit(facility, level, NULL, 0, fmt, args);
+	va_end(args);
+
+	return r;
+}
+
 static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
 {
 	char *buf, *line;
@@ -811,7 +824,7 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
 		}
 	}
 
-	printk_emit(facility, level, NULL, 0, "%s", line);
+	devkmsg_emit(facility, level, "%s", line);
 	kfree(buf);
 	return ret;
 }
@@ -1936,21 +1949,6 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 }
 EXPORT_SYMBOL(vprintk);
 
-asmlinkage int printk_emit(int facility, int level,
-			   const char *dict, size_t dictlen,
-			   const char *fmt, ...)
-{
-	va_list args;
-	int r;
-
-	va_start(args, fmt);
-	r = vprintk_emit(facility, level, dict, dictlen, fmt, args);
-	va_end(args);
-
-	return r;
-}
-EXPORT_SYMBOL(printk_emit);
-
 int vprintk_default(const char *fmt, va_list args)
 {
 	int r;
-- 
cgit 


From 2d25bc55235314d869459c574be14e8faa73aca3 Mon Sep 17 00:00:00 2001
From: Jessica Yu <jeyu@kernel.org>
Date: Mon, 19 Nov 2018 17:43:58 +0100
Subject: module: make it clearer when we're handling kallsyms symbols vs
 exported symbols

The module loader internally works with both exported symbols
represented as struct kernel_symbol, as well as Elf symbols from a
module's symbol table. It's hard to distinguish sometimes which type of
symbol we're handling given that some helper function names are not
consistent or helpful. Take get_ksymbol() for instance - are we
looking for an exported symbol or a kallsyms symbol here? Or symname()
and kernel_symbol_name() - which function handles an exported symbol and
which one an Elf symbol?

Clean up and unify the function naming scheme a bit to make it clear
which kind of symbol we're handling. This change only affects static
functions internal to the module loader.

Reviewed-by: Miroslav Benes <mbenes@suse.cz>
Signed-off-by: Jessica Yu <jeyu@kernel.org>
---
 kernel/module.c | 78 ++++++++++++++++++++++++++++++++-------------------------
 1 file changed, 44 insertions(+), 34 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 49a405891587..1b5edf78694c 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -495,9 +495,9 @@ struct find_symbol_arg {
 	const struct kernel_symbol *sym;
 };
 
-static bool check_symbol(const struct symsearch *syms,
-				 struct module *owner,
-				 unsigned int symnum, void *data)
+static bool check_exported_symbol(const struct symsearch *syms,
+				  struct module *owner,
+				  unsigned int symnum, void *data)
 {
 	struct find_symbol_arg *fsa = data;
 
@@ -555,9 +555,9 @@ static int cmp_name(const void *va, const void *vb)
 	return strcmp(a, kernel_symbol_name(b));
 }
 
-static bool find_symbol_in_section(const struct symsearch *syms,
-				   struct module *owner,
-				   void *data)
+static bool find_exported_symbol_in_section(const struct symsearch *syms,
+					    struct module *owner,
+					    void *data)
 {
 	struct find_symbol_arg *fsa = data;
 	struct kernel_symbol *sym;
@@ -565,13 +565,14 @@ static bool find_symbol_in_section(const struct symsearch *syms,
 	sym = bsearch(fsa->name, syms->start, syms->stop - syms->start,
 			sizeof(struct kernel_symbol), cmp_name);
 
-	if (sym != NULL && check_symbol(syms, owner, sym - syms->start, data))
+	if (sym != NULL && check_exported_symbol(syms, owner,
+						 sym - syms->start, data))
 		return true;
 
 	return false;
 }
 
-/* Find a symbol and return it, along with, (optional) crc and
+/* Find an exported symbol and return it, along with, (optional) crc and
  * (optional) module which owns it.  Needs preempt disabled or module_mutex. */
 const struct kernel_symbol *find_symbol(const char *name,
 					struct module **owner,
@@ -585,7 +586,7 @@ const struct kernel_symbol *find_symbol(const char *name,
 	fsa.gplok = gplok;
 	fsa.warn = warn;
 
-	if (each_symbol_section(find_symbol_in_section, &fsa)) {
+	if (each_symbol_section(find_exported_symbol_in_section, &fsa)) {
 		if (owner)
 			*owner = fsa.owner;
 		if (crc)
@@ -2198,7 +2199,7 @@ EXPORT_SYMBOL_GPL(__symbol_get);
  *
  * You must hold the module_mutex.
  */
-static int verify_export_symbols(struct module *mod)
+static int verify_exported_symbols(struct module *mod)
 {
 	unsigned int i;
 	struct module *owner;
@@ -2519,10 +2520,10 @@ static void free_modinfo(struct module *mod)
 
 #ifdef CONFIG_KALLSYMS
 
-/* lookup symbol in given range of kernel_symbols */
-static const struct kernel_symbol *lookup_symbol(const char *name,
-	const struct kernel_symbol *start,
-	const struct kernel_symbol *stop)
+/* Lookup exported symbol in given range of kernel_symbols */
+static const struct kernel_symbol *lookup_exported_symbol(const char *name,
+							  const struct kernel_symbol *start,
+							  const struct kernel_symbol *stop)
 {
 	return bsearch(name, start, stop - start,
 			sizeof(struct kernel_symbol), cmp_name);
@@ -2533,9 +2534,10 @@ static int is_exported(const char *name, unsigned long value,
 {
 	const struct kernel_symbol *ks;
 	if (!mod)
-		ks = lookup_symbol(name, __start___ksymtab, __stop___ksymtab);
+		ks = lookup_exported_symbol(name, __start___ksymtab, __stop___ksymtab);
 	else
-		ks = lookup_symbol(name, mod->syms, mod->syms + mod->num_syms);
+		ks = lookup_exported_symbol(name, mod->syms, mod->syms + mod->num_syms);
+
 	return ks != NULL && kernel_symbol_value(ks) == value;
 }
 
@@ -3592,7 +3594,7 @@ static int complete_formation(struct module *mod, struct load_info *info)
 	mutex_lock(&module_mutex);
 
 	/* Find duplicate symbols (must be called under lock). */
-	err = verify_export_symbols(mod);
+	err = verify_exported_symbols(mod);
 	if (err < 0)
 		goto out;
 
@@ -3911,15 +3913,19 @@ static inline int is_arm_mapping_symbol(const char *str)
 	       && (str[2] == '\0' || str[2] == '.');
 }
 
-static const char *symname(struct mod_kallsyms *kallsyms, unsigned int symnum)
+static const char *kallsyms_symbol_name(struct mod_kallsyms *kallsyms, unsigned int symnum)
 {
 	return kallsyms->strtab + kallsyms->symtab[symnum].st_name;
 }
 
-static const char *get_ksymbol(struct module *mod,
-			       unsigned long addr,
-			       unsigned long *size,
-			       unsigned long *offset)
+/*
+ * Given a module and address, find the corresponding symbol and return its name
+ * while providing its size and offset if needed.
+ */
+static const char *find_kallsyms_symbol(struct module *mod,
+					unsigned long addr,
+					unsigned long *size,
+					unsigned long *offset)
 {
 	unsigned int i, best = 0;
 	unsigned long nextval;
@@ -3939,8 +3945,8 @@ static const char *get_ksymbol(struct module *mod,
 
 		/* We ignore unnamed symbols: they're uninformative
 		 * and inserted at a whim. */
-		if (*symname(kallsyms, i) == '\0'
-		    || is_arm_mapping_symbol(symname(kallsyms, i)))
+		if (*kallsyms_symbol_name(kallsyms, i) == '\0'
+		    || is_arm_mapping_symbol(kallsyms_symbol_name(kallsyms, i)))
 			continue;
 
 		if (kallsyms->symtab[i].st_value <= addr
@@ -3958,7 +3964,8 @@ static const char *get_ksymbol(struct module *mod,
 		*size = nextval - kallsyms->symtab[best].st_value;
 	if (offset)
 		*offset = addr - kallsyms->symtab[best].st_value;
-	return symname(kallsyms, best);
+
+	return kallsyms_symbol_name(kallsyms, best);
 }
 
 void * __weak dereference_module_function_descriptor(struct module *mod,
@@ -3983,7 +3990,8 @@ const char *module_address_lookup(unsigned long addr,
 	if (mod) {
 		if (modname)
 			*modname = mod->name;
-		ret = get_ksymbol(mod, addr, size, offset);
+
+		ret = find_kallsyms_symbol(mod, addr, size, offset);
 	}
 	/* Make a copy in here where it's safe */
 	if (ret) {
@@ -4006,9 +4014,10 @@ int lookup_module_symbol_name(unsigned long addr, char *symname)
 		if (within_module(addr, mod)) {
 			const char *sym;
 
-			sym = get_ksymbol(mod, addr, NULL, NULL);
+			sym = find_kallsyms_symbol(mod, addr, NULL, NULL);
 			if (!sym)
 				goto out;
+
 			strlcpy(symname, sym, KSYM_NAME_LEN);
 			preempt_enable();
 			return 0;
@@ -4031,7 +4040,7 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size,
 		if (within_module(addr, mod)) {
 			const char *sym;
 
-			sym = get_ksymbol(mod, addr, size, offset);
+			sym = find_kallsyms_symbol(mod, addr, size, offset);
 			if (!sym)
 				goto out;
 			if (modname)
@@ -4062,7 +4071,7 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
 		if (symnum < kallsyms->num_symtab) {
 			*value = kallsyms->symtab[symnum].st_value;
 			*type = kallsyms->symtab[symnum].st_info;
-			strlcpy(name, symname(kallsyms, symnum), KSYM_NAME_LEN);
+			strlcpy(name, kallsyms_symbol_name(kallsyms, symnum), KSYM_NAME_LEN);
 			strlcpy(module_name, mod->name, MODULE_NAME_LEN);
 			*exported = is_exported(name, *value, mod);
 			preempt_enable();
@@ -4074,13 +4083,14 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
 	return -ERANGE;
 }
 
-static unsigned long mod_find_symname(struct module *mod, const char *name)
+/* Given a module and name of symbol, find and return the symbol's value */
+static unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name)
 {
 	unsigned int i;
 	struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms);
 
 	for (i = 0; i < kallsyms->num_symtab; i++)
-		if (strcmp(name, symname(kallsyms, i)) == 0 &&
+		if (strcmp(name, kallsyms_symbol_name(kallsyms, i)) == 0 &&
 		    kallsyms->symtab[i].st_shndx != SHN_UNDEF)
 			return kallsyms->symtab[i].st_value;
 	return 0;
@@ -4097,12 +4107,12 @@ unsigned long module_kallsyms_lookup_name(const char *name)
 	preempt_disable();
 	if ((colon = strnchr(name, MODULE_NAME_LEN, ':')) != NULL) {
 		if ((mod = find_module_all(name, colon - name, false)) != NULL)
-			ret = mod_find_symname(mod, colon+1);
+			ret = find_kallsyms_symbol_value(mod, colon+1);
 	} else {
 		list_for_each_entry_rcu(mod, &modules, list) {
 			if (mod->state == MODULE_STATE_UNFORMED)
 				continue;
-			if ((ret = mod_find_symname(mod, name)) != 0)
+			if ((ret = find_kallsyms_symbol_value(mod, name)) != 0)
 				break;
 		}
 	}
@@ -4131,7 +4141,7 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
 			if (kallsyms->symtab[i].st_shndx == SHN_UNDEF)
 				continue;
 
-			ret = fn(data, symname(kallsyms, i),
+			ret = fn(data, kallsyms_symbol_name(kallsyms, i),
 				 mod, kallsyms->symtab[i].st_value);
 			if (ret != 0)
 				return ret;
-- 
cgit 


From 96c6935212d632cb78db9149c61005b839e03b75 Mon Sep 17 00:00:00 2001
From: Yangtao Li <tiny.windzz@gmail.com>
Date: Tue, 6 Nov 2018 09:38:06 -0500
Subject: PM / QoS: Change to use DEFINE_SHOW_ATTRIBUTE macro

Use DEFINE_SHOW_ATTRIBUTE macro to simplify the code.

Signed-off-by: Yangtao Li <tiny.windzz@gmail.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/qos.c | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 86d72ffb811b..b7a82502857a 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -184,7 +184,7 @@ static inline void pm_qos_set_value(struct pm_qos_constraints *c, s32 value)
 	c->target_value = value;
 }
 
-static int pm_qos_dbg_show_requests(struct seq_file *s, void *unused)
+static int pm_qos_debug_show(struct seq_file *s, void *unused)
 {
 	struct pm_qos_object *qos = (struct pm_qos_object *)s->private;
 	struct pm_qos_constraints *c;
@@ -245,18 +245,7 @@ out:
 	return 0;
 }
 
-static int pm_qos_dbg_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, pm_qos_dbg_show_requests,
-			   inode->i_private);
-}
-
-static const struct file_operations pm_qos_debug_fops = {
-	.open           = pm_qos_dbg_open,
-	.read           = seq_read,
-	.llseek         = seq_lseek,
-	.release        = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(pm_qos_debug);
 
 /**
  * pm_qos_update_target - manages the constraints list and calls the notifiers
-- 
cgit 


From c43ac4a5301986c015137bb89568979f9b3264ca Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Tue, 27 Nov 2018 09:36:37 -0500
Subject: tracing: Do not line wrap short line in function_graph_enter()

Commit 588ca1786f2dd ("function_graph: Use new curr_ret_depth to manage
depth instead of curr_ret_stack") removed a parameter from the call
ftrace_push_return_trace() that made it so that the entire call was under 80
characters, but it did not remove the line break. There's no reason to break
that line up, so make it a single line.

Link: http://lkml.kernel.org/r/20181122100322.GN2131@hirez.programming.kicks-ass.net

Reported-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_functions_graph.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 086af4f5c3e8..0d235e44d08e 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -188,8 +188,7 @@ int function_graph_enter(unsigned long ret, unsigned long func,
 	trace.func = func;
 	trace.depth = ++current->curr_ret_depth;
 
-	if (ftrace_push_return_trace(ret, func,
-				     frame_pointer, retp))
+	if (ftrace_push_return_trace(ret, func, frame_pointer, retp))
 		goto out;
 
 	/* Only trace if the calling function expects to */
-- 
cgit 


From d864a3ca883095aa12575b84841ebd52b3d808fa Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Mon, 12 Nov 2018 15:21:22 -0500
Subject: fgraph: Create a fgraph.c file to store function graph infrastructure

As the function graph infrastructure can be used by thing other than
tracing, moving the code to its own file out of the trace_functions_graph.c
code makes more sense.

The fgraph.c file will only contain the infrastructure required to hook into
functions and their return code.

Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/Makefile                |   1 +
 kernel/trace/fgraph.c                | 232 +++++++++++++++++++++++++++++++++++
 kernel/trace/trace_functions_graph.c | 220 ---------------------------------
 3 files changed, 233 insertions(+), 220 deletions(-)
 create mode 100644 kernel/trace/fgraph.c

(limited to 'kernel')

diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index f81dadbc7c4a..c7ade7965464 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -57,6 +57,7 @@ obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
 obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
 obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
 obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
+obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += fgraph.o
 ifeq ($(CONFIG_BLOCK),y)
 obj-$(CONFIG_EVENT_TRACING) += blktrace.o
 endif
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
new file mode 100644
index 000000000000..5ad9c0e88b80
--- /dev/null
+++ b/kernel/trace/fgraph.c
@@ -0,0 +1,232 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Infrastructure to took into function calls and returns.
+ * Copyright (c) 2008-2009 Frederic Weisbecker <fweisbec@gmail.com>
+ * Mostly borrowed from function tracer which
+ * is Copyright (c) Steven Rostedt <srostedt@redhat.com>
+ *
+ * Highly modified by Steven Rostedt (VMware).
+ */
+#include <linux/ftrace.h>
+
+#include "trace.h"
+
+static bool kill_ftrace_graph;
+
+/**
+ * ftrace_graph_is_dead - returns true if ftrace_graph_stop() was called
+ *
+ * ftrace_graph_stop() is called when a severe error is detected in
+ * the function graph tracing. This function is called by the critical
+ * paths of function graph to keep those paths from doing any more harm.
+ */
+bool ftrace_graph_is_dead(void)
+{
+	return kill_ftrace_graph;
+}
+
+/**
+ * ftrace_graph_stop - set to permanently disable function graph tracincg
+ *
+ * In case of an error int function graph tracing, this is called
+ * to try to keep function graph tracing from causing any more harm.
+ * Usually this is pretty severe and this is called to try to at least
+ * get a warning out to the user.
+ */
+void ftrace_graph_stop(void)
+{
+	kill_ftrace_graph = true;
+}
+
+/* Add a function return address to the trace stack on thread info.*/
+static int
+ftrace_push_return_trace(unsigned long ret, unsigned long func,
+			 unsigned long frame_pointer, unsigned long *retp)
+{
+	unsigned long long calltime;
+	int index;
+
+	if (unlikely(ftrace_graph_is_dead()))
+		return -EBUSY;
+
+	if (!current->ret_stack)
+		return -EBUSY;
+
+	/*
+	 * We must make sure the ret_stack is tested before we read
+	 * anything else.
+	 */
+	smp_rmb();
+
+	/* The return trace stack is full */
+	if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) {
+		atomic_inc(&current->trace_overrun);
+		return -EBUSY;
+	}
+
+	/*
+	 * The curr_ret_stack is an index to ftrace return stack of
+	 * current task.  Its value should be in [0, FTRACE_RETFUNC_
+	 * DEPTH) when the function graph tracer is used.  To support
+	 * filtering out specific functions, it makes the index
+	 * negative by subtracting huge value (FTRACE_NOTRACE_DEPTH)
+	 * so when it sees a negative index the ftrace will ignore
+	 * the record.  And the index gets recovered when returning
+	 * from the filtered function by adding the FTRACE_NOTRACE_
+	 * DEPTH and then it'll continue to record functions normally.
+	 *
+	 * The curr_ret_stack is initialized to -1 and get increased
+	 * in this function.  So it can be less than -1 only if it was
+	 * filtered out via ftrace_graph_notrace_addr() which can be
+	 * set from set_graph_notrace file in tracefs by user.
+	 */
+	if (current->curr_ret_stack < -1)
+		return -EBUSY;
+
+	calltime = trace_clock_local();
+
+	index = ++current->curr_ret_stack;
+	if (ftrace_graph_notrace_addr(func))
+		current->curr_ret_stack -= FTRACE_NOTRACE_DEPTH;
+	barrier();
+	current->ret_stack[index].ret = ret;
+	current->ret_stack[index].func = func;
+	current->ret_stack[index].calltime = calltime;
+#ifdef HAVE_FUNCTION_GRAPH_FP_TEST
+	current->ret_stack[index].fp = frame_pointer;
+#endif
+#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
+	current->ret_stack[index].retp = retp;
+#endif
+	return 0;
+}
+
+int function_graph_enter(unsigned long ret, unsigned long func,
+			 unsigned long frame_pointer, unsigned long *retp)
+{
+	struct ftrace_graph_ent trace;
+
+	trace.func = func;
+	trace.depth = ++current->curr_ret_depth;
+
+	if (ftrace_push_return_trace(ret, func, frame_pointer, retp))
+		goto out;
+
+	/* Only trace if the calling function expects to */
+	if (!ftrace_graph_entry(&trace))
+		goto out_ret;
+
+	return 0;
+ out_ret:
+	current->curr_ret_stack--;
+ out:
+	current->curr_ret_depth--;
+	return -EBUSY;
+}
+
+/* Retrieve a function return address to the trace stack on thread info.*/
+static void
+ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
+			unsigned long frame_pointer)
+{
+	int index;
+
+	index = current->curr_ret_stack;
+
+	/*
+	 * A negative index here means that it's just returned from a
+	 * notrace'd function.  Recover index to get an original
+	 * return address.  See ftrace_push_return_trace().
+	 *
+	 * TODO: Need to check whether the stack gets corrupted.
+	 */
+	if (index < 0)
+		index += FTRACE_NOTRACE_DEPTH;
+
+	if (unlikely(index < 0 || index >= FTRACE_RETFUNC_DEPTH)) {
+		ftrace_graph_stop();
+		WARN_ON(1);
+		/* Might as well panic, otherwise we have no where to go */
+		*ret = (unsigned long)panic;
+		return;
+	}
+
+#ifdef HAVE_FUNCTION_GRAPH_FP_TEST
+	/*
+	 * The arch may choose to record the frame pointer used
+	 * and check it here to make sure that it is what we expect it
+	 * to be. If gcc does not set the place holder of the return
+	 * address in the frame pointer, and does a copy instead, then
+	 * the function graph trace will fail. This test detects this
+	 * case.
+	 *
+	 * Currently, x86_32 with optimize for size (-Os) makes the latest
+	 * gcc do the above.
+	 *
+	 * Note, -mfentry does not use frame pointers, and this test
+	 *  is not needed if CC_USING_FENTRY is set.
+	 */
+	if (unlikely(current->ret_stack[index].fp != frame_pointer)) {
+		ftrace_graph_stop();
+		WARN(1, "Bad frame pointer: expected %lx, received %lx\n"
+		     "  from func %ps return to %lx\n",
+		     current->ret_stack[index].fp,
+		     frame_pointer,
+		     (void *)current->ret_stack[index].func,
+		     current->ret_stack[index].ret);
+		*ret = (unsigned long)panic;
+		return;
+	}
+#endif
+
+	*ret = current->ret_stack[index].ret;
+	trace->func = current->ret_stack[index].func;
+	trace->calltime = current->ret_stack[index].calltime;
+	trace->overrun = atomic_read(&current->trace_overrun);
+	trace->depth = current->curr_ret_depth--;
+	/*
+	 * We still want to trace interrupts coming in if
+	 * max_depth is set to 1. Make sure the decrement is
+	 * seen before ftrace_graph_return.
+	 */
+	barrier();
+}
+
+/*
+ * Send the trace to the ring-buffer.
+ * @return the original return address.
+ */
+unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
+{
+	struct ftrace_graph_ret trace;
+	unsigned long ret;
+
+	ftrace_pop_return_trace(&trace, &ret, frame_pointer);
+	trace.rettime = trace_clock_local();
+	ftrace_graph_return(&trace);
+	/*
+	 * The ftrace_graph_return() may still access the current
+	 * ret_stack structure, we need to make sure the update of
+	 * curr_ret_stack is after that.
+	 */
+	barrier();
+	current->curr_ret_stack--;
+	/*
+	 * The curr_ret_stack can be less than -1 only if it was
+	 * filtered out and it's about to return from the function.
+	 * Recover the index and continue to trace normal functions.
+	 */
+	if (current->curr_ret_stack < -1) {
+		current->curr_ret_stack += FTRACE_NOTRACE_DEPTH;
+		return ret;
+	}
+
+	if (unlikely(!ret)) {
+		ftrace_graph_stop();
+		WARN_ON(1);
+		/* Might as well panic. What else to do? */
+		ret = (unsigned long)panic;
+	}
+
+	return ret;
+}
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 0d235e44d08e..b846d82c2f95 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -16,33 +16,6 @@
 #include "trace.h"
 #include "trace_output.h"
 
-static bool kill_ftrace_graph;
-
-/**
- * ftrace_graph_is_dead - returns true if ftrace_graph_stop() was called
- *
- * ftrace_graph_stop() is called when a severe error is detected in
- * the function graph tracing. This function is called by the critical
- * paths of function graph to keep those paths from doing any more harm.
- */
-bool ftrace_graph_is_dead(void)
-{
-	return kill_ftrace_graph;
-}
-
-/**
- * ftrace_graph_stop - set to permanently disable function graph tracincg
- *
- * In case of an error int function graph tracing, this is called
- * to try to keep function graph tracing from causing any more harm.
- * Usually this is pretty severe and this is called to try to at least
- * get a warning out to the user.
- */
-void ftrace_graph_stop(void)
-{
-	kill_ftrace_graph = true;
-}
-
 /* When set, irq functions will be ignored */
 static int ftrace_graph_skip_irqs;
 
@@ -117,199 +90,6 @@ static void
 print_graph_duration(struct trace_array *tr, unsigned long long duration,
 		     struct trace_seq *s, u32 flags);
 
-/* Add a function return address to the trace stack on thread info.*/
-static int
-ftrace_push_return_trace(unsigned long ret, unsigned long func,
-			 unsigned long frame_pointer, unsigned long *retp)
-{
-	unsigned long long calltime;
-	int index;
-
-	if (unlikely(ftrace_graph_is_dead()))
-		return -EBUSY;
-
-	if (!current->ret_stack)
-		return -EBUSY;
-
-	/*
-	 * We must make sure the ret_stack is tested before we read
-	 * anything else.
-	 */
-	smp_rmb();
-
-	/* The return trace stack is full */
-	if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) {
-		atomic_inc(&current->trace_overrun);
-		return -EBUSY;
-	}
-
-	/*
-	 * The curr_ret_stack is an index to ftrace return stack of
-	 * current task.  Its value should be in [0, FTRACE_RETFUNC_
-	 * DEPTH) when the function graph tracer is used.  To support
-	 * filtering out specific functions, it makes the index
-	 * negative by subtracting huge value (FTRACE_NOTRACE_DEPTH)
-	 * so when it sees a negative index the ftrace will ignore
-	 * the record.  And the index gets recovered when returning
-	 * from the filtered function by adding the FTRACE_NOTRACE_
-	 * DEPTH and then it'll continue to record functions normally.
-	 *
-	 * The curr_ret_stack is initialized to -1 and get increased
-	 * in this function.  So it can be less than -1 only if it was
-	 * filtered out via ftrace_graph_notrace_addr() which can be
-	 * set from set_graph_notrace file in tracefs by user.
-	 */
-	if (current->curr_ret_stack < -1)
-		return -EBUSY;
-
-	calltime = trace_clock_local();
-
-	index = ++current->curr_ret_stack;
-	if (ftrace_graph_notrace_addr(func))
-		current->curr_ret_stack -= FTRACE_NOTRACE_DEPTH;
-	barrier();
-	current->ret_stack[index].ret = ret;
-	current->ret_stack[index].func = func;
-	current->ret_stack[index].calltime = calltime;
-#ifdef HAVE_FUNCTION_GRAPH_FP_TEST
-	current->ret_stack[index].fp = frame_pointer;
-#endif
-#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
-	current->ret_stack[index].retp = retp;
-#endif
-	return 0;
-}
-
-int function_graph_enter(unsigned long ret, unsigned long func,
-			 unsigned long frame_pointer, unsigned long *retp)
-{
-	struct ftrace_graph_ent trace;
-
-	trace.func = func;
-	trace.depth = ++current->curr_ret_depth;
-
-	if (ftrace_push_return_trace(ret, func, frame_pointer, retp))
-		goto out;
-
-	/* Only trace if the calling function expects to */
-	if (!ftrace_graph_entry(&trace))
-		goto out_ret;
-
-	return 0;
- out_ret:
-	current->curr_ret_stack--;
- out:
-	current->curr_ret_depth--;
-	return -EBUSY;
-}
-
-/* Retrieve a function return address to the trace stack on thread info.*/
-static void
-ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
-			unsigned long frame_pointer)
-{
-	int index;
-
-	index = current->curr_ret_stack;
-
-	/*
-	 * A negative index here means that it's just returned from a
-	 * notrace'd function.  Recover index to get an original
-	 * return address.  See ftrace_push_return_trace().
-	 *
-	 * TODO: Need to check whether the stack gets corrupted.
-	 */
-	if (index < 0)
-		index += FTRACE_NOTRACE_DEPTH;
-
-	if (unlikely(index < 0 || index >= FTRACE_RETFUNC_DEPTH)) {
-		ftrace_graph_stop();
-		WARN_ON(1);
-		/* Might as well panic, otherwise we have no where to go */
-		*ret = (unsigned long)panic;
-		return;
-	}
-
-#ifdef HAVE_FUNCTION_GRAPH_FP_TEST
-	/*
-	 * The arch may choose to record the frame pointer used
-	 * and check it here to make sure that it is what we expect it
-	 * to be. If gcc does not set the place holder of the return
-	 * address in the frame pointer, and does a copy instead, then
-	 * the function graph trace will fail. This test detects this
-	 * case.
-	 *
-	 * Currently, x86_32 with optimize for size (-Os) makes the latest
-	 * gcc do the above.
-	 *
-	 * Note, -mfentry does not use frame pointers, and this test
-	 *  is not needed if CC_USING_FENTRY is set.
-	 */
-	if (unlikely(current->ret_stack[index].fp != frame_pointer)) {
-		ftrace_graph_stop();
-		WARN(1, "Bad frame pointer: expected %lx, received %lx\n"
-		     "  from func %ps return to %lx\n",
-		     current->ret_stack[index].fp,
-		     frame_pointer,
-		     (void *)current->ret_stack[index].func,
-		     current->ret_stack[index].ret);
-		*ret = (unsigned long)panic;
-		return;
-	}
-#endif
-
-	*ret = current->ret_stack[index].ret;
-	trace->func = current->ret_stack[index].func;
-	trace->calltime = current->ret_stack[index].calltime;
-	trace->overrun = atomic_read(&current->trace_overrun);
-	trace->depth = current->curr_ret_depth--;
-	/*
-	 * We still want to trace interrupts coming in if
-	 * max_depth is set to 1. Make sure the decrement is
-	 * seen before ftrace_graph_return.
-	 */
-	barrier();
-}
-
-/*
- * Send the trace to the ring-buffer.
- * @return the original return address.
- */
-unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
-{
-	struct ftrace_graph_ret trace;
-	unsigned long ret;
-
-	ftrace_pop_return_trace(&trace, &ret, frame_pointer);
-	trace.rettime = trace_clock_local();
-	ftrace_graph_return(&trace);
-	/*
-	 * The ftrace_graph_return() may still access the current
-	 * ret_stack structure, we need to make sure the update of
-	 * curr_ret_stack is after that.
-	 */
-	barrier();
-	current->curr_ret_stack--;
-	/*
-	 * The curr_ret_stack can be less than -1 only if it was
-	 * filtered out and it's about to return from the function.
-	 * Recover the index and continue to trace normal functions.
-	 */
-	if (current->curr_ret_stack < -1) {
-		current->curr_ret_stack += FTRACE_NOTRACE_DEPTH;
-		return ret;
-	}
-
-	if (unlikely(!ret)) {
-		ftrace_graph_stop();
-		WARN_ON(1);
-		/* Might as well panic. What else to do? */
-		ret = (unsigned long)panic;
-	}
-
-	return ret;
-}
-
 /**
  * ftrace_graph_ret_addr - convert a potentially modified stack return address
  *			   to its original value
-- 
cgit 


From 9cd2992f2d6c8df54c5b937d5d1f8a23b684cc1d Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Wed, 14 Nov 2018 13:14:58 -0500
Subject: fgraph: Have set_graph_notrace only affect function_graph tracer

In order to make the function graph infrastructure more generic, there can
not be code specific for the function_graph tracer in the generic code. This
includes the set_graph_notrace logic, that stops all graph calls when a
function in the set_graph_notrace is hit.

By using the trace_recursion mask, we can use a bit in the current
task_struct to implement the notrace code, and move the logic out of
fgraph.c and into trace_functions_graph.c and keeps it affecting only the
tracer and not all call graph callbacks.

Acked-by: Namhyung Kim <namhyung@kernel.org>
Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/fgraph.c                | 21 ---------------------
 kernel/trace/trace.h                 |  7 +++++++
 kernel/trace/trace_functions_graph.c | 22 ++++++++++++++++++++++
 3 files changed, 29 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 5ad9c0e88b80..e852b69c0e64 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -64,30 +64,9 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func,
 		return -EBUSY;
 	}
 
-	/*
-	 * The curr_ret_stack is an index to ftrace return stack of
-	 * current task.  Its value should be in [0, FTRACE_RETFUNC_
-	 * DEPTH) when the function graph tracer is used.  To support
-	 * filtering out specific functions, it makes the index
-	 * negative by subtracting huge value (FTRACE_NOTRACE_DEPTH)
-	 * so when it sees a negative index the ftrace will ignore
-	 * the record.  And the index gets recovered when returning
-	 * from the filtered function by adding the FTRACE_NOTRACE_
-	 * DEPTH and then it'll continue to record functions normally.
-	 *
-	 * The curr_ret_stack is initialized to -1 and get increased
-	 * in this function.  So it can be less than -1 only if it was
-	 * filtered out via ftrace_graph_notrace_addr() which can be
-	 * set from set_graph_notrace file in tracefs by user.
-	 */
-	if (current->curr_ret_stack < -1)
-		return -EBUSY;
-
 	calltime = trace_clock_local();
 
 	index = ++current->curr_ret_stack;
-	if (ftrace_graph_notrace_addr(func))
-		current->curr_ret_stack -= FTRACE_NOTRACE_DEPTH;
 	barrier();
 	current->ret_stack[index].ret = ret;
 	current->ret_stack[index].func = func;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 447bd96ee658..f67060a75f38 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -534,6 +534,13 @@ enum {
 
 	TRACE_GRAPH_DEPTH_START_BIT,
 	TRACE_GRAPH_DEPTH_END_BIT,
+
+	/*
+	 * To implement set_graph_notrace, if this bit is set, we ignore
+	 * function graph tracing of called functions, until the return
+	 * function is called to clear it.
+	 */
+	TRACE_GRAPH_NOTRACE_BIT,
 };
 
 #define trace_recursion_set(bit)	do { (current)->trace_recursion |= (1<<(bit)); } while (0)
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index b846d82c2f95..ecf543df943b 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -188,6 +188,18 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
 	int cpu;
 	int pc;
 
+	if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT))
+		return 0;
+
+	if (ftrace_graph_notrace_addr(trace->func)) {
+		trace_recursion_set(TRACE_GRAPH_NOTRACE_BIT);
+		/*
+		 * Need to return 1 to have the return called
+		 * that will clear the NOTRACE bit.
+		 */
+		return 1;
+	}
+
 	if (!ftrace_trace_task(tr))
 		return 0;
 
@@ -290,6 +302,11 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
 
 	ftrace_graph_addr_finish(trace);
 
+	if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT)) {
+		trace_recursion_clear(TRACE_GRAPH_NOTRACE_BIT);
+		return;
+	}
+
 	local_irq_save(flags);
 	cpu = raw_smp_processor_id();
 	data = per_cpu_ptr(tr->trace_buffer.data, cpu);
@@ -315,6 +332,11 @@ static void trace_graph_thresh_return(struct ftrace_graph_ret *trace)
 {
 	ftrace_graph_addr_finish(trace);
 
+	if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT)) {
+		trace_recursion_clear(TRACE_GRAPH_NOTRACE_BIT);
+		return;
+	}
+
 	if (tracing_thresh &&
 	    (trace->rettime - trace->calltime < tracing_thresh))
 		return;
-- 
cgit 


From e9ee9efc0d176512cdce9d27ff8549d7ffa2bfcd Mon Sep 17 00:00:00 2001
From: David Miller <davem@davemloft.net>
Date: Fri, 30 Nov 2018 21:08:14 -0800
Subject: bpf: Add BPF_F_ANY_ALIGNMENT.

Often we want to write tests cases that check things like bad context
offset accesses.  And one way to do this is to use an odd offset on,
for example, a 32-bit load.

This unfortunately triggers the alignment checks first on platforms
that do not set CONFIG_EFFICIENT_UNALIGNED_ACCESS.  So the test
case see the alignment failure rather than what it was testing for.

It is often not completely possible to respect the original intention
of the test, or even test the same exact thing, while solving the
alignment issue.

Another option could have been to check the alignment after the
context and other validations are performed by the verifier, but
that is a non-trivial change to the verifier.

Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h                    | 14 ++++++++++++++
 kernel/bpf/syscall.c                        |  7 ++++++-
 kernel/bpf/verifier.c                       |  2 ++
 tools/include/uapi/linux/bpf.h              | 14 ++++++++++++++
 tools/lib/bpf/bpf.c                         |  8 ++++----
 tools/lib/bpf/bpf.h                         |  2 +-
 tools/testing/selftests/bpf/test_align.c    |  4 ++--
 tools/testing/selftests/bpf/test_verifier.c |  3 ++-
 8 files changed, 45 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 597afdbc1ab9..8050caea7495 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -232,6 +232,20 @@ enum bpf_attach_type {
  */
 #define BPF_F_STRICT_ALIGNMENT	(1U << 0)
 
+/* If BPF_F_ANY_ALIGNMENT is used in BPF_PROF_LOAD command, the
+ * verifier will allow any alignment whatsoever.  On platforms
+ * with strict alignment requirements for loads ands stores (such
+ * as sparc and mips) the verifier validates that all loads and
+ * stores provably follow this requirement.  This flag turns that
+ * checking and enforcement off.
+ *
+ * It is mostly used for testing when we want to validate the
+ * context and memory access aspects of the verifier, but because
+ * of an unaligned access the alignment check would trigger before
+ * the one we are interested in.
+ */
+#define BPF_F_ANY_ALIGNMENT	(1U << 1)
+
 /* when bpf_ldimm64->src_reg == BPF_PSEUDO_MAP_FD, bpf_ldimm64->imm == fd */
 #define BPF_PSEUDO_MAP_FD	1
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 85cbeec06e50..f9554d9a14e1 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1452,9 +1452,14 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
 	if (CHECK_ATTR(BPF_PROG_LOAD))
 		return -EINVAL;
 
-	if (attr->prog_flags & ~BPF_F_STRICT_ALIGNMENT)
+	if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT | BPF_F_ANY_ALIGNMENT))
 		return -EINVAL;
 
+	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
+	    (attr->prog_flags & BPF_F_ANY_ALIGNMENT) &&
+	    !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
 	/* copy eBPF program license from user space */
 	if (strncpy_from_user(license, u64_to_user_ptr(attr->license),
 			      sizeof(license) - 1) < 0)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9584438fa2cc..71988337ac14 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6505,6 +6505,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
 	env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT);
 	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
 		env->strict_alignment = true;
+	if (attr->prog_flags & BPF_F_ANY_ALIGNMENT)
+		env->strict_alignment = false;
 
 	ret = replace_map_fd_with_map_ptr(env);
 	if (ret < 0)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 597afdbc1ab9..8050caea7495 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -232,6 +232,20 @@ enum bpf_attach_type {
  */
 #define BPF_F_STRICT_ALIGNMENT	(1U << 0)
 
+/* If BPF_F_ANY_ALIGNMENT is used in BPF_PROF_LOAD command, the
+ * verifier will allow any alignment whatsoever.  On platforms
+ * with strict alignment requirements for loads ands stores (such
+ * as sparc and mips) the verifier validates that all loads and
+ * stores provably follow this requirement.  This flag turns that
+ * checking and enforcement off.
+ *
+ * It is mostly used for testing when we want to validate the
+ * context and memory access aspects of the verifier, but because
+ * of an unaligned access the alignment check would trigger before
+ * the one we are interested in.
+ */
+#define BPF_F_ANY_ALIGNMENT	(1U << 1)
+
 /* when bpf_ldimm64->src_reg == BPF_PSEUDO_MAP_FD, bpf_ldimm64->imm == fd */
 #define BPF_PSEUDO_MAP_FD	1
 
diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index ce1822194590..c19226cccf39 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -279,9 +279,9 @@ int bpf_load_program(enum bpf_prog_type type, const struct bpf_insn *insns,
 }
 
 int bpf_verify_program(enum bpf_prog_type type, const struct bpf_insn *insns,
-		       size_t insns_cnt, int strict_alignment,
-		       const char *license, __u32 kern_version,
-		       char *log_buf, size_t log_buf_sz, int log_level)
+		       size_t insns_cnt, __u32 prog_flags, const char *license,
+		       __u32 kern_version, char *log_buf, size_t log_buf_sz,
+		       int log_level)
 {
 	union bpf_attr attr;
 
@@ -295,7 +295,7 @@ int bpf_verify_program(enum bpf_prog_type type, const struct bpf_insn *insns,
 	attr.log_level = log_level;
 	log_buf[0] = 0;
 	attr.kern_version = kern_version;
-	attr.prog_flags = strict_alignment ? BPF_F_STRICT_ALIGNMENT : 0;
+	attr.prog_flags = prog_flags;
 
 	return sys_bpf(BPF_PROG_LOAD, &attr, sizeof(attr));
 }
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 09e8bbe111d4..60392b70587c 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -98,7 +98,7 @@ LIBBPF_API int bpf_load_program(enum bpf_prog_type type,
 				char *log_buf, size_t log_buf_sz);
 LIBBPF_API int bpf_verify_program(enum bpf_prog_type type,
 				  const struct bpf_insn *insns,
-				  size_t insns_cnt, int strict_alignment,
+				  size_t insns_cnt, __u32 prog_flags,
 				  const char *license, __u32 kern_version,
 				  char *log_buf, size_t log_buf_sz,
 				  int log_level);
diff --git a/tools/testing/selftests/bpf/test_align.c b/tools/testing/selftests/bpf/test_align.c
index 5f377ec53f2f..3c789d03b629 100644
--- a/tools/testing/selftests/bpf/test_align.c
+++ b/tools/testing/selftests/bpf/test_align.c
@@ -620,8 +620,8 @@ static int do_test_single(struct bpf_align_test *test)
 
 	prog_len = probe_filter_length(prog);
 	fd_prog = bpf_verify_program(prog_type ? : BPF_PROG_TYPE_SOCKET_FILTER,
-				     prog, prog_len, 1, "GPL", 0,
-				     bpf_vlog, sizeof(bpf_vlog), 2);
+				     prog, prog_len, BPF_F_STRICT_ALIGNMENT,
+				     "GPL", 0, bpf_vlog, sizeof(bpf_vlog), 2);
 	if (fd_prog < 0 && test->result != REJECT) {
 		printf("Failed to load program.\n");
 		printf("%s", bpf_vlog);
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 5eace1f606fb..78e779c35869 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -14275,7 +14275,8 @@ static void do_test_single(struct bpf_test *test, bool unpriv,
 	prog_len = probe_filter_length(prog);
 
 	fd_prog = bpf_verify_program(prog_type, prog, prog_len,
-				     test->flags & F_LOAD_WITH_STRICT_ALIGNMENT,
+				     test->flags & F_LOAD_WITH_STRICT_ALIGNMENT ?
+				     BPF_F_STRICT_ALIGNMENT : 0,
 				     "GPL", 0, bpf_vlog, sizeof(bpf_vlog), 1);
 
 	expected_ret = unpriv && test->result_unpriv != UNDEF ?
-- 
cgit 


From b18814e767a445534ab9ccba02e82a31208f85d6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 4 Nov 2018 17:27:56 +0100
Subject: dma-direct: provide page based alloc/free helpers

Some architectures support remapping highmem into DMA coherent
allocations.  To use the common code for them we need variants of
dma_direct_{alloc,free}_pages that do not use kernel virtual addresses.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
---
 include/linux/dma-direct.h |  3 +++
 kernel/dma/direct.c        | 32 ++++++++++++++++++++++----------
 2 files changed, 25 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index 9e66bfe369aa..61b78f934f64 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -67,6 +67,9 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
 		dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs);
 void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
 		dma_addr_t dma_addr, unsigned long attrs);
+struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
+		dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs);
+void __dma_direct_free_pages(struct device *dev, size_t size, struct page *page);
 dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
 		unsigned long offset, size_t size, enum dma_data_direction dir,
 		unsigned long attrs);
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 22a12ab5a5e9..680287779b0a 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -103,14 +103,13 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size)
 			min_not_zero(dev->coherent_dma_mask, dev->bus_dma_mask);
 }
 
-void *dma_direct_alloc_pages(struct device *dev, size_t size,
+struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
 		dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
 {
 	unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
 	int page_order = get_order(size);
 	struct page *page = NULL;
 	u64 phys_mask;
-	void *ret;
 
 	if (attrs & DMA_ATTR_NO_WARN)
 		gfp |= __GFP_NOWARN;
@@ -150,11 +149,22 @@ again:
 		}
 	}
 
+	return page;
+}
+
+void *dma_direct_alloc_pages(struct device *dev, size_t size,
+		dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
+{
+	struct page *page;
+	void *ret;
+
+	page = __dma_direct_alloc_pages(dev, size, dma_handle, gfp, attrs);
 	if (!page)
 		return NULL;
+
 	ret = page_address(page);
 	if (force_dma_unencrypted()) {
-		set_memory_decrypted((unsigned long)ret, 1 << page_order);
+		set_memory_decrypted((unsigned long)ret, 1 << get_order(size));
 		*dma_handle = __phys_to_dma(dev, page_to_phys(page));
 	} else {
 		*dma_handle = phys_to_dma(dev, page_to_phys(page));
@@ -163,20 +173,22 @@ again:
 	return ret;
 }
 
-/*
- * NOTE: this function must never look at the dma_addr argument, because we want
- * to be able to use it as a helper for iommu implementations as well.
- */
+void __dma_direct_free_pages(struct device *dev, size_t size, struct page *page)
+{
+	unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+
+	if (!dma_release_from_contiguous(dev, page, count))
+		__free_pages(page, get_order(size));
+}
+
 void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
 		dma_addr_t dma_addr, unsigned long attrs)
 {
-	unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
 	unsigned int page_order = get_order(size);
 
 	if (force_dma_unencrypted())
 		set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order);
-	if (!dma_release_from_contiguous(dev, virt_to_page(cpu_addr), count))
-		free_pages((unsigned long)cpu_addr, page_order);
+	__dma_direct_free_pages(dev, size, virt_to_page(cpu_addr));
 }
 
 void *dma_direct_alloc(struct device *dev, size_t size,
-- 
cgit 


From 704f2c20eaa566f6906e8812b6e2115889bd753d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 22 Sep 2018 20:47:26 +0200
Subject: dma-direct: reject highmem pages from dma_alloc_from_contiguous

dma_alloc_from_contiguous can return highmem pages depending on the
setup, which a plain non-remapping DMA allocator can't handle.  Detect
this case and fail the allocation.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
---
 kernel/dma/direct.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'kernel')

diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 680287779b0a..c49849bcced6 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -162,6 +162,18 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
 	if (!page)
 		return NULL;
 
+	if (PageHighMem(page)) {
+		/*
+		 * Depending on the cma= arguments and per-arch setup
+		 * dma_alloc_from_contiguous could return highmem pages.
+		 * Without remapping there is no way to return them here,
+		 * so log an error and fail.
+		 */
+		dev_info(dev, "Rejecting highmem page from CMA.\n");
+		__dma_direct_free_pages(dev, size, page);
+		return NULL;
+	}
+
 	ret = page_address(page);
 	if (force_dma_unencrypted()) {
 		set_memory_decrypted((unsigned long)ret, 1 << get_order(size));
-- 
cgit 


From f0edfea8ef93ed6cc5f747c46c85c8e53e0798a0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 24 Aug 2018 10:31:08 +0200
Subject: dma-mapping: move the remap helpers to a separate file

The dma remap code only makes sense for not cache coherent architectures
(or possibly the corner case of highmem CMA allocations) and currently
is only used by arm, arm64, csky and xtensa.  Split it out into a
separate file with a separate Kconfig symbol, which gets the right
copyright notice given that this code was written by Laura Abbott
working for Code Aurora at that point.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Laura Abbott <labbott@redhat.com>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
---
 arch/arm/Kconfig     |  1 +
 arch/arm64/Kconfig   |  1 +
 arch/csky/Kconfig    |  1 +
 arch/xtensa/Kconfig  |  1 +
 kernel/dma/Kconfig   |  4 +++
 kernel/dma/Makefile  |  2 +-
 kernel/dma/mapping.c | 84 -------------------------------------------------
 kernel/dma/remap.c   | 88 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 8 files changed, 97 insertions(+), 85 deletions(-)
 create mode 100644 kernel/dma/remap.c

(limited to 'kernel')

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 91be74d8df65..3b2852df6eb3 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -30,6 +30,7 @@ config ARM
 	select CPU_PM if (SUSPEND || CPU_IDLE)
 	select DCACHE_WORD_ACCESS if HAVE_EFFICIENT_UNALIGNED_ACCESS
 	select DMA_DIRECT_OPS if !MMU
+	select DMA_REMAP if MMU
 	select EDAC_SUPPORT
 	select EDAC_ATOMIC_SCRUB
 	select GENERIC_ALLOCATOR
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 787d7850e064..5d065acb6d10 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -82,6 +82,7 @@ config ARM64
 	select CRC32
 	select DCACHE_WORD_ACCESS
 	select DMA_DIRECT_OPS
+	select DMA_REMAP
 	select EDAC_SUPPORT
 	select FRAME_POINTER
 	select GENERIC_ALLOCATOR
diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig
index cb64f8dacd08..8a30e006a845 100644
--- a/arch/csky/Kconfig
+++ b/arch/csky/Kconfig
@@ -9,6 +9,7 @@ config CSKY
 	select CLKSRC_OF
 	select DMA_DIRECT_OPS
 	select DMA_NONCOHERENT_OPS
+	select DMA_REMAP
 	select IRQ_DOMAIN
 	select HANDLE_DOMAIN_IRQ
 	select DW_APB_TIMER_OF
diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig
index d29b7365da8d..239bfb16c58b 100644
--- a/arch/xtensa/Kconfig
+++ b/arch/xtensa/Kconfig
@@ -11,6 +11,7 @@ config XTENSA
 	select CLONE_BACKWARDS
 	select COMMON_CLK
 	select DMA_DIRECT_OPS
+	select DMA_REMAP if MMU
 	select GENERIC_ATOMIC64
 	select GENERIC_CLOCKEVENTS
 	select GENERIC_IRQ_SHOW
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 645c7a2ecde8..c92e08173ed8 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -51,3 +51,7 @@ config SWIOTLB
 	bool
 	select DMA_DIRECT_OPS
 	select NEED_DMA_MAP_STATE
+
+config DMA_REMAP
+	depends on MMU
+	bool
diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile
index 7d581e4eea4a..f4feeceb8020 100644
--- a/kernel/dma/Makefile
+++ b/kernel/dma/Makefile
@@ -7,4 +7,4 @@ obj-$(CONFIG_DMA_DIRECT_OPS)		+= direct.o
 obj-$(CONFIG_DMA_VIRT_OPS)		+= virt.o
 obj-$(CONFIG_DMA_API_DEBUG)		+= debug.o
 obj-$(CONFIG_SWIOTLB)			+= swiotlb.o
-
+obj-$(CONFIG_DMA_REMAP)			+= remap.o
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 58dec7a92b7b..dfbc3deb95cd 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -262,87 +262,3 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma,
 #endif /* !CONFIG_ARCH_NO_COHERENT_DMA_MMAP */
 }
 EXPORT_SYMBOL(dma_common_mmap);
-
-#ifdef CONFIG_MMU
-static struct vm_struct *__dma_common_pages_remap(struct page **pages,
-			size_t size, unsigned long vm_flags, pgprot_t prot,
-			const void *caller)
-{
-	struct vm_struct *area;
-
-	area = get_vm_area_caller(size, vm_flags, caller);
-	if (!area)
-		return NULL;
-
-	if (map_vm_area(area, prot, pages)) {
-		vunmap(area->addr);
-		return NULL;
-	}
-
-	return area;
-}
-
-/*
- * remaps an array of PAGE_SIZE pages into another vm_area
- * Cannot be used in non-sleeping contexts
- */
-void *dma_common_pages_remap(struct page **pages, size_t size,
-			unsigned long vm_flags, pgprot_t prot,
-			const void *caller)
-{
-	struct vm_struct *area;
-
-	area = __dma_common_pages_remap(pages, size, vm_flags, prot, caller);
-	if (!area)
-		return NULL;
-
-	area->pages = pages;
-
-	return area->addr;
-}
-
-/*
- * remaps an allocated contiguous region into another vm_area.
- * Cannot be used in non-sleeping contexts
- */
-
-void *dma_common_contiguous_remap(struct page *page, size_t size,
-			unsigned long vm_flags,
-			pgprot_t prot, const void *caller)
-{
-	int i;
-	struct page **pages;
-	struct vm_struct *area;
-
-	pages = kmalloc(sizeof(struct page *) << get_order(size), GFP_KERNEL);
-	if (!pages)
-		return NULL;
-
-	for (i = 0; i < (size >> PAGE_SHIFT); i++)
-		pages[i] = nth_page(page, i);
-
-	area = __dma_common_pages_remap(pages, size, vm_flags, prot, caller);
-
-	kfree(pages);
-
-	if (!area)
-		return NULL;
-	return area->addr;
-}
-
-/*
- * unmaps a range previously mapped by dma_common_*_remap
- */
-void dma_common_free_remap(void *cpu_addr, size_t size, unsigned long vm_flags)
-{
-	struct vm_struct *area = find_vm_area(cpu_addr);
-
-	if (!area || (area->flags & vm_flags) != vm_flags) {
-		WARN(1, "trying to free invalid coherent area: %p\n", cpu_addr);
-		return;
-	}
-
-	unmap_kernel_range((unsigned long)cpu_addr, PAGE_ALIGN(size));
-	vunmap(cpu_addr);
-}
-#endif
diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c
new file mode 100644
index 000000000000..a15c393ea4e5
--- /dev/null
+++ b/kernel/dma/remap.c
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2014 The Linux Foundation
+ */
+#include <linux/dma-mapping.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+static struct vm_struct *__dma_common_pages_remap(struct page **pages,
+			size_t size, unsigned long vm_flags, pgprot_t prot,
+			const void *caller)
+{
+	struct vm_struct *area;
+
+	area = get_vm_area_caller(size, vm_flags, caller);
+	if (!area)
+		return NULL;
+
+	if (map_vm_area(area, prot, pages)) {
+		vunmap(area->addr);
+		return NULL;
+	}
+
+	return area;
+}
+
+/*
+ * Remaps an array of PAGE_SIZE pages into another vm_area.
+ * Cannot be used in non-sleeping contexts
+ */
+void *dma_common_pages_remap(struct page **pages, size_t size,
+			unsigned long vm_flags, pgprot_t prot,
+			const void *caller)
+{
+	struct vm_struct *area;
+
+	area = __dma_common_pages_remap(pages, size, vm_flags, prot, caller);
+	if (!area)
+		return NULL;
+
+	area->pages = pages;
+
+	return area->addr;
+}
+
+/*
+ * Remaps an allocated contiguous region into another vm_area.
+ * Cannot be used in non-sleeping contexts
+ */
+void *dma_common_contiguous_remap(struct page *page, size_t size,
+			unsigned long vm_flags,
+			pgprot_t prot, const void *caller)
+{
+	int i;
+	struct page **pages;
+	struct vm_struct *area;
+
+	pages = kmalloc(sizeof(struct page *) << get_order(size), GFP_KERNEL);
+	if (!pages)
+		return NULL;
+
+	for (i = 0; i < (size >> PAGE_SHIFT); i++)
+		pages[i] = nth_page(page, i);
+
+	area = __dma_common_pages_remap(pages, size, vm_flags, prot, caller);
+
+	kfree(pages);
+
+	if (!area)
+		return NULL;
+	return area->addr;
+}
+
+/*
+ * Unmaps a range previously mapped by dma_common_*_remap
+ */
+void dma_common_free_remap(void *cpu_addr, size_t size, unsigned long vm_flags)
+{
+	struct vm_struct *area = find_vm_area(cpu_addr);
+
+	if (!area || (area->flags & vm_flags) != vm_flags) {
+		WARN(1, "trying to free invalid coherent area: %p\n", cpu_addr);
+		return;
+	}
+
+	unmap_kernel_range((unsigned long)cpu_addr, PAGE_ALIGN(size));
+	vunmap(cpu_addr);
+}
-- 
cgit 


From 0c3b3171ceccb8830c2bb5adff1b4e9b204c1450 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 4 Nov 2018 20:29:28 +0100
Subject: dma-mapping: move the arm64 noncoherent alloc/free support to common
 code

The arm64 codebase to implement coherent dma allocation for architectures
with non-coherent DMA is a good start for a generic implementation, given
that is uses the generic remap helpers, provides the atomic pool for
allocations that can't sleep and still is realtively simple and well
tested.  Move it to kernel/dma and allow architectures to opt into it
using a config symbol.  Architectures just need to provide a new
arch_dma_prep_coherent helper to writeback an invalidate the caches
for any memory that gets remapped for uncached access.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Will Deacon <will.deacon@arm.com>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
---
 arch/arm64/Kconfig              |   2 +-
 arch/arm64/mm/dma-mapping.c     | 184 +++-------------------------------------
 include/linux/dma-mapping.h     |   5 ++
 include/linux/dma-noncoherent.h |   2 +
 kernel/dma/Kconfig              |   5 ++
 kernel/dma/remap.c              | 158 +++++++++++++++++++++++++++++++++-
 6 files changed, 180 insertions(+), 176 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 5d065acb6d10..2e645ea693ea 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -82,7 +82,7 @@ config ARM64
 	select CRC32
 	select DCACHE_WORD_ACCESS
 	select DMA_DIRECT_OPS
-	select DMA_REMAP
+	select DMA_DIRECT_REMAP
 	select EDAC_SUPPORT
 	select FRAME_POINTER
 	select GENERIC_ALLOCATOR
diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index a3ac26284845..e2e7e5d0f94e 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -33,113 +33,6 @@
 
 #include <asm/cacheflush.h>
 
-static struct gen_pool *atomic_pool __ro_after_init;
-
-#define DEFAULT_DMA_COHERENT_POOL_SIZE  SZ_256K
-static size_t atomic_pool_size __initdata = DEFAULT_DMA_COHERENT_POOL_SIZE;
-
-static int __init early_coherent_pool(char *p)
-{
-	atomic_pool_size = memparse(p, &p);
-	return 0;
-}
-early_param("coherent_pool", early_coherent_pool);
-
-static void *__alloc_from_pool(size_t size, struct page **ret_page, gfp_t flags)
-{
-	unsigned long val;
-	void *ptr = NULL;
-
-	if (!atomic_pool) {
-		WARN(1, "coherent pool not initialised!\n");
-		return NULL;
-	}
-
-	val = gen_pool_alloc(atomic_pool, size);
-	if (val) {
-		phys_addr_t phys = gen_pool_virt_to_phys(atomic_pool, val);
-
-		*ret_page = phys_to_page(phys);
-		ptr = (void *)val;
-		memset(ptr, 0, size);
-	}
-
-	return ptr;
-}
-
-static bool __in_atomic_pool(void *start, size_t size)
-{
-	return addr_in_gen_pool(atomic_pool, (unsigned long)start, size);
-}
-
-static int __free_from_pool(void *start, size_t size)
-{
-	if (!__in_atomic_pool(start, size))
-		return 0;
-
-	gen_pool_free(atomic_pool, (unsigned long)start, size);
-
-	return 1;
-}
-
-void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
-		gfp_t flags, unsigned long attrs)
-{
-	struct page *page;
-	void *ptr, *coherent_ptr;
-	pgprot_t prot = pgprot_writecombine(PAGE_KERNEL);
-
-	size = PAGE_ALIGN(size);
-
-	if (!gfpflags_allow_blocking(flags)) {
-		struct page *page = NULL;
-		void *addr = __alloc_from_pool(size, &page, flags);
-
-		if (addr)
-			*dma_handle = phys_to_dma(dev, page_to_phys(page));
-
-		return addr;
-	}
-
-	ptr = dma_direct_alloc_pages(dev, size, dma_handle, flags, attrs);
-	if (!ptr)
-		goto no_mem;
-
-	/* remove any dirty cache lines on the kernel alias */
-	__dma_flush_area(ptr, size);
-
-	/* create a coherent mapping */
-	page = virt_to_page(ptr);
-	coherent_ptr = dma_common_contiguous_remap(page, size, VM_USERMAP,
-						   prot, __builtin_return_address(0));
-	if (!coherent_ptr)
-		goto no_map;
-
-	return coherent_ptr;
-
-no_map:
-	dma_direct_free_pages(dev, size, ptr, *dma_handle, attrs);
-no_mem:
-	return NULL;
-}
-
-void arch_dma_free(struct device *dev, size_t size, void *vaddr,
-		dma_addr_t dma_handle, unsigned long attrs)
-{
-	if (!__free_from_pool(vaddr, PAGE_ALIGN(size))) {
-		void *kaddr = phys_to_virt(dma_to_phys(dev, dma_handle));
-
-		vunmap(vaddr);
-		dma_direct_free_pages(dev, size, kaddr, dma_handle, attrs);
-	}
-}
-
-long arch_dma_coherent_to_pfn(struct device *dev, void *cpu_addr,
-		dma_addr_t dma_addr)
-{
-	return __phys_to_pfn(dma_to_phys(dev, dma_addr));
-}
-
 pgprot_t arch_dma_mmap_pgprot(struct device *dev, pgprot_t prot,
 		unsigned long attrs)
 {
@@ -160,6 +53,11 @@ void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr,
 	__dma_unmap_area(phys_to_virt(paddr), size, dir);
 }
 
+void arch_dma_prep_coherent(struct page *page, size_t size)
+{
+	__dma_flush_area(page_address(page), size);
+}
+
 #ifdef CONFIG_IOMMU_DMA
 static int __swiotlb_get_sgtable_page(struct sg_table *sgt,
 				      struct page *page, size_t size)
@@ -191,67 +89,6 @@ static int __swiotlb_mmap_pfn(struct vm_area_struct *vma,
 }
 #endif /* CONFIG_IOMMU_DMA */
 
-static int __init atomic_pool_init(void)
-{
-	pgprot_t prot = __pgprot(PROT_NORMAL_NC);
-	unsigned long nr_pages = atomic_pool_size >> PAGE_SHIFT;
-	struct page *page;
-	void *addr;
-	unsigned int pool_size_order = get_order(atomic_pool_size);
-
-	if (dev_get_cma_area(NULL))
-		page = dma_alloc_from_contiguous(NULL, nr_pages,
-						 pool_size_order, false);
-	else
-		page = alloc_pages(GFP_DMA32, pool_size_order);
-
-	if (page) {
-		int ret;
-		void *page_addr = page_address(page);
-
-		memset(page_addr, 0, atomic_pool_size);
-		__dma_flush_area(page_addr, atomic_pool_size);
-
-		atomic_pool = gen_pool_create(PAGE_SHIFT, -1);
-		if (!atomic_pool)
-			goto free_page;
-
-		addr = dma_common_contiguous_remap(page, atomic_pool_size,
-					VM_USERMAP, prot, atomic_pool_init);
-
-		if (!addr)
-			goto destroy_genpool;
-
-		ret = gen_pool_add_virt(atomic_pool, (unsigned long)addr,
-					page_to_phys(page),
-					atomic_pool_size, -1);
-		if (ret)
-			goto remove_mapping;
-
-		gen_pool_set_algo(atomic_pool,
-				  gen_pool_first_fit_order_align,
-				  NULL);
-
-		pr_info("DMA: preallocated %zu KiB pool for atomic allocations\n",
-			atomic_pool_size / 1024);
-		return 0;
-	}
-	goto out;
-
-remove_mapping:
-	dma_common_free_remap(addr, atomic_pool_size, VM_USERMAP);
-destroy_genpool:
-	gen_pool_destroy(atomic_pool);
-	atomic_pool = NULL;
-free_page:
-	if (!dma_release_from_contiguous(NULL, page, nr_pages))
-		__free_pages(page, pool_size_order);
-out:
-	pr_err("DMA: failed to allocate %zu KiB pool for atomic coherent allocation\n",
-		atomic_pool_size / 1024);
-	return -ENOMEM;
-}
-
 /********************************************
  * The following APIs are for dummy DMA ops *
  ********************************************/
@@ -350,8 +187,7 @@ static int __init arm64_dma_init(void)
 		   TAINT_CPU_OUT_OF_SPEC,
 		   "ARCH_DMA_MINALIGN smaller than CTR_EL0.CWG (%d < %d)",
 		   ARCH_DMA_MINALIGN, cache_line_size());
-
-	return atomic_pool_init();
+	return dma_atomic_pool_init(GFP_DMA32, __pgprot(PROT_NORMAL_NC));
 }
 arch_initcall(arm64_dma_init);
 
@@ -397,7 +233,7 @@ static void *__iommu_alloc_attrs(struct device *dev, size_t size,
 			page = alloc_pages(gfp, get_order(size));
 			addr = page ? page_address(page) : NULL;
 		} else {
-			addr = __alloc_from_pool(size, &page, gfp);
+			addr = dma_alloc_from_pool(size, &page, gfp);
 		}
 		if (!addr)
 			return NULL;
@@ -407,7 +243,7 @@ static void *__iommu_alloc_attrs(struct device *dev, size_t size,
 			if (coherent)
 				__free_pages(page, get_order(size));
 			else
-				__free_from_pool(addr, size);
+				dma_free_from_pool(addr, size);
 			addr = NULL;
 		}
 	} else if (attrs & DMA_ATTR_FORCE_CONTIGUOUS) {
@@ -471,9 +307,9 @@ static void __iommu_free_attrs(struct device *dev, size_t size, void *cpu_addr,
 	 *   coherent devices.
 	 * Hence how dodgy the below logic looks...
 	 */
-	if (__in_atomic_pool(cpu_addr, size)) {
+	if (dma_in_atomic_pool(cpu_addr, size)) {
 		iommu_dma_unmap_page(dev, handle, iosize, 0, 0);
-		__free_from_pool(cpu_addr, size);
+		dma_free_from_pool(cpu_addr, size);
 	} else if (attrs & DMA_ATTR_FORCE_CONTIGUOUS) {
 		struct page *page = vmalloc_to_page(cpu_addr);
 
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 0f81c713f6e9..1a0edcde7d14 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -455,6 +455,11 @@ void *dma_common_pages_remap(struct page **pages, size_t size,
 			const void *caller);
 void dma_common_free_remap(void *cpu_addr, size_t size, unsigned long vm_flags);
 
+int __init dma_atomic_pool_init(gfp_t gfp, pgprot_t prot);
+bool dma_in_atomic_pool(void *start, size_t size);
+void *dma_alloc_from_pool(size_t size, struct page **ret_page, gfp_t flags);
+bool dma_free_from_pool(void *start, size_t size);
+
 /**
  * dma_mmap_attrs - map a coherent DMA allocation into user space
  * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
diff --git a/include/linux/dma-noncoherent.h b/include/linux/dma-noncoherent.h
index 9051b055beec..306557331d7d 100644
--- a/include/linux/dma-noncoherent.h
+++ b/include/linux/dma-noncoherent.h
@@ -69,4 +69,6 @@ static inline void arch_sync_dma_for_cpu_all(struct device *dev)
 }
 #endif /* CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL */
 
+void arch_dma_prep_coherent(struct page *page, size_t size);
+
 #endif /* _LINUX_DMA_NONCOHERENT_H */
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index c92e08173ed8..41c3b1df70eb 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -55,3 +55,8 @@ config SWIOTLB
 config DMA_REMAP
 	depends on MMU
 	bool
+
+config DMA_DIRECT_REMAP
+	bool
+	depends on DMA_DIRECT_OPS
+	select DMA_REMAP
diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c
index a15c393ea4e5..b32bb08f96ae 100644
--- a/kernel/dma/remap.c
+++ b/kernel/dma/remap.c
@@ -1,8 +1,13 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
+ * Copyright (C) 2012 ARM Ltd.
  * Copyright (c) 2014 The Linux Foundation
  */
-#include <linux/dma-mapping.h>
+#include <linux/dma-direct.h>
+#include <linux/dma-noncoherent.h>
+#include <linux/dma-contiguous.h>
+#include <linux/init.h>
+#include <linux/genalloc.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 
@@ -86,3 +91,154 @@ void dma_common_free_remap(void *cpu_addr, size_t size, unsigned long vm_flags)
 	unmap_kernel_range((unsigned long)cpu_addr, PAGE_ALIGN(size));
 	vunmap(cpu_addr);
 }
+
+#ifdef CONFIG_DMA_DIRECT_REMAP
+static struct gen_pool *atomic_pool __ro_after_init;
+
+#define DEFAULT_DMA_COHERENT_POOL_SIZE  SZ_256K
+static size_t atomic_pool_size __initdata = DEFAULT_DMA_COHERENT_POOL_SIZE;
+
+static int __init early_coherent_pool(char *p)
+{
+	atomic_pool_size = memparse(p, &p);
+	return 0;
+}
+early_param("coherent_pool", early_coherent_pool);
+
+int __init dma_atomic_pool_init(gfp_t gfp, pgprot_t prot)
+{
+	unsigned int pool_size_order = get_order(atomic_pool_size);
+	unsigned long nr_pages = atomic_pool_size >> PAGE_SHIFT;
+	struct page *page;
+	void *addr;
+	int ret;
+
+	if (dev_get_cma_area(NULL))
+		page = dma_alloc_from_contiguous(NULL, nr_pages,
+						 pool_size_order, false);
+	else
+		page = alloc_pages(gfp, pool_size_order);
+	if (!page)
+		goto out;
+
+	memset(page_address(page), 0, atomic_pool_size);
+	arch_dma_prep_coherent(page, atomic_pool_size);
+
+	atomic_pool = gen_pool_create(PAGE_SHIFT, -1);
+	if (!atomic_pool)
+		goto free_page;
+
+	addr = dma_common_contiguous_remap(page, atomic_pool_size, VM_USERMAP,
+					   prot, __builtin_return_address(0));
+	if (!addr)
+		goto destroy_genpool;
+
+	ret = gen_pool_add_virt(atomic_pool, (unsigned long)addr,
+				page_to_phys(page), atomic_pool_size, -1);
+	if (ret)
+		goto remove_mapping;
+	gen_pool_set_algo(atomic_pool, gen_pool_first_fit_order_align, NULL);
+
+	pr_info("DMA: preallocated %zu KiB pool for atomic allocations\n",
+		atomic_pool_size / 1024);
+	return 0;
+
+remove_mapping:
+	dma_common_free_remap(addr, atomic_pool_size, VM_USERMAP);
+destroy_genpool:
+	gen_pool_destroy(atomic_pool);
+	atomic_pool = NULL;
+free_page:
+	if (!dma_release_from_contiguous(NULL, page, nr_pages))
+		__free_pages(page, pool_size_order);
+out:
+	pr_err("DMA: failed to allocate %zu KiB pool for atomic coherent allocation\n",
+		atomic_pool_size / 1024);
+	return -ENOMEM;
+}
+
+bool dma_in_atomic_pool(void *start, size_t size)
+{
+	return addr_in_gen_pool(atomic_pool, (unsigned long)start, size);
+}
+
+void *dma_alloc_from_pool(size_t size, struct page **ret_page, gfp_t flags)
+{
+	unsigned long val;
+	void *ptr = NULL;
+
+	if (!atomic_pool) {
+		WARN(1, "coherent pool not initialised!\n");
+		return NULL;
+	}
+
+	val = gen_pool_alloc(atomic_pool, size);
+	if (val) {
+		phys_addr_t phys = gen_pool_virt_to_phys(atomic_pool, val);
+
+		*ret_page = pfn_to_page(__phys_to_pfn(phys));
+		ptr = (void *)val;
+		memset(ptr, 0, size);
+	}
+
+	return ptr;
+}
+
+bool dma_free_from_pool(void *start, size_t size)
+{
+	if (!dma_in_atomic_pool(start, size))
+		return false;
+	gen_pool_free(atomic_pool, (unsigned long)start, size);
+	return true;
+}
+
+void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
+		gfp_t flags, unsigned long attrs)
+{
+	struct page *page = NULL;
+	void *ret, *kaddr;
+
+	size = PAGE_ALIGN(size);
+
+	if (!gfpflags_allow_blocking(flags)) {
+		ret = dma_alloc_from_pool(size, &page, flags);
+		if (!ret)
+			return NULL;
+		*dma_handle = phys_to_dma(dev, page_to_phys(page));
+		return ret;
+	}
+
+	kaddr = dma_direct_alloc_pages(dev, size, dma_handle, flags, attrs);
+	if (!kaddr)
+		return NULL;
+	page = virt_to_page(kaddr);
+
+	/* remove any dirty cache lines on the kernel alias */
+	arch_dma_prep_coherent(page, size);
+
+	/* create a coherent mapping */
+	ret = dma_common_contiguous_remap(page, size, VM_USERMAP,
+			arch_dma_mmap_pgprot(dev, PAGE_KERNEL, attrs),
+			__builtin_return_address(0));
+	if (!ret)
+		dma_direct_free_pages(dev, size, kaddr, *dma_handle, attrs);
+	return ret;
+}
+
+void arch_dma_free(struct device *dev, size_t size, void *vaddr,
+		dma_addr_t dma_handle, unsigned long attrs)
+{
+	if (!dma_free_from_pool(vaddr, PAGE_ALIGN(size))) {
+		void *kaddr = phys_to_virt(dma_to_phys(dev, dma_handle));
+
+		vunmap(vaddr);
+		dma_direct_free_pages(dev, size, kaddr, dma_handle, attrs);
+	}
+}
+
+long arch_dma_coherent_to_pfn(struct device *dev, void *cpu_addr,
+		dma_addr_t dma_addr)
+{
+	return __phys_to_pfn(dma_to_phys(dev, dma_addr));
+}
+#endif /* CONFIG_DMA_DIRECT_REMAP */
-- 
cgit 


From bfd56cd605219d90b210a5377fca31a644efe95c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 4 Nov 2018 17:38:39 +0100
Subject: dma-mapping: support highmem in the generic remap allocator

By using __dma_direct_alloc_pages we can deal entirely with struct page
instead of having to derive a kernel virtual address.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
---
 kernel/dma/remap.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c
index b32bb08f96ae..dcc82dd668f8 100644
--- a/kernel/dma/remap.c
+++ b/kernel/dma/remap.c
@@ -196,7 +196,7 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
 		gfp_t flags, unsigned long attrs)
 {
 	struct page *page = NULL;
-	void *ret, *kaddr;
+	void *ret;
 
 	size = PAGE_ALIGN(size);
 
@@ -208,10 +208,9 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
 		return ret;
 	}
 
-	kaddr = dma_direct_alloc_pages(dev, size, dma_handle, flags, attrs);
-	if (!kaddr)
+	page = __dma_direct_alloc_pages(dev, size, dma_handle, flags, attrs);
+	if (!page)
 		return NULL;
-	page = virt_to_page(kaddr);
 
 	/* remove any dirty cache lines on the kernel alias */
 	arch_dma_prep_coherent(page, size);
@@ -221,7 +220,7 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
 			arch_dma_mmap_pgprot(dev, PAGE_KERNEL, attrs),
 			__builtin_return_address(0));
 	if (!ret)
-		dma_direct_free_pages(dev, size, kaddr, *dma_handle, attrs);
+		__dma_direct_free_pages(dev, size, page);
 	return ret;
 }
 
@@ -229,10 +228,11 @@ void arch_dma_free(struct device *dev, size_t size, void *vaddr,
 		dma_addr_t dma_handle, unsigned long attrs)
 {
 	if (!dma_free_from_pool(vaddr, PAGE_ALIGN(size))) {
-		void *kaddr = phys_to_virt(dma_to_phys(dev, dma_handle));
+		phys_addr_t phys = dma_to_phys(dev, dma_handle);
+		struct page *page = pfn_to_page(__phys_to_pfn(phys));
 
 		vunmap(vaddr);
-		dma_direct_free_pages(dev, size, kaddr, dma_handle, attrs);
+		__dma_direct_free_pages(dev, size, page);
 	}
 }
 
-- 
cgit 


From e440e26a0251b054243b3db6356f3869a9f9c2d1 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 4 Nov 2018 17:51:50 +0100
Subject: dma-remap: support DMA_ATTR_NO_KERNEL_MAPPING

Do not waste vmalloc space on allocations that do not require a mapping
into the kernel address space.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
---
 kernel/dma/remap.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c
index dcc82dd668f8..68a64e3ff6a1 100644
--- a/kernel/dma/remap.c
+++ b/kernel/dma/remap.c
@@ -200,7 +200,8 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
 
 	size = PAGE_ALIGN(size);
 
-	if (!gfpflags_allow_blocking(flags)) {
+	if (!gfpflags_allow_blocking(flags) &&
+	    !(attrs & DMA_ATTR_NO_KERNEL_MAPPING)) {
 		ret = dma_alloc_from_pool(size, &page, flags);
 		if (!ret)
 			return NULL;
@@ -215,6 +216,9 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
 	/* remove any dirty cache lines on the kernel alias */
 	arch_dma_prep_coherent(page, size);
 
+	if (attrs & DMA_ATTR_NO_KERNEL_MAPPING)
+		return page; /* opaque cookie */
+
 	/* create a coherent mapping */
 	ret = dma_common_contiguous_remap(page, size, VM_USERMAP,
 			arch_dma_mmap_pgprot(dev, PAGE_KERNEL, attrs),
@@ -227,7 +231,10 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
 void arch_dma_free(struct device *dev, size_t size, void *vaddr,
 		dma_addr_t dma_handle, unsigned long attrs)
 {
-	if (!dma_free_from_pool(vaddr, PAGE_ALIGN(size))) {
+	if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) {
+		/* vaddr is a struct page cookie, not a kernel address */
+		__dma_direct_free_pages(dev, size, vaddr);
+	} else if (!dma_free_from_pool(vaddr, PAGE_ALIGN(size))) {
 		phys_addr_t phys = dma_to_phys(dev, dma_handle);
 		struct page *page = pfn_to_page(__phys_to_pfn(phys));
 
-- 
cgit 


From 2af3024cd78f120d027cb44b454186ba9d7dab24 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Wed, 7 Nov 2018 14:11:40 -0800
Subject: cgroups: Replace synchronize_sched() with synchronize_rcu()

Now that synchronize_rcu() waits for preempt-disable regions of code
as well as RCU read-side critical sections, synchronize_sched() can be
replaced by synchronize_rcu().  This commit therefore makes this change,
even though it is but a comment.

Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Dennis Zhou (Facebook)" <dennisszhou@gmail.com>
Acked-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cgroup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 6aaf5dd5383b..7a8429f8e280 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -5343,7 +5343,7 @@ int __init cgroup_init(void)
 	cgroup_rstat_boot();
 
 	/*
-	 * The latency of the synchronize_sched() is too high for cgroups,
+	 * The latency of the synchronize_rcu() is too high for cgroups,
 	 * avoid it at the cost of forcing all readers into the slow path.
 	 */
 	rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss);
-- 
cgit 


From 6932689e4145f545062ca8c86cf76f38854d63d0 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Wed, 7 Nov 2018 14:16:57 -0800
Subject: livepatch: Replace synchronize_sched() with synchronize_rcu()

Now that synchronize_rcu() waits for preempt-disable regions of code
as well as RCU read-side critical sections, synchronize_sched() can be
replaced by synchronize_rcu().  This commit therefore makes this change,
even though it is but a comment.

Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 kernel/livepatch/patch.c      | 4 ++--
 kernel/livepatch/transition.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c
index 82d584225dc6..7702cb4064fc 100644
--- a/kernel/livepatch/patch.c
+++ b/kernel/livepatch/patch.c
@@ -61,7 +61,7 @@ static void notrace klp_ftrace_handler(unsigned long ip,
 	ops = container_of(fops, struct klp_ops, fops);
 
 	/*
-	 * A variant of synchronize_sched() is used to allow patching functions
+	 * A variant of synchronize_rcu() is used to allow patching functions
 	 * where RCU is not watching, see klp_synchronize_transition().
 	 */
 	preempt_disable_notrace();
@@ -72,7 +72,7 @@ static void notrace klp_ftrace_handler(unsigned long ip,
 	/*
 	 * func should never be NULL because preemption should be disabled here
 	 * and unregister_ftrace_function() does the equivalent of a
-	 * synchronize_sched() before the func_stack removal.
+	 * synchronize_rcu() before the func_stack removal.
 	 */
 	if (WARN_ON_ONCE(!func))
 		goto unlock;
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index 5bc349805e03..304d5eb8a98c 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -52,7 +52,7 @@ static DECLARE_DELAYED_WORK(klp_transition_work, klp_transition_work_fn);
 
 /*
  * This function is just a stub to implement a hard force
- * of synchronize_sched(). This requires synchronizing
+ * of synchronize_rcu(). This requires synchronizing
  * tasks even in userspace and idle.
  */
 static void klp_sync(struct work_struct *work)
@@ -175,7 +175,7 @@ void klp_cancel_transition(void)
 void klp_update_patch_state(struct task_struct *task)
 {
 	/*
-	 * A variant of synchronize_sched() is used to allow patching functions
+	 * A variant of synchronize_rcu() is used to allow patching functions
 	 * where RCU is not watching, see klp_synchronize_transition().
 	 */
 	preempt_disable_notrace();
-- 
cgit 


From 4871848531af1d62f30032bfb872c43b9afe03ad Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 15 Aug 2018 15:32:51 -0700
Subject: rcutorture: Add call_rcu() flooding forward-progress tests

This commit adds a call_rcu() flooding loop to the forward-progress test.
This emulates tight userspace loops that force call_rcu() invocations,
for example, the infamous loop containing close(open()) that instigated
the addition of blimit.  If RCU does not make sufficient forward progress
in invoking the resulting flood of callbacks, rcutorture emits a warning.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/rcutorture.c | 129 +++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 127 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 210c77460365..8cf700ca7845 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -259,6 +259,8 @@ static atomic_t barrier_cbs_invoked;	/* Barrier callbacks invoked. */
 static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */
 static DECLARE_WAIT_QUEUE_HEAD(barrier_wq);
 
+static bool rcu_fwd_cb_nodelay;		/* Short rcu_torture_delay() delays. */
+
 /*
  * Allocate an element from the rcu_tortures pool.
  */
@@ -348,7 +350,8 @@ rcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp)
 	 * period, and we want a long delay occasionally to trigger
 	 * force_quiescent_state. */
 
-	if (!(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) {
+	if (!rcu_fwd_cb_nodelay &&
+	    !(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) {
 		started = cur_ops->get_gp_seq();
 		ts = rcu_trace_clock_local();
 		if (preempt_count() & (SOFTIRQ_MASK | HARDIRQ_MASK))
@@ -1674,6 +1677,43 @@ static void rcu_torture_fwd_prog_cb(struct rcu_head *rhp)
 	cur_ops->call(&fcsp->rh, rcu_torture_fwd_prog_cb);
 }
 
+/* State for continuous-flood RCU callbacks. */
+struct rcu_fwd_cb {
+	struct rcu_head rh;
+	struct rcu_fwd_cb *rfc_next;
+	int rfc_gps;
+};
+static DEFINE_SPINLOCK(rcu_fwd_lock);
+static struct rcu_fwd_cb *rcu_fwd_cb_head;
+static struct rcu_fwd_cb **rcu_fwd_cb_tail = &rcu_fwd_cb_head;
+static long n_launders_cb;
+static unsigned long rcu_fwd_startat;
+#define MAX_FWD_CB_JIFFIES	(8 * HZ) /* Maximum CB test duration. */
+#define MIN_FWD_CB_LAUNDERS	3	/* This many CB invocations to count. */
+#define MIN_FWD_CBS_LAUNDERED	100	/* Number of counted CBs. */
+static long n_launders_hist[2 * MAX_FWD_CB_JIFFIES / HZ];
+
+/* Callback function for continuous-flood RCU callbacks. */
+static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp)
+{
+	int i;
+	struct rcu_fwd_cb *rfcp = container_of(rhp, struct rcu_fwd_cb, rh);
+	struct rcu_fwd_cb **rfcpp;
+
+	rfcp->rfc_next = NULL;
+	rfcp->rfc_gps++;
+	spin_lock(&rcu_fwd_lock);
+	rfcpp = rcu_fwd_cb_tail;
+	rcu_fwd_cb_tail = &rfcp->rfc_next;
+	WRITE_ONCE(*rfcpp, rfcp);
+	WRITE_ONCE(n_launders_cb, n_launders_cb + 1);
+	i = ((jiffies - rcu_fwd_startat) / HZ);
+	if (i >= ARRAY_SIZE(n_launders_hist))
+		i = ARRAY_SIZE(n_launders_hist) - 1;
+	n_launders_hist[i]++;
+	spin_unlock(&rcu_fwd_lock);
+}
+
 /* Carry out grace-period forward-progress testing. */
 static int rcu_torture_fwd_prog(void *args)
 {
@@ -1681,11 +1721,21 @@ static int rcu_torture_fwd_prog(void *args)
 	unsigned long dur;
 	struct fwd_cb_state fcs;
 	unsigned long gps;
+	int i;
 	int idx;
+	int j;
+	long n_launders;
+	long n_launders_cb_snap;
+	long n_launders_sa;
+	long n_max_cbs;
+	long n_max_gps;
+	struct rcu_fwd_cb *rfcp;
+	struct rcu_fwd_cb *rfcpn;
 	int sd;
 	int sd4;
 	bool selfpropcb = false;
 	unsigned long stopat;
+	unsigned long stoppedat;
 	int tested = 0;
 	int tested_tries = 0;
 	static DEFINE_TORTURE_RANDOM(trs);
@@ -1699,6 +1749,8 @@ static int rcu_torture_fwd_prog(void *args)
 	}
 	do {
 		schedule_timeout_interruptible(fwd_progress_holdoff * HZ);
+
+		/* Tight loop containing cond_resched(). */
 		if  (selfpropcb) {
 			WRITE_ONCE(fcs.stop, 0);
 			cur_ops->call(&fcs.rh, rcu_torture_fwd_prog_cb);
@@ -1708,7 +1760,8 @@ static int rcu_torture_fwd_prog(void *args)
 		sd = cur_ops->stall_dur() + 1;
 		sd4 = (sd + fwd_progress_div - 1) / fwd_progress_div;
 		dur = sd4 + torture_random(&trs) % (sd - sd4);
-		stopat = jiffies + dur;
+		rcu_fwd_startat = jiffies;
+		stopat = rcu_fwd_startat + dur;
 		while (time_before(jiffies, stopat) && !torture_must_stop()) {
 			idx = cur_ops->readlock();
 			udelay(10);
@@ -1729,6 +1782,78 @@ static int rcu_torture_fwd_prog(void *args)
 			cur_ops->sync(); /* Wait for running CB to complete. */
 			cur_ops->cb_barrier(); /* Wait for queued callbacks. */
 		}
+
+		/* Loop continuously posting RCU callbacks. */
+		WRITE_ONCE(rcu_fwd_cb_nodelay, true);
+		cur_ops->sync(); /* Later readers see above write. */
+		rcu_fwd_startat = jiffies;
+		stopat = rcu_fwd_startat + MAX_FWD_CB_JIFFIES;
+		n_launders = 0;
+		n_launders_cb = 0;
+		n_launders_sa = 0;
+		n_max_cbs = 0;
+		n_max_gps = 0;
+		for (i = 0; i < ARRAY_SIZE(n_launders_hist); i++)
+			n_launders_hist[i] = 0;
+		cver = READ_ONCE(rcu_torture_current_version);
+		gps = cur_ops->get_gp_seq();
+		while (time_before(jiffies, stopat) && !torture_must_stop()) {
+			rfcp = READ_ONCE(rcu_fwd_cb_head);
+			rfcpn = NULL;
+			if (rfcp)
+				rfcpn = READ_ONCE(rfcp->rfc_next);
+			if (rfcpn) {
+				if (rfcp->rfc_gps >= MIN_FWD_CB_LAUNDERS &&
+				    ++n_max_gps >= MIN_FWD_CBS_LAUNDERED)
+					break;
+				rcu_fwd_cb_head = rfcpn;
+				n_launders++;
+				n_launders_sa++;
+			} else {
+				rfcp = kmalloc(sizeof(*rfcp), GFP_KERNEL);
+				if (WARN_ON_ONCE(!rfcp)) {
+					schedule_timeout_interruptible(1);
+					continue;
+				}
+				n_max_cbs++;
+				n_launders_sa = 0;
+				rfcp->rfc_gps = 0;
+			}
+			cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr);
+			cond_resched();
+		}
+		stoppedat = jiffies;
+		n_launders_cb_snap = READ_ONCE(n_launders_cb);
+		cver = READ_ONCE(rcu_torture_current_version) - cver;
+		gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps);
+		cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */
+		for (;;) {
+			rfcp = rcu_fwd_cb_head;
+			if (!rfcp)
+				break;
+			rcu_fwd_cb_head = rfcp->rfc_next;
+			kfree(rfcp);
+		}
+		rcu_fwd_cb_tail = &rcu_fwd_cb_head;
+		WRITE_ONCE(rcu_fwd_cb_nodelay, false);
+		if (!torture_must_stop()) {
+			WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED);
+			pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld\n",
+				 __func__,
+				 stoppedat - rcu_fwd_startat,
+				 jiffies - stoppedat,
+				 n_launders + n_max_cbs - n_launders_cb_snap,
+				 n_launders, n_launders_sa,
+				 n_max_gps, n_max_cbs, cver, gps);
+			for (i = ARRAY_SIZE(n_launders_hist) - 1; i > 0; i--)
+				if (n_launders_hist[i] > 0)
+					break;
+			pr_alert("Callback-invocation histogram:");
+			for (j = 0; j <= i; j++)
+				pr_cont(" %ds: %ld", j + 1, n_launders_hist[j]);
+			pr_cont("\n");
+		}
+
 		/* Avoid slow periods, better to test when busy. */
 		stutter_wait("rcu_torture_fwd_prog");
 	} while (!torture_must_stop());
-- 
cgit 


From 28cf5952f56005325f269ccfe402a880cd741189 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 21 Aug 2018 15:27:16 -0700
Subject: torture: Bring any extra CPUs online during kernel startup

Currently, the torture scripts rely on the initrd/init script to bring
any extra CPUs online, for example, in the case where the kernel and
qemu have different ideas about how many CPUs are present.  This works,
but is an unnecessary dependency on initrd, which needs to vary depending
on the distro.  This commit therefore causes torture_onoff() to check
for additional CPUs, attempting to bring any found online. Errors are
ignored, just as they are by the initrd/init script.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/torture.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'kernel')

diff --git a/kernel/torture.c b/kernel/torture.c
index 17d91f5fba2a..9410d1bf84d6 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -194,11 +194,23 @@ torture_onoff(void *arg)
 	int cpu;
 	int maxcpu = -1;
 	DEFINE_TORTURE_RANDOM(rand);
+	int ret;
 
 	VERBOSE_TOROUT_STRING("torture_onoff task started");
 	for_each_online_cpu(cpu)
 		maxcpu = cpu;
 	WARN_ON(maxcpu < 0);
+	if (!IS_MODULE(CONFIG_TORTURE_TEST))
+		for_each_possible_cpu(cpu) {
+			if (cpu_online(cpu))
+				continue;
+			ret = cpu_up(cpu);
+			if (ret && verbose) {
+				pr_alert("%s" TORTURE_FLAG
+					 "%s: Initial online %d: errno %d\n",
+					 __func__, torture_type, cpu, ret);
+			}
+		}
 
 	if (maxcpu == 0) {
 		VERBOSE_TOROUT_STRING("Only one CPU, so CPU-hotplug testing is disabled");
-- 
cgit 


From fc6f9c57787e578473d47b7bbc846e317d17c1df Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 27 Aug 2018 14:43:05 -0700
Subject: rcutorture: Remove cbflood facility

Now that the forward-progress code does a full-bore continuous callback
flood lasting multiple seconds, there is little point in also posting a
mere 60,000 callbacks every second or so.  This commit therefore removes
the old cbflood testing.  Over time, it may be desirable to concurrently
do full-bore continuous callback floods on all CPUs simultaneously, but
one dragon at a time.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 Documentation/admin-guide/kernel-parameters.txt | 18 ------
 kernel/rcu/rcutorture.c                         | 86 +------------------------
 2 files changed, 1 insertion(+), 103 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 3823679deea5..86e825e0927a 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3743,24 +3743,6 @@
 			in microseconds.  The default of zero says
 			no holdoff.
 
-	rcutorture.cbflood_inter_holdoff= [KNL]
-			Set holdoff time (jiffies) between successive
-			callback-flood tests.
-
-	rcutorture.cbflood_intra_holdoff= [KNL]
-			Set holdoff time (jiffies) between successive
-			bursts of callbacks within a given callback-flood
-			test.
-
-	rcutorture.cbflood_n_burst= [KNL]
-			Set the number of bursts making up a given
-			callback-flood test.  Set this to zero to
-			disable callback-flood testing.
-
-	rcutorture.cbflood_n_per_burst= [KNL]
-			Set the number of callbacks to be registered
-			in a given burst of a callback-flood test.
-
 	rcutorture.fqs_duration= [KNL]
 			Set duration of force_quiescent_state bursts
 			in microseconds.
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 8cf700ca7845..17f480129a78 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -80,13 +80,6 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@jos
 					/* Must be power of two minus one. */
 #define RCUTORTURE_RDR_MAX_SEGS (RCUTORTURE_RDR_MAX_LOOPS + 3)
 
-torture_param(int, cbflood_inter_holdoff, HZ,
-	      "Holdoff between floods (jiffies)");
-torture_param(int, cbflood_intra_holdoff, 1,
-	      "Holdoff between bursts (jiffies)");
-torture_param(int, cbflood_n_burst, 3, "# bursts in flood, zero to disable");
-torture_param(int, cbflood_n_per_burst, 20000,
-	      "# callbacks per burst in flood");
 torture_param(int, extendables, RCUTORTURE_MAX_EXTEND,
 	      "Extend readers by disabling bh (1), irqs (2), or preempt (4)");
 torture_param(int, fqs_duration, 0,
@@ -138,12 +131,10 @@ module_param(torture_type, charp, 0444);
 MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, srcu, ...)");
 
 static int nrealreaders;
-static int ncbflooders;
 static struct task_struct *writer_task;
 static struct task_struct **fakewriter_tasks;
 static struct task_struct **reader_tasks;
 static struct task_struct *stats_task;
-static struct task_struct **cbflood_task;
 static struct task_struct *fqs_task;
 static struct task_struct *boost_tasks[NR_CPUS];
 static struct task_struct *stall_task;
@@ -181,7 +172,6 @@ static long n_rcu_torture_boosts;
 static atomic_long_t n_rcu_torture_timers;
 static long n_barrier_attempts;
 static long n_barrier_successes; /* did rcu_barrier test succeed? */
-static atomic_long_t n_cbfloods;
 static struct list_head rcu_torture_removed;
 
 static int rcu_torture_writer_state;
@@ -873,59 +863,6 @@ checkwait:	stutter_wait("rcu_torture_boost");
 	return 0;
 }
 
-static void rcu_torture_cbflood_cb(struct rcu_head *rhp)
-{
-}
-
-/*
- * RCU torture callback-flood kthread.  Repeatedly induces bursts of calls
- * to call_rcu() or analogous, increasing the probability of occurrence
- * of callback-overflow corner cases.
- */
-static int
-rcu_torture_cbflood(void *arg)
-{
-	int err = 1;
-	int i;
-	int j;
-	struct rcu_head *rhp;
-
-	if (cbflood_n_per_burst > 0 &&
-	    cbflood_inter_holdoff > 0 &&
-	    cbflood_intra_holdoff > 0 &&
-	    cur_ops->call &&
-	    cur_ops->cb_barrier) {
-		rhp = vmalloc(array3_size(cbflood_n_burst,
-					  cbflood_n_per_burst,
-					  sizeof(*rhp)));
-		err = !rhp;
-	}
-	if (err) {
-		VERBOSE_TOROUT_STRING("rcu_torture_cbflood disabled: Bad args or OOM");
-		goto wait_for_stop;
-	}
-	VERBOSE_TOROUT_STRING("rcu_torture_cbflood task started");
-	do {
-		schedule_timeout_interruptible(cbflood_inter_holdoff);
-		atomic_long_inc(&n_cbfloods);
-		WARN_ON(signal_pending(current));
-		for (i = 0; i < cbflood_n_burst; i++) {
-			for (j = 0; j < cbflood_n_per_burst; j++) {
-				cur_ops->call(&rhp[i * cbflood_n_per_burst + j],
-					      rcu_torture_cbflood_cb);
-			}
-			schedule_timeout_interruptible(cbflood_intra_holdoff);
-			WARN_ON(signal_pending(current));
-		}
-		cur_ops->cb_barrier();
-		stutter_wait("rcu_torture_cbflood");
-	} while (!torture_must_stop());
-	vfree(rhp);
-wait_for_stop:
-	torture_kthread_stopping("rcu_torture_cbflood");
-	return 0;
-}
-
 /*
  * RCU torture force-quiescent-state kthread.  Repeatedly induces
  * bursts of calls to force_quiescent_state(), increasing the probability
@@ -1460,11 +1397,10 @@ rcu_torture_stats_print(void)
 		n_rcu_torture_boosts,
 		atomic_long_read(&n_rcu_torture_timers));
 	torture_onoff_stats();
-	pr_cont("barrier: %ld/%ld:%ld ",
+	pr_cont("barrier: %ld/%ld:%ld\n",
 		n_barrier_successes,
 		n_barrier_attempts,
 		n_rcu_torture_barrier_error);
-	pr_cont("cbflood: %ld\n", atomic_long_read(&n_cbfloods));
 
 	pr_alert("%s%s ", torture_type, TORTURE_FLAG);
 	if (atomic_read(&n_rcu_torture_mberror) != 0 ||
@@ -2093,8 +2029,6 @@ rcu_torture_cleanup(void)
 		 cur_ops->name, gp_seq, flags);
 	torture_stop_kthread(rcu_torture_stats, stats_task);
 	torture_stop_kthread(rcu_torture_fqs, fqs_task);
-	for (i = 0; i < ncbflooders; i++)
-		torture_stop_kthread(rcu_torture_cbflood, cbflood_task[i]);
 	if (rcu_torture_can_boost())
 		cpuhp_remove_state(rcutor_hp);
 
@@ -2377,24 +2311,6 @@ rcu_torture_init(void)
 		goto unwind;
 	if (object_debug)
 		rcu_test_debug_objects();
-	if (cbflood_n_burst > 0) {
-		/* Create the cbflood threads */
-		ncbflooders = (num_online_cpus() + 3) / 4;
-		cbflood_task = kcalloc(ncbflooders, sizeof(*cbflood_task),
-				       GFP_KERNEL);
-		if (!cbflood_task) {
-			VERBOSE_TOROUT_ERRSTRING("out of memory");
-			firsterr = -ENOMEM;
-			goto unwind;
-		}
-		for (i = 0; i < ncbflooders; i++) {
-			firsterr = torture_create_kthread(rcu_torture_cbflood,
-							  NULL,
-							  cbflood_task[i]);
-			if (firsterr)
-				goto unwind;
-		}
-	}
 	torture_init_end();
 	return 0;
 
-- 
cgit 


From 6b3de7a172bc59010a9d8e425877d98c1f24555e Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 28 Aug 2018 14:38:43 -0700
Subject: rcutorture: Break up too-long rcu_torture_fwd_prog() function

This commit splits rcu_torture_fwd_prog_nr() and rcu_torture_fwd_prog_cr()
functions out of rcu_torture_fwd_prog() in order to reduce indentation
pain and because rcu_torture_fwd_prog() was getting a bit too long.
In addition, this will enable easier conditional execution of the
rcu_torture_fwd_prog_cr() function, which can give false-positive
failures in some NO_HZ_FULL configurations due to overloading the
housekeeping CPUs.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/rcutorture.c | 254 +++++++++++++++++++++++++-----------------------
 1 file changed, 135 insertions(+), 119 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 17f480129a78..bcc33bb8d9a6 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1650,15 +1650,70 @@ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp)
 	spin_unlock(&rcu_fwd_lock);
 }
 
-/* Carry out grace-period forward-progress testing. */
-static int rcu_torture_fwd_prog(void *args)
+/* Carry out need_resched()/cond_resched() forward-progress testing. */
+static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries)
 {
 	unsigned long cver;
 	unsigned long dur;
 	struct fwd_cb_state fcs;
 	unsigned long gps;
-	int i;
 	int idx;
+	int sd;
+	int sd4;
+	bool selfpropcb = false;
+	unsigned long stopat;
+	static DEFINE_TORTURE_RANDOM(trs);
+
+	if  (cur_ops->call && cur_ops->sync && cur_ops->cb_barrier) {
+		init_rcu_head_on_stack(&fcs.rh);
+		selfpropcb = true;
+	}
+
+	/* Tight loop containing cond_resched(). */
+	if  (selfpropcb) {
+		WRITE_ONCE(fcs.stop, 0);
+		cur_ops->call(&fcs.rh, rcu_torture_fwd_prog_cb);
+	}
+	cver = READ_ONCE(rcu_torture_current_version);
+	gps = cur_ops->get_gp_seq();
+	sd = cur_ops->stall_dur() + 1;
+	sd4 = (sd + fwd_progress_div - 1) / fwd_progress_div;
+	dur = sd4 + torture_random(&trs) % (sd - sd4);
+	rcu_fwd_startat = jiffies;
+	stopat = rcu_fwd_startat + dur;
+	while (time_before(jiffies, stopat) && !torture_must_stop()) {
+		idx = cur_ops->readlock();
+		udelay(10);
+		cur_ops->readunlock(idx);
+		if (!fwd_progress_need_resched || need_resched())
+			cond_resched();
+	}
+	(*tested_tries)++;
+	if (!time_before(jiffies, stopat) && !torture_must_stop()) {
+		(*tested)++;
+		cver = READ_ONCE(rcu_torture_current_version) - cver;
+		gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps);
+		WARN_ON(!cver && gps < 2);
+		pr_alert("%s: Duration %ld cver %ld gps %ld\n", __func__, dur, cver, gps);
+	}
+	if (selfpropcb) {
+		WRITE_ONCE(fcs.stop, 1);
+		cur_ops->sync(); /* Wait for running CB to complete. */
+		cur_ops->cb_barrier(); /* Wait for queued callbacks. */
+	}
+
+	if (selfpropcb) {
+		WARN_ON(READ_ONCE(fcs.stop) != 2);
+		destroy_rcu_head_on_stack(&fcs.rh);
+	}
+}
+
+/* Carry out call_rcu() forward-progress testing. */
+static void rcu_torture_fwd_prog_cr(void)
+{
+	unsigned long cver;
+	unsigned long gps;
+	int i;
 	int j;
 	long n_launders;
 	long n_launders_cb_snap;
@@ -1667,136 +1722,97 @@ static int rcu_torture_fwd_prog(void *args)
 	long n_max_gps;
 	struct rcu_fwd_cb *rfcp;
 	struct rcu_fwd_cb *rfcpn;
-	int sd;
-	int sd4;
-	bool selfpropcb = false;
 	unsigned long stopat;
 	unsigned long stoppedat;
+
+	/* Loop continuously posting RCU callbacks. */
+	WRITE_ONCE(rcu_fwd_cb_nodelay, true);
+	cur_ops->sync(); /* Later readers see above write. */
+	rcu_fwd_startat = jiffies;
+	stopat = rcu_fwd_startat + MAX_FWD_CB_JIFFIES;
+	n_launders = 0;
+	n_launders_cb = 0;
+	n_launders_sa = 0;
+	n_max_cbs = 0;
+	n_max_gps = 0;
+	for (i = 0; i < ARRAY_SIZE(n_launders_hist); i++)
+		n_launders_hist[i] = 0;
+	cver = READ_ONCE(rcu_torture_current_version);
+	gps = cur_ops->get_gp_seq();
+	while (time_before(jiffies, stopat) && !torture_must_stop()) {
+		rfcp = READ_ONCE(rcu_fwd_cb_head);
+		rfcpn = NULL;
+		if (rfcp)
+			rfcpn = READ_ONCE(rfcp->rfc_next);
+		if (rfcpn) {
+			if (rfcp->rfc_gps >= MIN_FWD_CB_LAUNDERS &&
+			    ++n_max_gps >= MIN_FWD_CBS_LAUNDERED)
+				break;
+			rcu_fwd_cb_head = rfcpn;
+			n_launders++;
+			n_launders_sa++;
+		} else {
+			rfcp = kmalloc(sizeof(*rfcp), GFP_KERNEL);
+			if (WARN_ON_ONCE(!rfcp)) {
+				schedule_timeout_interruptible(1);
+				continue;
+			}
+			n_max_cbs++;
+			n_launders_sa = 0;
+			rfcp->rfc_gps = 0;
+		}
+		cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr);
+		cond_resched();
+	}
+	stoppedat = jiffies;
+	n_launders_cb_snap = READ_ONCE(n_launders_cb);
+	cver = READ_ONCE(rcu_torture_current_version) - cver;
+	gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps);
+	cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */
+	for (;;) {
+		rfcp = rcu_fwd_cb_head;
+		if (!rfcp)
+			break;
+		rcu_fwd_cb_head = rfcp->rfc_next;
+		kfree(rfcp);
+	}
+	rcu_fwd_cb_tail = &rcu_fwd_cb_head;
+	WRITE_ONCE(rcu_fwd_cb_nodelay, false);
+	if (!torture_must_stop()) {
+		WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED);
+		pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld\n",
+			 __func__,
+			 stoppedat - rcu_fwd_startat, jiffies - stoppedat,
+			 n_launders + n_max_cbs - n_launders_cb_snap,
+			 n_launders, n_launders_sa,
+			 n_max_gps, n_max_cbs, cver, gps);
+		for (i = ARRAY_SIZE(n_launders_hist) - 1; i > 0; i--)
+			if (n_launders_hist[i] > 0)
+				break;
+		pr_alert("Callback-invocation histogram:");
+		for (j = 0; j <= i; j++)
+			pr_cont(" %ds: %ld", j + 1, n_launders_hist[j]);
+		pr_cont("\n");
+	}
+}
+
+/* Carry out grace-period forward-progress testing. */
+static int rcu_torture_fwd_prog(void *args)
+{
 	int tested = 0;
 	int tested_tries = 0;
-	static DEFINE_TORTURE_RANDOM(trs);
 
 	VERBOSE_TOROUT_STRING("rcu_torture_fwd_progress task started");
 	if (!IS_ENABLED(CONFIG_SMP) || !IS_ENABLED(CONFIG_RCU_BOOST))
 		set_user_nice(current, MAX_NICE);
-	if  (cur_ops->call && cur_ops->sync && cur_ops->cb_barrier) {
-		init_rcu_head_on_stack(&fcs.rh);
-		selfpropcb = true;
-	}
 	do {
 		schedule_timeout_interruptible(fwd_progress_holdoff * HZ);
-
-		/* Tight loop containing cond_resched(). */
-		if  (selfpropcb) {
-			WRITE_ONCE(fcs.stop, 0);
-			cur_ops->call(&fcs.rh, rcu_torture_fwd_prog_cb);
-		}
-		cver = READ_ONCE(rcu_torture_current_version);
-		gps = cur_ops->get_gp_seq();
-		sd = cur_ops->stall_dur() + 1;
-		sd4 = (sd + fwd_progress_div - 1) / fwd_progress_div;
-		dur = sd4 + torture_random(&trs) % (sd - sd4);
-		rcu_fwd_startat = jiffies;
-		stopat = rcu_fwd_startat + dur;
-		while (time_before(jiffies, stopat) && !torture_must_stop()) {
-			idx = cur_ops->readlock();
-			udelay(10);
-			cur_ops->readunlock(idx);
-			if (!fwd_progress_need_resched || need_resched())
-				cond_resched();
-		}
-		tested_tries++;
-		if (!time_before(jiffies, stopat) && !torture_must_stop()) {
-			tested++;
-			cver = READ_ONCE(rcu_torture_current_version) - cver;
-			gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps);
-			WARN_ON(!cver && gps < 2);
-			pr_alert("%s: Duration %ld cver %ld gps %ld\n", __func__, dur, cver, gps);
-		}
-		if (selfpropcb) {
-			WRITE_ONCE(fcs.stop, 1);
-			cur_ops->sync(); /* Wait for running CB to complete. */
-			cur_ops->cb_barrier(); /* Wait for queued callbacks. */
-		}
-
-		/* Loop continuously posting RCU callbacks. */
-		WRITE_ONCE(rcu_fwd_cb_nodelay, true);
-		cur_ops->sync(); /* Later readers see above write. */
-		rcu_fwd_startat = jiffies;
-		stopat = rcu_fwd_startat + MAX_FWD_CB_JIFFIES;
-		n_launders = 0;
-		n_launders_cb = 0;
-		n_launders_sa = 0;
-		n_max_cbs = 0;
-		n_max_gps = 0;
-		for (i = 0; i < ARRAY_SIZE(n_launders_hist); i++)
-			n_launders_hist[i] = 0;
-		cver = READ_ONCE(rcu_torture_current_version);
-		gps = cur_ops->get_gp_seq();
-		while (time_before(jiffies, stopat) && !torture_must_stop()) {
-			rfcp = READ_ONCE(rcu_fwd_cb_head);
-			rfcpn = NULL;
-			if (rfcp)
-				rfcpn = READ_ONCE(rfcp->rfc_next);
-			if (rfcpn) {
-				if (rfcp->rfc_gps >= MIN_FWD_CB_LAUNDERS &&
-				    ++n_max_gps >= MIN_FWD_CBS_LAUNDERED)
-					break;
-				rcu_fwd_cb_head = rfcpn;
-				n_launders++;
-				n_launders_sa++;
-			} else {
-				rfcp = kmalloc(sizeof(*rfcp), GFP_KERNEL);
-				if (WARN_ON_ONCE(!rfcp)) {
-					schedule_timeout_interruptible(1);
-					continue;
-				}
-				n_max_cbs++;
-				n_launders_sa = 0;
-				rfcp->rfc_gps = 0;
-			}
-			cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr);
-			cond_resched();
-		}
-		stoppedat = jiffies;
-		n_launders_cb_snap = READ_ONCE(n_launders_cb);
-		cver = READ_ONCE(rcu_torture_current_version) - cver;
-		gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps);
-		cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */
-		for (;;) {
-			rfcp = rcu_fwd_cb_head;
-			if (!rfcp)
-				break;
-			rcu_fwd_cb_head = rfcp->rfc_next;
-			kfree(rfcp);
-		}
-		rcu_fwd_cb_tail = &rcu_fwd_cb_head;
-		WRITE_ONCE(rcu_fwd_cb_nodelay, false);
-		if (!torture_must_stop()) {
-			WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED);
-			pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld\n",
-				 __func__,
-				 stoppedat - rcu_fwd_startat,
-				 jiffies - stoppedat,
-				 n_launders + n_max_cbs - n_launders_cb_snap,
-				 n_launders, n_launders_sa,
-				 n_max_gps, n_max_cbs, cver, gps);
-			for (i = ARRAY_SIZE(n_launders_hist) - 1; i > 0; i--)
-				if (n_launders_hist[i] > 0)
-					break;
-			pr_alert("Callback-invocation histogram:");
-			for (j = 0; j <= i; j++)
-				pr_cont(" %ds: %ld", j + 1, n_launders_hist[j]);
-			pr_cont("\n");
-		}
+		rcu_torture_fwd_prog_nr(&tested, &tested_tries);
+		rcu_torture_fwd_prog_cr();
 
 		/* Avoid slow periods, better to test when busy. */
 		stutter_wait("rcu_torture_fwd_prog");
 	} while (!torture_must_stop());
-	if (selfpropcb) {
-		WARN_ON(READ_ONCE(fcs.stop) != 2);
-		destroy_rcu_head_on_stack(&fcs.rh);
-	}
 	/* Short runs might not contain a valid forward-progress attempt. */
 	WARN_ON(!tested && tested_tries >= 5);
 	pr_alert("%s: tested %d tested_tries %d\n", __func__, tested, tested_tries);
-- 
cgit 


From 5ab7ab8362fa8a4f7995d65ea05edf71530e8004 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Fri, 21 Sep 2018 18:08:09 -0700
Subject: rcutorture: Affinity forward-progress test to avoid housekeeping CPUs

This commit affinities the forward-progress tests to avoid hogging a
housekeeping CPU on the theory that the offloaded callbacks will be
running on those housekeeping CPUs.

Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
[ paulmck: Fix NULL-pointer issue located by kbuild test robot. ]
Tested-by: Rong Chen <rong.a.chen@intel.com>
---
 kernel/rcu/rcu.h         |  2 ++
 kernel/rcu/rcutorture.c  |  1 +
 kernel/rcu/tree_plugin.h | 11 +++++++++++
 3 files changed, 14 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 2866166863f0..0f0f5ae8c3d4 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -539,8 +539,10 @@ extern struct workqueue_struct *rcu_par_gp_wq;
 
 #ifdef CONFIG_RCU_NOCB_CPU
 bool rcu_is_nocb_cpu(int cpu);
+void rcu_bind_current_to_nocb(void);
 #else
 static inline bool rcu_is_nocb_cpu(int cpu) { return false; }
+static inline void rcu_bind_current_to_nocb(void) { }
 #endif
 
 #endif /* __LINUX_RCU_H */
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index bcc33bb8d9a6..36a3bc42782d 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1803,6 +1803,7 @@ static int rcu_torture_fwd_prog(void *args)
 	int tested_tries = 0;
 
 	VERBOSE_TOROUT_STRING("rcu_torture_fwd_progress task started");
+	rcu_bind_current_to_nocb();
 	if (!IS_ENABLED(CONFIG_SMP) || !IS_ENABLED(CONFIG_RCU_BOOST))
 		set_user_nice(current, MAX_NICE);
 	do {
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 605ff3b06098..25fe26c4e9c1 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -2603,6 +2603,17 @@ static bool init_nocb_callback_list(struct rcu_data *rdp)
 	return true;
 }
 
+/*
+ * Bind the current task to the offloaded CPUs.  If there are no offloaded
+ * CPUs, leave the task unbound.  Splat if the bind attempt fails.
+ */
+void rcu_bind_current_to_nocb(void)
+{
+	if (cpumask_available(rcu_nocb_mask) && cpumask_weight(rcu_nocb_mask))
+		WARN_ON(sched_setaffinity(current->pid, rcu_nocb_mask));
+}
+EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb);
+
 #else /* #ifdef CONFIG_RCU_NOCB_CPU */
 
 static bool rcu_nocb_cpu_needs_barrier(int cpu)
-- 
cgit 


From 2a7d968816a94a4c52f0082c085c6714a5b3f1ec Mon Sep 17 00:00:00 2001
From: Pierce Griffiths <pierceagriffiths@gmail.com>
Date: Fri, 21 Sep 2018 20:21:31 -0500
Subject: torture: Remove unnecessary "ret" variables

Remove return variables (declared as "ret") in cases where,
depending on whether a condition evaluates as true, the result of a
function call can be immediately returned instead of storing the result in
the return variable. When the condition evaluates as false, the constant
initially stored in the return variable at declaration is returned instead.

Signed-off-by: Pierce Griffiths <pierceagriffiths@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 kernel/torture.c | 22 ++++++++--------------
 1 file changed, 8 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/torture.c b/kernel/torture.c
index 9410d1bf84d6..bbf6d473e50c 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -245,16 +245,15 @@ stop:
  */
 int torture_onoff_init(long ooholdoff, long oointerval)
 {
-	int ret = 0;
-
 #ifdef CONFIG_HOTPLUG_CPU
 	onoff_holdoff = ooholdoff;
 	onoff_interval = oointerval;
 	if (onoff_interval <= 0)
 		return 0;
-	ret = torture_create_kthread(torture_onoff, NULL, onoff_task);
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-	return ret;
+	return torture_create_kthread(torture_onoff, NULL, onoff_task);
+#else /* #ifdef CONFIG_HOTPLUG_CPU */
+	return 0;
+#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
 }
 EXPORT_SYMBOL_GPL(torture_onoff_init);
 
@@ -525,15 +524,13 @@ static int torture_shutdown(void *arg)
  */
 int torture_shutdown_init(int ssecs, void (*cleanup)(void))
 {
-	int ret = 0;
-
 	torture_shutdown_hook = cleanup;
 	if (ssecs > 0) {
 		shutdown_time = ktime_add(ktime_get(), ktime_set(ssecs, 0));
-		ret = torture_create_kthread(torture_shutdown, NULL,
+		return torture_create_kthread(torture_shutdown, NULL,
 					     shutdown_task);
 	}
-	return ret;
+	return 0;
 }
 EXPORT_SYMBOL_GPL(torture_shutdown_init);
 
@@ -632,13 +629,10 @@ static int torture_stutter(void *arg)
 /*
  * Initialize and kick off the torture_stutter kthread.
  */
-int torture_stutter_init(int s)
+int torture_stutter_init(const int s)
 {
-	int ret;
-
 	stutter = s;
-	ret = torture_create_kthread(torture_stutter, NULL, stutter_task);
-	return ret;
+	return torture_create_kthread(torture_stutter, NULL, stutter_task);
 }
 EXPORT_SYMBOL_GPL(torture_stutter_init);
 
-- 
cgit 


From 61670adcb4a9f66ff3fa8a9e846a623d9a9e1553 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Mon, 1 Oct 2018 13:27:41 -0700
Subject: rcutorture: Prepare for asynchronous access to rcu_fwd_startat

Because rcutorture's forward-progress checking will trigger from an
OOM notifier, this notifier will introduce asynchronous concurrent
access to the rcu_fwd_startat variable.  This commit therefore prepares
for this by converting updates to WRITE_ONCE().

Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 kernel/rcu/rcutorture.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 36a3bc42782d..c4fd61dccedb 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1679,7 +1679,7 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries)
 	sd = cur_ops->stall_dur() + 1;
 	sd4 = (sd + fwd_progress_div - 1) / fwd_progress_div;
 	dur = sd4 + torture_random(&trs) % (sd - sd4);
-	rcu_fwd_startat = jiffies;
+	WRITE_ONCE(rcu_fwd_startat, jiffies);
 	stopat = rcu_fwd_startat + dur;
 	while (time_before(jiffies, stopat) && !torture_must_stop()) {
 		idx = cur_ops->readlock();
@@ -1728,7 +1728,7 @@ static void rcu_torture_fwd_prog_cr(void)
 	/* Loop continuously posting RCU callbacks. */
 	WRITE_ONCE(rcu_fwd_cb_nodelay, true);
 	cur_ops->sync(); /* Later readers see above write. */
-	rcu_fwd_startat = jiffies;
+	WRITE_ONCE(rcu_fwd_startat, jiffies);
 	stopat = rcu_fwd_startat + MAX_FWD_CB_JIFFIES;
 	n_launders = 0;
 	n_launders_cb = 0;
-- 
cgit 


From e0aff97355575ac6a28a48a4217533a3953095c5 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Mon, 1 Oct 2018 17:40:54 -0700
Subject: rcutorture: Dump grace-period diagnostics upon forward-progress OOM

This commit adds an OOM notifier during rcutorture forward-progress
testing.  If this notifier is invoked, it dumps out some grace-period
state to help debug the forward-progress problem.

Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 kernel/rcu/rcu.h        |  2 ++
 kernel/rcu/rcutorture.c | 31 ++++++++++++++++++++++++++++---
 kernel/rcu/tree.c       | 20 ++++++++++++++++++++
 3 files changed, 50 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 0f0f5ae8c3d4..a393e24a9195 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -526,12 +526,14 @@ srcu_batches_completed(struct srcu_struct *sp) { return 0; }
 static inline void rcu_force_quiescent_state(void) { }
 static inline void show_rcu_gp_kthreads(void) { }
 static inline int rcu_get_gp_kthreads_prio(void) { return 0; }
+static inline void rcu_fwd_progress_check(unsigned long j) { }
 #else /* #ifdef CONFIG_TINY_RCU */
 unsigned long rcu_get_gp_seq(void);
 unsigned long rcu_exp_batches_completed(void);
 unsigned long srcu_batches_completed(struct srcu_struct *sp);
 void show_rcu_gp_kthreads(void);
 int rcu_get_gp_kthreads_prio(void);
+void rcu_fwd_progress_check(unsigned long j);
 void rcu_force_quiescent_state(void);
 extern struct workqueue_struct *rcu_gp_wq;
 extern struct workqueue_struct *rcu_par_gp_wq;
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index c4fd61dccedb..f28b88ecb47a 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -56,6 +56,7 @@
 #include <linux/vmalloc.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/sysctl.h>
+#include <linux/oom.h>
 
 #include "rcu.h"
 
@@ -1624,6 +1625,7 @@ static struct rcu_fwd_cb *rcu_fwd_cb_head;
 static struct rcu_fwd_cb **rcu_fwd_cb_tail = &rcu_fwd_cb_head;
 static long n_launders_cb;
 static unsigned long rcu_fwd_startat;
+static bool rcu_fwd_emergency_stop;
 #define MAX_FWD_CB_JIFFIES	(8 * HZ) /* Maximum CB test duration. */
 #define MIN_FWD_CB_LAUNDERS	3	/* This many CB invocations to count. */
 #define MIN_FWD_CBS_LAUNDERED	100	/* Number of counted CBs. */
@@ -1681,7 +1683,8 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries)
 	dur = sd4 + torture_random(&trs) % (sd - sd4);
 	WRITE_ONCE(rcu_fwd_startat, jiffies);
 	stopat = rcu_fwd_startat + dur;
-	while (time_before(jiffies, stopat) && !torture_must_stop()) {
+	while (time_before(jiffies, stopat) &&
+	       !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) {
 		idx = cur_ops->readlock();
 		udelay(10);
 		cur_ops->readunlock(idx);
@@ -1689,7 +1692,8 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries)
 			cond_resched();
 	}
 	(*tested_tries)++;
-	if (!time_before(jiffies, stopat) && !torture_must_stop()) {
+	if (!time_before(jiffies, stopat) &&
+	    !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) {
 		(*tested)++;
 		cver = READ_ONCE(rcu_torture_current_version) - cver;
 		gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps);
@@ -1739,7 +1743,8 @@ static void rcu_torture_fwd_prog_cr(void)
 		n_launders_hist[i] = 0;
 	cver = READ_ONCE(rcu_torture_current_version);
 	gps = cur_ops->get_gp_seq();
-	while (time_before(jiffies, stopat) && !torture_must_stop()) {
+	while (time_before(jiffies, stopat) &&
+	       !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) {
 		rfcp = READ_ONCE(rcu_fwd_cb_head);
 		rfcpn = NULL;
 		if (rfcp)
@@ -1796,6 +1801,23 @@ static void rcu_torture_fwd_prog_cr(void)
 	}
 }
 
+
+/*
+ * OOM notifier, but this only prints diagnostic information for the
+ * current forward-progress test.
+ */
+static int rcutorture_oom_notify(struct notifier_block *self,
+				 unsigned long notused, void *nfreed)
+{
+	rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rcu_fwd_startat) / 2));
+	WRITE_ONCE(rcu_fwd_emergency_stop, true);
+	return NOTIFY_OK;
+}
+
+static struct notifier_block rcutorture_oom_nb = {
+	.notifier_call = rcutorture_oom_notify
+};
+
 /* Carry out grace-period forward-progress testing. */
 static int rcu_torture_fwd_prog(void *args)
 {
@@ -1808,8 +1830,11 @@ static int rcu_torture_fwd_prog(void *args)
 		set_user_nice(current, MAX_NICE);
 	do {
 		schedule_timeout_interruptible(fwd_progress_holdoff * HZ);
+		WRITE_ONCE(rcu_fwd_emergency_stop, false);
+		register_oom_notifier(&rcutorture_oom_nb);
 		rcu_torture_fwd_prog_nr(&tested, &tested_tries);
 		rcu_torture_fwd_prog_cr();
+		unregister_oom_notifier(&rcutorture_oom_nb);
 
 		/* Avoid slow periods, better to test when busy. */
 		stutter_wait("rcu_torture_fwd_prog");
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 6ec3abbe90e2..853b79a6ff10 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2657,6 +2657,26 @@ rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp,
 	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 }
 
+/*
+ * Do a forward-progress check for rcutorture.  This is normally invoked
+ * due to an OOM event.  The argument "j" gives the time period during
+ * which rcutorture would like progress to have been made.
+ */
+void rcu_fwd_progress_check(unsigned long j)
+{
+	struct rcu_data *rdp;
+
+	if (rcu_gp_in_progress()) {
+		show_rcu_gp_kthreads();
+	} else {
+		preempt_disable();
+		rdp = this_cpu_ptr(&rcu_data);
+		rcu_check_gp_start_stall(rdp->mynode, rdp, j);
+		preempt_enable();
+	}
+}
+EXPORT_SYMBOL_GPL(rcu_fwd_progress_check);
+
 /*
  * This does the RCU core processing work for the specified rcu_data
  * structures.  This may be called only from the CPU to whom the rdp
-- 
cgit 


From 903ee83d91776bc72d856147743687d4b6c99286 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Tue, 2 Oct 2018 16:05:46 -0700
Subject: rcu: Account for nocb-CPU callback counts in RCU CPU stall warnings

The RCU CPU stall warnings print an estimate of the total number of
RCU callbacks queued in the system, but this estimate leaves out
the callbacks queued for nocbs CPUs.  This commit therefore introduces
rcu_get_n_cbs_cpu(), which gives an accurate callback estimate for
both nocbs and normal CPUs, and uses this new function as needed.

This commit also introduces a rcu_get_n_cbs_nocb_cpu() helper function
that returns the number of callbacks for nocbs CPUs or zero otherwise,
and also uses this function in place of direct access to ->nocb_q_count
while in the area (fewer characters, you see).

Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 kernel/rcu/tree.c        | 19 +++++++++++++++----
 kernel/rcu/tree.h        |  1 +
 kernel/rcu/tree_plugin.h | 24 +++++++++++++++++++-----
 3 files changed, 35 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 853b79a6ff10..0933d8650890 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -207,6 +207,19 @@ static int rcu_gp_in_progress(void)
 	return rcu_seq_state(rcu_seq_current(&rcu_state.gp_seq));
 }
 
+/*
+ * Return the number of callbacks queued on the specified CPU.
+ * Handles both the nocbs and normal cases.
+ */
+static long rcu_get_n_cbs_cpu(int cpu)
+{
+	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+
+	if (rcu_segcblist_is_enabled(&rdp->cblist)) /* Online normal CPU? */
+		return rcu_segcblist_n_cbs(&rdp->cblist);
+	return rcu_get_n_cbs_nocb_cpu(rdp); /* Works for offline, too. */
+}
+
 void rcu_softirq_qs(void)
 {
 	rcu_qs();
@@ -1265,8 +1278,7 @@ static void print_other_cpu_stall(unsigned long gp_seq)
 
 	print_cpu_stall_info_end();
 	for_each_possible_cpu(cpu)
-		totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(&rcu_data,
-							    cpu)->cblist);
+		totqlen += rcu_get_n_cbs_cpu(cpu);
 	pr_cont("(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n",
 	       smp_processor_id(), (long)(jiffies - rcu_state.gp_start),
 	       (long)rcu_seq_current(&rcu_state.gp_seq), totqlen);
@@ -1326,8 +1338,7 @@ static void print_cpu_stall(void)
 	raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags);
 	print_cpu_stall_info_end();
 	for_each_possible_cpu(cpu)
-		totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(&rcu_data,
-							    cpu)->cblist);
+		totqlen += rcu_get_n_cbs_cpu(cpu);
 	pr_cont(" (t=%lu jiffies g=%ld q=%lu)\n",
 		jiffies - rcu_state.gp_start,
 		(long)rcu_seq_current(&rcu_state.gp_seq), totqlen);
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index c3e2807a834a..a8f82b7dc5e2 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -455,6 +455,7 @@ static void __init rcu_spawn_nocb_kthreads(void);
 static void __init rcu_organize_nocb_kthreads(void);
 #endif /* #ifdef CONFIG_RCU_NOCB_CPU */
 static bool init_nocb_callback_list(struct rcu_data *rdp);
+static unsigned long rcu_get_n_cbs_nocb_cpu(struct rcu_data *rdp);
 static void rcu_bind_gp_kthread(void);
 static bool rcu_nohz_full_cpu(void);
 static void rcu_dynticks_task_enter(void);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 25fe26c4e9c1..1b3dd2fc0cd6 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -2011,7 +2011,7 @@ static bool rcu_nocb_cpu_needs_barrier(int cpu)
 	 * (if a callback is in fact needed).  This is associated with an
 	 * atomic_inc() in the caller.
 	 */
-	ret = atomic_long_read(&rdp->nocb_q_count);
+	ret = rcu_get_n_cbs_nocb_cpu(rdp);
 
 #ifdef CONFIG_PROVE_RCU
 	rhp = READ_ONCE(rdp->nocb_head);
@@ -2066,7 +2066,7 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
 				    TPS("WakeNotPoll"));
 		return;
 	}
-	len = atomic_long_read(&rdp->nocb_q_count);
+	len = rcu_get_n_cbs_nocb_cpu(rdp);
 	if (old_rhpp == &rdp->nocb_head) {
 		if (!irqs_disabled_flags(flags)) {
 			/* ... if queue was empty ... */
@@ -2115,11 +2115,11 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
 		trace_rcu_kfree_callback(rcu_state.name, rhp,
 					 (unsigned long)rhp->func,
 					 -atomic_long_read(&rdp->nocb_q_count_lazy),
-					 -atomic_long_read(&rdp->nocb_q_count));
+					 -rcu_get_n_cbs_nocb_cpu(rdp));
 	else
 		trace_rcu_callback(rcu_state.name, rhp,
 				   -atomic_long_read(&rdp->nocb_q_count_lazy),
-				   -atomic_long_read(&rdp->nocb_q_count));
+				   -rcu_get_n_cbs_nocb_cpu(rdp));
 
 	/*
 	 * If called from an extended quiescent state with interrupts
@@ -2343,7 +2343,7 @@ static int rcu_nocb_kthread(void *arg)
 		/* Each pass through the following loop invokes a callback. */
 		trace_rcu_batch_start(rcu_state.name,
 				      atomic_long_read(&rdp->nocb_q_count_lazy),
-				      atomic_long_read(&rdp->nocb_q_count), -1);
+				      rcu_get_n_cbs_nocb_cpu(rdp), -1);
 		c = cl = 0;
 		while (list) {
 			next = list->next;
@@ -2614,6 +2614,15 @@ void rcu_bind_current_to_nocb(void)
 }
 EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb);
 
+/*
+ * Return the number of RCU callbacks still queued from the specified
+ * CPU, which must be a nocbs CPU.
+ */
+static unsigned long rcu_get_n_cbs_nocb_cpu(struct rcu_data *rdp)
+{
+	return atomic_long_read(&rdp->nocb_q_count);
+}
+
 #else /* #ifdef CONFIG_RCU_NOCB_CPU */
 
 static bool rcu_nocb_cpu_needs_barrier(int cpu)
@@ -2674,6 +2683,11 @@ static bool init_nocb_callback_list(struct rcu_data *rdp)
 	return false;
 }
 
+static unsigned long rcu_get_n_cbs_nocb_cpu(struct rcu_data *rdp)
+{
+	return 0;
+}
+
 #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
 
 /*
-- 
cgit 


From bfcfcffc5f23e76a1b88f7412ee7efaec5107b28 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Tue, 2 Oct 2018 17:09:56 -0700
Subject: rcu: Print per-CPU callback counts for forward-progress failures

This commit prints out the non-zero per-CPU callback counts when a
forware-progress error (OOM event) occurs.

Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
[ paulmck: Fix a pair of uninitialized locals spotted by kbuild test robot. ]
---
 kernel/rcu/tree.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 0933d8650890..90a67c75a447 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2675,6 +2675,10 @@ rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp,
  */
 void rcu_fwd_progress_check(unsigned long j)
 {
+	unsigned long cbs;
+	int cpu;
+	unsigned long max_cbs = 0;
+	int max_cpu = -1;
 	struct rcu_data *rdp;
 
 	if (rcu_gp_in_progress()) {
@@ -2685,6 +2689,20 @@ void rcu_fwd_progress_check(unsigned long j)
 		rcu_check_gp_start_stall(rdp->mynode, rdp, j);
 		preempt_enable();
 	}
+	for_each_possible_cpu(cpu) {
+		cbs = rcu_get_n_cbs_cpu(cpu);
+		if (!cbs)
+			continue;
+		if (max_cpu < 0)
+			pr_info("%s: callbacks", __func__);
+		pr_cont(" %d: %lu", cpu, cbs);
+		if (cbs <= max_cbs)
+			continue;
+		max_cbs = cbs;
+		max_cpu = cpu;
+	}
+	if (max_cpu >= 0)
+		pr_cont("\n");
 }
 EXPORT_SYMBOL_GPL(rcu_fwd_progress_check);
 
-- 
cgit 


From 8dd3b54689d9a2103c0817f2b7adc51760a45551 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Wed, 3 Oct 2018 11:02:05 -0700
Subject: rcutorture: Print GP age upon forward-progress failure

Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 kernel/rcu/tree.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 90a67c75a447..f91631541965 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2682,6 +2682,8 @@ void rcu_fwd_progress_check(unsigned long j)
 	struct rcu_data *rdp;
 
 	if (rcu_gp_in_progress()) {
+		pr_info("%s: GP age %lu jiffies\n",
+			__func__, jiffies - rcu_state.gp_start);
 		show_rcu_gp_kthreads();
 	} else {
 		preempt_disable();
-- 
cgit 


From 1a682754c7ed9df213069d5a0d3981f8360a32c2 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Wed, 3 Oct 2018 12:33:41 -0700
Subject: rcutorture: Print histogram of CB invocation at OOM time

One reason why a forward-progress test might fail would be if something
prevented or delayed callback invocation.  This commit therefore adds a
callback-invocation histogram printout when OOM is reported to rcutorture.

Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 kernel/rcu/rcutorture.c | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index f28b88ecb47a..329f4fb13125 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1631,6 +1631,20 @@ static bool rcu_fwd_emergency_stop;
 #define MIN_FWD_CBS_LAUNDERED	100	/* Number of counted CBs. */
 static long n_launders_hist[2 * MAX_FWD_CB_JIFFIES / HZ];
 
+static void rcu_torture_fwd_cb_hist(void)
+{
+	int i;
+	int j;
+
+	for (i = ARRAY_SIZE(n_launders_hist) - 1; i > 0; i--)
+		if (n_launders_hist[i] > 0)
+			break;
+	pr_alert("%s: Callback-invocation histogram:", __func__);
+	for (j = 0; j <= i; j++)
+		pr_cont(" %ds: %ld", j + 1, n_launders_hist[j]);
+	pr_cont("\n");
+}
+
 /* Callback function for continuous-flood RCU callbacks. */
 static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp)
 {
@@ -1718,7 +1732,6 @@ static void rcu_torture_fwd_prog_cr(void)
 	unsigned long cver;
 	unsigned long gps;
 	int i;
-	int j;
 	long n_launders;
 	long n_launders_cb_snap;
 	long n_launders_sa;
@@ -1791,13 +1804,7 @@ static void rcu_torture_fwd_prog_cr(void)
 			 n_launders + n_max_cbs - n_launders_cb_snap,
 			 n_launders, n_launders_sa,
 			 n_max_gps, n_max_cbs, cver, gps);
-		for (i = ARRAY_SIZE(n_launders_hist) - 1; i > 0; i--)
-			if (n_launders_hist[i] > 0)
-				break;
-		pr_alert("Callback-invocation histogram:");
-		for (j = 0; j <= i; j++)
-			pr_cont(" %ds: %ld", j + 1, n_launders_hist[j]);
-		pr_cont("\n");
+		rcu_torture_fwd_cb_hist();
 	}
 }
 
@@ -1809,6 +1816,7 @@ static void rcu_torture_fwd_prog_cr(void)
 static int rcutorture_oom_notify(struct notifier_block *self,
 				 unsigned long notused, void *nfreed)
 {
+	rcu_torture_fwd_cb_hist();
 	rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rcu_fwd_startat) / 2));
 	WRITE_ONCE(rcu_fwd_emergency_stop, true);
 	return NOTIFY_OK;
-- 
cgit 


From c51d7b5e6c94aa6b554c27bd2b0eb64ebef02334 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Wed, 3 Oct 2018 17:25:33 -0700
Subject: rcutorture: Print time since GP end upon forward-progress failure

If rcutorture's forward-progress tests fail while a grace period is not
in progress, it is useful to print the time since the last grace period
ended as a way to detect failure to launch a new grace period.  This
commit therefore makes this change.

Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 kernel/rcu/tree.c | 5 ++++-
 kernel/rcu/tree.h | 2 ++
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index f91631541965..9180158756d2 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2000,7 +2000,8 @@ static void rcu_gp_cleanup(void)
 
 	WRITE_ONCE(rcu_state.gp_activity, jiffies);
 	raw_spin_lock_irq_rcu_node(rnp);
-	gp_duration = jiffies - rcu_state.gp_start;
+	rcu_state.gp_end = jiffies;
+	gp_duration = rcu_state.gp_end - rcu_state.gp_start;
 	if (gp_duration > rcu_state.gp_max)
 		rcu_state.gp_max = gp_duration;
 
@@ -2686,6 +2687,8 @@ void rcu_fwd_progress_check(unsigned long j)
 			__func__, jiffies - rcu_state.gp_start);
 		show_rcu_gp_kthreads();
 	} else {
+		pr_info("%s: Last GP end %lu jiffies ago\n",
+			__func__, jiffies - rcu_state.gp_end);
 		preempt_disable();
 		rdp = this_cpu_ptr(&rcu_data);
 		rcu_check_gp_start_stall(rdp->mynode, rdp, j);
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index a8f82b7dc5e2..d90b02b53c0e 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -328,6 +328,8 @@ struct rcu_state {
 						/*  force_quiescent_state(). */
 	unsigned long gp_start;			/* Time at which GP started, */
 						/*  but in jiffies. */
+	unsigned long gp_end;			/* Time last GP ended, again */
+						/*  in jiffies. */
 	unsigned long gp_activity;		/* Time of last GP kthread */
 						/*  activity in jiffies. */
 	unsigned long gp_req_activity;		/* Time of last GP request */
-- 
cgit 


From 73d665b1410afae405309ad4475a98924776ab13 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Thu, 4 Oct 2018 10:54:22 -0700
Subject: rcutorture: Print forward-progress test age upon failure

This commit prints the age of the forward-progress test in jiffies,
in order to allow better interpretation of the callback-invocation
histograms.

Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 kernel/rcu/rcutorture.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 329f4fb13125..080b5ac6340c 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1639,7 +1639,8 @@ static void rcu_torture_fwd_cb_hist(void)
 	for (i = ARRAY_SIZE(n_launders_hist) - 1; i > 0; i--)
 		if (n_launders_hist[i] > 0)
 			break;
-	pr_alert("%s: Callback-invocation histogram:", __func__);
+	pr_alert("%s: Callback-invocation histogram (duration %lu jiffies):",
+		 __func__, jiffies - rcu_fwd_startat);
 	for (j = 0; j <= i; j++)
 		pr_cont(" %ds: %ld", j + 1, n_launders_hist[j]);
 	pr_cont("\n");
-- 
cgit 


From 2667ccce9328e4e25ed77a83291c066d5e11e65a Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Fri, 5 Oct 2018 09:09:49 -0700
Subject: rcutorture: Recover from OOM during forward-progress tests

This commit causes the OOM handler to do rcu_barrier() calls and to
free up forward-progress callbacks in order to recover from OOM events.
The current test is terminated, but subsequent forward-progress tests can
proceed.  This allows a long test to result in multiple forward-progress
failures, greatly reducing the required testing time.

Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 kernel/rcu/rcutorture.c | 60 ++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 49 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 080b5ac6340c..afa98162575d 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1649,13 +1649,14 @@ static void rcu_torture_fwd_cb_hist(void)
 /* Callback function for continuous-flood RCU callbacks. */
 static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp)
 {
+	unsigned long flags;
 	int i;
 	struct rcu_fwd_cb *rfcp = container_of(rhp, struct rcu_fwd_cb, rh);
 	struct rcu_fwd_cb **rfcpp;
 
 	rfcp->rfc_next = NULL;
 	rfcp->rfc_gps++;
-	spin_lock(&rcu_fwd_lock);
+	spin_lock_irqsave(&rcu_fwd_lock, flags);
 	rfcpp = rcu_fwd_cb_tail;
 	rcu_fwd_cb_tail = &rfcp->rfc_next;
 	WRITE_ONCE(*rfcpp, rfcp);
@@ -1664,7 +1665,33 @@ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp)
 	if (i >= ARRAY_SIZE(n_launders_hist))
 		i = ARRAY_SIZE(n_launders_hist) - 1;
 	n_launders_hist[i]++;
-	spin_unlock(&rcu_fwd_lock);
+	spin_unlock_irqrestore(&rcu_fwd_lock, flags);
+}
+
+/*
+ * Free all callbacks on the rcu_fwd_cb_head list, either because the
+ * test is over or because we hit an OOM event.
+ */
+static unsigned long rcu_torture_fwd_prog_cbfree(void)
+{
+	unsigned long flags;
+	unsigned long freed = 0;
+	struct rcu_fwd_cb *rfcp;
+
+	for (;;) {
+		spin_lock_irqsave(&rcu_fwd_lock, flags);
+		rfcp = rcu_fwd_cb_head;
+		if (!rfcp)
+			break;
+		rcu_fwd_cb_head = rfcp->rfc_next;
+		if (!rcu_fwd_cb_head)
+			rcu_fwd_cb_tail = &rcu_fwd_cb_head;
+		spin_unlock_irqrestore(&rcu_fwd_lock, flags);
+		kfree(rfcp);
+		freed++;
+	}
+	spin_unlock_irqrestore(&rcu_fwd_lock, flags);
+	return freed;
 }
 
 /* Carry out need_resched()/cond_resched() forward-progress testing. */
@@ -1743,6 +1770,9 @@ static void rcu_torture_fwd_prog_cr(void)
 	unsigned long stopat;
 	unsigned long stoppedat;
 
+	if (READ_ONCE(rcu_fwd_emergency_stop))
+		return; /* Get out of the way quickly, no GP wait! */
+
 	/* Loop continuously posting RCU callbacks. */
 	WRITE_ONCE(rcu_fwd_cb_nodelay, true);
 	cur_ops->sync(); /* Later readers see above write. */
@@ -1788,16 +1818,10 @@ static void rcu_torture_fwd_prog_cr(void)
 	cver = READ_ONCE(rcu_torture_current_version) - cver;
 	gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps);
 	cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */
-	for (;;) {
-		rfcp = rcu_fwd_cb_head;
-		if (!rfcp)
-			break;
-		rcu_fwd_cb_head = rfcp->rfc_next;
-		kfree(rfcp);
-	}
-	rcu_fwd_cb_tail = &rcu_fwd_cb_head;
+	(void)rcu_torture_fwd_prog_cbfree();
+
 	WRITE_ONCE(rcu_fwd_cb_nodelay, false);
-	if (!torture_must_stop()) {
+	if (!torture_must_stop() && !READ_ONCE(rcu_fwd_emergency_stop)) {
 		WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED);
 		pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld\n",
 			 __func__,
@@ -1817,9 +1841,23 @@ static void rcu_torture_fwd_prog_cr(void)
 static int rcutorture_oom_notify(struct notifier_block *self,
 				 unsigned long notused, void *nfreed)
 {
+	WARN(1, "%s invoked upon OOM during forward-progress testing.\n",
+	     __func__);
 	rcu_torture_fwd_cb_hist();
 	rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rcu_fwd_startat) / 2));
 	WRITE_ONCE(rcu_fwd_emergency_stop, true);
+	smp_mb(); /* Emergency stop before free and wait to avoid hangs. */
+	pr_info("%s: Freed %lu RCU callbacks.\n",
+		__func__, rcu_torture_fwd_prog_cbfree());
+	rcu_barrier();
+	pr_info("%s: Freed %lu RCU callbacks.\n",
+		__func__, rcu_torture_fwd_prog_cbfree());
+	rcu_barrier();
+	pr_info("%s: Freed %lu RCU callbacks.\n",
+		__func__, rcu_torture_fwd_prog_cbfree());
+	smp_mb(); /* Frees before return to avoid redoing OOM. */
+	(*(unsigned long *)nfreed)++; /* Forward progress CBs freed! */
+	pr_info("%s returning after OOM processing.\n", __func__);
 	return NOTIFY_OK;
 }
 
-- 
cgit 


From 2e57bf97a6856f2dc10fb4377c452cb08f844047 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Fri, 5 Oct 2018 16:43:09 -0700
Subject: rcutorture: Use 100ms buckets for forward-progress callback
 histograms

This commit narrows the scope of each bucket of the forward-progress
callback-invocation histograms from one second to 100 milliseconds, which
aids debugging of forward-progress problems by making shorter-duration
callback-invocation stalls visible.

Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 kernel/rcu/rcutorture.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index afa98162575d..a4c4a24bdcaa 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1629,7 +1629,8 @@ static bool rcu_fwd_emergency_stop;
 #define MAX_FWD_CB_JIFFIES	(8 * HZ) /* Maximum CB test duration. */
 #define MIN_FWD_CB_LAUNDERS	3	/* This many CB invocations to count. */
 #define MIN_FWD_CBS_LAUNDERED	100	/* Number of counted CBs. */
-static long n_launders_hist[2 * MAX_FWD_CB_JIFFIES / HZ];
+#define FWD_CBS_HIST_DIV	10	/* Histogram buckets/second. */
+static long n_launders_hist[2 * MAX_FWD_CB_JIFFIES / (HZ / FWD_CBS_HIST_DIV)];
 
 static void rcu_torture_fwd_cb_hist(void)
 {
@@ -1642,7 +1643,8 @@ static void rcu_torture_fwd_cb_hist(void)
 	pr_alert("%s: Callback-invocation histogram (duration %lu jiffies):",
 		 __func__, jiffies - rcu_fwd_startat);
 	for (j = 0; j <= i; j++)
-		pr_cont(" %ds: %ld", j + 1, n_launders_hist[j]);
+		pr_cont(" %ds/%d: %ld",
+			j + 1, FWD_CBS_HIST_DIV, n_launders_hist[j]);
 	pr_cont("\n");
 }
 
@@ -1661,7 +1663,7 @@ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp)
 	rcu_fwd_cb_tail = &rfcp->rfc_next;
 	WRITE_ONCE(*rfcpp, rfcp);
 	WRITE_ONCE(n_launders_cb, n_launders_cb + 1);
-	i = ((jiffies - rcu_fwd_startat) / HZ);
+	i = ((jiffies - rcu_fwd_startat) / (HZ / FWD_CBS_HIST_DIV));
 	if (i >= ARRAY_SIZE(n_launders_hist))
 		i = ARRAY_SIZE(n_launders_hist) - 1;
 	n_launders_hist[i]++;
-- 
cgit 


From 5ac7cdc29897e5fc3f5e214f3f8c8b03ef8d7029 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Tue, 16 Oct 2018 05:46:58 -0700
Subject: rcutorture: Don't do busted forward-progress testing

The "busted" rcutorture type is an intentionally broken implementation
of RCU.  Doing forward-progress testing on this implementation is not
particularly meaningful on the one hand and can result in fatal abuse
of the memory allocator on the other.  This commit therefore disables
forward-progress testing of the "busted" rcutorture type.

Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 kernel/rcu/rcutorture.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index a4c4a24bdcaa..f6e85faa4ff4 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1900,7 +1900,8 @@ static int __init rcu_torture_fwd_prog_init(void)
 {
 	if (!fwd_progress)
 		return 0; /* Not requested, so don't do it. */
-	if (!cur_ops->stall_dur || cur_ops->stall_dur() <= 0) {
+	if (!cur_ops->stall_dur || cur_ops->stall_dur() <= 0 ||
+	    cur_ops == &rcu_busted_ops) {
 		VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Disabled, unsupported by RCU flavor under test");
 		return 0;
 	}
-- 
cgit 


From 5482e9a93c83839f94e75db712e6837e6a39962c Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Sat, 1 Dec 2018 17:08:44 -0800
Subject: bpf: Fix memleak in aux->func_info and aux->btf

The aux->func_info and aux->btf are leaked in the error out cases
during bpf_prog_load().  This patch fixes it.

Fixes: ba64e7d85252 ("bpf: btf: support proper non-jit func info")
Cc: Yonghong Song <yhs@fb.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/syscall.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index f9554d9a14e1..4445d0d084d8 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1560,6 +1560,8 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
 	return err;
 
 free_used_maps:
+	kvfree(prog->aux->func_info);
+	btf_put(prog->aux->btf);
 	bpf_prog_kallsyms_del_subprogs(prog);
 	free_used_maps(prog->aux);
 free_prog:
-- 
cgit 


From fca0c116504e1c5c72963bf28b1ede0932228fef Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Mon, 3 Dec 2018 10:52:21 +0100
Subject: perf: Fix typos in comments

Fix two typos in kernel/events/*.

No change in functionality intended.

Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c          | 2 +-
 kernel/events/hw_breakpoint.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 84530ab358c3..a2a6e0b4a881 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5541,7 +5541,7 @@ out_put:
 
 static const struct vm_operations_struct perf_mmap_vmops = {
 	.open		= perf_mmap_open,
-	.close		= perf_mmap_close, /* non mergable */
+	.close		= perf_mmap_close, /* non mergeable */
 	.fault		= perf_mmap_fault,
 	.page_mkwrite	= perf_mmap_fault,
 };
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index d6b56180827c..5befb338a18d 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -238,7 +238,7 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
 }
 
 /*
- * Contraints to check before allowing this new breakpoint counter:
+ * Constraints to check before allowing this new breakpoint counter:
  *
  *  == Non-pinned counter == (Considered as pinned for now)
  *
-- 
cgit 


From dfcb245e28481256a10a9133441baf2a93d26642 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Mon, 3 Dec 2018 10:05:56 +0100
Subject: sched: Fix various typos in comments

Go over the scheduler source code and fix common typos
in comments - and a typo in an actual variable name.

No change in functionality intended.

Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h           |  4 ++--
 include/linux/sched/isolation.h |  4 ++--
 include/linux/sched/mm.h        |  2 +-
 include/linux/sched/stat.h      |  2 +-
 kernel/sched/core.c             |  2 +-
 kernel/sched/cputime.c          |  2 +-
 kernel/sched/deadline.c         |  2 +-
 kernel/sched/fair.c             |  8 ++++----
 kernel/sched/isolation.c        | 14 +++++++-------
 kernel/sched/sched.h            |  4 ++--
 10 files changed, 22 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 291a9bd5b97f..b8c7ba0e3796 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -176,7 +176,7 @@ struct task_group;
  * TASK_RUNNING store which can collide with __set_current_state(TASK_RUNNING).
  *
  * However, with slightly different timing the wakeup TASK_RUNNING store can
- * also collide with the TASK_UNINTERRUPTIBLE store. Loosing that store is not
+ * also collide with the TASK_UNINTERRUPTIBLE store. Losing that store is not
  * a problem either because that will result in one extra go around the loop
  * and our @cond test will save the day.
  *
@@ -515,7 +515,7 @@ struct sched_dl_entity {
 
 	/*
 	 * Actual scheduling parameters. Initialized with the values above,
-	 * they are continously updated during task execution. Note that
+	 * they are continuously updated during task execution. Note that
 	 * the remaining runtime could be < 0 in case we are in overrun.
 	 */
 	s64				runtime;	/* Remaining runtime for this instance	*/
diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h
index 4a6582c27dea..b0fb1446fe04 100644
--- a/include/linux/sched/isolation.h
+++ b/include/linux/sched/isolation.h
@@ -16,7 +16,7 @@ enum hk_flags {
 };
 
 #ifdef CONFIG_CPU_ISOLATION
-DECLARE_STATIC_KEY_FALSE(housekeeping_overriden);
+DECLARE_STATIC_KEY_FALSE(housekeeping_overridden);
 extern int housekeeping_any_cpu(enum hk_flags flags);
 extern const struct cpumask *housekeeping_cpumask(enum hk_flags flags);
 extern void housekeeping_affine(struct task_struct *t, enum hk_flags flags);
@@ -43,7 +43,7 @@ static inline void housekeeping_init(void) { }
 static inline bool housekeeping_cpu(int cpu, enum hk_flags flags)
 {
 #ifdef CONFIG_CPU_ISOLATION
-	if (static_branch_unlikely(&housekeeping_overriden))
+	if (static_branch_unlikely(&housekeeping_overridden))
 		return housekeeping_test_cpu(cpu, flags);
 #endif
 	return true;
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index aebb370a0006..3bfa6a0cbba4 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -153,7 +153,7 @@ static inline gfp_t current_gfp_context(gfp_t flags)
 {
 	/*
 	 * NOIO implies both NOIO and NOFS and it is a weaker context
-	 * so always make sure it makes precendence
+	 * so always make sure it makes precedence
 	 */
 	if (unlikely(current->flags & PF_MEMALLOC_NOIO))
 		flags &= ~(__GFP_IO | __GFP_FS);
diff --git a/include/linux/sched/stat.h b/include/linux/sched/stat.h
index f30954cc059d..568286411b43 100644
--- a/include/linux/sched/stat.h
+++ b/include/linux/sched/stat.h
@@ -8,7 +8,7 @@
  * Various counters maintained by the scheduler and fork(),
  * exposed via /proc, sys.c or used by drivers via these APIs.
  *
- * ( Note that all these values are aquired without locking,
+ * ( Note that all these values are acquired without locking,
  *   so they can only be relied on in narrow circumstances. )
  */
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8050f266751a..e4ca15d75541 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2857,7 +2857,7 @@ unsigned long nr_running(void)
  * preemption, thus the result might have a time-of-check-to-time-of-use
  * race.  The caller is responsible to use it correctly, for example:
  *
- * - from a non-preemptable section (of course)
+ * - from a non-preemptible section (of course)
  *
  * - from a thread that is bound to a single CPU
  *
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 0796f938c4f0..ba4a143bdcf3 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -525,7 +525,7 @@ void account_idle_ticks(unsigned long ticks)
 
 /*
  * Perform (stime * rtime) / total, but avoid multiplication overflow by
- * loosing precision when the numbers are big.
+ * losing precision when the numbers are big.
  */
 static u64 scale_stime(u64 stime, u64 rtime, u64 total)
 {
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 470ba6b464fe..b32bc1f7cd14 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -727,7 +727,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
  * refill the runtime and set the deadline a period in the future,
  * because keeping the current (absolute) deadline of the task would
  * result in breaking guarantees promised to other tasks (refer to
- * Documentation/scheduler/sched-deadline.txt for more informations).
+ * Documentation/scheduler/sched-deadline.txt for more information).
  *
  * This function returns true if:
  *
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e30dea59d215..fdc8356ea742 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -703,9 +703,9 @@ void init_entity_runnable_average(struct sched_entity *se)
 	memset(sa, 0, sizeof(*sa));
 
 	/*
-	 * Tasks are intialized with full load to be seen as heavy tasks until
+	 * Tasks are initialized with full load to be seen as heavy tasks until
 	 * they get a chance to stabilize to their real load level.
-	 * Group entities are intialized with zero load to reflect the fact that
+	 * Group entities are initialized with zero load to reflect the fact that
 	 * nothing has been attached to the task group yet.
 	 */
 	if (entity_is_task(se))
@@ -3976,8 +3976,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	/*
 	 * When dequeuing a sched_entity, we must:
 	 *   - Update loads to have both entity and cfs_rq synced with now.
-	 *   - Substract its load from the cfs_rq->runnable_avg.
-	 *   - Substract its previous weight from cfs_rq->load.weight.
+	 *   - Subtract its load from the cfs_rq->runnable_avg.
+	 *   - Subtract its previous weight from cfs_rq->load.weight.
 	 *   - For group entity, update its weight to reflect the new share
 	 *     of its group cfs_rq.
 	 */
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index e6802181900f..81faddba9e20 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -8,14 +8,14 @@
  */
 #include "sched.h"
 
-DEFINE_STATIC_KEY_FALSE(housekeeping_overriden);
-EXPORT_SYMBOL_GPL(housekeeping_overriden);
+DEFINE_STATIC_KEY_FALSE(housekeeping_overridden);
+EXPORT_SYMBOL_GPL(housekeeping_overridden);
 static cpumask_var_t housekeeping_mask;
 static unsigned int housekeeping_flags;
 
 int housekeeping_any_cpu(enum hk_flags flags)
 {
-	if (static_branch_unlikely(&housekeeping_overriden))
+	if (static_branch_unlikely(&housekeeping_overridden))
 		if (housekeeping_flags & flags)
 			return cpumask_any_and(housekeeping_mask, cpu_online_mask);
 	return smp_processor_id();
@@ -24,7 +24,7 @@ EXPORT_SYMBOL_GPL(housekeeping_any_cpu);
 
 const struct cpumask *housekeeping_cpumask(enum hk_flags flags)
 {
-	if (static_branch_unlikely(&housekeeping_overriden))
+	if (static_branch_unlikely(&housekeeping_overridden))
 		if (housekeeping_flags & flags)
 			return housekeeping_mask;
 	return cpu_possible_mask;
@@ -33,7 +33,7 @@ EXPORT_SYMBOL_GPL(housekeeping_cpumask);
 
 void housekeeping_affine(struct task_struct *t, enum hk_flags flags)
 {
-	if (static_branch_unlikely(&housekeeping_overriden))
+	if (static_branch_unlikely(&housekeeping_overridden))
 		if (housekeeping_flags & flags)
 			set_cpus_allowed_ptr(t, housekeeping_mask);
 }
@@ -41,7 +41,7 @@ EXPORT_SYMBOL_GPL(housekeeping_affine);
 
 bool housekeeping_test_cpu(int cpu, enum hk_flags flags)
 {
-	if (static_branch_unlikely(&housekeeping_overriden))
+	if (static_branch_unlikely(&housekeeping_overridden))
 		if (housekeeping_flags & flags)
 			return cpumask_test_cpu(cpu, housekeeping_mask);
 	return true;
@@ -53,7 +53,7 @@ void __init housekeeping_init(void)
 	if (!housekeeping_flags)
 		return;
 
-	static_branch_enable(&housekeeping_overriden);
+	static_branch_enable(&housekeeping_overridden);
 
 	if (housekeeping_flags & HK_FLAG_TICK)
 		sched_tick_offload_init();
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 71cd8b710599..9bde60a11805 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -637,7 +637,7 @@ struct dl_rq {
 	/*
 	 * Deadline values of the currently executing and the
 	 * earliest ready task on this rq. Caching these facilitates
-	 * the decision wether or not a ready but not running task
+	 * the decision whether or not a ready but not running task
 	 * should migrate somewhere else.
 	 */
 	struct {
@@ -1434,7 +1434,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 #ifdef CONFIG_SMP
 	/*
 	 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
-	 * successfuly executed on another CPU. We must ensure that updates of
+	 * successfully executed on another CPU. We must ensure that updates of
 	 * per-task data have been completed by this moment.
 	 */
 	smp_wmb();
-- 
cgit 


From 1e7eacaf1db2f0f5f62fceda4e6c5a8869f00c13 Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Sat, 1 Dec 2018 03:12:56 +0000
Subject: cpuset: Remove set but not used variable 'cs'

Fixes gcc '-Wunused-but-set-variable' warning:

kernel/cgroup/cpuset.c: In function 'cpuset_cancel_attach':
kernel/cgroup/cpuset.c:2167:17: warning:
 variable 'cs' set but not used [-Wunused-but-set-variable]

It never used since introduction in commit 1f7dd3e5a6e4 ("cgroup: fix handling
of multi-destination migration from subtree_control enabling")

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cpuset.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 1151e93d71b6..f0decd8165e7 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -2109,10 +2109,8 @@ out_unlock:
 static void cpuset_cancel_attach(struct cgroup_taskset *tset)
 {
 	struct cgroup_subsys_state *css;
-	struct cpuset *cs;
 
 	cgroup_taskset_first(tset, &css);
-	cs = css_cs(css);
 
 	mutex_lock(&cpuset_mutex);
 	css_cs(css)->attach_in_progress--;
-- 
cgit 


From 9a547c7e575fc2501c12081558fda3027d0f2a5e Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Fri, 30 Nov 2018 16:13:16 -0500
Subject: audit: shorten PATH cap values when zero

Since the vast majority of files (99.993% on a typical system) have no
fcaps, display "0" instead of the full zero-padded 16 hex digits in the
two PATH record cap_f* fields to save netlink bandwidth and disk space.

Simply changing the format to %x won't work since the value is two (or
possibly more in the future) 32-bit hexadecimal values concatenated and
bits in higher order values will be misrepresented.

Passes audit-testsuite and userspace tools already work fine.
Please see the github issue tracker for more details
https://github.com/linux-audit/audit-kernel/issues/101

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
Acked-by: Steve Grubb <sgrubb@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 779671883349..a0a4544e69ca 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -2059,11 +2059,13 @@ void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap)
 {
 	int i;
 
-	audit_log_format(ab, " %s=", prefix);
-	CAP_FOR_EACH_U32(i) {
-		audit_log_format(ab, "%08x",
-				 cap->cap[CAP_LAST_U32 - i]);
+	if (cap_isclear(*cap)) {
+		audit_log_format(ab, " %s=0", prefix);
+		return;
 	}
+	audit_log_format(ab, " %s=", prefix);
+	CAP_FOR_EACH_U32(i)
+		audit_log_format(ab, "%08x", cap->cap[CAP_LAST_U32 - i]);
 }
 
 static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
-- 
cgit 


From ce10a5b3954f2514af726beb78ed8d7350c5e41c Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Wed, 28 Nov 2018 15:43:09 -0800
Subject: timekeeping: Use proper seqcount initializer

tk_core.seq is initialized open coded, but that misses to initialize the
lockdep map when lockdep is enabled. Lockdep splats involving tk_core seq
consequently lack a name and are hard to read.

Use the proper initializer which takes care of the lockdep map
initialization.

[ tglx: Massaged changelog ]

Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: peterz@infradead.org
Cc: tj@kernel.org
Cc: johannes.berg@intel.com
Link: https://lkml.kernel.org/r/20181128234325.110011-12-bvanassche@acm.org
---
 kernel/time/timekeeping.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index cd02bd38cf2d..c801e25875a3 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -45,7 +45,9 @@ enum timekeeping_adv_mode {
 static struct {
 	seqcount_t		seq;
 	struct timekeeper	timekeeper;
-} tk_core ____cacheline_aligned;
+} tk_core ____cacheline_aligned = {
+	.seq = SEQCNT_ZERO(tk_core.seq),
+};
 
 static DEFINE_RAW_SPINLOCK(timekeeper_lock);
 static struct timekeeper shadow_timekeeper;
-- 
cgit 


From a1da439cc0d929891f0f7372c9e03530c809068c Mon Sep 17 00:00:00 2001
From: Marek Szyprowski <m.szyprowski@samsung.com>
Date: Wed, 5 Dec 2018 11:14:01 +0100
Subject: dma-mapping: fix lack of DMA address assignment in generic remap
 allocator

Commit bfd56cd60521 ("dma-mapping: support highmem in the generic remap
allocator") replaced dma_direct_alloc_pages() with __dma_direct_alloc_pages(),
which doesn't set dma_handle and zero allocated memory. Fix it by doing this
directly in the caller function.

Fixes: bfd56cd60521 ("dma-mapping: support highmem in the generic remap allocator")
Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
Tested-by: Thierry Reding <treding@nvidia.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 kernel/dma/remap.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c
index 68a64e3ff6a1..8a44317cfc1a 100644
--- a/kernel/dma/remap.c
+++ b/kernel/dma/remap.c
@@ -223,8 +223,14 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
 	ret = dma_common_contiguous_remap(page, size, VM_USERMAP,
 			arch_dma_mmap_pgprot(dev, PAGE_KERNEL, attrs),
 			__builtin_return_address(0));
-	if (!ret)
+	if (!ret) {
 		__dma_direct_free_pages(dev, size, page);
+		return ret;
+	}
+
+	*dma_handle = phys_to_dma(dev, page_to_phys(page));
+	memset(ret, 0, size);
+
 	return ret;
 }
 
-- 
cgit 


From dc002bb62f10c5905420f8b8a7d5ec0da567fc82 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Fri, 23 Nov 2018 23:18:03 +0100
Subject: bpf: add __weak hook for allocating executable memory

By default, BPF uses module_alloc() to allocate executable memory,
but this is not necessary on all arches and potentially undesirable
on some of them.

So break out the module_alloc() and module_memfree() calls into __weak
functions to allow them to be overridden in arch code.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/core.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index f93ed667546f..86817ab204e8 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -623,6 +623,16 @@ static void bpf_jit_uncharge_modmem(u32 pages)
 	atomic_long_sub(pages, &bpf_jit_current);
 }
 
+void *__weak bpf_jit_alloc_exec(unsigned long size)
+{
+	return module_alloc(size);
+}
+
+void __weak bpf_jit_free_exec(void *addr)
+{
+	module_memfree(addr);
+}
+
 struct bpf_binary_header *
 bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
 		     unsigned int alignment,
@@ -640,7 +650,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
 
 	if (bpf_jit_charge_modmem(pages))
 		return NULL;
-	hdr = module_alloc(size);
+	hdr = bpf_jit_alloc_exec(size);
 	if (!hdr) {
 		bpf_jit_uncharge_modmem(pages);
 		return NULL;
@@ -664,7 +674,7 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr)
 {
 	u32 pages = hdr->pages;
 
-	module_memfree(hdr);
+	bpf_jit_free_exec(hdr);
 	bpf_jit_uncharge_modmem(pages);
 }
 
-- 
cgit 


From 7337224fc150b3b762190425399ac0e8dee380d1 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 5 Dec 2018 17:35:43 -0800
Subject: bpf: Improve the info.func_info and info.func_info_rec_size behavior

1) When bpf_dump_raw_ok() == false and the kernel can provide >=1
   func_info to the userspace, the current behavior is setting
   the info.func_info_cnt to 0 instead of setting info.func_info
   to 0.

   It is different from the behavior in jited_func_lens/nr_jited_func_lens,
   jited_ksyms/nr_jited_ksyms...etc.

   This patch fixes it. (i.e. set func_info to 0 instead of
   func_info_cnt to 0 when bpf_dump_raw_ok() == false).

2) When the userspace passed in info.func_info_cnt == 0, the kernel
   will set the expected func_info size back to the
   info.func_info_rec_size.  It is a way for the userspace to learn
   the kernel expected func_info_rec_size introduced in
   commit 838e96904ff3 ("bpf: Introduce bpf_func_info").

   An exception is the kernel expected size is not set when
   func_info is not available for a bpf_prog.  This makes the
   returned info.func_info_rec_size has different values
   depending on the returned value of info.func_info_cnt.

   This patch sets the kernel expected size to info.func_info_rec_size
   independent of the info.func_info_cnt.

3) The current logic only rejects invalid func_info_rec_size if
   func_info_cnt is non zero.  This patch also rejects invalid
   nonzero info.func_info_rec_size and not equal to the kernel
   expected size.

4) Set info.btf_id as long as prog->aux->btf != NULL.  That will
   setup the later copy_to_user() codes look the same as others
   which then easier to understand and maintain.

   prog->aux->btf is not NULL only if prog->aux->func_info_cnt > 0.

   Breaking up info.btf_id from prog->aux->func_info_cnt is needed
   for the later line info patch anyway.

   A similar change is made to bpf_get_prog_name().

Fixes: 838e96904ff3 ("bpf: Introduce bpf_func_info")
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/core.c    |  2 +-
 kernel/bpf/syscall.c | 46 ++++++++++++++++++++--------------------------
 2 files changed, 21 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 86817ab204e8..628b3970a49b 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -410,7 +410,7 @@ static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
 	sym  = bin2hex(sym, prog->tag, sizeof(prog->tag));
 
 	/* prog->aux->name will be ignored if full btf name is available */
-	if (prog->aux->btf) {
+	if (prog->aux->func_info_cnt) {
 		type = btf_type_by_id(prog->aux->btf,
 				      prog->aux->func_info[prog->aux->func_idx].type_id);
 		func_name = btf_name_by_offset(prog->aux->btf, type->name_off);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 4445d0d084d8..aa05aa38f4a8 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2083,6 +2083,12 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 				return -EFAULT;
 	}
 
+	if ((info.func_info_cnt || info.func_info_rec_size) &&
+	    info.func_info_rec_size != sizeof(struct bpf_func_info))
+		return -EINVAL;
+
+	info.func_info_rec_size = sizeof(struct bpf_func_info);
+
 	if (!capable(CAP_SYS_ADMIN)) {
 		info.jited_prog_len = 0;
 		info.xlated_prog_len = 0;
@@ -2226,35 +2232,23 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 		}
 	}
 
-	if (prog->aux->btf) {
-		u32 krec_size = sizeof(struct bpf_func_info);
-		u32 ucnt, urec_size;
-
+	if (prog->aux->btf)
 		info.btf_id = btf_id(prog->aux->btf);
 
-		ucnt = info.func_info_cnt;
-		info.func_info_cnt = prog->aux->func_info_cnt;
-		urec_size = info.func_info_rec_size;
-		info.func_info_rec_size = krec_size;
-		if (ucnt) {
-			/* expect passed-in urec_size is what the kernel expects */
-			if (urec_size != info.func_info_rec_size)
-				return -EINVAL;
-
-			if (bpf_dump_raw_ok()) {
-				char __user *user_finfo;
-
-				user_finfo = u64_to_user_ptr(info.func_info);
-				ucnt = min_t(u32, info.func_info_cnt, ucnt);
-				if (copy_to_user(user_finfo, prog->aux->func_info,
-						 krec_size * ucnt))
-					return -EFAULT;
-			} else {
-				info.func_info_cnt = 0;
-			}
+	ulen = info.func_info_cnt;
+	info.func_info_cnt = prog->aux->func_info_cnt;
+	if (info.func_info_cnt && ulen) {
+		if (bpf_dump_raw_ok()) {
+			char __user *user_finfo;
+
+			user_finfo = u64_to_user_ptr(info.func_info);
+			ulen = min_t(u32, info.func_info_cnt, ulen);
+			if (copy_to_user(user_finfo, prog->aux->func_info,
+					 info.func_info_rec_size * ulen))
+				return -EFAULT;
+		} else {
+			info.func_info = 0;
 		}
-	} else {
-		info.func_info_cnt = 0;
 	}
 
 done:
-- 
cgit 


From d30d42e08c76cb9323ec6121190eb026b07f773b Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 5 Dec 2018 17:35:44 -0800
Subject: bpf: Change insn_offset to insn_off in bpf_func_info

The later patch will introduce "struct bpf_line_info" which
has member "line_off" and "file_off" referring back to the
string section in btf.  The line_"off" and file_"off"
are more consistent to the naming convention in btf.h that
means "offset" (e.g. name_off in "struct btf_type").

The to-be-added "struct bpf_line_info" also has another
member, "insn_off" which is the same as the "insn_offset"
in "struct bpf_func_info".  Hence, this patch renames "insn_offset"
to "insn_off" for "struct bpf_func_info".

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h |  2 +-
 kernel/bpf/verifier.c    | 18 +++++++++---------
 2 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c8e1eeee2c5f..a84fd232d934 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2991,7 +2991,7 @@ struct bpf_flow_keys {
 };
 
 struct bpf_func_info {
-	__u32	insn_offset;
+	__u32	insn_off;
 	__u32	type_id;
 };
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 71988337ac14..7658c61c1a88 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4707,24 +4707,24 @@ static int check_btf_func(struct bpf_prog *prog, struct bpf_verifier_env *env,
 			goto free_btf;
 		}
 
-		/* check insn_offset */
+		/* check insn_off */
 		if (i == 0) {
-			if (krecord[i].insn_offset) {
+			if (krecord[i].insn_off) {
 				verbose(env,
-					"nonzero insn_offset %u for the first func info record",
-					krecord[i].insn_offset);
+					"nonzero insn_off %u for the first func info record",
+					krecord[i].insn_off);
 				ret = -EINVAL;
 				goto free_btf;
 			}
-		} else if (krecord[i].insn_offset <= prev_offset) {
+		} else if (krecord[i].insn_off <= prev_offset) {
 			verbose(env,
 				"same or smaller insn offset (%u) than previous func info record (%u)",
-				krecord[i].insn_offset, prev_offset);
+				krecord[i].insn_off, prev_offset);
 			ret = -EINVAL;
 			goto free_btf;
 		}
 
-		if (env->subprog_info[i].start != krecord[i].insn_offset) {
+		if (env->subprog_info[i].start != krecord[i].insn_off) {
 			verbose(env, "func_info BTF section doesn't match subprog layout in BPF program\n");
 			ret = -EINVAL;
 			goto free_btf;
@@ -4739,7 +4739,7 @@ static int check_btf_func(struct bpf_prog *prog, struct bpf_verifier_env *env,
 			goto free_btf;
 		}
 
-		prev_offset = krecord[i].insn_offset;
+		prev_offset = krecord[i].insn_off;
 		urecord += urec_size;
 	}
 
@@ -4762,7 +4762,7 @@ static void adjust_btf_func(struct bpf_verifier_env *env)
 		return;
 
 	for (i = 0; i < env->subprog_cnt; i++)
-		env->prog->aux->func_info[i].insn_offset = env->subprog_info[i].start;
+		env->prog->aux->func_info[i].insn_off = env->subprog_info[i].start;
 }
 
 /* check %cur's range satisfies %old's */
-- 
cgit 


From 92a98a2b9f64a8b3c200a7709ceae04d09c39451 Mon Sep 17 00:00:00 2001
From: AKASHI Takahiro <takahiro.akashi@linaro.org>
Date: Thu, 15 Nov 2018 14:52:41 +0900
Subject: kexec_file: make kexec_image_post_load_cleanup_default() global

Change this function from static to global so that arm64 can implement
its own arch_kimage_file_post_load_cleanup() later using
kexec_image_post_load_cleanup_default().

Signed-off-by: AKASHI Takahiro <takahiro.akashi@linaro.org>
Acked-by: Dave Young <dyoung@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Baoquan He <bhe@redhat.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 include/linux/kexec.h | 1 +
 kernel/kexec_file.c   | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 9e4e638fb505..49ab758f4d91 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -143,6 +143,7 @@ extern const struct kexec_file_ops * const kexec_file_loaders[];
 
 int kexec_image_probe_default(struct kimage *image, void *buf,
 			      unsigned long buf_len);
+int kexec_image_post_load_cleanup_default(struct kimage *image);
 
 /**
  * struct kexec_buf - parameters for finding a place for a buffer in memory
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 35cf0ad29718..9ce6672f4fa3 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -76,7 +76,7 @@ void * __weak arch_kexec_kernel_image_load(struct kimage *image)
 	return kexec_image_load_default(image);
 }
 
-static int kexec_image_post_load_cleanup_default(struct kimage *image)
+int kexec_image_post_load_cleanup_default(struct kimage *image)
 {
 	if (!image->fops || !image->fops->cleanup)
 		return 0;
-- 
cgit 


From b6664ba42f1424d2768b605dd60cecc4428d9364 Mon Sep 17 00:00:00 2001
From: AKASHI Takahiro <takahiro.akashi@linaro.org>
Date: Thu, 15 Nov 2018 14:52:42 +0900
Subject: s390, kexec_file: drop arch_kexec_mem_walk()

Since s390 already knows where to locate buffers, calling
arch_kexec_mem_walk() has no sense. So we can just drop it as kbuf->mem
indicates this while all other architectures sets it to 0 initially.

This change is a preparatory work for the next patch, where all the
variant memory walks, either on system resource or memblock, will be
put in one common place so that it will satisfy all the architectures'
need.

Signed-off-by: AKASHI Takahiro <takahiro.akashi@linaro.org>
Reviewed-by: Philipp Rudo <prudo@linux.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Baoquan He <bhe@redhat.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/s390/kernel/machine_kexec_file.c | 10 ----------
 include/linux/kexec.h                 |  8 ++++++++
 kernel/kexec_file.c                   |  4 ++++
 3 files changed, 12 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/arch/s390/kernel/machine_kexec_file.c b/arch/s390/kernel/machine_kexec_file.c
index f413f57f8d20..32023b4f9dc0 100644
--- a/arch/s390/kernel/machine_kexec_file.c
+++ b/arch/s390/kernel/machine_kexec_file.c
@@ -134,16 +134,6 @@ int kexec_file_add_initrd(struct kimage *image, struct s390_load_data *data,
 	return ret;
 }
 
-/*
- * The kernel is loaded to a fixed location. Turn off kexec_locate_mem_hole
- * and provide kbuf->mem by hand.
- */
-int arch_kexec_walk_mem(struct kexec_buf *kbuf,
-			int (*func)(struct resource *, void *))
-{
-	return 1;
-}
-
 int arch_kexec_apply_relocations_add(struct purgatory_info *pi,
 				     Elf_Shdr *section,
 				     const Elf_Shdr *relsec,
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 49ab758f4d91..f378cb786f1b 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -145,6 +145,14 @@ int kexec_image_probe_default(struct kimage *image, void *buf,
 			      unsigned long buf_len);
 int kexec_image_post_load_cleanup_default(struct kimage *image);
 
+/*
+ * If kexec_buf.mem is set to this value, kexec_locate_mem_hole()
+ * will try to allocate free memory. Arch may overwrite it.
+ */
+#ifndef KEXEC_BUF_MEM_UNKNOWN
+#define KEXEC_BUF_MEM_UNKNOWN 0
+#endif
+
 /**
  * struct kexec_buf - parameters for finding a place for a buffer in memory
  * @image:	kexec image in which memory to search.
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 9ce6672f4fa3..9e6529da12ed 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -532,6 +532,10 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf)
 {
 	int ret;
 
+	/* Arch knows where to place */
+	if (kbuf->mem != KEXEC_BUF_MEM_UNKNOWN)
+		return 0;
+
 	ret = arch_kexec_walk_mem(kbuf, locate_mem_hole_callback);
 
 	return ret == 1 ? 0 : -EADDRNOTAVAIL;
-- 
cgit 


From 735c2f90e333b3d0adee52a8e7e855a0c0eca284 Mon Sep 17 00:00:00 2001
From: AKASHI Takahiro <takahiro.akashi@linaro.org>
Date: Thu, 15 Nov 2018 14:52:43 +0900
Subject: powerpc, kexec_file: factor out memblock-based arch_kexec_walk_mem()

Memblock list is another source for usable system memory layout.
So move powerpc's arch_kexec_walk_mem() to common code so that other
memblock-based architectures, particularly arm64, can also utilise it.
A moved function is now renamed to kexec_walk_memblock() and integrated
into kexec_locate_mem_hole(), which will now be usable for all
architectures with no need for overriding arch_kexec_walk_mem().

With this change, arch_kexec_walk_mem() need no longer be a weak function,
and was now renamed to kexec_walk_resources().

Since powerpc doesn't support kdump in its kexec_file_load(), the current
kexec_walk_memblock() won't work for kdump either in this form, this will
be fixed in the next patch.

Signed-off-by: AKASHI Takahiro <takahiro.akashi@linaro.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Acked-by: Dave Young <dyoung@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Baoquan He <bhe@redhat.com>
Acked-by: James Morse <james.morse@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/powerpc/kernel/machine_kexec_file_64.c | 54 -------------------------
 include/linux/kexec.h                       |  2 -
 kernel/kexec_file.c                         | 61 +++++++++++++++++++++++++++--
 3 files changed, 57 insertions(+), 60 deletions(-)

(limited to 'kernel')

diff --git a/arch/powerpc/kernel/machine_kexec_file_64.c b/arch/powerpc/kernel/machine_kexec_file_64.c
index c77e95e9b384..0d20c7ad40fa 100644
--- a/arch/powerpc/kernel/machine_kexec_file_64.c
+++ b/arch/powerpc/kernel/machine_kexec_file_64.c
@@ -24,7 +24,6 @@
 
 #include <linux/slab.h>
 #include <linux/kexec.h>
-#include <linux/memblock.h>
 #include <linux/of_fdt.h>
 #include <linux/libfdt.h>
 #include <asm/ima.h>
@@ -46,59 +45,6 @@ int arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
 	return kexec_image_probe_default(image, buf, buf_len);
 }
 
-/**
- * arch_kexec_walk_mem - call func(data) for each unreserved memory block
- * @kbuf:	Context info for the search. Also passed to @func.
- * @func:	Function to call for each memory block.
- *
- * This function is used by kexec_add_buffer and kexec_locate_mem_hole
- * to find unreserved memory to load kexec segments into.
- *
- * Return: The memory walk will stop when func returns a non-zero value
- * and that value will be returned. If all free regions are visited without
- * func returning non-zero, then zero will be returned.
- */
-int arch_kexec_walk_mem(struct kexec_buf *kbuf,
-			int (*func)(struct resource *, void *))
-{
-	int ret = 0;
-	u64 i;
-	phys_addr_t mstart, mend;
-	struct resource res = { };
-
-	if (kbuf->top_down) {
-		for_each_free_mem_range_reverse(i, NUMA_NO_NODE, 0,
-						&mstart, &mend, NULL) {
-			/*
-			 * In memblock, end points to the first byte after the
-			 * range while in kexec, end points to the last byte
-			 * in the range.
-			 */
-			res.start = mstart;
-			res.end = mend - 1;
-			ret = func(&res, kbuf);
-			if (ret)
-				break;
-		}
-	} else {
-		for_each_free_mem_range(i, NUMA_NO_NODE, 0, &mstart, &mend,
-					NULL) {
-			/*
-			 * In memblock, end points to the first byte after the
-			 * range while in kexec, end points to the last byte
-			 * in the range.
-			 */
-			res.start = mstart;
-			res.end = mend - 1;
-			ret = func(&res, kbuf);
-			if (ret)
-				break;
-		}
-	}
-
-	return ret;
-}
-
 /**
  * setup_purgatory - initialize the purgatory's global variables
  * @image:		kexec image.
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index f378cb786f1b..d58d1f2fab10 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -192,8 +192,6 @@ int __weak arch_kexec_apply_relocations(struct purgatory_info *pi,
 					const Elf_Shdr *relsec,
 					const Elf_Shdr *symtab);
 
-int __weak arch_kexec_walk_mem(struct kexec_buf *kbuf,
-			       int (*func)(struct resource *, void *));
 extern int kexec_add_buffer(struct kexec_buf *kbuf);
 int kexec_locate_mem_hole(struct kexec_buf *kbuf);
 
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 9e6529da12ed..d03195a8cb6e 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -16,6 +16,7 @@
 #include <linux/file.h>
 #include <linux/slab.h>
 #include <linux/kexec.h>
+#include <linux/memblock.h>
 #include <linux/mutex.h>
 #include <linux/list.h>
 #include <linux/fs.h>
@@ -499,8 +500,57 @@ static int locate_mem_hole_callback(struct resource *res, void *arg)
 	return locate_mem_hole_bottom_up(start, end, kbuf);
 }
 
+#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
+static int kexec_walk_memblock(struct kexec_buf *kbuf,
+			       int (*func)(struct resource *, void *))
+{
+	return 0;
+}
+#else
+static int kexec_walk_memblock(struct kexec_buf *kbuf,
+			       int (*func)(struct resource *, void *))
+{
+	int ret = 0;
+	u64 i;
+	phys_addr_t mstart, mend;
+	struct resource res = { };
+
+	if (kbuf->top_down) {
+		for_each_free_mem_range_reverse(i, NUMA_NO_NODE, 0,
+						&mstart, &mend, NULL) {
+			/*
+			 * In memblock, end points to the first byte after the
+			 * range while in kexec, end points to the last byte
+			 * in the range.
+			 */
+			res.start = mstart;
+			res.end = mend - 1;
+			ret = func(&res, kbuf);
+			if (ret)
+				break;
+		}
+	} else {
+		for_each_free_mem_range(i, NUMA_NO_NODE, 0, &mstart, &mend,
+					NULL) {
+			/*
+			 * In memblock, end points to the first byte after the
+			 * range while in kexec, end points to the last byte
+			 * in the range.
+			 */
+			res.start = mstart;
+			res.end = mend - 1;
+			ret = func(&res, kbuf);
+			if (ret)
+				break;
+		}
+	}
+
+	return ret;
+}
+#endif
+
 /**
- * arch_kexec_walk_mem - call func(data) on free memory regions
+ * kexec_walk_resources - call func(data) on free memory regions
  * @kbuf:	Context info for the search. Also passed to @func.
  * @func:	Function to call for each memory region.
  *
@@ -508,8 +558,8 @@ static int locate_mem_hole_callback(struct resource *res, void *arg)
  * and that value will be returned. If all free regions are visited without
  * func returning non-zero, then zero will be returned.
  */
-int __weak arch_kexec_walk_mem(struct kexec_buf *kbuf,
-			       int (*func)(struct resource *, void *))
+static int kexec_walk_resources(struct kexec_buf *kbuf,
+				int (*func)(struct resource *, void *))
 {
 	if (kbuf->image->type == KEXEC_TYPE_CRASH)
 		return walk_iomem_res_desc(crashk_res.desc,
@@ -536,7 +586,10 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf)
 	if (kbuf->mem != KEXEC_BUF_MEM_UNKNOWN)
 		return 0;
 
-	ret = arch_kexec_walk_mem(kbuf, locate_mem_hole_callback);
+	if (IS_ENABLED(CONFIG_ARCH_DISCARD_MEMBLOCK))
+		ret = kexec_walk_resources(kbuf, locate_mem_hole_callback);
+	else
+		ret = kexec_walk_memblock(kbuf, locate_mem_hole_callback);
 
 	return ret == 1 ? 0 : -EADDRNOTAVAIL;
 }
-- 
cgit 


From 497e1858647a0b859076dc96300527375977d76a Mon Sep 17 00:00:00 2001
From: AKASHI Takahiro <takahiro.akashi@linaro.org>
Date: Thu, 15 Nov 2018 14:52:44 +0900
Subject: kexec_file: kexec_walk_memblock() only walks a dedicated region at
 kdump

In kdump case, there exists only one dedicated memblock region as usable
memory (crashk_res). With this patch, kexec_walk_memblock() runs a given
callback function on this region.

Cosmetic change: 0 to MEMBLOCK_NONE at for_each_free_mem_range*()

Signed-off-by: AKASHI Takahiro <takahiro.akashi@linaro.org>
Acked-by: Dave Young <dyoung@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Baoquan He <bhe@redhat.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 kernel/kexec_file.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index d03195a8cb6e..f1d0e00a3971 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -515,8 +515,11 @@ static int kexec_walk_memblock(struct kexec_buf *kbuf,
 	phys_addr_t mstart, mend;
 	struct resource res = { };
 
+	if (kbuf->image->type == KEXEC_TYPE_CRASH)
+		return func(&crashk_res, kbuf);
+
 	if (kbuf->top_down) {
-		for_each_free_mem_range_reverse(i, NUMA_NO_NODE, 0,
+		for_each_free_mem_range_reverse(i, NUMA_NO_NODE, MEMBLOCK_NONE,
 						&mstart, &mend, NULL) {
 			/*
 			 * In memblock, end points to the first byte after the
@@ -530,8 +533,8 @@ static int kexec_walk_memblock(struct kexec_buf *kbuf,
 				break;
 		}
 	} else {
-		for_each_free_mem_range(i, NUMA_NO_NODE, 0, &mstart, &mend,
-					NULL) {
+		for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE,
+					&mstart, &mend, NULL) {
 			/*
 			 * In memblock, end points to the first byte after the
 			 * range while in kexec, end points to the last byte
-- 
cgit 


From b0cbeae4944924640bf550b75487729a20204c14 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 21 Nov 2018 18:52:35 +0100
Subject: dma-direct: remove the mapping_error dma_map_ops method

The dma-direct code already returns (~(dma_addr_t)0x0) on mapping
failures, so we can switch over to returning DMA_MAPPING_ERROR and let
the core dma-mapping code handle the rest.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/kernel/dma-swiotlb.c |  1 -
 include/linux/dma-direct.h        |  3 ---
 kernel/dma/direct.c               |  8 +-------
 kernel/dma/swiotlb.c              | 11 +++++------
 4 files changed, 6 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/arch/powerpc/kernel/dma-swiotlb.c b/arch/powerpc/kernel/dma-swiotlb.c
index 5fc335f4d9cd..3d8df2cf8be9 100644
--- a/arch/powerpc/kernel/dma-swiotlb.c
+++ b/arch/powerpc/kernel/dma-swiotlb.c
@@ -59,7 +59,6 @@ const struct dma_map_ops powerpc_swiotlb_dma_ops = {
 	.sync_single_for_device = swiotlb_sync_single_for_device,
 	.sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
 	.sync_sg_for_device = swiotlb_sync_sg_for_device,
-	.mapping_error = dma_direct_mapping_error,
 	.get_required_mask = swiotlb_powerpc_get_required,
 };
 
diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index 61b78f934f64..6e5a47ae7d64 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -5,8 +5,6 @@
 #include <linux/dma-mapping.h>
 #include <linux/mem_encrypt.h>
 
-#define DIRECT_MAPPING_ERROR		(~(dma_addr_t)0)
-
 #ifdef CONFIG_ARCH_HAS_PHYS_TO_DMA
 #include <asm/dma-direct.h>
 #else
@@ -76,5 +74,4 @@ dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
 int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
 		enum dma_data_direction dir, unsigned long attrs);
 int dma_direct_supported(struct device *dev, u64 mask);
-int dma_direct_mapping_error(struct device *dev, dma_addr_t dma_addr);
 #endif /* _LINUX_DMA_DIRECT_H */
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index c49849bcced6..308f88a750c8 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -289,7 +289,7 @@ dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
 	dma_addr_t dma_addr = phys_to_dma(dev, phys);
 
 	if (!check_addr(dev, dma_addr, size, __func__))
-		return DIRECT_MAPPING_ERROR;
+		return DMA_MAPPING_ERROR;
 
 	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
 		dma_direct_sync_single_for_device(dev, dma_addr, size, dir);
@@ -336,11 +336,6 @@ int dma_direct_supported(struct device *dev, u64 mask)
 	return mask >= phys_to_dma(dev, min_mask);
 }
 
-int dma_direct_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
-	return dma_addr == DIRECT_MAPPING_ERROR;
-}
-
 const struct dma_map_ops dma_direct_ops = {
 	.alloc			= dma_direct_alloc,
 	.free			= dma_direct_free,
@@ -359,7 +354,6 @@ const struct dma_map_ops dma_direct_ops = {
 #endif
 	.get_required_mask	= dma_direct_get_required_mask,
 	.dma_supported		= dma_direct_supported,
-	.mapping_error		= dma_direct_mapping_error,
 	.cache_sync		= arch_dma_cache_sync,
 };
 EXPORT_SYMBOL(dma_direct_ops);
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 045930e32c0e..ff1ce81bb623 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -631,21 +631,21 @@ static dma_addr_t swiotlb_bounce_page(struct device *dev, phys_addr_t *phys,
 	if (unlikely(swiotlb_force == SWIOTLB_NO_FORCE)) {
 		dev_warn_ratelimited(dev,
 			"Cannot do DMA to address %pa\n", phys);
-		return DIRECT_MAPPING_ERROR;
+		return DMA_MAPPING_ERROR;
 	}
 
 	/* Oh well, have to allocate and map a bounce buffer. */
 	*phys = swiotlb_tbl_map_single(dev, __phys_to_dma(dev, io_tlb_start),
 			*phys, size, dir, attrs);
 	if (*phys == SWIOTLB_MAP_ERROR)
-		return DIRECT_MAPPING_ERROR;
+		return DMA_MAPPING_ERROR;
 
 	/* Ensure that the address returned is DMA'ble */
 	dma_addr = __phys_to_dma(dev, *phys);
 	if (unlikely(!dma_capable(dev, dma_addr, size))) {
 		swiotlb_tbl_unmap_single(dev, *phys, size, dir,
 			attrs | DMA_ATTR_SKIP_CPU_SYNC);
-		return DIRECT_MAPPING_ERROR;
+		return DMA_MAPPING_ERROR;
 	}
 
 	return dma_addr;
@@ -680,7 +680,7 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
 
 	if (!dev_is_dma_coherent(dev) &&
 	    (attrs & DMA_ATTR_SKIP_CPU_SYNC) == 0 &&
-	    dev_addr != DIRECT_MAPPING_ERROR)
+	    dev_addr != DMA_MAPPING_ERROR)
 		arch_sync_dma_for_device(dev, phys, size, dir);
 
 	return dev_addr;
@@ -789,7 +789,7 @@ swiotlb_map_sg_attrs(struct device *dev, struct scatterlist *sgl, int nelems,
 	for_each_sg(sgl, sg, nelems, i) {
 		sg->dma_address = swiotlb_map_page(dev, sg_page(sg), sg->offset,
 				sg->length, dir, attrs);
-		if (sg->dma_address == DIRECT_MAPPING_ERROR)
+		if (sg->dma_address == DMA_MAPPING_ERROR)
 			goto out_error;
 		sg_dma_len(sg) = sg->length;
 	}
@@ -869,7 +869,6 @@ swiotlb_dma_supported(struct device *hwdev, u64 mask)
 }
 
 const struct dma_map_ops swiotlb_dma_ops = {
-	.mapping_error		= dma_direct_mapping_error,
 	.alloc			= dma_direct_alloc,
 	.free			= dma_direct_free,
 	.sync_single_for_cpu	= swiotlb_sync_single_for_cpu,
-- 
cgit 


From df44b479654f62b478c18ee4d8bc4e9f897a9844 Mon Sep 17 00:00:00 2001
From: Peter Rajnoha <prajnoha@redhat.com>
Date: Wed, 5 Dec 2018 12:27:44 +0100
Subject: kobject: return error code if writing /sys/.../uevent fails

Propagate error code back to userspace if writing the /sys/.../uevent
file fails. Before, the write operation always returned with success,
even if we failed to recognize the input string or if we failed to
generate the uevent itself.

With the error codes properly propagated back to userspace, we are
able to react in userspace accordingly by not assuming and awaiting
a uevent that is not delivered.

Signed-off-by: Peter Rajnoha <prajnoha@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/bus.c  | 12 ++++++++----
 drivers/base/core.c |  8 +++++++-
 kernel/module.c     |  6 ++++--
 3 files changed, 19 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/drivers/base/bus.c b/drivers/base/bus.c
index 8bfd27ec73d6..b886b15cb53b 100644
--- a/drivers/base/bus.c
+++ b/drivers/base/bus.c
@@ -611,8 +611,10 @@ static void remove_probe_files(struct bus_type *bus)
 static ssize_t uevent_store(struct device_driver *drv, const char *buf,
 			    size_t count)
 {
-	kobject_synth_uevent(&drv->p->kobj, buf, count);
-	return count;
+	int rc;
+
+	rc = kobject_synth_uevent(&drv->p->kobj, buf, count);
+	return rc ? rc : count;
 }
 static DRIVER_ATTR_WO(uevent);
 
@@ -828,8 +830,10 @@ static void klist_devices_put(struct klist_node *n)
 static ssize_t bus_uevent_store(struct bus_type *bus,
 				const char *buf, size_t count)
 {
-	kobject_synth_uevent(&bus->p->subsys.kobj, buf, count);
-	return count;
+	int rc;
+
+	rc = kobject_synth_uevent(&bus->p->subsys.kobj, buf, count);
+	return rc ? rc : count;
 }
 static BUS_ATTR(uevent, S_IWUSR, NULL, bus_uevent_store);
 
diff --git a/drivers/base/core.c b/drivers/base/core.c
index e2285059161d..a4ced331bc50 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -1073,8 +1073,14 @@ out:
 static ssize_t uevent_store(struct device *dev, struct device_attribute *attr,
 			    const char *buf, size_t count)
 {
-	if (kobject_synth_uevent(&dev->kobj, buf, count))
+	int rc;
+
+	rc = kobject_synth_uevent(&dev->kobj, buf, count);
+
+	if (rc) {
 		dev_err(dev, "uevent: failed to send synthetic uevent\n");
+		return rc;
+	}
 
 	return count;
 }
diff --git a/kernel/module.c b/kernel/module.c
index 49a405891587..0812a7f80fa7 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1207,8 +1207,10 @@ static ssize_t store_uevent(struct module_attribute *mattr,
 			    struct module_kobject *mk,
 			    const char *buffer, size_t count)
 {
-	kobject_synth_uevent(&mk->kobj, buffer, count);
-	return count;
+	int rc;
+
+	rc = kobject_synth_uevent(&mk->kobj, buffer, count);
+	return rc ? rc : count;
 }
 
 struct module_attribute module_uevent =
-- 
cgit 


From ded653ccbec0335a78fa7a7aff3ec9870349fafb Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Wed, 19 Sep 2018 21:41:04 -0700
Subject: signal: Add set_user_sigmask()

Refactor reading sigset from userspace and updating sigmask
into an api.

This is useful for versions of syscalls that pass in the
sigmask and expect the current->sigmask to be changed during,
and restored after, the execution of the syscall.

With the advent of new y2038 syscalls in the subsequent patches,
we add two more new versions of the syscalls (for pselect, ppoll,
and io_pgetevents) in addition to the existing native and compat
versions. Adding such an api reduces the logic that would need to
be replicated otherwise.

Note that the calls to sigprocmask() ignored the return value
from the api as the function only returns an error on an invalid
first argument that is hardcoded at these call sites.
The updated logic uses set_current_blocked() instead.

Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 fs/aio.c               | 23 +++++++----------------
 fs/eventpoll.c         | 22 ++++++----------------
 fs/select.c            | 50 ++++++++++++--------------------------------------
 include/linux/compat.h |  4 ++++
 include/linux/signal.h |  2 ++
 kernel/signal.c        | 45 +++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 76 insertions(+), 70 deletions(-)

(limited to 'kernel')

diff --git a/fs/aio.c b/fs/aio.c
index 301e6314183b..6ddb63ee8eb6 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -2104,14 +2104,10 @@ SYSCALL_DEFINE6(io_pgetevents,
 	if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
 		return -EFAULT;
 
-	if (ksig.sigmask) {
-		if (ksig.sigsetsize != sizeof(sigset_t))
-			return -EINVAL;
-		if (copy_from_user(&ksigmask, ksig.sigmask, sizeof(ksigmask)))
-			return -EFAULT;
-		sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
-		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
-	}
+
+	ret = set_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize);
+	if (ret)
+		return ret;
 
 	ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
 	if (signal_pending(current)) {
@@ -2174,14 +2170,9 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents,
 	if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
 		return -EFAULT;
 
-	if (ksig.sigmask) {
-		if (ksig.sigsetsize != sizeof(compat_sigset_t))
-			return -EINVAL;
-		if (get_compat_sigset(&ksigmask, ksig.sigmask))
-			return -EFAULT;
-		sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
-		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
-	}
+	ret = set_compat_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize);
+	if (ret)
+		return ret;
 
 	ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
 	if (signal_pending(current)) {
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 42bbe6824b4b..2d86eeba837b 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -2223,14 +2223,9 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
 	 * If the caller wants a certain signal mask to be set during the wait,
 	 * we apply it here.
 	 */
-	if (sigmask) {
-		if (sigsetsize != sizeof(sigset_t))
-			return -EINVAL;
-		if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
-			return -EFAULT;
-		sigsaved = current->blocked;
-		set_current_blocked(&ksigmask);
-	}
+	error = set_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+	if (error)
+		return error;
 
 	error = do_epoll_wait(epfd, events, maxevents, timeout);
 
@@ -2266,14 +2261,9 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
 	 * If the caller wants a certain signal mask to be set during the wait,
 	 * we apply it here.
 	 */
-	if (sigmask) {
-		if (sigsetsize != sizeof(compat_sigset_t))
-			return -EINVAL;
-		if (get_compat_sigset(&ksigmask, sigmask))
-			return -EFAULT;
-		sigsaved = current->blocked;
-		set_current_blocked(&ksigmask);
-	}
+	err = set_compat_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+	if (err)
+		return err;
 
 	err = do_epoll_wait(epfd, events, maxevents, timeout);
 
diff --git a/fs/select.c b/fs/select.c
index 22b3bf89f051..65c78b4147a2 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -717,16 +717,9 @@ static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
 			return -EINVAL;
 	}
 
-	if (sigmask) {
-		/* XXX: Don't preclude handling different sized sigset_t's.  */
-		if (sigsetsize != sizeof(sigset_t))
-			return -EINVAL;
-		if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
-			return -EFAULT;
-
-		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
-		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
-	}
+	ret = set_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+	if (ret)
+		return ret;
 
 	ret = core_sys_select(n, inp, outp, exp, to);
 	ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
@@ -1061,16 +1054,9 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
 			return -EINVAL;
 	}
 
-	if (sigmask) {
-		/* XXX: Don't preclude handling different sized sigset_t's.  */
-		if (sigsetsize != sizeof(sigset_t))
-			return -EINVAL;
-		if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
-			return -EFAULT;
-
-		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
-		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
-	}
+	ret = set_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+	if (ret)
+		return ret;
 
 	ret = do_sys_poll(ufds, nfds, to);
 
@@ -1323,15 +1309,9 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp,
 			return -EINVAL;
 	}
 
-	if (sigmask) {
-		if (sigsetsize != sizeof(compat_sigset_t))
-			return -EINVAL;
-		if (get_compat_sigset(&ksigmask, sigmask))
-			return -EFAULT;
-
-		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
-		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
-	}
+	ret = set_compat_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+	if (ret)
+		return ret;
 
 	ret = compat_core_sys_select(n, inp, outp, exp, to);
 	ret = compat_poll_select_copy_remaining(&end_time, tsp, 0, ret);
@@ -1389,15 +1369,9 @@ COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds,
 			return -EINVAL;
 	}
 
-	if (sigmask) {
-		if (sigsetsize != sizeof(compat_sigset_t))
-			return -EINVAL;
-		if (get_compat_sigset(&ksigmask, sigmask))
-			return -EFAULT;
-
-		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
-		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
-	}
+	ret = set_compat_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+	if (ret)
+		return ret;
 
 	ret = do_sys_poll(ufds, nfds, to);
 
diff --git a/include/linux/compat.h b/include/linux/compat.h
index 88720b443cd6..17c497b82690 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -169,6 +169,10 @@ typedef struct {
 	compat_sigset_word	sig[_COMPAT_NSIG_WORDS];
 } compat_sigset_t;
 
+int set_compat_user_sigmask(const compat_sigset_t __user *usigmask,
+			    sigset_t *set, sigset_t *oldset,
+			    size_t sigsetsize);
+
 struct compat_sigaction {
 #ifndef __ARCH_HAS_IRIX_SIGACTION
 	compat_uptr_t			sa_handler;
diff --git a/include/linux/signal.h b/include/linux/signal.h
index f428e86f4800..ce14b951befb 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -273,6 +273,8 @@ extern int group_send_sig_info(int sig, struct kernel_siginfo *info,
 			       struct task_struct *p, enum pid_type type);
 extern int __group_send_sig_info(int, struct kernel_siginfo *, struct task_struct *);
 extern int sigprocmask(int, sigset_t *, sigset_t *);
+extern int set_user_sigmask(const sigset_t __user *usigmask, sigset_t *set,
+	sigset_t *oldset, size_t sigsetsize);
 extern void set_current_blocked(sigset_t *);
 extern void __set_current_blocked(const sigset_t *);
 extern int show_unhandled_signals;
diff --git a/kernel/signal.c b/kernel/signal.c
index 9a32bc2088c9..811b5d440617 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2735,6 +2735,51 @@ int sigprocmask(int how, sigset_t *set, sigset_t *oldset)
 }
 EXPORT_SYMBOL(sigprocmask);
 
+/*
+ * The api helps set app-provided sigmasks.
+ *
+ * This is useful for syscalls such as ppoll, pselect, io_pgetevents and
+ * epoll_pwait where a new sigmask is passed from userland for the syscalls.
+ */
+int set_user_sigmask(const sigset_t __user *usigmask, sigset_t *set,
+		     sigset_t *oldset, size_t sigsetsize)
+{
+	if (!usigmask)
+		return 0;
+
+	if (sigsetsize != sizeof(sigset_t))
+		return -EINVAL;
+	if (copy_from_user(set, usigmask, sizeof(sigset_t)))
+		return -EFAULT;
+
+	*oldset = current->blocked;
+	set_current_blocked(set);
+
+	return 0;
+}
+EXPORT_SYMBOL(set_user_sigmask);
+
+#ifdef CONFIG_COMPAT
+int set_compat_user_sigmask(const compat_sigset_t __user *usigmask,
+			    sigset_t *set, sigset_t *oldset,
+			    size_t sigsetsize)
+{
+	if (!usigmask)
+		return 0;
+
+	if (sigsetsize != sizeof(compat_sigset_t))
+		return -EINVAL;
+	if (get_compat_sigset(set, usigmask))
+		return -EFAULT;
+
+	*oldset = current->blocked;
+	set_current_blocked(set);
+
+	return 0;
+}
+EXPORT_SYMBOL(set_compat_user_sigmask);
+#endif
+
 /**
  *  sys_rt_sigprocmask - change the list of currently blocked signals
  *  @how: whether to add, remove, or set signals
-- 
cgit 


From 854a6ed56839a40f6b5d02a2962f48841482eec4 Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Wed, 19 Sep 2018 21:41:05 -0700
Subject: signal: Add restore_user_sigmask()

Refactor the logic to restore the sigmask before the syscall
returns into an api.
This is useful for versions of syscalls that pass in the
sigmask and expect the current->sigmask to be changed during
the execution and restored after the execution of the syscall.

With the advent of new y2038 syscalls in the subsequent patches,
we add two more new versions of the syscalls (for pselect, ppoll
and io_pgetevents) in addition to the existing native and compat
versions. Adding such an api reduces the logic that would need to
be replicated otherwise.

Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 fs/aio.c               | 29 +++++-------------------
 fs/eventpoll.c         | 30 ++-----------------------
 fs/select.c            | 60 +++++++-------------------------------------------
 include/linux/signal.h |  2 ++
 kernel/signal.c        | 33 +++++++++++++++++++++++++++
 5 files changed, 51 insertions(+), 103 deletions(-)

(limited to 'kernel')

diff --git a/fs/aio.c b/fs/aio.c
index 6ddb63ee8eb6..39a1f2df6805 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -2110,18 +2110,9 @@ SYSCALL_DEFINE6(io_pgetevents,
 		return ret;
 
 	ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
-	if (signal_pending(current)) {
-		if (ksig.sigmask) {
-			current->saved_sigmask = sigsaved;
-			set_restore_sigmask();
-		}
-
-		if (!ret)
-			ret = -ERESTARTNOHAND;
-	} else {
-		if (ksig.sigmask)
-			sigprocmask(SIG_SETMASK, &sigsaved, NULL);
-	}
+	restore_user_sigmask(ksig.sigmask, &sigsaved);
+	if (signal_pending(current) && !ret)
+		ret = -ERESTARTNOHAND;
 
 	return ret;
 }
@@ -2175,17 +2166,9 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents,
 		return ret;
 
 	ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
-	if (signal_pending(current)) {
-		if (ksig.sigmask) {
-			current->saved_sigmask = sigsaved;
-			set_restore_sigmask();
-		}
-		if (!ret)
-			ret = -ERESTARTNOHAND;
-	} else {
-		if (ksig.sigmask)
-			sigprocmask(SIG_SETMASK, &sigsaved, NULL);
-	}
+	restore_user_sigmask(ksig.sigmask, &sigsaved);
+	if (signal_pending(current) && !ret)
+		ret = -ERESTARTNOHAND;
 
 	return ret;
 }
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 2d86eeba837b..8a5a1010886b 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -2229,20 +2229,7 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
 
 	error = do_epoll_wait(epfd, events, maxevents, timeout);
 
-	/*
-	 * If we changed the signal mask, we need to restore the original one.
-	 * In case we've got a signal while waiting, we do not restore the
-	 * signal mask yet, and we allow do_signal() to deliver the signal on
-	 * the way back to userspace, before the signal mask is restored.
-	 */
-	if (sigmask) {
-		if (error == -EINTR) {
-			memcpy(&current->saved_sigmask, &sigsaved,
-			       sizeof(sigsaved));
-			set_restore_sigmask();
-		} else
-			set_current_blocked(&sigsaved);
-	}
+	restore_user_sigmask(sigmask, &sigsaved);
 
 	return error;
 }
@@ -2267,20 +2254,7 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
 
 	err = do_epoll_wait(epfd, events, maxevents, timeout);
 
-	/*
-	 * If we changed the signal mask, we need to restore the original one.
-	 * In case we've got a signal while waiting, we do not restore the
-	 * signal mask yet, and we allow do_signal() to deliver the signal on
-	 * the way back to userspace, before the signal mask is restored.
-	 */
-	if (sigmask) {
-		if (err == -EINTR) {
-			memcpy(&current->saved_sigmask, &sigsaved,
-			       sizeof(sigsaved));
-			set_restore_sigmask();
-		} else
-			set_current_blocked(&sigsaved);
-	}
+	restore_user_sigmask(sigmask, &sigsaved);
 
 	return err;
 }
diff --git a/fs/select.c b/fs/select.c
index 65c78b4147a2..eb9132520197 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -724,19 +724,7 @@ static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
 	ret = core_sys_select(n, inp, outp, exp, to);
 	ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
 
-	if (ret == -ERESTARTNOHAND) {
-		/*
-		 * Don't restore the signal mask yet. Let do_signal() deliver
-		 * the signal on the way back to userspace, before the signal
-		 * mask is restored.
-		 */
-		if (sigmask) {
-			memcpy(&current->saved_sigmask, &sigsaved,
-					sizeof(sigsaved));
-			set_restore_sigmask();
-		}
-	} else if (sigmask)
-		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+	restore_user_sigmask(sigmask, &sigsaved);
 
 	return ret;
 }
@@ -1060,21 +1048,11 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
 
 	ret = do_sys_poll(ufds, nfds, to);
 
+	restore_user_sigmask(sigmask, &sigsaved);
+
 	/* We can restart this syscall, usually */
-	if (ret == -EINTR) {
-		/*
-		 * Don't restore the signal mask yet. Let do_signal() deliver
-		 * the signal on the way back to userspace, before the signal
-		 * mask is restored.
-		 */
-		if (sigmask) {
-			memcpy(&current->saved_sigmask, &sigsaved,
-					sizeof(sigsaved));
-			set_restore_sigmask();
-		}
+	if (ret == -EINTR)
 		ret = -ERESTARTNOHAND;
-	} else if (sigmask)
-		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 
 	ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
 
@@ -1316,19 +1294,7 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp,
 	ret = compat_core_sys_select(n, inp, outp, exp, to);
 	ret = compat_poll_select_copy_remaining(&end_time, tsp, 0, ret);
 
-	if (ret == -ERESTARTNOHAND) {
-		/*
-		 * Don't restore the signal mask yet. Let do_signal() deliver
-		 * the signal on the way back to userspace, before the signal
-		 * mask is restored.
-		 */
-		if (sigmask) {
-			memcpy(&current->saved_sigmask, &sigsaved,
-					sizeof(sigsaved));
-			set_restore_sigmask();
-		}
-	} else if (sigmask)
-		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+	restore_user_sigmask(sigmask, &sigsaved);
 
 	return ret;
 }
@@ -1375,21 +1341,11 @@ COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds,
 
 	ret = do_sys_poll(ufds, nfds, to);
 
+	restore_user_sigmask(sigmask, &sigsaved);
+
 	/* We can restart this syscall, usually */
-	if (ret == -EINTR) {
-		/*
-		 * Don't restore the signal mask yet. Let do_signal() deliver
-		 * the signal on the way back to userspace, before the signal
-		 * mask is restored.
-		 */
-		if (sigmask) {
-			memcpy(&current->saved_sigmask, &sigsaved,
-				sizeof(sigsaved));
-			set_restore_sigmask();
-		}
+	if (ret == -EINTR)
 		ret = -ERESTARTNOHAND;
-	} else if (sigmask)
-		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 
 	ret = compat_poll_select_copy_remaining(&end_time, tsp, 0, ret);
 
diff --git a/include/linux/signal.h b/include/linux/signal.h
index ce14b951befb..cc7e2c1cd444 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -275,6 +275,8 @@ extern int __group_send_sig_info(int, struct kernel_siginfo *, struct task_struc
 extern int sigprocmask(int, sigset_t *, sigset_t *);
 extern int set_user_sigmask(const sigset_t __user *usigmask, sigset_t *set,
 	sigset_t *oldset, size_t sigsetsize);
+extern void restore_user_sigmask(const void __user *usigmask,
+				 sigset_t *sigsaved);
 extern void set_current_blocked(sigset_t *);
 extern void __set_current_blocked(const sigset_t *);
 extern int show_unhandled_signals;
diff --git a/kernel/signal.c b/kernel/signal.c
index 811b5d440617..3c8ea7a328e0 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2780,6 +2780,39 @@ int set_compat_user_sigmask(const compat_sigset_t __user *usigmask,
 EXPORT_SYMBOL(set_compat_user_sigmask);
 #endif
 
+/*
+ * restore_user_sigmask:
+ * usigmask: sigmask passed in from userland.
+ * sigsaved: saved sigmask when the syscall started and changed the sigmask to
+ *           usigmask.
+ *
+ * This is useful for syscalls such as ppoll, pselect, io_pgetevents and
+ * epoll_pwait where a new sigmask is passed in from userland for the syscalls.
+ */
+void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved)
+{
+
+	if (!usigmask)
+		return;
+	/*
+	 * When signals are pending, do not restore them here.
+	 * Restoring sigmask here can lead to delivering signals that the above
+	 * syscalls are intended to block because of the sigmask passed in.
+	 */
+	if (signal_pending(current)) {
+		current->saved_sigmask = *sigsaved;
+		set_restore_sigmask();
+		return;
+	}
+
+	/*
+	 * This is needed because the fast syscall return path does not restore
+	 * saved_sigmask when signals are not pending.
+	 */
+	set_current_blocked(sigsaved);
+}
+EXPORT_SYMBOL(restore_user_sigmask);
+
 /**
  *  sys_rt_sigprocmask - change the list of currently blocked signals
  *  @how: whether to add, remove, or set signals
-- 
cgit 


From 04e7712f4460585e5eed5b853fd8b82a9943958f Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 17 Apr 2018 16:31:07 +0200
Subject: y2038: futex: Move compat implementation into futex.c

We are going to share the compat_sys_futex() handler between 64-bit
architectures and 32-bit architectures that need to deal with both 32-bit
and 64-bit time_t, and this is easier if both entry points are in the
same file.

In fact, most other system call handlers do the same thing these days, so
let's follow the trend here and merge all of futex_compat.c into futex.c.

In the process, a few minor changes have to be done to make sure everything
still makes sense: handle_futex_death() and futex_cmpxchg_enabled() become
local symbol, and the compat version of the fetch_robust_entry() function
gets renamed to compat_fetch_robust_entry() to avoid a symbol clash.

This is intended as a purely cosmetic patch, no behavior should
change.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 include/linux/futex.h |   8 --
 kernel/Makefile       |   3 -
 kernel/futex.c        | 195 +++++++++++++++++++++++++++++++++++++++++++++++-
 kernel/futex_compat.c | 202 --------------------------------------------------
 4 files changed, 192 insertions(+), 216 deletions(-)
 delete mode 100644 kernel/futex_compat.c

(limited to 'kernel')

diff --git a/include/linux/futex.h b/include/linux/futex.h
index 821ae502d3d8..ccaef0097785 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -9,9 +9,6 @@ struct inode;
 struct mm_struct;
 struct task_struct;
 
-extern int
-handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi);
-
 /*
  * Futexes are matched on equal values of this key.
  * The key type depends on whether it's a shared or private mapping.
@@ -55,11 +52,6 @@ extern void exit_robust_list(struct task_struct *curr);
 
 long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 	      u32 __user *uaddr2, u32 val2, u32 val3);
-#ifdef CONFIG_HAVE_FUTEX_CMPXCHG
-#define futex_cmpxchg_enabled 1
-#else
-extern int futex_cmpxchg_enabled;
-#endif
 #else
 static inline void exit_robust_list(struct task_struct *curr)
 {
diff --git a/kernel/Makefile b/kernel/Makefile
index 7343b3a9bff0..8e40a6742d23 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -49,9 +49,6 @@ obj-$(CONFIG_PROFILING) += profile.o
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
 obj-y += time/
 obj-$(CONFIG_FUTEX) += futex.o
-ifeq ($(CONFIG_COMPAT),y)
-obj-$(CONFIG_FUTEX) += futex_compat.o
-endif
 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
 obj-$(CONFIG_SMP) += smp.o
 ifneq ($(CONFIG_SMP),y)
diff --git a/kernel/futex.c b/kernel/futex.c
index f423f9b6577e..5cc7c3b098e9 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -44,6 +44,7 @@
  *  along with this program; if not, write to the Free Software
  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
+#include <linux/compat.h>
 #include <linux/slab.h>
 #include <linux/poll.h>
 #include <linux/fs.h>
@@ -173,8 +174,10 @@
  * double_lock_hb() and double_unlock_hb(), respectively.
  */
 
-#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
-int __read_mostly futex_cmpxchg_enabled;
+#ifdef CONFIG_HAVE_FUTEX_CMPXCHG
+#define futex_cmpxchg_enabled 1
+#else
+static int  __read_mostly futex_cmpxchg_enabled;
 #endif
 
 /*
@@ -3360,7 +3363,7 @@ err_unlock:
  * Process a futex-list entry, check whether it's owned by the
  * dying task, and do notification if so:
  */
-int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
+static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
 {
 	u32 uval, uninitialized_var(nval), mval;
 
@@ -3589,6 +3592,192 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
 	return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
 }
 
+#ifdef CONFIG_COMPAT
+/*
+ * Fetch a robust-list pointer. Bit 0 signals PI futexes:
+ */
+static inline int
+compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
+		   compat_uptr_t __user *head, unsigned int *pi)
+{
+	if (get_user(*uentry, head))
+		return -EFAULT;
+
+	*entry = compat_ptr((*uentry) & ~1);
+	*pi = (unsigned int)(*uentry) & 1;
+
+	return 0;
+}
+
+static void __user *futex_uaddr(struct robust_list __user *entry,
+				compat_long_t futex_offset)
+{
+	compat_uptr_t base = ptr_to_compat(entry);
+	void __user *uaddr = compat_ptr(base + futex_offset);
+
+	return uaddr;
+}
+
+/*
+ * Walk curr->robust_list (very carefully, it's a userspace list!)
+ * and mark any locks found there dead, and notify any waiters.
+ *
+ * We silently return on any sign of list-walking problem.
+ */
+void compat_exit_robust_list(struct task_struct *curr)
+{
+	struct compat_robust_list_head __user *head = curr->compat_robust_list;
+	struct robust_list __user *entry, *next_entry, *pending;
+	unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
+	unsigned int uninitialized_var(next_pi);
+	compat_uptr_t uentry, next_uentry, upending;
+	compat_long_t futex_offset;
+	int rc;
+
+	if (!futex_cmpxchg_enabled)
+		return;
+
+	/*
+	 * Fetch the list head (which was registered earlier, via
+	 * sys_set_robust_list()):
+	 */
+	if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
+		return;
+	/*
+	 * Fetch the relative futex offset:
+	 */
+	if (get_user(futex_offset, &head->futex_offset))
+		return;
+	/*
+	 * Fetch any possibly pending lock-add first, and handle it
+	 * if it exists:
+	 */
+	if (compat_fetch_robust_entry(&upending, &pending,
+			       &head->list_op_pending, &pip))
+		return;
+
+	next_entry = NULL;	/* avoid warning with gcc */
+	while (entry != (struct robust_list __user *) &head->list) {
+		/*
+		 * Fetch the next entry in the list before calling
+		 * handle_futex_death:
+		 */
+		rc = compat_fetch_robust_entry(&next_uentry, &next_entry,
+			(compat_uptr_t __user *)&entry->next, &next_pi);
+		/*
+		 * A pending lock might already be on the list, so
+		 * dont process it twice:
+		 */
+		if (entry != pending) {
+			void __user *uaddr = futex_uaddr(entry, futex_offset);
+
+			if (handle_futex_death(uaddr, curr, pi))
+				return;
+		}
+		if (rc)
+			return;
+		uentry = next_uentry;
+		entry = next_entry;
+		pi = next_pi;
+		/*
+		 * Avoid excessively long or circular lists:
+		 */
+		if (!--limit)
+			break;
+
+		cond_resched();
+	}
+	if (pending) {
+		void __user *uaddr = futex_uaddr(pending, futex_offset);
+
+		handle_futex_death(uaddr, curr, pip);
+	}
+}
+
+COMPAT_SYSCALL_DEFINE2(set_robust_list,
+		struct compat_robust_list_head __user *, head,
+		compat_size_t, len)
+{
+	if (!futex_cmpxchg_enabled)
+		return -ENOSYS;
+
+	if (unlikely(len != sizeof(*head)))
+		return -EINVAL;
+
+	current->compat_robust_list = head;
+
+	return 0;
+}
+
+COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
+			compat_uptr_t __user *, head_ptr,
+			compat_size_t __user *, len_ptr)
+{
+	struct compat_robust_list_head __user *head;
+	unsigned long ret;
+	struct task_struct *p;
+
+	if (!futex_cmpxchg_enabled)
+		return -ENOSYS;
+
+	rcu_read_lock();
+
+	ret = -ESRCH;
+	if (!pid)
+		p = current;
+	else {
+		p = find_task_by_vpid(pid);
+		if (!p)
+			goto err_unlock;
+	}
+
+	ret = -EPERM;
+	if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
+		goto err_unlock;
+
+	head = p->compat_robust_list;
+	rcu_read_unlock();
+
+	if (put_user(sizeof(*head), len_ptr))
+		return -EFAULT;
+	return put_user(ptr_to_compat(head), head_ptr);
+
+err_unlock:
+	rcu_read_unlock();
+
+	return ret;
+}
+
+COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
+		struct old_timespec32 __user *, utime, u32 __user *, uaddr2,
+		u32, val3)
+{
+	struct timespec ts;
+	ktime_t t, *tp = NULL;
+	int val2 = 0;
+	int cmd = op & FUTEX_CMD_MASK;
+
+	if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
+		      cmd == FUTEX_WAIT_BITSET ||
+		      cmd == FUTEX_WAIT_REQUEUE_PI)) {
+		if (compat_get_timespec(&ts, utime))
+			return -EFAULT;
+		if (!timespec_valid(&ts))
+			return -EINVAL;
+
+		t = timespec_to_ktime(ts);
+		if (cmd == FUTEX_WAIT)
+			t = ktime_add_safe(ktime_get(), t);
+		tp = &t;
+	}
+	if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
+	    cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
+		val2 = (int) (unsigned long) utime;
+
+	return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
+}
+#endif /* CONFIG_COMPAT */
+
 static void __init futex_detect_cmpxchg(void)
 {
 #ifndef CONFIG_HAVE_FUTEX_CMPXCHG
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
deleted file mode 100644
index 410a77a8f6e2..000000000000
--- a/kernel/futex_compat.c
+++ /dev/null
@@ -1,202 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * linux/kernel/futex_compat.c
- *
- * Futex compatibililty routines.
- *
- * Copyright 2006, Red Hat, Inc., Ingo Molnar
- */
-
-#include <linux/linkage.h>
-#include <linux/compat.h>
-#include <linux/nsproxy.h>
-#include <linux/futex.h>
-#include <linux/ptrace.h>
-#include <linux/syscalls.h>
-
-#include <linux/uaccess.h>
-
-
-/*
- * Fetch a robust-list pointer. Bit 0 signals PI futexes:
- */
-static inline int
-fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
-		   compat_uptr_t __user *head, unsigned int *pi)
-{
-	if (get_user(*uentry, head))
-		return -EFAULT;
-
-	*entry = compat_ptr((*uentry) & ~1);
-	*pi = (unsigned int)(*uentry) & 1;
-
-	return 0;
-}
-
-static void __user *futex_uaddr(struct robust_list __user *entry,
-				compat_long_t futex_offset)
-{
-	compat_uptr_t base = ptr_to_compat(entry);
-	void __user *uaddr = compat_ptr(base + futex_offset);
-
-	return uaddr;
-}
-
-/*
- * Walk curr->robust_list (very carefully, it's a userspace list!)
- * and mark any locks found there dead, and notify any waiters.
- *
- * We silently return on any sign of list-walking problem.
- */
-void compat_exit_robust_list(struct task_struct *curr)
-{
-	struct compat_robust_list_head __user *head = curr->compat_robust_list;
-	struct robust_list __user *entry, *next_entry, *pending;
-	unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
-	unsigned int uninitialized_var(next_pi);
-	compat_uptr_t uentry, next_uentry, upending;
-	compat_long_t futex_offset;
-	int rc;
-
-	if (!futex_cmpxchg_enabled)
-		return;
-
-	/*
-	 * Fetch the list head (which was registered earlier, via
-	 * sys_set_robust_list()):
-	 */
-	if (fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
-		return;
-	/*
-	 * Fetch the relative futex offset:
-	 */
-	if (get_user(futex_offset, &head->futex_offset))
-		return;
-	/*
-	 * Fetch any possibly pending lock-add first, and handle it
-	 * if it exists:
-	 */
-	if (fetch_robust_entry(&upending, &pending,
-			       &head->list_op_pending, &pip))
-		return;
-
-	next_entry = NULL;	/* avoid warning with gcc */
-	while (entry != (struct robust_list __user *) &head->list) {
-		/*
-		 * Fetch the next entry in the list before calling
-		 * handle_futex_death:
-		 */
-		rc = fetch_robust_entry(&next_uentry, &next_entry,
-			(compat_uptr_t __user *)&entry->next, &next_pi);
-		/*
-		 * A pending lock might already be on the list, so
-		 * dont process it twice:
-		 */
-		if (entry != pending) {
-			void __user *uaddr = futex_uaddr(entry, futex_offset);
-
-			if (handle_futex_death(uaddr, curr, pi))
-				return;
-		}
-		if (rc)
-			return;
-		uentry = next_uentry;
-		entry = next_entry;
-		pi = next_pi;
-		/*
-		 * Avoid excessively long or circular lists:
-		 */
-		if (!--limit)
-			break;
-
-		cond_resched();
-	}
-	if (pending) {
-		void __user *uaddr = futex_uaddr(pending, futex_offset);
-
-		handle_futex_death(uaddr, curr, pip);
-	}
-}
-
-COMPAT_SYSCALL_DEFINE2(set_robust_list,
-		struct compat_robust_list_head __user *, head,
-		compat_size_t, len)
-{
-	if (!futex_cmpxchg_enabled)
-		return -ENOSYS;
-
-	if (unlikely(len != sizeof(*head)))
-		return -EINVAL;
-
-	current->compat_robust_list = head;
-
-	return 0;
-}
-
-COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
-			compat_uptr_t __user *, head_ptr,
-			compat_size_t __user *, len_ptr)
-{
-	struct compat_robust_list_head __user *head;
-	unsigned long ret;
-	struct task_struct *p;
-
-	if (!futex_cmpxchg_enabled)
-		return -ENOSYS;
-
-	rcu_read_lock();
-
-	ret = -ESRCH;
-	if (!pid)
-		p = current;
-	else {
-		p = find_task_by_vpid(pid);
-		if (!p)
-			goto err_unlock;
-	}
-
-	ret = -EPERM;
-	if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
-		goto err_unlock;
-
-	head = p->compat_robust_list;
-	rcu_read_unlock();
-
-	if (put_user(sizeof(*head), len_ptr))
-		return -EFAULT;
-	return put_user(ptr_to_compat(head), head_ptr);
-
-err_unlock:
-	rcu_read_unlock();
-
-	return ret;
-}
-
-COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
-		struct old_timespec32 __user *, utime, u32 __user *, uaddr2,
-		u32, val3)
-{
-	struct timespec ts;
-	ktime_t t, *tp = NULL;
-	int val2 = 0;
-	int cmd = op & FUTEX_CMD_MASK;
-
-	if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
-		      cmd == FUTEX_WAIT_BITSET ||
-		      cmd == FUTEX_WAIT_REQUEUE_PI)) {
-		if (compat_get_timespec(&ts, utime))
-			return -EFAULT;
-		if (!timespec_valid(&ts))
-			return -EINVAL;
-
-		t = timespec_to_ktime(ts);
-		if (cmd == FUTEX_WAIT)
-			t = ktime_add_safe(ktime_get(), t);
-		tp = &t;
-	}
-	if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
-	    cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
-		val2 = (int) (unsigned long) utime;
-
-	return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
-}
-- 
cgit 


From bec2f7cbb73eadf5e1cc7d54ecb0980ede244257 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 17 Apr 2018 17:23:35 +0200
Subject: y2038: futex: Add support for __kernel_timespec

This prepares sys_futex for y2038 safe calling: the native
syscall is changed to receive a __kernel_timespec argument, which
will be switched to 64-bit time_t in the future. All the internal
time handling gets changed to timespec64, and the compat_sys_futex
entry point is moved under the CONFIG_COMPAT_32BIT_TIME check
to provide compatibility for existing 32-bit architectures.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 include/linux/syscalls.h |  2 +-
 kernel/futex.c           | 22 ++++++++++++----------
 2 files changed, 13 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index a27cf407de92..247ad9eca955 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -553,7 +553,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags);
 
 /* kernel/futex.c */
 asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
-			struct timespec __user *utime, u32 __user *uaddr2,
+			struct __kernel_timespec __user *utime, u32 __user *uaddr2,
 			u32 val3);
 asmlinkage long sys_get_robust_list(int pid,
 				    struct robust_list_head __user * __user *head_ptr,
diff --git a/kernel/futex.c b/kernel/futex.c
index 5cc7c3b098e9..b305beaab739 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -3558,10 +3558,10 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 
 
 SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
-		struct timespec __user *, utime, u32 __user *, uaddr2,
+		struct __kernel_timespec __user *, utime, u32 __user *, uaddr2,
 		u32, val3)
 {
-	struct timespec ts;
+	struct timespec64 ts;
 	ktime_t t, *tp = NULL;
 	u32 val2 = 0;
 	int cmd = op & FUTEX_CMD_MASK;
@@ -3571,12 +3571,12 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
 		      cmd == FUTEX_WAIT_REQUEUE_PI)) {
 		if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG))))
 			return -EFAULT;
-		if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
+		if (get_timespec64(&ts, utime))
 			return -EFAULT;
-		if (!timespec_valid(&ts))
+		if (!timespec64_valid(&ts))
 			return -EINVAL;
 
-		t = timespec_to_ktime(ts);
+		t = timespec64_to_ktime(ts);
 		if (cmd == FUTEX_WAIT)
 			t = ktime_add_safe(ktime_get(), t);
 		tp = &t;
@@ -3747,12 +3747,14 @@ err_unlock:
 
 	return ret;
 }
+#endif /* CONFIG_COMPAT */
 
+#ifdef CONFIG_COMPAT_32BIT_TIME
 COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
 		struct old_timespec32 __user *, utime, u32 __user *, uaddr2,
 		u32, val3)
 {
-	struct timespec ts;
+	struct timespec64 ts;
 	ktime_t t, *tp = NULL;
 	int val2 = 0;
 	int cmd = op & FUTEX_CMD_MASK;
@@ -3760,12 +3762,12 @@ COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
 	if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
 		      cmd == FUTEX_WAIT_BITSET ||
 		      cmd == FUTEX_WAIT_REQUEUE_PI)) {
-		if (compat_get_timespec(&ts, utime))
+		if (get_old_timespec32(&ts, utime))
 			return -EFAULT;
-		if (!timespec_valid(&ts))
+		if (!timespec64_valid(&ts))
 			return -EINVAL;
 
-		t = timespec_to_ktime(ts);
+		t = timespec64_to_ktime(ts);
 		if (cmd == FUTEX_WAIT)
 			t = ktime_add_safe(ktime_get(), t);
 		tp = &t;
@@ -3776,7 +3778,7 @@ COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
 
 	return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
 }
-#endif /* CONFIG_COMPAT */
+#endif /* CONFIG_COMPAT_32BIT_TIME */
 
 static void __init futex_detect_cmpxchg(void)
 {
-- 
cgit 


From 2dc6b100f928aac8d7532bf7112d3f8d3f952bad Mon Sep 17 00:00:00 2001
From: Jiong Wang <jiong.wang@netronome.com>
Date: Wed, 5 Dec 2018 13:52:34 -0500
Subject: bpf: interpreter support BPF_ALU | BPF_ARSH

This patch implements interpreting BPF_ALU | BPF_ARSH. Do arithmetic right
shift on low 32-bit sub-register, and zero the high 32 bits.

Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/core.c | 52 ++++++++++++++++++++++++++++++----------------------
 1 file changed, 30 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 628b3970a49b..a5b223ef7131 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -933,32 +933,34 @@ EXPORT_SYMBOL_GPL(__bpf_call_base);
 #define BPF_INSN_MAP(INSN_2, INSN_3)		\
 	/* 32 bit ALU operations. */		\
 	/*   Register based. */			\
-	INSN_3(ALU, ADD, X),			\
-	INSN_3(ALU, SUB, X),			\
-	INSN_3(ALU, AND, X),			\
-	INSN_3(ALU, OR,  X),			\
-	INSN_3(ALU, LSH, X),			\
-	INSN_3(ALU, RSH, X),			\
-	INSN_3(ALU, XOR, X),			\
-	INSN_3(ALU, MUL, X),			\
-	INSN_3(ALU, MOV, X),			\
-	INSN_3(ALU, DIV, X),			\
-	INSN_3(ALU, MOD, X),			\
+	INSN_3(ALU, ADD,  X),			\
+	INSN_3(ALU, SUB,  X),			\
+	INSN_3(ALU, AND,  X),			\
+	INSN_3(ALU, OR,   X),			\
+	INSN_3(ALU, LSH,  X),			\
+	INSN_3(ALU, RSH,  X),			\
+	INSN_3(ALU, XOR,  X),			\
+	INSN_3(ALU, MUL,  X),			\
+	INSN_3(ALU, MOV,  X),			\
+	INSN_3(ALU, ARSH, X),			\
+	INSN_3(ALU, DIV,  X),			\
+	INSN_3(ALU, MOD,  X),			\
 	INSN_2(ALU, NEG),			\
 	INSN_3(ALU, END, TO_BE),		\
 	INSN_3(ALU, END, TO_LE),		\
 	/*   Immediate based. */		\
-	INSN_3(ALU, ADD, K),			\
-	INSN_3(ALU, SUB, K),			\
-	INSN_3(ALU, AND, K),			\
-	INSN_3(ALU, OR,  K),			\
-	INSN_3(ALU, LSH, K),			\
-	INSN_3(ALU, RSH, K),			\
-	INSN_3(ALU, XOR, K),			\
-	INSN_3(ALU, MUL, K),			\
-	INSN_3(ALU, MOV, K),			\
-	INSN_3(ALU, DIV, K),			\
-	INSN_3(ALU, MOD, K),			\
+	INSN_3(ALU, ADD,  K),			\
+	INSN_3(ALU, SUB,  K),			\
+	INSN_3(ALU, AND,  K),			\
+	INSN_3(ALU, OR,   K),			\
+	INSN_3(ALU, LSH,  K),			\
+	INSN_3(ALU, RSH,  K),			\
+	INSN_3(ALU, XOR,  K),			\
+	INSN_3(ALU, MUL,  K),			\
+	INSN_3(ALU, MOV,  K),			\
+	INSN_3(ALU, ARSH, K),			\
+	INSN_3(ALU, DIV,  K),			\
+	INSN_3(ALU, MOD,  K),			\
 	/* 64 bit ALU operations. */		\
 	/*   Register based. */			\
 	INSN_3(ALU64, ADD,  X),			\
@@ -1137,6 +1139,12 @@ select_insn:
 		DST = (u64) (u32) insn[0].imm | ((u64) (u32) insn[1].imm) << 32;
 		insn++;
 		CONT;
+	ALU_ARSH_X:
+		DST = (u64) (u32) ((*(s32 *) &DST) >> SRC);
+		CONT;
+	ALU_ARSH_K:
+		DST = (u64) (u32) ((*(s32 *) &DST) >> IMM);
+		CONT;
 	ALU64_ARSH_X:
 		(*(s64 *) &DST) >>= SRC;
 		CONT;
-- 
cgit 


From c49f7dbd4f9c2c49df7fc0f5b50c1350ee7e01ee Mon Sep 17 00:00:00 2001
From: Jiong Wang <jiong.wang@netronome.com>
Date: Wed, 5 Dec 2018 13:52:35 -0500
Subject: bpf: verifier remove the rejection on BPF_ALU | BPF_ARSH

This patch remove the rejection on BPF_ALU | BPF_ARSH as we have supported
them on interpreter and all JIT back-ends

Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 7658c61c1a88..2752d35ad073 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3649,11 +3649,6 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 			return -EINVAL;
 		}
 
-		if (opcode == BPF_ARSH && BPF_CLASS(insn->code) != BPF_ALU64) {
-			verbose(env, "BPF_ARSH not supported for 32 bit ALU\n");
-			return -EINVAL;
-		}
-
 		if ((opcode == BPF_LSH || opcode == BPF_RSH ||
 		     opcode == BPF_ARSH) && BPF_SRC(insn->code) == BPF_K) {
 			int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32;
-- 
cgit 


From db6638d7d177a8bc74c9e539e2e0d7d061c767b1 Mon Sep 17 00:00:00 2001
From: Dennis Zhou <dennis@kernel.org>
Date: Wed, 5 Dec 2018 12:10:35 -0500
Subject: blkcg: remove bio->bi_css and instead use bio->bi_blkg

Prior patches ensured that any bio that interacts with a request_queue
is properly associated with a blkg. This makes bio->bi_css unnecessary
as blkg maintains a reference to blkcg already.

This removes the bio field bi_css and transfers corresponding uses to
access via bi_blkg.

Signed-off-by: Dennis Zhou <dennis@kernel.org>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c                | 59 ++++++++++------------------------------------
 block/bounce.c             |  2 +-
 drivers/block/loop.c       |  5 ++--
 drivers/md/raid0.c         |  2 +-
 include/linux/bio.h        | 11 ++++-----
 include/linux/blk-cgroup.h |  8 +++----
 include/linux/blk_types.h  |  7 +++---
 kernel/trace/blktrace.c    |  4 ++--
 8 files changed, 32 insertions(+), 66 deletions(-)

(limited to 'kernel')

diff --git a/block/bio.c b/block/bio.c
index b42477b6a225..2b6bc7b805ec 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -610,7 +610,7 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
 	bio->bi_iter = bio_src->bi_iter;
 	bio->bi_io_vec = bio_src->bi_io_vec;
 
-	bio_clone_blkcg_association(bio, bio_src);
+	bio_clone_blkg_association(bio, bio_src);
 	blkcg_bio_issue_init(bio);
 }
 EXPORT_SYMBOL(__bio_clone_fast);
@@ -1957,34 +1957,6 @@ EXPORT_SYMBOL(bioset_init_from_src);
 
 #ifdef CONFIG_BLK_CGROUP
 
-/**
- * bio_associate_blkcg - associate a bio with the specified blkcg
- * @bio: target bio
- * @blkcg_css: css of the blkcg to associate
- *
- * Associate @bio with the blkcg specified by @blkcg_css.  Block layer will
- * treat @bio as if it were issued by a task which belongs to the blkcg.
- *
- * This function takes an extra reference of @blkcg_css which will be put
- * when @bio is released.  The caller must own @bio and is responsible for
- * synchronizing calls to this function.  If @blkcg_css is %NULL, a call to
- * blkcg_get_css() finds the current css from the kthread or task.
- */
-int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css)
-{
-	if (unlikely(bio->bi_css))
-		return -EBUSY;
-
-	if (blkcg_css)
-		css_get(blkcg_css);
-	else
-		blkcg_css = blkcg_get_css();
-
-	bio->bi_css = blkcg_css;
-	return 0;
-}
-EXPORT_SYMBOL_GPL(bio_associate_blkcg);
-
 /**
  * bio_disassociate_blkg - puts back the blkg reference if associated
  * @bio: target bio
@@ -1994,6 +1966,8 @@ EXPORT_SYMBOL_GPL(bio_associate_blkcg);
 void bio_disassociate_blkg(struct bio *bio)
 {
 	if (bio->bi_blkg) {
+		/* a ref is always taken on css */
+		css_put(&bio_blkcg(bio)->css);
 		blkg_put(bio->bi_blkg);
 		bio->bi_blkg = NULL;
 	}
@@ -2047,7 +2021,6 @@ void bio_associate_blkg_from_css(struct bio *bio,
 				 struct cgroup_subsys_state *css)
 {
 	css_get(css);
-	bio->bi_css = css;
 	__bio_associate_blkg_from_css(bio, css);
 }
 EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css);
@@ -2066,13 +2039,10 @@ void bio_associate_blkg_from_page(struct bio *bio, struct page *page)
 {
 	struct cgroup_subsys_state *css;
 
-	if (unlikely(bio->bi_css))
-		return;
 	if (!page->mem_cgroup)
 		return;
 
 	css = cgroup_get_e_css(page->mem_cgroup->css.cgroup, &io_cgrp_subsys);
-	bio->bi_css = css;
 	__bio_associate_blkg_from_css(bio, css);
 }
 #endif /* CONFIG_MEMCG */
@@ -2094,8 +2064,10 @@ void bio_associate_blkg(struct bio *bio)
 
 	rcu_read_lock();
 
-	bio_associate_blkcg(bio, NULL);
-	blkcg = bio_blkcg(bio);
+	if (bio->bi_blkg)
+		blkcg = bio->bi_blkg->blkcg;
+	else
+		blkcg = css_to_blkcg(blkcg_get_css());
 
 	if (!blkcg->css.parent) {
 		__bio_associate_blkg(bio, q->root_blkg);
@@ -2115,27 +2087,22 @@ EXPORT_SYMBOL_GPL(bio_associate_blkg);
  */
 void bio_disassociate_task(struct bio *bio)
 {
-	if (bio->bi_css) {
-		css_put(bio->bi_css);
-		bio->bi_css = NULL;
-	}
 	bio_disassociate_blkg(bio);
 }
 
 /**
- * bio_clone_blkcg_association - clone blkcg association from src to dst bio
+ * bio_clone_blkg_association - clone blkg association from src to dst bio
  * @dst: destination bio
  * @src: source bio
  */
-void bio_clone_blkcg_association(struct bio *dst, struct bio *src)
+void bio_clone_blkg_association(struct bio *dst, struct bio *src)
 {
-	if (src->bi_css)
-		WARN_ON(bio_associate_blkcg(dst, src->bi_css));
-
-	if (src->bi_blkg)
+	if (src->bi_blkg) {
+		css_get(&bio_blkcg(src)->css);
 		__bio_associate_blkg(dst, src->bi_blkg);
+	}
 }
-EXPORT_SYMBOL_GPL(bio_clone_blkcg_association);
+EXPORT_SYMBOL_GPL(bio_clone_blkg_association);
 #endif /* CONFIG_BLK_CGROUP */
 
 static void __init biovec_init_slabs(void)
diff --git a/block/bounce.c b/block/bounce.c
index cfb96d5170d0..ffb9e9ecfa7e 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -277,7 +277,7 @@ static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask,
 		}
 	}
 
-	bio_clone_blkcg_association(bio, bio_src);
+	bio_clone_blkg_association(bio, bio_src);
 	blkcg_bio_issue_init(bio);
 
 	return bio;
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 176ab1f28eca..0770004616de 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -77,6 +77,7 @@
 #include <linux/falloc.h>
 #include <linux/uio.h>
 #include <linux/ioprio.h>
+#include <linux/blk-cgroup.h>
 
 #include "loop.h"
 
@@ -1820,8 +1821,8 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx,
 
 	/* always use the first bio's css */
 #ifdef CONFIG_BLK_CGROUP
-	if (cmd->use_aio && rq->bio && rq->bio->bi_css) {
-		cmd->css = rq->bio->bi_css;
+	if (cmd->use_aio && rq->bio && rq->bio->bi_blkg) {
+		cmd->css = &bio_blkcg(rq->bio)->css;
 		css_get(cmd->css);
 	} else
 #endif
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index ac1cffd2a09b..f3fb5bb8c82a 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -542,7 +542,7 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
 		    !discard_bio)
 			continue;
 		bio_chain(discard_bio, bio);
-		bio_clone_blkcg_association(discard_bio, bio);
+		bio_clone_blkg_association(discard_bio, bio);
 		if (mddev->gendisk)
 			trace_block_bio_remap(bdev_get_queue(rdev->bdev),
 				discard_bio, disk_devt(mddev->gendisk),
diff --git a/include/linux/bio.h b/include/linux/bio.h
index f0438061a5a3..84e1c4dc703a 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -498,7 +498,7 @@ do {						\
 do {						\
 	(dst)->bi_disk = (src)->bi_disk;	\
 	(dst)->bi_partno = (src)->bi_partno;	\
-	bio_clone_blkcg_association(dst, src);	\
+	bio_clone_blkg_association(dst, src);	\
 } while (0)
 
 #define bio_dev(bio) \
@@ -512,24 +512,21 @@ static inline void bio_associate_blkg_from_page(struct bio *bio,
 #endif
 
 #ifdef CONFIG_BLK_CGROUP
-int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css);
 void bio_disassociate_blkg(struct bio *bio);
 void bio_associate_blkg(struct bio *bio);
 void bio_associate_blkg_from_css(struct bio *bio,
 				 struct cgroup_subsys_state *css);
 void bio_disassociate_task(struct bio *bio);
-void bio_clone_blkcg_association(struct bio *dst, struct bio *src);
+void bio_clone_blkg_association(struct bio *dst, struct bio *src);
 #else	/* CONFIG_BLK_CGROUP */
-static inline int bio_associate_blkcg(struct bio *bio,
-			struct cgroup_subsys_state *blkcg_css) { return 0; }
 static inline void bio_disassociate_blkg(struct bio *bio) { }
 static inline void bio_associate_blkg(struct bio *bio) { }
 static inline void bio_associate_blkg_from_css(struct bio *bio,
 					       struct cgroup_subsys_state *css)
 { }
 static inline void bio_disassociate_task(struct bio *bio) { }
-static inline void bio_clone_blkcg_association(struct bio *dst,
-			struct bio *src) { }
+static inline void bio_clone_blkg_association(struct bio *dst,
+					      struct bio *src) { }
 #endif	/* CONFIG_BLK_CGROUP */
 
 #ifdef CONFIG_HIGHMEM
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 8b069c3775ee..f11c37f8ce09 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -309,8 +309,8 @@ static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css)
  */
 static inline struct blkcg *__bio_blkcg(struct bio *bio)
 {
-	if (bio && bio->bi_css)
-		return css_to_blkcg(bio->bi_css);
+	if (bio && bio->bi_blkg)
+		return bio->bi_blkg->blkcg;
 	return css_to_blkcg(blkcg_css());
 }
 
@@ -324,8 +324,8 @@ static inline struct blkcg *__bio_blkcg(struct bio *bio)
  */
 static inline struct blkcg *bio_blkcg(struct bio *bio)
 {
-	if (bio && bio->bi_css)
-		return css_to_blkcg(bio->bi_css);
+	if (bio && bio->bi_blkg)
+		return bio->bi_blkg->blkcg;
 	return NULL;
 }
 
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index c0ba1a038ff3..46c005d601ac 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -174,10 +174,11 @@ struct bio {
 	void			*bi_private;
 #ifdef CONFIG_BLK_CGROUP
 	/*
-	 * Optional css associated with this bio.  Put on bio
-	 * release.  Read comment on top of bio_associate_current().
+	 * Represents the association of the css and request_queue for the bio.
+	 * If a bio goes direct to device, it will not have a blkg as it will
+	 * not have a request_queue associated with it.  The reference is put
+	 * on release of the bio.
 	 */
-	struct cgroup_subsys_state *bi_css;
 	struct blkcg_gq		*bi_blkg;
 	struct bio_issue	bi_issue;
 #endif
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 2868d85f1fb1..fac0ddf8a8e2 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -764,9 +764,9 @@ blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
 	if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP))
 		return NULL;
 
-	if (!bio->bi_css)
+	if (!bio->bi_blkg)
 		return NULL;
-	return cgroup_get_kernfs_id(bio->bi_css->cgroup);
+	return cgroup_get_kernfs_id(bio_blkcg(bio)->css.cgroup);
 }
 #else
 static union kernfs_node_id *
-- 
cgit 


From fc5a828bfad628c1092194f2814604943561c52d Mon Sep 17 00:00:00 2001
From: Dennis Zhou <dennis@kernel.org>
Date: Wed, 5 Dec 2018 12:10:36 -0500
Subject: blkcg: remove additional reference to the css

The previous patch in this series removed carrying around a pointer to
the css in blkg. However, the blkg association logic still relied on
taking a reference on the css to ensure we wouldn't fail in getting a
reference for the blkg.

Here the implicit dependency on the css is removed. The association
continues to rely on the tryget logic walking up the blkg tree. This
streamlines the three ways that association can happen: normal, swap,
and writeback.

Signed-off-by: Dennis Zhou <dennis@kernel.org>
Acked-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c                | 66 ++++++++++++++++++++--------------------------
 include/linux/blk-cgroup.h | 41 ----------------------------
 include/linux/cgroup.h     |  2 ++
 kernel/cgroup/cgroup.c     | 48 ++++++++++++++++++++++++++-------
 4 files changed, 69 insertions(+), 88 deletions(-)

(limited to 'kernel')

diff --git a/block/bio.c b/block/bio.c
index 2b6bc7b805ec..ce1e512dca5a 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1966,8 +1966,6 @@ EXPORT_SYMBOL(bioset_init_from_src);
 void bio_disassociate_blkg(struct bio *bio)
 {
 	if (bio->bi_blkg) {
-		/* a ref is always taken on css */
-		css_put(&bio_blkcg(bio)->css);
 		blkg_put(bio->bi_blkg);
 		bio->bi_blkg = NULL;
 	}
@@ -1995,33 +1993,31 @@ static void __bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg)
 	bio->bi_blkg = blkg_try_get_closest(blkg);
 }
 
-static void __bio_associate_blkg_from_css(struct bio *bio,
-					  struct cgroup_subsys_state *css)
-{
-	struct blkcg_gq *blkg;
-
-	rcu_read_lock();
-
-	blkg = blkg_lookup_create(css_to_blkcg(css), bio->bi_disk->queue);
-	__bio_associate_blkg(bio, blkg);
-
-	rcu_read_unlock();
-}
-
 /**
  * bio_associate_blkg_from_css - associate a bio with a specified css
  * @bio: target bio
  * @css: target css
  *
  * Associate @bio with the blkg found by combining the css's blkg and the
- * request_queue of the @bio.  This takes a reference on the css that will
- * be put upon freeing of @bio.
+ * request_queue of the @bio.  This falls back to the queue's root_blkg if
+ * the association fails with the css.
  */
 void bio_associate_blkg_from_css(struct bio *bio,
 				 struct cgroup_subsys_state *css)
 {
-	css_get(css);
-	__bio_associate_blkg_from_css(bio, css);
+	struct request_queue *q = bio->bi_disk->queue;
+	struct blkcg_gq *blkg;
+
+	rcu_read_lock();
+
+	if (!css || !css->parent)
+		blkg = q->root_blkg;
+	else
+		blkg = blkg_lookup_create(css_to_blkcg(css), q);
+
+	__bio_associate_blkg(bio, blkg);
+
+	rcu_read_unlock();
 }
 EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css);
 
@@ -2032,8 +2028,8 @@ EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css);
  * @page: the page to lookup the blkcg from
  *
  * Associate @bio with the blkg from @page's owning memcg and the respective
- * request_queue.  This works like every other associate function wrt
- * references.
+ * request_queue.  If cgroup_e_css returns %NULL, fall back to the queue's
+ * root_blkg.
  */
 void bio_associate_blkg_from_page(struct bio *bio, struct page *page)
 {
@@ -2042,8 +2038,12 @@ void bio_associate_blkg_from_page(struct bio *bio, struct page *page)
 	if (!page->mem_cgroup)
 		return;
 
-	css = cgroup_get_e_css(page->mem_cgroup->css.cgroup, &io_cgrp_subsys);
-	__bio_associate_blkg_from_css(bio, css);
+	rcu_read_lock();
+
+	css = cgroup_e_css(page->mem_cgroup->css.cgroup, &io_cgrp_subsys);
+	bio_associate_blkg_from_css(bio, css);
+
+	rcu_read_unlock();
 }
 #endif /* CONFIG_MEMCG */
 
@@ -2058,24 +2058,16 @@ void bio_associate_blkg_from_page(struct bio *bio, struct page *page)
  */
 void bio_associate_blkg(struct bio *bio)
 {
-	struct request_queue *q = bio->bi_disk->queue;
-	struct blkcg *blkcg;
-	struct blkcg_gq *blkg;
+	struct cgroup_subsys_state *css;
 
 	rcu_read_lock();
 
 	if (bio->bi_blkg)
-		blkcg = bio->bi_blkg->blkcg;
+		css = &bio_blkcg(bio)->css;
 	else
-		blkcg = css_to_blkcg(blkcg_get_css());
+		css = blkcg_css();
 
-	if (!blkcg->css.parent) {
-		__bio_associate_blkg(bio, q->root_blkg);
-	} else {
-		blkg = blkg_lookup_create(blkcg, q);
-
-		__bio_associate_blkg(bio, blkg);
-	}
+	bio_associate_blkg_from_css(bio, css);
 
 	rcu_read_unlock();
 }
@@ -2097,10 +2089,8 @@ void bio_disassociate_task(struct bio *bio)
  */
 void bio_clone_blkg_association(struct bio *dst, struct bio *src)
 {
-	if (src->bi_blkg) {
-		css_get(&bio_blkcg(src)->css);
+	if (src->bi_blkg)
 		__bio_associate_blkg(dst, src->bi_blkg);
-	}
 }
 EXPORT_SYMBOL_GPL(bio_clone_blkg_association);
 #endif /* CONFIG_BLK_CGROUP */
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index f11c37f8ce09..284819a4d122 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -247,47 +247,6 @@ static inline struct cgroup_subsys_state *blkcg_css(void)
 	return task_css(current, io_cgrp_id);
 }
 
-/**
- * blkcg_get_css - find and get a reference to the css
- *
- * Find the css associated with either the kthread or the current task.
- * This takes a reference on the blkcg which will need to be managed by the
- * caller.
- */
-static inline struct cgroup_subsys_state *blkcg_get_css(void)
-{
-	struct cgroup_subsys_state *css;
-
-	rcu_read_lock();
-
-	css = kthread_blkcg();
-	if (css) {
-		css_get(css);
-	} else {
-		/*
-		 * This is a bit complicated.  It is possible task_css() is
-		 * seeing an old css pointer here.  This is caused by the
-		 * current thread migrating away from this cgroup and this
-		 * cgroup dying.  css_tryget() will fail when trying to take a
-		 * ref on a cgroup that's ref count has hit 0.
-		 *
-		 * Therefore, if it does fail, this means current must have
-		 * been swapped away already and this is waiting for it to
-		 * propagate on the polling cpu.  Hence the use of cpu_relax().
-		 */
-		while (true) {
-			css = task_css(current, io_cgrp_id);
-			if (likely(css_tryget(css)))
-				break;
-			cpu_relax();
-		}
-	}
-
-	rcu_read_unlock();
-
-	return css;
-}
-
 static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css)
 {
 	return css ? container_of(css, struct blkcg, css) : NULL;
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 9d12757a65b0..9968332cceed 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -93,6 +93,8 @@ extern struct css_set init_css_set;
 
 bool css_has_online_children(struct cgroup_subsys_state *css);
 struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss);
+struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgroup,
+					 struct cgroup_subsys *ss);
 struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup,
 					     struct cgroup_subsys *ss);
 struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 6aaf5dd5383b..8b79318810ad 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -493,7 +493,7 @@ static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
 }
 
 /**
- * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
+ * cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss
  * @cgrp: the cgroup of interest
  * @ss: the subsystem of interest (%NULL returns @cgrp->self)
  *
@@ -502,8 +502,8 @@ static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
  * enabled.  If @ss is associated with the hierarchy @cgrp is on, this
  * function is guaranteed to return non-NULL css.
  */
-static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
-						struct cgroup_subsys *ss)
+static struct cgroup_subsys_state *cgroup_e_css_by_mask(struct cgroup *cgrp,
+							struct cgroup_subsys *ss)
 {
 	lockdep_assert_held(&cgroup_mutex);
 
@@ -523,6 +523,35 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
 	return cgroup_css(cgrp, ss);
 }
 
+/**
+ * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
+ * @cgrp: the cgroup of interest
+ * @ss: the subsystem of interest
+ *
+ * Find and get the effective css of @cgrp for @ss.  The effective css is
+ * defined as the matching css of the nearest ancestor including self which
+ * has @ss enabled.  If @ss is not mounted on the hierarchy @cgrp is on,
+ * the root css is returned, so this function always returns a valid css.
+ *
+ * The returned css is not guaranteed to be online, and therefore it is the
+ * callers responsiblity to tryget a reference for it.
+ */
+struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
+					 struct cgroup_subsys *ss)
+{
+	struct cgroup_subsys_state *css;
+
+	do {
+		css = cgroup_css(cgrp, ss);
+
+		if (css)
+			return css;
+		cgrp = cgroup_parent(cgrp);
+	} while (cgrp);
+
+	return init_css_set.subsys[ss->id];
+}
+
 /**
  * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem
  * @cgrp: the cgroup of interest
@@ -605,10 +634,11 @@ EXPORT_SYMBOL_GPL(of_css);
  *
  * Should be called under cgroup_[tree_]mutex.
  */
-#define for_each_e_css(css, ssid, cgrp)					\
-	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)	\
-		if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \
-			;						\
+#define for_each_e_css(css, ssid, cgrp)					    \
+	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)	    \
+		if (!((css) = cgroup_e_css_by_mask(cgrp,		    \
+						   cgroup_subsys[(ssid)]))) \
+			;						    \
 		else
 
 /**
@@ -1007,7 +1037,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset,
 			 * @ss is in this hierarchy, so we want the
 			 * effective css from @cgrp.
 			 */
-			template[i] = cgroup_e_css(cgrp, ss);
+			template[i] = cgroup_e_css_by_mask(cgrp, ss);
 		} else {
 			/*
 			 * @ss is not in this hierarchy, so we don't want
@@ -3024,7 +3054,7 @@ static int cgroup_apply_control(struct cgroup *cgrp)
 		return ret;
 
 	/*
-	 * At this point, cgroup_e_css() results reflect the new csses
+	 * At this point, cgroup_e_css_by_mask() results reflect the new csses
 	 * making the following cgroup_update_dfl_csses() properly update
 	 * css associations of all tasks in the subtree.
 	 */
-- 
cgit 


From 761efe8a94cfcd0a3dd90f2008411550f3520b63 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Sun, 18 Nov 2018 18:44:04 -0500
Subject: function_graph: Remove the use of FTRACE_NOTRACE_DEPTH

The curr_ret_stack is no longer set to a negative value when a function is
not to be traced by the function graph tracer. Remove the usage of
FTRACE_NOTRACE_DEPTH, as it is no longer needed.

Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/ftrace.h               |  1 -
 kernel/trace/fgraph.c                | 19 -------------------
 kernel/trace/trace_functions_graph.c | 11 -----------
 3 files changed, 31 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 10bd46434908..98625f10d982 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -790,7 +790,6 @@ unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
  */
 #define __notrace_funcgraph		notrace
 
-#define FTRACE_NOTRACE_DEPTH 65536
 #define FTRACE_RETFUNC_DEPTH 50
 #define FTRACE_RETSTACK_ALLOC_SIZE 32
 extern int register_ftrace_graph(trace_func_graph_ret_t retfunc,
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index e852b69c0e64..de887a983ac7 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -112,16 +112,6 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
 
 	index = current->curr_ret_stack;
 
-	/*
-	 * A negative index here means that it's just returned from a
-	 * notrace'd function.  Recover index to get an original
-	 * return address.  See ftrace_push_return_trace().
-	 *
-	 * TODO: Need to check whether the stack gets corrupted.
-	 */
-	if (index < 0)
-		index += FTRACE_NOTRACE_DEPTH;
-
 	if (unlikely(index < 0 || index >= FTRACE_RETFUNC_DEPTH)) {
 		ftrace_graph_stop();
 		WARN_ON(1);
@@ -190,15 +180,6 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
 	 */
 	barrier();
 	current->curr_ret_stack--;
-	/*
-	 * The curr_ret_stack can be less than -1 only if it was
-	 * filtered out and it's about to return from the function.
-	 * Recover the index and continue to trace normal functions.
-	 */
-	if (current->curr_ret_stack < -1) {
-		current->curr_ret_stack += FTRACE_NOTRACE_DEPTH;
-		return ret;
-	}
 
 	if (unlikely(!ret)) {
 		ftrace_graph_stop();
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index ecf543df943b..eaf9b1629956 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -115,9 +115,6 @@ unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
 	if (ret != (unsigned long)return_to_handler)
 		return ret;
 
-	if (index < -1)
-		index += FTRACE_NOTRACE_DEPTH;
-
 	if (index < 0)
 		return ret;
 
@@ -675,10 +672,6 @@ print_graph_entry_leaf(struct trace_iterator *iter,
 
 		cpu_data = per_cpu_ptr(data->cpu_data, cpu);
 
-		/* If a graph tracer ignored set_graph_notrace */
-		if (call->depth < -1)
-			call->depth += FTRACE_NOTRACE_DEPTH;
-
 		/*
 		 * Comments display at + 1 to depth. Since
 		 * this is a leaf function, keep the comments
@@ -721,10 +714,6 @@ print_graph_entry_nested(struct trace_iterator *iter,
 		struct fgraph_cpu_data *cpu_data;
 		int cpu = iter->cpu;
 
-		/* If a graph tracer ignored set_graph_notrace */
-		if (call->depth < -1)
-			call->depth += FTRACE_NOTRACE_DEPTH;
-
 		cpu_data = per_cpu_ptr(data->cpu_data, cpu);
 		cpu_data->depth = call->depth;
 
-- 
cgit 


From 3306fc4aff464f9c08c8899695a218f4b1125d4a Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 15 Nov 2018 12:32:38 -0500
Subject: ftrace: Create new ftrace_internal.h header

In order to move function graph infrastructure into its own file (fgraph.h)
it needs to access various functions and variables in ftrace.c that are
currently static. Create a new file called ftrace-internal.h that holds the
function prototypes and the extern declarations of the variables needed by
fgraph.c as well, and make them global in ftrace.c such that they can be
used outside that file.

Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c          | 76 ++++++++----------------------------------
 kernel/trace/ftrace_internal.h | 75 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 89 insertions(+), 62 deletions(-)
 create mode 100644 kernel/trace/ftrace_internal.h

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 77734451cb05..52c89428b0db 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -40,6 +40,7 @@
 #include <asm/sections.h>
 #include <asm/setup.h>
 
+#include "ftrace_internal.h"
 #include "trace_output.h"
 #include "trace_stat.h"
 
@@ -77,7 +78,7 @@
 #define ASSIGN_OPS_HASH(opsname, val)
 #endif
 
-static struct ftrace_ops ftrace_list_end __read_mostly = {
+struct ftrace_ops ftrace_list_end __read_mostly = {
 	.func		= ftrace_stub,
 	.flags		= FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB,
 	INIT_OPS_HASH(ftrace_list_end)
@@ -112,11 +113,11 @@ static void ftrace_update_trampoline(struct ftrace_ops *ops);
  */
 static int ftrace_disabled __read_mostly;
 
-static DEFINE_MUTEX(ftrace_lock);
+DEFINE_MUTEX(ftrace_lock);
 
-static struct ftrace_ops __rcu *ftrace_ops_list __read_mostly = &ftrace_list_end;
+struct ftrace_ops __rcu *ftrace_ops_list __read_mostly = &ftrace_list_end;
 ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
-static struct ftrace_ops global_ops;
+struct ftrace_ops global_ops;
 
 #if ARCH_SUPPORTS_FTRACE_OPS
 static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
@@ -127,26 +128,6 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip);
 #define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops)
 #endif
 
-/*
- * Traverse the ftrace_global_list, invoking all entries.  The reason that we
- * can use rcu_dereference_raw_notrace() is that elements removed from this list
- * are simply leaked, so there is no need to interact with a grace-period
- * mechanism.  The rcu_dereference_raw_notrace() calls are needed to handle
- * concurrent insertions into the ftrace_global_list.
- *
- * Silly Alpha and silly pointer-speculation compiler optimizations!
- */
-#define do_for_each_ftrace_op(op, list)			\
-	op = rcu_dereference_raw_notrace(list);			\
-	do
-
-/*
- * Optimized for just a single item in the list (as that is the normal case).
- */
-#define while_for_each_ftrace_op(op)				\
-	while (likely(op = rcu_dereference_raw_notrace((op)->next)) &&	\
-	       unlikely((op) != &ftrace_list_end))
-
 static inline void ftrace_ops_init(struct ftrace_ops *ops)
 {
 #ifdef CONFIG_DYNAMIC_FTRACE
@@ -187,17 +168,11 @@ static void ftrace_sync_ipi(void *data)
 }
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
-static void update_function_graph_func(void);
-
 /* Both enabled by default (can be cleared by function_graph tracer flags */
 static bool fgraph_sleep_time = true;
 static bool fgraph_graph_time = true;
-
-#else
-static inline void update_function_graph_func(void) { }
 #endif
 
-
 static ftrace_func_t ftrace_ops_get_list_func(struct ftrace_ops *ops)
 {
 	/*
@@ -334,7 +309,7 @@ static int remove_ftrace_ops(struct ftrace_ops __rcu **list,
 
 static void ftrace_update_trampoline(struct ftrace_ops *ops);
 
-static int __register_ftrace_function(struct ftrace_ops *ops)
+int __register_ftrace_function(struct ftrace_ops *ops)
 {
 	if (ops->flags & FTRACE_OPS_FL_DELETED)
 		return -EINVAL;
@@ -375,7 +350,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
 	return 0;
 }
 
-static int __unregister_ftrace_function(struct ftrace_ops *ops)
+int __unregister_ftrace_function(struct ftrace_ops *ops)
 {
 	int ret;
 
@@ -1022,9 +997,7 @@ static __init void ftrace_profile_tracefs(struct dentry *d_tracer)
 #endif /* CONFIG_FUNCTION_PROFILER */
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
-static int ftrace_graph_active;
-#else
-# define ftrace_graph_active 0
+int ftrace_graph_active;
 #endif
 
 #ifdef CONFIG_DYNAMIC_FTRACE
@@ -1067,7 +1040,7 @@ static const struct ftrace_hash empty_hash = {
 };
 #define EMPTY_HASH	((struct ftrace_hash *)&empty_hash)
 
-static struct ftrace_ops global_ops = {
+struct ftrace_ops global_ops = {
 	.func				= ftrace_stub,
 	.local_hash.notrace_hash	= EMPTY_HASH,
 	.local_hash.filter_hash		= EMPTY_HASH,
@@ -1503,7 +1476,7 @@ static bool hash_contains_ip(unsigned long ip,
  * This needs to be called with preemption disabled as
  * the hashes are freed with call_rcu_sched().
  */
-static int
+int
 ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)
 {
 	struct ftrace_ops_hash hash;
@@ -2682,7 +2655,7 @@ static void ftrace_startup_all(int command)
 	update_all_ops = false;
 }
 
-static int ftrace_startup(struct ftrace_ops *ops, int command)
+int ftrace_startup(struct ftrace_ops *ops, int command)
 {
 	int ret;
 
@@ -2724,7 +2697,7 @@ static int ftrace_startup(struct ftrace_ops *ops, int command)
 	return 0;
 }
 
-static int ftrace_shutdown(struct ftrace_ops *ops, int command)
+int ftrace_shutdown(struct ftrace_ops *ops, int command)
 {
 	int ret;
 
@@ -6177,7 +6150,7 @@ void ftrace_init_trace_array(struct trace_array *tr)
 }
 #else
 
-static struct ftrace_ops global_ops = {
+struct ftrace_ops global_ops = {
 	.func			= ftrace_stub,
 	.flags			= FTRACE_OPS_FL_RECURSION_SAFE |
 				  FTRACE_OPS_FL_INITIALIZED |
@@ -6194,31 +6167,10 @@ core_initcall(ftrace_nodyn_init);
 static inline int ftrace_init_dyn_tracefs(struct dentry *d_tracer) { return 0; }
 static inline void ftrace_startup_enable(int command) { }
 static inline void ftrace_startup_all(int command) { }
-/* Keep as macros so we do not need to define the commands */
-# define ftrace_startup(ops, command)					\
-	({								\
-		int ___ret = __register_ftrace_function(ops);		\
-		if (!___ret)						\
-			(ops)->flags |= FTRACE_OPS_FL_ENABLED;		\
-		___ret;							\
-	})
-# define ftrace_shutdown(ops, command)					\
-	({								\
-		int ___ret = __unregister_ftrace_function(ops);		\
-		if (!___ret)						\
-			(ops)->flags &= ~FTRACE_OPS_FL_ENABLED;		\
-		___ret;							\
-	})
 
 # define ftrace_startup_sysctl()	do { } while (0)
 # define ftrace_shutdown_sysctl()	do { } while (0)
 
-static inline int
-ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)
-{
-	return 1;
-}
-
 static void ftrace_update_trampoline(struct ftrace_ops *ops)
 {
 }
@@ -6930,7 +6882,7 @@ static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace)
  * function against the global ops, and not just trace any function
  * that any ftrace_ops registered.
  */
-static void update_function_graph_func(void)
+void update_function_graph_func(void)
 {
 	struct ftrace_ops *op;
 	bool do_test = false;
diff --git a/kernel/trace/ftrace_internal.h b/kernel/trace/ftrace_internal.h
new file mode 100644
index 000000000000..0515a2096f90
--- /dev/null
+++ b/kernel/trace/ftrace_internal.h
@@ -0,0 +1,75 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_KERNEL_FTRACE_INTERNAL_H
+#define  _LINUX_KERNEL_FTRACE_INTERNAL_H
+
+#ifdef CONFIG_FUNCTION_TRACER
+
+/*
+ * Traverse the ftrace_global_list, invoking all entries.  The reason that we
+ * can use rcu_dereference_raw_notrace() is that elements removed from this list
+ * are simply leaked, so there is no need to interact with a grace-period
+ * mechanism.  The rcu_dereference_raw_notrace() calls are needed to handle
+ * concurrent insertions into the ftrace_global_list.
+ *
+ * Silly Alpha and silly pointer-speculation compiler optimizations!
+ */
+#define do_for_each_ftrace_op(op, list)			\
+	op = rcu_dereference_raw_notrace(list);			\
+	do
+
+/*
+ * Optimized for just a single item in the list (as that is the normal case).
+ */
+#define while_for_each_ftrace_op(op)				\
+	while (likely(op = rcu_dereference_raw_notrace((op)->next)) &&	\
+	       unlikely((op) != &ftrace_list_end))
+
+extern struct ftrace_ops __rcu *ftrace_ops_list;
+extern struct ftrace_ops ftrace_list_end;
+extern struct mutex ftrace_lock;
+extern struct ftrace_ops global_ops;
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+int ftrace_startup(struct ftrace_ops *ops, int command);
+int ftrace_shutdown(struct ftrace_ops *ops, int command);
+int ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs);
+
+#else /* !CONFIG_DYNAMIC_FTRACE */
+
+int __register_ftrace_function(struct ftrace_ops *ops);
+int __unregister_ftrace_function(struct ftrace_ops *ops);
+/* Keep as macros so we do not need to define the commands */
+# define ftrace_startup(ops, command)					\
+	({								\
+		int ___ret = __register_ftrace_function(ops);		\
+		if (!___ret)						\
+			(ops)->flags |= FTRACE_OPS_FL_ENABLED;		\
+		___ret;							\
+	})
+# define ftrace_shutdown(ops, command)					\
+	({								\
+		int ___ret = __unregister_ftrace_function(ops);		\
+		if (!___ret)						\
+			(ops)->flags &= ~FTRACE_OPS_FL_ENABLED;		\
+		___ret;							\
+	})
+static inline int
+ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)
+{
+	return 1;
+}
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+extern int ftrace_graph_active;
+void update_function_graph_func(void);
+#else /* !CONFIG_FUNCTION_GRAPH_TRACER */
+# define ftrace_graph_active 0
+static inline void update_function_graph_func(void) { }
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
+#else /* !CONFIG_FUNCTION_TRACER */
+#endif /* CONFIG_FUNCTION_TRACER */
+
+#endif
-- 
cgit 


From c8dd0f45874547e6e77bab03d71feb16c4cb98a8 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Fri, 23 Nov 2018 13:06:07 -0500
Subject: function_graph: Do not expose the graph_time option when profiler is
 not configured

When the function profiler is not configured, the "graph_time" option is
meaningless, as the function profiler is the only thing that makes use of
it. Do not expose it if the profiler is not configured.

Link: http://lkml.kernel.org/r/20181123061133.GA195223@google.com

Reported-by: Joel Fernandes <joel@joelfernandes.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.h                 | 5 +++++
 kernel/trace/trace_functions_graph.c | 4 ++++
 2 files changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f67060a75f38..ab16eca76e59 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -862,7 +862,12 @@ static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash)
 #define TRACE_GRAPH_PRINT_FILL_MASK	(0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT)
 
 extern void ftrace_graph_sleep_time_control(bool enable);
+
+#ifdef CONFIG_FUNCTION_PROFILER
 extern void ftrace_graph_graph_time_control(bool enable);
+#else
+static inline void ftrace_graph_graph_time_control(bool enable) { }
+#endif
 
 extern enum print_line_t
 print_graph_function_flags(struct trace_iterator *iter, u32 flags);
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index eaf9b1629956..855c13c61e77 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -60,8 +60,12 @@ static struct tracer_opt trace_opts[] = {
 	{ TRACER_OPT(funcgraph-tail, TRACE_GRAPH_PRINT_TAIL) },
 	/* Include sleep time (scheduled out) between entry and return */
 	{ TRACER_OPT(sleep-time, TRACE_GRAPH_SLEEP_TIME) },
+
+#ifdef CONFIG_FUNCTION_PROFILER
 	/* Include time within nested functions */
 	{ TRACER_OPT(graph-time, TRACE_GRAPH_GRAPH_TIME) },
+#endif
+
 	{ } /* Empty entry */
 };
 
-- 
cgit 


From e73e679f656e678b0e7f8961094201f3544f4541 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 15 Nov 2018 12:35:13 -0500
Subject: fgraph: Move function graph specific code into fgraph.c

To make the function graph infrastructure more managable, the code needs to
be in its own file (fgraph.c). Move the code that is specific for managing
the function graph infrastructure out of ftrace.c and into fgraph.c

Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/fgraph.c | 360 +++++++++++++++++++++++++++++++++++++++++++++++-
 kernel/trace/ftrace.c | 368 +-------------------------------------------------
 2 files changed, 366 insertions(+), 362 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index de887a983ac7..374f3e42e29e 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -7,11 +7,27 @@
  *
  * Highly modified by Steven Rostedt (VMware).
  */
+#include <linux/suspend.h>
 #include <linux/ftrace.h>
+#include <linux/slab.h>
 
-#include "trace.h"
+#include <trace/events/sched.h>
+
+#include "ftrace_internal.h"
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+#define ASSIGN_OPS_HASH(opsname, val) \
+	.func_hash		= val, \
+	.local_hash.regex_lock	= __MUTEX_INITIALIZER(opsname.local_hash.regex_lock),
+#else
+#define ASSIGN_OPS_HASH(opsname, val)
+#endif
 
 static bool kill_ftrace_graph;
+int ftrace_graph_active;
+
+/* Both enabled by default (can be cleared by function_graph tracer flags */
+static bool fgraph_sleep_time = true;
 
 /**
  * ftrace_graph_is_dead - returns true if ftrace_graph_stop() was called
@@ -161,6 +177,31 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
 	barrier();
 }
 
+/*
+ * Hibernation protection.
+ * The state of the current task is too much unstable during
+ * suspend/restore to disk. We want to protect against that.
+ */
+static int
+ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state,
+							void *unused)
+{
+	switch (state) {
+	case PM_HIBERNATION_PREPARE:
+		pause_graph_tracing();
+		break;
+
+	case PM_POST_HIBERNATION:
+		unpause_graph_tracing();
+		break;
+	}
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block ftrace_suspend_notifier = {
+	.notifier_call = ftrace_suspend_notifier_call,
+};
+
 /*
  * Send the trace to the ring-buffer.
  * @return the original return address.
@@ -190,3 +231,320 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
 
 	return ret;
 }
+
+static struct ftrace_ops graph_ops = {
+	.func			= ftrace_stub,
+	.flags			= FTRACE_OPS_FL_RECURSION_SAFE |
+				   FTRACE_OPS_FL_INITIALIZED |
+				   FTRACE_OPS_FL_PID |
+				   FTRACE_OPS_FL_STUB,
+#ifdef FTRACE_GRAPH_TRAMP_ADDR
+	.trampoline		= FTRACE_GRAPH_TRAMP_ADDR,
+	/* trampoline_size is only needed for dynamically allocated tramps */
+#endif
+	ASSIGN_OPS_HASH(graph_ops, &global_ops.local_hash)
+};
+
+void ftrace_graph_sleep_time_control(bool enable)
+{
+	fgraph_sleep_time = enable;
+}
+
+int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
+{
+	return 0;
+}
+
+/* The callbacks that hook a function */
+trace_func_graph_ret_t ftrace_graph_return =
+			(trace_func_graph_ret_t)ftrace_stub;
+trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub;
+static trace_func_graph_ent_t __ftrace_graph_entry = ftrace_graph_entry_stub;
+
+/* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */
+static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
+{
+	int i;
+	int ret = 0;
+	int start = 0, end = FTRACE_RETSTACK_ALLOC_SIZE;
+	struct task_struct *g, *t;
+
+	for (i = 0; i < FTRACE_RETSTACK_ALLOC_SIZE; i++) {
+		ret_stack_list[i] =
+			kmalloc_array(FTRACE_RETFUNC_DEPTH,
+				      sizeof(struct ftrace_ret_stack),
+				      GFP_KERNEL);
+		if (!ret_stack_list[i]) {
+			start = 0;
+			end = i;
+			ret = -ENOMEM;
+			goto free;
+		}
+	}
+
+	read_lock(&tasklist_lock);
+	do_each_thread(g, t) {
+		if (start == end) {
+			ret = -EAGAIN;
+			goto unlock;
+		}
+
+		if (t->ret_stack == NULL) {
+			atomic_set(&t->tracing_graph_pause, 0);
+			atomic_set(&t->trace_overrun, 0);
+			t->curr_ret_stack = -1;
+			t->curr_ret_depth = -1;
+			/* Make sure the tasks see the -1 first: */
+			smp_wmb();
+			t->ret_stack = ret_stack_list[start++];
+		}
+	} while_each_thread(g, t);
+
+unlock:
+	read_unlock(&tasklist_lock);
+free:
+	for (i = start; i < end; i++)
+		kfree(ret_stack_list[i]);
+	return ret;
+}
+
+static void
+ftrace_graph_probe_sched_switch(void *ignore, bool preempt,
+			struct task_struct *prev, struct task_struct *next)
+{
+	unsigned long long timestamp;
+	int index;
+
+	/*
+	 * Does the user want to count the time a function was asleep.
+	 * If so, do not update the time stamps.
+	 */
+	if (fgraph_sleep_time)
+		return;
+
+	timestamp = trace_clock_local();
+
+	prev->ftrace_timestamp = timestamp;
+
+	/* only process tasks that we timestamped */
+	if (!next->ftrace_timestamp)
+		return;
+
+	/*
+	 * Update all the counters in next to make up for the
+	 * time next was sleeping.
+	 */
+	timestamp -= next->ftrace_timestamp;
+
+	for (index = next->curr_ret_stack; index >= 0; index--)
+		next->ret_stack[index].calltime += timestamp;
+}
+
+static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace)
+{
+	if (!ftrace_ops_test(&global_ops, trace->func, NULL))
+		return 0;
+	return __ftrace_graph_entry(trace);
+}
+
+/*
+ * The function graph tracer should only trace the functions defined
+ * by set_ftrace_filter and set_ftrace_notrace. If another function
+ * tracer ops is registered, the graph tracer requires testing the
+ * function against the global ops, and not just trace any function
+ * that any ftrace_ops registered.
+ */
+void update_function_graph_func(void)
+{
+	struct ftrace_ops *op;
+	bool do_test = false;
+
+	/*
+	 * The graph and global ops share the same set of functions
+	 * to test. If any other ops is on the list, then
+	 * the graph tracing needs to test if its the function
+	 * it should call.
+	 */
+	do_for_each_ftrace_op(op, ftrace_ops_list) {
+		if (op != &global_ops && op != &graph_ops &&
+		    op != &ftrace_list_end) {
+			do_test = true;
+			/* in double loop, break out with goto */
+			goto out;
+		}
+	} while_for_each_ftrace_op(op);
+ out:
+	if (do_test)
+		ftrace_graph_entry = ftrace_graph_entry_test;
+	else
+		ftrace_graph_entry = __ftrace_graph_entry;
+}
+
+static DEFINE_PER_CPU(struct ftrace_ret_stack *, idle_ret_stack);
+
+static void
+graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack)
+{
+	atomic_set(&t->tracing_graph_pause, 0);
+	atomic_set(&t->trace_overrun, 0);
+	t->ftrace_timestamp = 0;
+	/* make curr_ret_stack visible before we add the ret_stack */
+	smp_wmb();
+	t->ret_stack = ret_stack;
+}
+
+/*
+ * Allocate a return stack for the idle task. May be the first
+ * time through, or it may be done by CPU hotplug online.
+ */
+void ftrace_graph_init_idle_task(struct task_struct *t, int cpu)
+{
+	t->curr_ret_stack = -1;
+	t->curr_ret_depth = -1;
+	/*
+	 * The idle task has no parent, it either has its own
+	 * stack or no stack at all.
+	 */
+	if (t->ret_stack)
+		WARN_ON(t->ret_stack != per_cpu(idle_ret_stack, cpu));
+
+	if (ftrace_graph_active) {
+		struct ftrace_ret_stack *ret_stack;
+
+		ret_stack = per_cpu(idle_ret_stack, cpu);
+		if (!ret_stack) {
+			ret_stack =
+				kmalloc_array(FTRACE_RETFUNC_DEPTH,
+					      sizeof(struct ftrace_ret_stack),
+					      GFP_KERNEL);
+			if (!ret_stack)
+				return;
+			per_cpu(idle_ret_stack, cpu) = ret_stack;
+		}
+		graph_init_task(t, ret_stack);
+	}
+}
+
+/* Allocate a return stack for newly created task */
+void ftrace_graph_init_task(struct task_struct *t)
+{
+	/* Make sure we do not use the parent ret_stack */
+	t->ret_stack = NULL;
+	t->curr_ret_stack = -1;
+	t->curr_ret_depth = -1;
+
+	if (ftrace_graph_active) {
+		struct ftrace_ret_stack *ret_stack;
+
+		ret_stack = kmalloc_array(FTRACE_RETFUNC_DEPTH,
+					  sizeof(struct ftrace_ret_stack),
+					  GFP_KERNEL);
+		if (!ret_stack)
+			return;
+		graph_init_task(t, ret_stack);
+	}
+}
+
+void ftrace_graph_exit_task(struct task_struct *t)
+{
+	struct ftrace_ret_stack	*ret_stack = t->ret_stack;
+
+	t->ret_stack = NULL;
+	/* NULL must become visible to IRQs before we free it: */
+	barrier();
+
+	kfree(ret_stack);
+}
+
+/* Allocate a return stack for each task */
+static int start_graph_tracing(void)
+{
+	struct ftrace_ret_stack **ret_stack_list;
+	int ret, cpu;
+
+	ret_stack_list = kmalloc_array(FTRACE_RETSTACK_ALLOC_SIZE,
+				       sizeof(struct ftrace_ret_stack *),
+				       GFP_KERNEL);
+
+	if (!ret_stack_list)
+		return -ENOMEM;
+
+	/* The cpu_boot init_task->ret_stack will never be freed */
+	for_each_online_cpu(cpu) {
+		if (!idle_task(cpu)->ret_stack)
+			ftrace_graph_init_idle_task(idle_task(cpu), cpu);
+	}
+
+	do {
+		ret = alloc_retstack_tasklist(ret_stack_list);
+	} while (ret == -EAGAIN);
+
+	if (!ret) {
+		ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
+		if (ret)
+			pr_info("ftrace_graph: Couldn't activate tracepoint"
+				" probe to kernel_sched_switch\n");
+	}
+
+	kfree(ret_stack_list);
+	return ret;
+}
+
+int register_ftrace_graph(trace_func_graph_ret_t retfunc,
+			trace_func_graph_ent_t entryfunc)
+{
+	int ret = 0;
+
+	mutex_lock(&ftrace_lock);
+
+	/* we currently allow only one tracer registered at a time */
+	if (ftrace_graph_active) {
+		ret = -EBUSY;
+		goto out;
+	}
+
+	register_pm_notifier(&ftrace_suspend_notifier);
+
+	ftrace_graph_active++;
+	ret = start_graph_tracing();
+	if (ret) {
+		ftrace_graph_active--;
+		goto out;
+	}
+
+	ftrace_graph_return = retfunc;
+
+	/*
+	 * Update the indirect function to the entryfunc, and the
+	 * function that gets called to the entry_test first. Then
+	 * call the update fgraph entry function to determine if
+	 * the entryfunc should be called directly or not.
+	 */
+	__ftrace_graph_entry = entryfunc;
+	ftrace_graph_entry = ftrace_graph_entry_test;
+	update_function_graph_func();
+
+	ret = ftrace_startup(&graph_ops, FTRACE_START_FUNC_RET);
+out:
+	mutex_unlock(&ftrace_lock);
+	return ret;
+}
+
+void unregister_ftrace_graph(void)
+{
+	mutex_lock(&ftrace_lock);
+
+	if (unlikely(!ftrace_graph_active))
+		goto out;
+
+	ftrace_graph_active--;
+	ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
+	ftrace_graph_entry = ftrace_graph_entry_stub;
+	__ftrace_graph_entry = ftrace_graph_entry_stub;
+	ftrace_shutdown(&graph_ops, FTRACE_STOP_FUNC_RET);
+	unregister_pm_notifier(&ftrace_suspend_notifier);
+	unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
+
+ out:
+	mutex_unlock(&ftrace_lock);
+}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 52c89428b0db..c53533b833cf 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -19,7 +19,6 @@
 #include <linux/sched/task.h>
 #include <linux/kallsyms.h>
 #include <linux/seq_file.h>
-#include <linux/suspend.h>
 #include <linux/tracefs.h>
 #include <linux/hardirq.h>
 #include <linux/kthread.h>
@@ -167,12 +166,6 @@ static void ftrace_sync_ipi(void *data)
 	smp_rmb();
 }
 
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-/* Both enabled by default (can be cleared by function_graph tracer flags */
-static bool fgraph_sleep_time = true;
-static bool fgraph_graph_time = true;
-#endif
-
 static ftrace_func_t ftrace_ops_get_list_func(struct ftrace_ops *ops)
 {
 	/*
@@ -790,6 +783,13 @@ function_profile_call(unsigned long ip, unsigned long parent_ip,
 }
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static bool fgraph_graph_time = true;
+
+void ftrace_graph_graph_time_control(bool enable)
+{
+	fgraph_graph_time = enable;
+}
+
 static int profile_graph_entry(struct ftrace_graph_ent *trace)
 {
 	int index = current->curr_ret_stack;
@@ -996,10 +996,6 @@ static __init void ftrace_profile_tracefs(struct dentry *d_tracer)
 }
 #endif /* CONFIG_FUNCTION_PROFILER */
 
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-int ftrace_graph_active;
-#endif
-
 #ifdef CONFIG_DYNAMIC_FTRACE
 
 static struct ftrace_ops *removed_ops;
@@ -6697,353 +6693,3 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
 	mutex_unlock(&ftrace_lock);
 	return ret;
 }
-
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-
-static struct ftrace_ops graph_ops = {
-	.func			= ftrace_stub,
-	.flags			= FTRACE_OPS_FL_RECURSION_SAFE |
-				   FTRACE_OPS_FL_INITIALIZED |
-				   FTRACE_OPS_FL_PID |
-				   FTRACE_OPS_FL_STUB,
-#ifdef FTRACE_GRAPH_TRAMP_ADDR
-	.trampoline		= FTRACE_GRAPH_TRAMP_ADDR,
-	/* trampoline_size is only needed for dynamically allocated tramps */
-#endif
-	ASSIGN_OPS_HASH(graph_ops, &global_ops.local_hash)
-};
-
-void ftrace_graph_sleep_time_control(bool enable)
-{
-	fgraph_sleep_time = enable;
-}
-
-void ftrace_graph_graph_time_control(bool enable)
-{
-	fgraph_graph_time = enable;
-}
-
-int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
-{
-	return 0;
-}
-
-/* The callbacks that hook a function */
-trace_func_graph_ret_t ftrace_graph_return =
-			(trace_func_graph_ret_t)ftrace_stub;
-trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub;
-static trace_func_graph_ent_t __ftrace_graph_entry = ftrace_graph_entry_stub;
-
-/* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */
-static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
-{
-	int i;
-	int ret = 0;
-	int start = 0, end = FTRACE_RETSTACK_ALLOC_SIZE;
-	struct task_struct *g, *t;
-
-	for (i = 0; i < FTRACE_RETSTACK_ALLOC_SIZE; i++) {
-		ret_stack_list[i] =
-			kmalloc_array(FTRACE_RETFUNC_DEPTH,
-				      sizeof(struct ftrace_ret_stack),
-				      GFP_KERNEL);
-		if (!ret_stack_list[i]) {
-			start = 0;
-			end = i;
-			ret = -ENOMEM;
-			goto free;
-		}
-	}
-
-	read_lock(&tasklist_lock);
-	do_each_thread(g, t) {
-		if (start == end) {
-			ret = -EAGAIN;
-			goto unlock;
-		}
-
-		if (t->ret_stack == NULL) {
-			atomic_set(&t->tracing_graph_pause, 0);
-			atomic_set(&t->trace_overrun, 0);
-			t->curr_ret_stack = -1;
-			t->curr_ret_depth = -1;
-			/* Make sure the tasks see the -1 first: */
-			smp_wmb();
-			t->ret_stack = ret_stack_list[start++];
-		}
-	} while_each_thread(g, t);
-
-unlock:
-	read_unlock(&tasklist_lock);
-free:
-	for (i = start; i < end; i++)
-		kfree(ret_stack_list[i]);
-	return ret;
-}
-
-static void
-ftrace_graph_probe_sched_switch(void *ignore, bool preempt,
-			struct task_struct *prev, struct task_struct *next)
-{
-	unsigned long long timestamp;
-	int index;
-
-	/*
-	 * Does the user want to count the time a function was asleep.
-	 * If so, do not update the time stamps.
-	 */
-	if (fgraph_sleep_time)
-		return;
-
-	timestamp = trace_clock_local();
-
-	prev->ftrace_timestamp = timestamp;
-
-	/* only process tasks that we timestamped */
-	if (!next->ftrace_timestamp)
-		return;
-
-	/*
-	 * Update all the counters in next to make up for the
-	 * time next was sleeping.
-	 */
-	timestamp -= next->ftrace_timestamp;
-
-	for (index = next->curr_ret_stack; index >= 0; index--)
-		next->ret_stack[index].calltime += timestamp;
-}
-
-/* Allocate a return stack for each task */
-static int start_graph_tracing(void)
-{
-	struct ftrace_ret_stack **ret_stack_list;
-	int ret, cpu;
-
-	ret_stack_list = kmalloc_array(FTRACE_RETSTACK_ALLOC_SIZE,
-				       sizeof(struct ftrace_ret_stack *),
-				       GFP_KERNEL);
-
-	if (!ret_stack_list)
-		return -ENOMEM;
-
-	/* The cpu_boot init_task->ret_stack will never be freed */
-	for_each_online_cpu(cpu) {
-		if (!idle_task(cpu)->ret_stack)
-			ftrace_graph_init_idle_task(idle_task(cpu), cpu);
-	}
-
-	do {
-		ret = alloc_retstack_tasklist(ret_stack_list);
-	} while (ret == -EAGAIN);
-
-	if (!ret) {
-		ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
-		if (ret)
-			pr_info("ftrace_graph: Couldn't activate tracepoint"
-				" probe to kernel_sched_switch\n");
-	}
-
-	kfree(ret_stack_list);
-	return ret;
-}
-
-/*
- * Hibernation protection.
- * The state of the current task is too much unstable during
- * suspend/restore to disk. We want to protect against that.
- */
-static int
-ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state,
-							void *unused)
-{
-	switch (state) {
-	case PM_HIBERNATION_PREPARE:
-		pause_graph_tracing();
-		break;
-
-	case PM_POST_HIBERNATION:
-		unpause_graph_tracing();
-		break;
-	}
-	return NOTIFY_DONE;
-}
-
-static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace)
-{
-	if (!ftrace_ops_test(&global_ops, trace->func, NULL))
-		return 0;
-	return __ftrace_graph_entry(trace);
-}
-
-/*
- * The function graph tracer should only trace the functions defined
- * by set_ftrace_filter and set_ftrace_notrace. If another function
- * tracer ops is registered, the graph tracer requires testing the
- * function against the global ops, and not just trace any function
- * that any ftrace_ops registered.
- */
-void update_function_graph_func(void)
-{
-	struct ftrace_ops *op;
-	bool do_test = false;
-
-	/*
-	 * The graph and global ops share the same set of functions
-	 * to test. If any other ops is on the list, then
-	 * the graph tracing needs to test if its the function
-	 * it should call.
-	 */
-	do_for_each_ftrace_op(op, ftrace_ops_list) {
-		if (op != &global_ops && op != &graph_ops &&
-		    op != &ftrace_list_end) {
-			do_test = true;
-			/* in double loop, break out with goto */
-			goto out;
-		}
-	} while_for_each_ftrace_op(op);
- out:
-	if (do_test)
-		ftrace_graph_entry = ftrace_graph_entry_test;
-	else
-		ftrace_graph_entry = __ftrace_graph_entry;
-}
-
-static struct notifier_block ftrace_suspend_notifier = {
-	.notifier_call = ftrace_suspend_notifier_call,
-};
-
-int register_ftrace_graph(trace_func_graph_ret_t retfunc,
-			trace_func_graph_ent_t entryfunc)
-{
-	int ret = 0;
-
-	mutex_lock(&ftrace_lock);
-
-	/* we currently allow only one tracer registered at a time */
-	if (ftrace_graph_active) {
-		ret = -EBUSY;
-		goto out;
-	}
-
-	register_pm_notifier(&ftrace_suspend_notifier);
-
-	ftrace_graph_active++;
-	ret = start_graph_tracing();
-	if (ret) {
-		ftrace_graph_active--;
-		goto out;
-	}
-
-	ftrace_graph_return = retfunc;
-
-	/*
-	 * Update the indirect function to the entryfunc, and the
-	 * function that gets called to the entry_test first. Then
-	 * call the update fgraph entry function to determine if
-	 * the entryfunc should be called directly or not.
-	 */
-	__ftrace_graph_entry = entryfunc;
-	ftrace_graph_entry = ftrace_graph_entry_test;
-	update_function_graph_func();
-
-	ret = ftrace_startup(&graph_ops, FTRACE_START_FUNC_RET);
-out:
-	mutex_unlock(&ftrace_lock);
-	return ret;
-}
-
-void unregister_ftrace_graph(void)
-{
-	mutex_lock(&ftrace_lock);
-
-	if (unlikely(!ftrace_graph_active))
-		goto out;
-
-	ftrace_graph_active--;
-	ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
-	ftrace_graph_entry = ftrace_graph_entry_stub;
-	__ftrace_graph_entry = ftrace_graph_entry_stub;
-	ftrace_shutdown(&graph_ops, FTRACE_STOP_FUNC_RET);
-	unregister_pm_notifier(&ftrace_suspend_notifier);
-	unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
-
- out:
-	mutex_unlock(&ftrace_lock);
-}
-
-static DEFINE_PER_CPU(struct ftrace_ret_stack *, idle_ret_stack);
-
-static void
-graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack)
-{
-	atomic_set(&t->tracing_graph_pause, 0);
-	atomic_set(&t->trace_overrun, 0);
-	t->ftrace_timestamp = 0;
-	/* make curr_ret_stack visible before we add the ret_stack */
-	smp_wmb();
-	t->ret_stack = ret_stack;
-}
-
-/*
- * Allocate a return stack for the idle task. May be the first
- * time through, or it may be done by CPU hotplug online.
- */
-void ftrace_graph_init_idle_task(struct task_struct *t, int cpu)
-{
-	t->curr_ret_stack = -1;
-	t->curr_ret_depth = -1;
-	/*
-	 * The idle task has no parent, it either has its own
-	 * stack or no stack at all.
-	 */
-	if (t->ret_stack)
-		WARN_ON(t->ret_stack != per_cpu(idle_ret_stack, cpu));
-
-	if (ftrace_graph_active) {
-		struct ftrace_ret_stack *ret_stack;
-
-		ret_stack = per_cpu(idle_ret_stack, cpu);
-		if (!ret_stack) {
-			ret_stack =
-				kmalloc_array(FTRACE_RETFUNC_DEPTH,
-					      sizeof(struct ftrace_ret_stack),
-					      GFP_KERNEL);
-			if (!ret_stack)
-				return;
-			per_cpu(idle_ret_stack, cpu) = ret_stack;
-		}
-		graph_init_task(t, ret_stack);
-	}
-}
-
-/* Allocate a return stack for newly created task */
-void ftrace_graph_init_task(struct task_struct *t)
-{
-	/* Make sure we do not use the parent ret_stack */
-	t->ret_stack = NULL;
-	t->curr_ret_stack = -1;
-	t->curr_ret_depth = -1;
-
-	if (ftrace_graph_active) {
-		struct ftrace_ret_stack *ret_stack;
-
-		ret_stack = kmalloc_array(FTRACE_RETFUNC_DEPTH,
-					  sizeof(struct ftrace_ret_stack),
-					  GFP_KERNEL);
-		if (!ret_stack)
-			return;
-		graph_init_task(t, ret_stack);
-	}
-}
-
-void ftrace_graph_exit_task(struct task_struct *t)
-{
-	struct ftrace_ret_stack	*ret_stack = t->ret_stack;
-
-	t->ret_stack = NULL;
-	/* NULL must become visible to IRQs before we free it: */
-	barrier();
-
-	kfree(ret_stack);
-}
-#endif
-- 
cgit 


From 317e04ca905ac6c4b33cb879e9a107c412125f14 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Wed, 28 Nov 2018 10:26:27 -0500
Subject: tracing: Rearrange functions in trace_sched_wakeup.c

Rearrange the functions in trace_sched_wakeup.c so that there are fewer
 #ifdef CONFIG_FUNCTION_TRACER and #ifdef CONFIG_FUNCTION_GRAPH_TRACER,
instead of having the #ifdefs spread all over.

No functional change is made.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_sched_wakeup.c | 272 ++++++++++++++++++--------------------
 1 file changed, 130 insertions(+), 142 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 7d04b9890755..2ce78100b4d3 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -35,26 +35,19 @@ static arch_spinlock_t wakeup_lock =
 
 static void wakeup_reset(struct trace_array *tr);
 static void __wakeup_reset(struct trace_array *tr);
+static int start_func_tracer(struct trace_array *tr, int graph);
+static void stop_func_tracer(struct trace_array *tr, int graph);
 
 static int save_flags;
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
-static int wakeup_display_graph(struct trace_array *tr, int set);
 # define is_graph(tr) ((tr)->trace_flags & TRACE_ITER_DISPLAY_GRAPH)
 #else
-static inline int wakeup_display_graph(struct trace_array *tr, int set)
-{
-	return 0;
-}
 # define is_graph(tr) false
 #endif
 
-
 #ifdef CONFIG_FUNCTION_TRACER
 
-static int wakeup_graph_entry(struct ftrace_graph_ent *trace);
-static void wakeup_graph_return(struct ftrace_graph_ret *trace);
-
 static bool function_enabled;
 
 /*
@@ -104,122 +97,8 @@ out_enable:
 	return 0;
 }
 
-/*
- * wakeup uses its own tracer function to keep the overhead down:
- */
-static void
-wakeup_tracer_call(unsigned long ip, unsigned long parent_ip,
-		   struct ftrace_ops *op, struct pt_regs *pt_regs)
-{
-	struct trace_array *tr = wakeup_trace;
-	struct trace_array_cpu *data;
-	unsigned long flags;
-	int pc;
-
-	if (!func_prolog_preempt_disable(tr, &data, &pc))
-		return;
-
-	local_irq_save(flags);
-	trace_function(tr, ip, parent_ip, flags, pc);
-	local_irq_restore(flags);
-
-	atomic_dec(&data->disabled);
-	preempt_enable_notrace();
-}
-
-static int register_wakeup_function(struct trace_array *tr, int graph, int set)
-{
-	int ret;
-
-	/* 'set' is set if TRACE_ITER_FUNCTION is about to be set */
-	if (function_enabled || (!set && !(tr->trace_flags & TRACE_ITER_FUNCTION)))
-		return 0;
-
-	if (graph)
-		ret = register_ftrace_graph(&wakeup_graph_return,
-					    &wakeup_graph_entry);
-	else
-		ret = register_ftrace_function(tr->ops);
-
-	if (!ret)
-		function_enabled = true;
-
-	return ret;
-}
-
-static void unregister_wakeup_function(struct trace_array *tr, int graph)
-{
-	if (!function_enabled)
-		return;
-
-	if (graph)
-		unregister_ftrace_graph();
-	else
-		unregister_ftrace_function(tr->ops);
-
-	function_enabled = false;
-}
-
-static int wakeup_function_set(struct trace_array *tr, u32 mask, int set)
-{
-	if (!(mask & TRACE_ITER_FUNCTION))
-		return 0;
-
-	if (set)
-		register_wakeup_function(tr, is_graph(tr), 1);
-	else
-		unregister_wakeup_function(tr, is_graph(tr));
-	return 1;
-}
-#else
-static int register_wakeup_function(struct trace_array *tr, int graph, int set)
-{
-	return 0;
-}
-static void unregister_wakeup_function(struct trace_array *tr, int graph) { }
-static int wakeup_function_set(struct trace_array *tr, u32 mask, int set)
-{
-	return 0;
-}
-#endif /* CONFIG_FUNCTION_TRACER */
-
-static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set)
-{
-	struct tracer *tracer = tr->current_trace;
-
-	if (wakeup_function_set(tr, mask, set))
-		return 0;
-
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
-	if (mask & TRACE_ITER_DISPLAY_GRAPH)
-		return wakeup_display_graph(tr, set);
-#endif
-
-	return trace_keep_overwrite(tracer, mask, set);
-}
 
-static int start_func_tracer(struct trace_array *tr, int graph)
-{
-	int ret;
-
-	ret = register_wakeup_function(tr, graph, 0);
-
-	if (!ret && tracing_is_enabled())
-		tracer_enabled = 1;
-	else
-		tracer_enabled = 0;
-
-	return ret;
-}
-
-static void stop_func_tracer(struct trace_array *tr, int graph)
-{
-	tracer_enabled = 0;
-
-	unregister_wakeup_function(tr, graph);
-}
-
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
 static int wakeup_display_graph(struct trace_array *tr, int set)
 {
 	if (!(is_graph(tr) ^ set))
@@ -318,20 +197,94 @@ static void wakeup_print_header(struct seq_file *s)
 	else
 		trace_default_header(s);
 }
+#else /* CONFIG_FUNCTION_GRAPH_TRACER */
+static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
+{
+	return -1;
+}
+static void wakeup_graph_return(struct ftrace_graph_ret *trace) { }
+#endif /* else CONFIG_FUNCTION_GRAPH_TRACER */
 
+/*
+ * wakeup uses its own tracer function to keep the overhead down:
+ */
 static void
-__trace_function(struct trace_array *tr,
-		 unsigned long ip, unsigned long parent_ip,
-		 unsigned long flags, int pc)
+wakeup_tracer_call(unsigned long ip, unsigned long parent_ip,
+		   struct ftrace_ops *op, struct pt_regs *pt_regs)
 {
-	if (is_graph(tr))
-		trace_graph_function(tr, ip, parent_ip, flags, pc);
+	struct trace_array *tr = wakeup_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	int pc;
+
+	if (!func_prolog_preempt_disable(tr, &data, &pc))
+		return;
+
+	local_irq_save(flags);
+	trace_function(tr, ip, parent_ip, flags, pc);
+	local_irq_restore(flags);
+
+	atomic_dec(&data->disabled);
+	preempt_enable_notrace();
+}
+
+static int register_wakeup_function(struct trace_array *tr, int graph, int set)
+{
+	int ret;
+
+	/* 'set' is set if TRACE_ITER_FUNCTION is about to be set */
+	if (function_enabled || (!set && !(tr->trace_flags & TRACE_ITER_FUNCTION)))
+		return 0;
+
+	if (graph)
+		ret = register_ftrace_graph(&wakeup_graph_return,
+					    &wakeup_graph_entry);
 	else
-		trace_function(tr, ip, parent_ip, flags, pc);
+		ret = register_ftrace_function(tr->ops);
+
+	if (!ret)
+		function_enabled = true;
+
+	return ret;
 }
-#else
-#define __trace_function trace_function
 
+static void unregister_wakeup_function(struct trace_array *tr, int graph)
+{
+	if (!function_enabled)
+		return;
+
+	if (graph)
+		unregister_ftrace_graph();
+	else
+		unregister_ftrace_function(tr->ops);
+
+	function_enabled = false;
+}
+
+static int wakeup_function_set(struct trace_array *tr, u32 mask, int set)
+{
+	if (!(mask & TRACE_ITER_FUNCTION))
+		return 0;
+
+	if (set)
+		register_wakeup_function(tr, is_graph(tr), 1);
+	else
+		unregister_wakeup_function(tr, is_graph(tr));
+	return 1;
+}
+#else /* CONFIG_FUNCTION_TRACER */
+static int register_wakeup_function(struct trace_array *tr, int graph, int set)
+{
+	return 0;
+}
+static void unregister_wakeup_function(struct trace_array *tr, int graph) { }
+static int wakeup_function_set(struct trace_array *tr, u32 mask, int set)
+{
+	return 0;
+}
+#endif /* else CONFIG_FUNCTION_TRACER */
+
+#ifndef CONFIG_FUNCTION_GRAPH_TRACER
 static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
 {
 	return TRACE_TYPE_UNHANDLED;
@@ -340,23 +293,58 @@ static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
 static void wakeup_trace_open(struct trace_iterator *iter) { }
 static void wakeup_trace_close(struct trace_iterator *iter) { }
 
-#ifdef CONFIG_FUNCTION_TRACER
-static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
-{
-	return -1;
-}
-static void wakeup_graph_return(struct ftrace_graph_ret *trace) { }
 static void wakeup_print_header(struct seq_file *s)
 {
 	trace_default_header(s);
 }
-#else
-static void wakeup_print_header(struct seq_file *s)
+#endif /* !CONFIG_FUNCTION_GRAPH_TRACER */
+
+static void
+__trace_function(struct trace_array *tr,
+		 unsigned long ip, unsigned long parent_ip,
+		 unsigned long flags, int pc)
+{
+	if (is_graph(tr))
+		trace_graph_function(tr, ip, parent_ip, flags, pc);
+	else
+		trace_function(tr, ip, parent_ip, flags, pc);
+}
+
+static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set)
 {
-	trace_latency_header(s);
+	struct tracer *tracer = tr->current_trace;
+
+	if (wakeup_function_set(tr, mask, set))
+		return 0;
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	if (mask & TRACE_ITER_DISPLAY_GRAPH)
+		return wakeup_display_graph(tr, set);
+#endif
+
+	return trace_keep_overwrite(tracer, mask, set);
+}
+
+static int start_func_tracer(struct trace_array *tr, int graph)
+{
+	int ret;
+
+	ret = register_wakeup_function(tr, graph, 0);
+
+	if (!ret && tracing_is_enabled())
+		tracer_enabled = 1;
+	else
+		tracer_enabled = 0;
+
+	return ret;
+}
+
+static void stop_func_tracer(struct trace_array *tr, int graph)
+{
+	tracer_enabled = 0;
+
+	unregister_wakeup_function(tr, graph);
 }
-#endif /* CONFIG_FUNCTION_TRACER */
-#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
 /*
  * Should this new latency be reported/recorded?
-- 
cgit 


From 688f7089d8851b1a81106f0c0b9b29181b2f2dc8 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 15 Nov 2018 14:06:47 -0500
Subject: fgraph: Add new fgraph_ops structure to enable function graph hooks

Currently the registering of function graph is to pass in a entry and return
function. We need to have a way to associate those functions together where
the entry can determine to run the return hook. Having a structure that
contains both functions will facilitate the process of converting the code
to be able to do such.

This is similar to the way function hooks are enabled (it passes in
ftrace_ops). Instead of passing in the functions to use, a single structure
is passed in to the registering function.

The unregister function is now passed in the fgraph_ops handle. When we
allow more than one callback to the function graph hooks, this will let the
system know which one to remove.

Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/ftrace.h               | 21 +++++++++++----------
 kernel/trace/fgraph.c                |  9 ++++-----
 kernel/trace/ftrace.c                | 10 +++++++---
 kernel/trace/trace_functions_graph.c | 21 ++++++++++++++++-----
 kernel/trace/trace_irqsoff.c         | 18 +++++++-----------
 kernel/trace/trace_sched_wakeup.c    | 16 +++++++---------
 kernel/trace/trace_selftest.c        |  8 ++++++--
 7 files changed, 58 insertions(+), 45 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 98625f10d982..21c80491ccde 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -749,6 +749,11 @@ typedef int (*trace_func_graph_ent_t)(struct ftrace_graph_ent *); /* entry */
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 
+struct fgraph_ops {
+	trace_func_graph_ent_t		entryfunc;
+	trace_func_graph_ret_t		retfunc;
+};
+
 /*
  * Stack of return addresses for functions
  * of a thread.
@@ -792,8 +797,9 @@ unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
 
 #define FTRACE_RETFUNC_DEPTH 50
 #define FTRACE_RETSTACK_ALLOC_SIZE 32
-extern int register_ftrace_graph(trace_func_graph_ret_t retfunc,
-				trace_func_graph_ent_t entryfunc);
+
+extern int register_ftrace_graph(struct fgraph_ops *ops);
+extern void unregister_ftrace_graph(struct fgraph_ops *ops);
 
 extern bool ftrace_graph_is_dead(void);
 extern void ftrace_graph_stop(void);
@@ -802,8 +808,6 @@ extern void ftrace_graph_stop(void);
 extern trace_func_graph_ret_t ftrace_graph_return;
 extern trace_func_graph_ent_t ftrace_graph_entry;
 
-extern void unregister_ftrace_graph(void);
-
 extern void ftrace_graph_init_task(struct task_struct *t);
 extern void ftrace_graph_exit_task(struct task_struct *t);
 extern void ftrace_graph_init_idle_task(struct task_struct *t, int cpu);
@@ -825,12 +829,9 @@ static inline void ftrace_graph_init_task(struct task_struct *t) { }
 static inline void ftrace_graph_exit_task(struct task_struct *t) { }
 static inline void ftrace_graph_init_idle_task(struct task_struct *t, int cpu) { }
 
-static inline int register_ftrace_graph(trace_func_graph_ret_t retfunc,
-			  trace_func_graph_ent_t entryfunc)
-{
-	return -1;
-}
-static inline void unregister_ftrace_graph(void) { }
+/* Define as macros as fgraph_ops may not be defined */
+#define register_ftrace_graph(ops) ({ -1; })
+#define unregister_ftrace_graph(ops) do { } while (0)
 
 static inline unsigned long
 ftrace_graph_ret_addr(struct task_struct *task, int *idx, unsigned long ret,
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 374f3e42e29e..cc35606e9a3e 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -490,8 +490,7 @@ static int start_graph_tracing(void)
 	return ret;
 }
 
-int register_ftrace_graph(trace_func_graph_ret_t retfunc,
-			trace_func_graph_ent_t entryfunc)
+int register_ftrace_graph(struct fgraph_ops *gops)
 {
 	int ret = 0;
 
@@ -512,7 +511,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
 		goto out;
 	}
 
-	ftrace_graph_return = retfunc;
+	ftrace_graph_return = gops->retfunc;
 
 	/*
 	 * Update the indirect function to the entryfunc, and the
@@ -520,7 +519,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
 	 * call the update fgraph entry function to determine if
 	 * the entryfunc should be called directly or not.
 	 */
-	__ftrace_graph_entry = entryfunc;
+	__ftrace_graph_entry = gops->entryfunc;
 	ftrace_graph_entry = ftrace_graph_entry_test;
 	update_function_graph_func();
 
@@ -530,7 +529,7 @@ out:
 	return ret;
 }
 
-void unregister_ftrace_graph(void)
+void unregister_ftrace_graph(struct fgraph_ops *gops)
 {
 	mutex_lock(&ftrace_lock);
 
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index c53533b833cf..d06fe588e650 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -849,15 +849,19 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)
 	local_irq_restore(flags);
 }
 
+static struct fgraph_ops fprofiler_ops = {
+	.entryfunc = &profile_graph_entry,
+	.retfunc = &profile_graph_return,
+};
+
 static int register_ftrace_profiler(void)
 {
-	return register_ftrace_graph(&profile_graph_return,
-				     &profile_graph_entry);
+	return register_ftrace_graph(&fprofiler_ops);
 }
 
 static void unregister_ftrace_profiler(void)
 {
-	unregister_ftrace_graph();
+	unregister_ftrace_graph(&fprofiler_ops);
 }
 #else
 static struct ftrace_ops ftrace_profile_ops __read_mostly = {
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 855c13c61e77..140b4b51ab34 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -345,17 +345,25 @@ static void trace_graph_thresh_return(struct ftrace_graph_ret *trace)
 		trace_graph_return(trace);
 }
 
+static struct fgraph_ops funcgraph_thresh_ops = {
+	.entryfunc = &trace_graph_entry,
+	.retfunc = &trace_graph_thresh_return,
+};
+
+static struct fgraph_ops funcgraph_ops = {
+	.entryfunc = &trace_graph_entry,
+	.retfunc = &trace_graph_return,
+};
+
 static int graph_trace_init(struct trace_array *tr)
 {
 	int ret;
 
 	set_graph_array(tr);
 	if (tracing_thresh)
-		ret = register_ftrace_graph(&trace_graph_thresh_return,
-					    &trace_graph_entry);
+		ret = register_ftrace_graph(&funcgraph_thresh_ops);
 	else
-		ret = register_ftrace_graph(&trace_graph_return,
-					    &trace_graph_entry);
+		ret = register_ftrace_graph(&funcgraph_ops);
 	if (ret)
 		return ret;
 	tracing_start_cmdline_record();
@@ -366,7 +374,10 @@ static int graph_trace_init(struct trace_array *tr)
 static void graph_trace_reset(struct trace_array *tr)
 {
 	tracing_stop_cmdline_record();
-	unregister_ftrace_graph();
+	if (tracing_thresh)
+		unregister_ftrace_graph(&funcgraph_thresh_ops);
+	else
+		unregister_ftrace_graph(&funcgraph_ops);
 }
 
 static int graph_trace_update_thresh(struct trace_array *tr)
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 98ea6d28df15..d3294721f119 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -218,6 +218,11 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace)
 	atomic_dec(&data->disabled);
 }
 
+static struct fgraph_ops fgraph_ops = {
+	.entryfunc		= &irqsoff_graph_entry,
+	.retfunc		= &irqsoff_graph_return,
+};
+
 static void irqsoff_trace_open(struct trace_iterator *iter)
 {
 	if (is_graph(iter->tr))
@@ -272,13 +277,6 @@ __trace_function(struct trace_array *tr,
 #else
 #define __trace_function trace_function
 
-#ifdef CONFIG_FUNCTION_TRACER
-static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
-{
-	return -1;
-}
-#endif
-
 static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
 {
 	return TRACE_TYPE_UNHANDLED;
@@ -288,7 +286,6 @@ static void irqsoff_trace_open(struct trace_iterator *iter) { }
 static void irqsoff_trace_close(struct trace_iterator *iter) { }
 
 #ifdef CONFIG_FUNCTION_TRACER
-static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { }
 static void irqsoff_print_header(struct seq_file *s)
 {
 	trace_default_header(s);
@@ -468,8 +465,7 @@ static int register_irqsoff_function(struct trace_array *tr, int graph, int set)
 		return 0;
 
 	if (graph)
-		ret = register_ftrace_graph(&irqsoff_graph_return,
-					    &irqsoff_graph_entry);
+		ret = register_ftrace_graph(&fgraph_ops);
 	else
 		ret = register_ftrace_function(tr->ops);
 
@@ -485,7 +481,7 @@ static void unregister_irqsoff_function(struct trace_array *tr, int graph)
 		return;
 
 	if (graph)
-		unregister_ftrace_graph();
+		unregister_ftrace_graph(&fgraph_ops);
 	else
 		unregister_ftrace_function(tr->ops);
 
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 2ce78100b4d3..4ea7e6845efb 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -162,6 +162,11 @@ static void wakeup_graph_return(struct ftrace_graph_ret *trace)
 	return;
 }
 
+static struct fgraph_ops fgraph_wakeup_ops = {
+	.entryfunc = &wakeup_graph_entry,
+	.retfunc = &wakeup_graph_return,
+};
+
 static void wakeup_trace_open(struct trace_iterator *iter)
 {
 	if (is_graph(iter->tr))
@@ -197,12 +202,6 @@ static void wakeup_print_header(struct seq_file *s)
 	else
 		trace_default_header(s);
 }
-#else /* CONFIG_FUNCTION_GRAPH_TRACER */
-static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
-{
-	return -1;
-}
-static void wakeup_graph_return(struct ftrace_graph_ret *trace) { }
 #endif /* else CONFIG_FUNCTION_GRAPH_TRACER */
 
 /*
@@ -237,8 +236,7 @@ static int register_wakeup_function(struct trace_array *tr, int graph, int set)
 		return 0;
 
 	if (graph)
-		ret = register_ftrace_graph(&wakeup_graph_return,
-					    &wakeup_graph_entry);
+		ret = register_ftrace_graph(&fgraph_wakeup_ops);
 	else
 		ret = register_ftrace_function(tr->ops);
 
@@ -254,7 +252,7 @@ static void unregister_wakeup_function(struct trace_array *tr, int graph)
 		return;
 
 	if (graph)
-		unregister_ftrace_graph();
+		unregister_ftrace_graph(&fgraph_wakeup_ops);
 	else
 		unregister_ftrace_function(tr->ops);
 
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 11e9daa4a568..9d402e7fc949 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -741,6 +741,11 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace)
 	return trace_graph_entry(trace);
 }
 
+static struct fgraph_ops fgraph_ops __initdata  = {
+	.entryfunc		= &trace_graph_entry_watchdog,
+	.retfunc		= &trace_graph_return,
+};
+
 /*
  * Pretty much the same than for the function tracer from which the selftest
  * has been borrowed.
@@ -765,8 +770,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,
 	 */
 	tracing_reset_online_cpus(&tr->trace_buffer);
 	set_graph_array(tr);
-	ret = register_ftrace_graph(&trace_graph_return,
-				    &trace_graph_entry_watchdog);
+	ret = register_ftrace_graph(&fgraph_ops);
 	if (ret) {
 		warn_failed_init_tracer(trace, ret);
 		goto out;
-- 
cgit 


From 76b42b63ed0d004961097d3a3cd979129d4afd26 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Sun, 18 Nov 2018 18:36:19 -0500
Subject: function_graph: Move ftrace_graph_ret_addr() to fgraph.c

Move the function function_graph_ret_addr() to fgraph.c, as the management
of the curr_ret_stack is going to change, and all the accesses to ret_stack
needs to be done in fgraph.c.

Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/fgraph.c                | 55 ++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_functions_graph.c | 55 ------------------------------------
 2 files changed, 55 insertions(+), 55 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index cc35606e9a3e..90fcefcaff2a 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -232,6 +232,61 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
 	return ret;
 }
 
+/**
+ * ftrace_graph_ret_addr - convert a potentially modified stack return address
+ *			   to its original value
+ *
+ * This function can be called by stack unwinding code to convert a found stack
+ * return address ('ret') to its original value, in case the function graph
+ * tracer has modified it to be 'return_to_handler'.  If the address hasn't
+ * been modified, the unchanged value of 'ret' is returned.
+ *
+ * 'idx' is a state variable which should be initialized by the caller to zero
+ * before the first call.
+ *
+ * 'retp' is a pointer to the return address on the stack.  It's ignored if
+ * the arch doesn't have HAVE_FUNCTION_GRAPH_RET_ADDR_PTR defined.
+ */
+#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
+unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
+				    unsigned long ret, unsigned long *retp)
+{
+	int index = task->curr_ret_stack;
+	int i;
+
+	if (ret != (unsigned long)return_to_handler)
+		return ret;
+
+	if (index < 0)
+		return ret;
+
+	for (i = 0; i <= index; i++)
+		if (task->ret_stack[i].retp == retp)
+			return task->ret_stack[i].ret;
+
+	return ret;
+}
+#else /* !HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */
+unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
+				    unsigned long ret, unsigned long *retp)
+{
+	int task_idx;
+
+	if (ret != (unsigned long)return_to_handler)
+		return ret;
+
+	task_idx = task->curr_ret_stack;
+
+	if (!task->ret_stack || task_idx < *idx)
+		return ret;
+
+	task_idx -= *idx;
+	(*idx)++;
+
+	return task->ret_stack[task_idx].ret;
+}
+#endif /* HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */
+
 static struct ftrace_ops graph_ops = {
 	.func			= ftrace_stub,
 	.flags			= FTRACE_OPS_FL_RECURSION_SAFE |
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 140b4b51ab34..c2af1560e856 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -94,61 +94,6 @@ static void
 print_graph_duration(struct trace_array *tr, unsigned long long duration,
 		     struct trace_seq *s, u32 flags);
 
-/**
- * ftrace_graph_ret_addr - convert a potentially modified stack return address
- *			   to its original value
- *
- * This function can be called by stack unwinding code to convert a found stack
- * return address ('ret') to its original value, in case the function graph
- * tracer has modified it to be 'return_to_handler'.  If the address hasn't
- * been modified, the unchanged value of 'ret' is returned.
- *
- * 'idx' is a state variable which should be initialized by the caller to zero
- * before the first call.
- *
- * 'retp' is a pointer to the return address on the stack.  It's ignored if
- * the arch doesn't have HAVE_FUNCTION_GRAPH_RET_ADDR_PTR defined.
- */
-#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
-unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
-				    unsigned long ret, unsigned long *retp)
-{
-	int index = task->curr_ret_stack;
-	int i;
-
-	if (ret != (unsigned long)return_to_handler)
-		return ret;
-
-	if (index < 0)
-		return ret;
-
-	for (i = 0; i <= index; i++)
-		if (task->ret_stack[i].retp == retp)
-			return task->ret_stack[i].ret;
-
-	return ret;
-}
-#else /* !HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */
-unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
-				    unsigned long ret, unsigned long *retp)
-{
-	int task_idx;
-
-	if (ret != (unsigned long)return_to_handler)
-		return ret;
-
-	task_idx = task->curr_ret_stack;
-
-	if (!task->ret_stack || task_idx < *idx)
-		return ret;
-
-	task_idx -= *idx;
-	(*idx)++;
-
-	return task->ret_stack[task_idx].ret;
-}
-#endif /* HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */
-
 int __trace_graph_entry(struct trace_array *tr,
 				struct ftrace_graph_ent *trace,
 				unsigned long flags,
-- 
cgit 


From b0e21a61d3196762b61f43ae994ffd255f646774 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Mon, 19 Nov 2018 20:54:08 -0500
Subject: function_graph: Have profiler use new helper
 ftrace_graph_get_ret_stack()

The ret_stack processing is going to change, and that is going
to break anything that is accessing the ret_stack directly. One user is the
function graph profiler. By using the ftrace_graph_get_ret_stack() helper
function, the profiler can access the ret_stack entry without relying on the
implementation details of the stack itself.

Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/ftrace.h |  3 +++
 kernel/trace/fgraph.c  | 11 +++++++++++
 kernel/trace/ftrace.c  | 21 +++++++++++----------
 3 files changed, 25 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 21c80491ccde..98e141c71ad0 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -785,6 +785,9 @@ extern int
 function_graph_enter(unsigned long ret, unsigned long func,
 		     unsigned long frame_pointer, unsigned long *retp);
 
+struct ftrace_ret_stack *
+ftrace_graph_get_ret_stack(struct task_struct *task, int idx);
+
 unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
 				    unsigned long ret, unsigned long *retp);
 
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 90fcefcaff2a..a3704ec8b599 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -232,6 +232,17 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
 	return ret;
 }
 
+struct ftrace_ret_stack *
+ftrace_graph_get_ret_stack(struct task_struct *task, int idx)
+{
+	idx = current->curr_ret_stack - idx;
+
+	if (idx >= 0 && idx <= task->curr_ret_stack)
+		return &current->ret_stack[idx];
+
+	return NULL;
+}
+
 /**
  * ftrace_graph_ret_addr - convert a potentially modified stack return address
  *			   to its original value
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index d06fe588e650..8ef9fc226037 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -792,7 +792,7 @@ void ftrace_graph_graph_time_control(bool enable)
 
 static int profile_graph_entry(struct ftrace_graph_ent *trace)
 {
-	int index = current->curr_ret_stack;
+	struct ftrace_ret_stack *ret_stack;
 
 	function_profile_call(trace->func, 0, NULL, NULL);
 
@@ -800,14 +800,16 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace)
 	if (!current->ret_stack)
 		return 0;
 
-	if (index >= 0 && index < FTRACE_RETFUNC_DEPTH)
-		current->ret_stack[index].subtime = 0;
+	ret_stack = ftrace_graph_get_ret_stack(current, 0);
+	if (ret_stack)
+		ret_stack->subtime = 0;
 
 	return 1;
 }
 
 static void profile_graph_return(struct ftrace_graph_ret *trace)
 {
+	struct ftrace_ret_stack *ret_stack;
 	struct ftrace_profile_stat *stat;
 	unsigned long long calltime;
 	struct ftrace_profile *rec;
@@ -825,16 +827,15 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)
 	calltime = trace->rettime - trace->calltime;
 
 	if (!fgraph_graph_time) {
-		int index;
-
-		index = current->curr_ret_stack;
 
 		/* Append this call time to the parent time to subtract */
-		if (index)
-			current->ret_stack[index - 1].subtime += calltime;
+		ret_stack = ftrace_graph_get_ret_stack(current, 1);
+		if (ret_stack)
+			ret_stack->subtime += calltime;
 
-		if (current->ret_stack[index].subtime < calltime)
-			calltime -= current->ret_stack[index].subtime;
+		ret_stack = ftrace_graph_get_ret_stack(current, 0);
+		if (ret_stack && ret_stack->subtime < calltime)
+			calltime -= ret_stack->subtime;
 		else
 			calltime = 0;
 	}
-- 
cgit 


From ca16b0fbb05242f18da9d810c07d3882ffed831c Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Wed, 20 Jun 2018 14:08:00 +0300
Subject: tracing: Have trace_stack nr_entries compare not be so subtle

Dan Carpenter reviewed the trace_stack.c code and figured he found an off by
one bug.

 "From reviewing the code, it seems possible for
  stack_trace_max.nr_entries to be set to .max_entries and in that case we
  would be reading one element beyond the end of the stack_dump_trace[]
  array.  If it's not set to .max_entries then the bug doesn't affect
  runtime."

Although it looks to be the case, it is not. Because we have:

 static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] =
	 { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX };

 struct stack_trace stack_trace_max = {
	.max_entries		= STACK_TRACE_ENTRIES - 1,
	.entries		= &stack_dump_trace[0],
 };

And:

	stack_trace_max.nr_entries = x;
	for (; x < i; x++)
		stack_dump_trace[x] = ULONG_MAX;

Even if nr_entries equals max_entries, indexing with it into the
stack_dump_trace[] array will not overflow the array. But if it is the case,
the second part of the conditional that tests stack_dump_trace[nr_entries]
to ULONG_MAX will always be true.

By applying Dan's patch, it removes the subtle aspect of it and makes the if
conditional slightly more efficient.

Link: http://lkml.kernel.org/r/20180620110758.crunhd5bfep7zuiz@kili.mountain

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_stack.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 2b0d1ee3241c..e2a153fc1afc 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -286,7 +286,7 @@ __next(struct seq_file *m, loff_t *pos)
 {
 	long n = *pos - 1;
 
-	if (n > stack_trace_max.nr_entries || stack_dump_trace[n] == ULONG_MAX)
+	if (n >= stack_trace_max.nr_entries || stack_dump_trace[n] == ULONG_MAX)
 		return NULL;
 
 	m->private = (void *)n;
-- 
cgit 


From 2c2b0a78b373908926e4683ea5571332f63c0eb5 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 29 Nov 2018 20:32:26 -0500
Subject: ring-buffer: Add percentage of ring buffer full to wake up reader

Instead of just waiting for a page to be full before waking up a pending
reader, allow the reader to pass in a "percentage" of pages that have
content before waking up a reader. This should help keep the process of
reading the events not cause wake ups that constantly cause reading of the
buffer.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/ring_buffer.h |  4 ++-
 kernel/trace/ring_buffer.c  | 71 +++++++++++++++++++++++++++++++++++++++++----
 kernel/trace/trace.c        |  8 ++---
 3 files changed, 73 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 0940fda59872..5b9ae62272bb 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -97,7 +97,7 @@ __ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *k
 	__ring_buffer_alloc((size), (flags), &__key);	\
 })
 
-int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full);
+int ring_buffer_wait(struct ring_buffer *buffer, int cpu, int full);
 __poll_t ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
 			  struct file *filp, poll_table *poll_table);
 
@@ -189,6 +189,8 @@ bool ring_buffer_time_stamp_abs(struct ring_buffer *buffer);
 
 size_t ring_buffer_page_len(void *page);
 
+size_t ring_buffer_nr_pages(struct ring_buffer *buffer, int cpu);
+size_t ring_buffer_nr_dirty_pages(struct ring_buffer *buffer, int cpu);
 
 void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu);
 void ring_buffer_free_read_page(struct ring_buffer *buffer, int cpu, void *data);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 65bd4616220d..9edb628603ab 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -487,6 +487,9 @@ struct ring_buffer_per_cpu {
 	local_t				dropped_events;
 	local_t				committing;
 	local_t				commits;
+	local_t				pages_touched;
+	local_t				pages_read;
+	size_t				shortest_full;
 	unsigned long			read;
 	unsigned long			read_bytes;
 	u64				write_stamp;
@@ -529,6 +532,41 @@ struct ring_buffer_iter {
 	u64				read_stamp;
 };
 
+/**
+ * ring_buffer_nr_pages - get the number of buffer pages in the ring buffer
+ * @buffer: The ring_buffer to get the number of pages from
+ * @cpu: The cpu of the ring_buffer to get the number of pages from
+ *
+ * Returns the number of pages used by a per_cpu buffer of the ring buffer.
+ */
+size_t ring_buffer_nr_pages(struct ring_buffer *buffer, int cpu)
+{
+	return buffer->buffers[cpu]->nr_pages;
+}
+
+/**
+ * ring_buffer_nr_pages_dirty - get the number of used pages in the ring buffer
+ * @buffer: The ring_buffer to get the number of pages from
+ * @cpu: The cpu of the ring_buffer to get the number of pages from
+ *
+ * Returns the number of pages that have content in the ring buffer.
+ */
+size_t ring_buffer_nr_dirty_pages(struct ring_buffer *buffer, int cpu)
+{
+	size_t read;
+	size_t cnt;
+
+	read = local_read(&buffer->buffers[cpu]->pages_read);
+	cnt = local_read(&buffer->buffers[cpu]->pages_touched);
+	/* The reader can read an empty page, but not more than that */
+	if (cnt < read) {
+		WARN_ON_ONCE(read > cnt + 1);
+		return 0;
+	}
+
+	return cnt - read;
+}
+
 /*
  * rb_wake_up_waiters - wake up tasks waiting for ring buffer input
  *
@@ -556,7 +594,7 @@ static void rb_wake_up_waiters(struct irq_work *work)
  * as data is added to any of the @buffer's cpu buffers. Otherwise
  * it will wait for data to be added to a specific cpu buffer.
  */
-int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full)
+int ring_buffer_wait(struct ring_buffer *buffer, int cpu, int full)
 {
 	struct ring_buffer_per_cpu *uninitialized_var(cpu_buffer);
 	DEFINE_WAIT(wait);
@@ -571,7 +609,7 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full)
 	if (cpu == RING_BUFFER_ALL_CPUS) {
 		work = &buffer->irq_work;
 		/* Full only makes sense on per cpu reads */
-		full = false;
+		full = 0;
 	} else {
 		if (!cpumask_test_cpu(cpu, buffer->cpumask))
 			return -ENODEV;
@@ -623,15 +661,22 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full)
 		    !ring_buffer_empty_cpu(buffer, cpu)) {
 			unsigned long flags;
 			bool pagebusy;
+			size_t nr_pages;
+			size_t dirty;
 
 			if (!full)
 				break;
 
 			raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 			pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
+			nr_pages = cpu_buffer->nr_pages;
+			dirty = ring_buffer_nr_dirty_pages(buffer, cpu);
+			if (!cpu_buffer->shortest_full ||
+			    cpu_buffer->shortest_full < full)
+				cpu_buffer->shortest_full = full;
 			raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
-
-			if (!pagebusy)
+			if (!pagebusy &&
+			    (!nr_pages || (dirty * 100) > full * nr_pages))
 				break;
 		}
 
@@ -1054,6 +1099,7 @@ static void rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
 	old_write = local_add_return(RB_WRITE_INTCNT, &next_page->write);
 	old_entries = local_add_return(RB_WRITE_INTCNT, &next_page->entries);
 
+	local_inc(&cpu_buffer->pages_touched);
 	/*
 	 * Just make sure we have seen our old_write and synchronize
 	 * with any interrupts that come in.
@@ -2603,6 +2649,16 @@ rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
 	pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
 
 	if (!pagebusy && cpu_buffer->irq_work.full_waiters_pending) {
+		size_t nr_pages;
+		size_t dirty;
+		size_t full;
+
+		full = cpu_buffer->shortest_full;
+		nr_pages = cpu_buffer->nr_pages;
+		dirty = ring_buffer_nr_dirty_pages(buffer, cpu_buffer->cpu);
+		if (full && nr_pages && (dirty * 100) <= full * nr_pages)
+			return;
+
 		cpu_buffer->irq_work.wakeup_full = true;
 		cpu_buffer->irq_work.full_waiters_pending = false;
 		/* irq_work_queue() supplies it's own memory barriers */
@@ -3732,13 +3788,15 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 		goto spin;
 
 	/*
-	 * Yeah! We succeeded in replacing the page.
+	 * Yay! We succeeded in replacing the page.
 	 *
 	 * Now make the new head point back to the reader page.
 	 */
 	rb_list_head(reader->list.next)->prev = &cpu_buffer->reader_page->list;
 	rb_inc_page(cpu_buffer, &cpu_buffer->head_page);
 
+	local_inc(&cpu_buffer->pages_read);
+
 	/* Finally update the reader page to the new head */
 	cpu_buffer->reader_page = reader;
 	cpu_buffer->reader_page->read = 0;
@@ -4334,6 +4392,9 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
 	local_set(&cpu_buffer->entries, 0);
 	local_set(&cpu_buffer->committing, 0);
 	local_set(&cpu_buffer->commits, 0);
+	local_set(&cpu_buffer->pages_touched, 0);
+	local_set(&cpu_buffer->pages_read, 0);
+	cpu_buffer->shortest_full = 0;
 	cpu_buffer->read = 0;
 	cpu_buffer->read_bytes = 0;
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ff1c4b20cd0a..48d5eb22ff33 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1431,7 +1431,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
 }
 #endif /* CONFIG_TRACER_MAX_TRACE */
 
-static int wait_on_pipe(struct trace_iterator *iter, bool full)
+static int wait_on_pipe(struct trace_iterator *iter, int full)
 {
 	/* Iterators are static, they should be filled or empty */
 	if (trace_buffer_iter(iter, iter->cpu_file))
@@ -5693,7 +5693,7 @@ static int tracing_wait_pipe(struct file *filp)
 
 		mutex_unlock(&iter->mutex);
 
-		ret = wait_on_pipe(iter, false);
+		ret = wait_on_pipe(iter, 0);
 
 		mutex_lock(&iter->mutex);
 
@@ -6751,7 +6751,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
 			if ((filp->f_flags & O_NONBLOCK))
 				return -EAGAIN;
 
-			ret = wait_on_pipe(iter, false);
+			ret = wait_on_pipe(iter, 0);
 			if (ret)
 				return ret;
 
@@ -6948,7 +6948,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 		if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK))
 			goto out;
 
-		ret = wait_on_pipe(iter, true);
+		ret = wait_on_pipe(iter, 1);
 		if (ret)
 			goto out;
 
-- 
cgit 


From 03329f9939781041424edd29487b9603a704fcd9 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 29 Nov 2018 21:38:42 -0500
Subject: tracing: Add tracefs file buffer_percentage

Add a "buffer_percentage" file, that allows users to specify how much of the
buffer (percentage of pages) need to be filled before waking up a task
blocked on a per cpu trace_pipe_raw file.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 39 +++++++++++++++++++--------------
 kernel/trace/trace.c       | 54 +++++++++++++++++++++++++++++++++++++++++++++-
 kernel/trace/trace.h       |  1 +
 3 files changed, 77 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 9edb628603ab..5434c16f2192 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -489,6 +489,7 @@ struct ring_buffer_per_cpu {
 	local_t				commits;
 	local_t				pages_touched;
 	local_t				pages_read;
+	long				last_pages_touch;
 	size_t				shortest_full;
 	unsigned long			read;
 	unsigned long			read_bytes;
@@ -2632,7 +2633,9 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
 static __always_inline void
 rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
 {
-	bool pagebusy;
+	size_t nr_pages;
+	size_t dirty;
+	size_t full;
 
 	if (buffer->irq_work.waiters_pending) {
 		buffer->irq_work.waiters_pending = false;
@@ -2646,24 +2649,27 @@ rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
 		irq_work_queue(&cpu_buffer->irq_work.work);
 	}
 
-	pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
+	if (cpu_buffer->last_pages_touch == local_read(&cpu_buffer->pages_touched))
+		return;
 
-	if (!pagebusy && cpu_buffer->irq_work.full_waiters_pending) {
-		size_t nr_pages;
-		size_t dirty;
-		size_t full;
+	if (cpu_buffer->reader_page == cpu_buffer->commit_page)
+		return;
 
-		full = cpu_buffer->shortest_full;
-		nr_pages = cpu_buffer->nr_pages;
-		dirty = ring_buffer_nr_dirty_pages(buffer, cpu_buffer->cpu);
-		if (full && nr_pages && (dirty * 100) <= full * nr_pages)
-			return;
+	if (!cpu_buffer->irq_work.full_waiters_pending)
+		return;
 
-		cpu_buffer->irq_work.wakeup_full = true;
-		cpu_buffer->irq_work.full_waiters_pending = false;
-		/* irq_work_queue() supplies it's own memory barriers */
-		irq_work_queue(&cpu_buffer->irq_work.work);
-	}
+	cpu_buffer->last_pages_touch = local_read(&cpu_buffer->pages_touched);
+
+	full = cpu_buffer->shortest_full;
+	nr_pages = cpu_buffer->nr_pages;
+	dirty = ring_buffer_nr_dirty_pages(buffer, cpu_buffer->cpu);
+	if (full && nr_pages && (dirty * 100) <= full * nr_pages)
+		return;
+
+	cpu_buffer->irq_work.wakeup_full = true;
+	cpu_buffer->irq_work.full_waiters_pending = false;
+	/* irq_work_queue() supplies it's own memory barriers */
+	irq_work_queue(&cpu_buffer->irq_work.work);
 }
 
 /*
@@ -4394,6 +4400,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
 	local_set(&cpu_buffer->commits, 0);
 	local_set(&cpu_buffer->pages_touched, 0);
 	local_set(&cpu_buffer->pages_read, 0);
+	cpu_buffer->last_pages_touch = 0;
 	cpu_buffer->shortest_full = 0;
 	cpu_buffer->read = 0;
 	cpu_buffer->read_bytes = 0;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 48d5eb22ff33..d382fd1aa4a6 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6948,7 +6948,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 		if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK))
 			goto out;
 
-		ret = wait_on_pipe(iter, 1);
+		ret = wait_on_pipe(iter, iter->tr->buffer_percent);
 		if (ret)
 			goto out;
 
@@ -7662,6 +7662,53 @@ static const struct file_operations rb_simple_fops = {
 	.llseek		= default_llseek,
 };
 
+static ssize_t
+buffer_percent_read(struct file *filp, char __user *ubuf,
+		    size_t cnt, loff_t *ppos)
+{
+	struct trace_array *tr = filp->private_data;
+	char buf[64];
+	int r;
+
+	r = tr->buffer_percent;
+	r = sprintf(buf, "%d\n", r);
+
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+buffer_percent_write(struct file *filp, const char __user *ubuf,
+		     size_t cnt, loff_t *ppos)
+{
+	struct trace_array *tr = filp->private_data;
+	unsigned long val;
+	int ret;
+
+	ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+	if (ret)
+		return ret;
+
+	if (val > 100)
+		return -EINVAL;
+
+	if (!val)
+		val = 1;
+
+	tr->buffer_percent = val;
+
+	(*ppos)++;
+
+	return cnt;
+}
+
+static const struct file_operations buffer_percent_fops = {
+	.open		= tracing_open_generic_tr,
+	.read		= buffer_percent_read,
+	.write		= buffer_percent_write,
+	.release	= tracing_release_generic_tr,
+	.llseek		= default_llseek,
+};
+
 struct dentry *trace_instance_dir;
 
 static void
@@ -7970,6 +8017,11 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
 	trace_create_file("timestamp_mode", 0444, d_tracer, tr,
 			  &trace_time_stamp_mode_fops);
 
+	tr->buffer_percent = 1;
+
+	trace_create_file("buffer_percent", 0444, d_tracer,
+			tr, &buffer_percent_fops);
+
 	create_trace_options_dir(tr);
 
 #if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index ab16eca76e59..08900828d282 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -247,6 +247,7 @@ struct trace_array {
 	int			clock_id;
 	int			nr_topts;
 	bool			clear_trace;
+	int			buffer_percent;
 	struct tracer		*current_trace;
 	unsigned int		trace_flags;
 	unsigned char		trace_flags_index[TRACE_FLAGS_MAX_SIZE];
-- 
cgit 


From a7b1d74e872a4299e1aef66a648316c9c23e5ab4 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 29 Nov 2018 22:36:47 -0500
Subject: tracing: Change default buffer_percent to 50

After running several tests, it appears that having the reader wait till
half the buffer is full before starting to read (and causing its own events
to fill up the ring buffer constantly), works well. It keeps trace-cmd (the
main user of this interface) from dominating the traces it records.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d382fd1aa4a6..194c01838e3f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -8017,7 +8017,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
 	trace_create_file("timestamp_mode", 0444, d_tracer, tr,
 			  &trace_time_stamp_mode_fops);
 
-	tr->buffer_percent = 1;
+	tr->buffer_percent = 50;
 
 	trace_create_file("buffer_percent", 0444, d_tracer,
 			tr, &buffer_percent_fops);
-- 
cgit 


From 547cd9eacd1c699c8d1fa77c95c6cdb33b2eba6a Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Mon, 5 Nov 2018 18:00:15 +0900
Subject: tracing/uprobes: Add busy check when cleanup all uprobes

Add a busy check loop in cleanup_all_probes() before
trying to remove all events in uprobe_events, the same way
that kprobe_events does.

Without this change, writing null to uprobe_events will
try to remove events but if one of them is enabled, it will
stop there leaving some events cleared and others not clceared.

With this change, writing null to uprobe_events makes
sure all events are not enabled before removing events.
So, it clears all events, or returns an error (-EBUSY)
with keeping all events.

Link: http://lkml.kernel.org/r/154140841557.17322.12653952888762532401.stgit@devbox

Reviewed-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Tested-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_uprobe.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 31ea48eceda1..b708e4ff7ea7 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -587,12 +587,19 @@ static int cleanup_all_probes(void)
 	int ret = 0;
 
 	mutex_lock(&uprobe_lock);
+	/* Ensure no probe is in use. */
+	list_for_each_entry(tu, &uprobe_list, list)
+		if (trace_probe_is_enabled(&tu->tp)) {
+			ret = -EBUSY;
+			goto end;
+		}
 	while (!list_empty(&uprobe_list)) {
 		tu = list_entry(uprobe_list.next, struct trace_uprobe, list);
 		ret = unregister_trace_uprobe(tu);
 		if (ret)
 			break;
 	}
+end:
 	mutex_unlock(&uprobe_lock);
 	return ret;
 }
-- 
cgit 


From fc800a10be26017f8f338bc8e500d48e3e6429d9 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Mon, 5 Nov 2018 18:00:43 +0900
Subject: tracing: Lock event_mutex before synth_event_mutex

synthetic event is using synth_event_mutex for protecting
synth_event_list, and event_trigger_write() path acquires
locks as below order.

event_trigger_write(event_mutex)
  ->trigger_process_regex(trigger_cmd_mutex)
    ->event_hist_trigger_func(synth_event_mutex)

On the other hand, synthetic event creation and deletion paths
call trace_add_event_call() and trace_remove_event_call()
which acquires event_mutex. In that case, if we keep the
synth_event_mutex locked while registering/unregistering synthetic
events, its dependency will be inversed.

To avoid this issue, current synthetic event is using a 2 phase
process to create/delete events. For example, it searches existing
events under synth_event_mutex to check for event-name conflicts, and
unlocks synth_event_mutex, then registers a new event under event_mutex
locked. Finally, it locks synth_event_mutex and tries to add the
new event to the list. But it can introduce complexity and a chance
for name conflicts.

To solve this simpler, this introduces trace_add_event_call_nolock()
and trace_remove_event_call_nolock() which don't acquire
event_mutex inside. synthetic event can lock event_mutex before
synth_event_mutex to solve the lock dependency issue simpler.

Link: http://lkml.kernel.org/r/154140844377.17322.13781091165954002713.stgit@devbox

Reviewed-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Tested-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/trace_events.h     |  2 ++
 kernel/trace/trace_events.c      | 34 ++++++++++++++++++++++++++++------
 kernel/trace/trace_events_hist.c | 24 ++++++++++--------------
 3 files changed, 40 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 4130a5497d40..3aa05593a53f 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -529,6 +529,8 @@ extern int trace_event_raw_init(struct trace_event_call *call);
 extern int trace_define_field(struct trace_event_call *call, const char *type,
 			      const char *name, int offset, int size,
 			      int is_signed, int filter_type);
+extern int trace_add_event_call_nolock(struct trace_event_call *call);
+extern int trace_remove_event_call_nolock(struct trace_event_call *call);
 extern int trace_add_event_call(struct trace_event_call *call);
 extern int trace_remove_event_call(struct trace_event_call *call);
 extern int trace_event_get_offsets(struct trace_event_call *call);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index f94be0c2827b..a3b157f689ee 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2305,11 +2305,11 @@ __trace_early_add_new_event(struct trace_event_call *call,
 struct ftrace_module_file_ops;
 static void __add_event_to_tracers(struct trace_event_call *call);
 
-/* Add an additional event_call dynamically */
-int trace_add_event_call(struct trace_event_call *call)
+int trace_add_event_call_nolock(struct trace_event_call *call)
 {
 	int ret;
-	mutex_lock(&event_mutex);
+	lockdep_assert_held(&event_mutex);
+
 	mutex_lock(&trace_types_lock);
 
 	ret = __register_event(call, NULL);
@@ -2317,6 +2317,16 @@ int trace_add_event_call(struct trace_event_call *call)
 		__add_event_to_tracers(call);
 
 	mutex_unlock(&trace_types_lock);
+	return ret;
+}
+
+/* Add an additional event_call dynamically */
+int trace_add_event_call(struct trace_event_call *call)
+{
+	int ret;
+
+	mutex_lock(&event_mutex);
+	ret = trace_add_event_call_nolock(call);
 	mutex_unlock(&event_mutex);
 	return ret;
 }
@@ -2366,17 +2376,29 @@ static int probe_remove_event_call(struct trace_event_call *call)
 	return 0;
 }
 
-/* Remove an event_call */
-int trace_remove_event_call(struct trace_event_call *call)
+/* no event_mutex version */
+int trace_remove_event_call_nolock(struct trace_event_call *call)
 {
 	int ret;
 
-	mutex_lock(&event_mutex);
+	lockdep_assert_held(&event_mutex);
+
 	mutex_lock(&trace_types_lock);
 	down_write(&trace_event_sem);
 	ret = probe_remove_event_call(call);
 	up_write(&trace_event_sem);
 	mutex_unlock(&trace_types_lock);
+
+	return ret;
+}
+
+/* Remove an event_call */
+int trace_remove_event_call(struct trace_event_call *call)
+{
+	int ret;
+
+	mutex_lock(&event_mutex);
+	ret = trace_remove_event_call_nolock(call);
 	mutex_unlock(&event_mutex);
 
 	return ret;
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index eb908ef2ecec..1670c65389fe 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -912,7 +912,7 @@ static int register_synth_event(struct synth_event *event)
 	call->data = event;
 	call->tp = event->tp;
 
-	ret = trace_add_event_call(call);
+	ret = trace_add_event_call_nolock(call);
 	if (ret) {
 		pr_warn("Failed to register synthetic event: %s\n",
 			trace_event_name(call));
@@ -936,7 +936,7 @@ static int unregister_synth_event(struct synth_event *event)
 	struct trace_event_call *call = &event->call;
 	int ret;
 
-	ret = trace_remove_event_call(call);
+	ret = trace_remove_event_call_nolock(call);
 
 	return ret;
 }
@@ -1013,12 +1013,10 @@ static void add_or_delete_synth_event(struct synth_event *event, int delete)
 	if (delete)
 		free_synth_event(event);
 	else {
-		mutex_lock(&synth_event_mutex);
 		if (!find_synth_event(event->name))
 			list_add(&event->list, &synth_event_list);
 		else
 			free_synth_event(event);
-		mutex_unlock(&synth_event_mutex);
 	}
 }
 
@@ -1030,6 +1028,7 @@ static int create_synth_event(int argc, char **argv)
 	int i, consumed = 0, n_fields = 0, ret = 0;
 	char *name;
 
+	mutex_lock(&event_mutex);
 	mutex_lock(&synth_event_mutex);
 
 	/*
@@ -1102,8 +1101,6 @@ static int create_synth_event(int argc, char **argv)
 		goto err;
 	}
  out:
-	mutex_unlock(&synth_event_mutex);
-
 	if (event) {
 		if (delete_event) {
 			ret = unregister_synth_event(event);
@@ -1113,10 +1110,13 @@ static int create_synth_event(int argc, char **argv)
 			add_or_delete_synth_event(event, ret);
 		}
 	}
+	mutex_unlock(&synth_event_mutex);
+	mutex_unlock(&event_mutex);
 
 	return ret;
  err:
 	mutex_unlock(&synth_event_mutex);
+	mutex_unlock(&event_mutex);
 
 	for (i = 0; i < n_fields; i++)
 		free_synth_field(fields[i]);
@@ -1127,12 +1127,10 @@ static int create_synth_event(int argc, char **argv)
 
 static int release_all_synth_events(void)
 {
-	struct list_head release_events;
 	struct synth_event *event, *e;
 	int ret = 0;
 
-	INIT_LIST_HEAD(&release_events);
-
+	mutex_lock(&event_mutex);
 	mutex_lock(&synth_event_mutex);
 
 	list_for_each_entry(event, &synth_event_list, list) {
@@ -1142,16 +1140,14 @@ static int release_all_synth_events(void)
 		}
 	}
 
-	list_splice_init(&event->list, &release_events);
-
-	mutex_unlock(&synth_event_mutex);
-
-	list_for_each_entry_safe(event, e, &release_events, list) {
+	list_for_each_entry_safe(event, e, &synth_event_list, list) {
 		list_del(&event->list);
 
 		ret = unregister_synth_event(event);
 		add_or_delete_synth_event(event, !ret);
 	}
+	mutex_unlock(&synth_event_mutex);
+	mutex_unlock(&event_mutex);
 
 	return ret;
 }
-- 
cgit 


From faacb361f271be4baf2d807e2eeaba87e059225f Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Mon, 5 Nov 2018 18:01:12 +0900
Subject: tracing: Simplify creation and deletion of synthetic events

Since the event_mutex and synth_event_mutex ordering issue
is gone, we can skip existing event check when adding or
deleting events, and some redundant code in error path.

This changes release_all_synth_events() to abort the process
when it hits any error and returns the error code. It succeeds
only if it has no error.

Link: http://lkml.kernel.org/r/154140847194.17322.17960275728005067803.stgit@devbox

Reviewed-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Tested-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c | 53 ++++++++++++++--------------------------
 1 file changed, 18 insertions(+), 35 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 1670c65389fe..0feb7f460123 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -1008,18 +1008,6 @@ struct hist_var_data {
 	struct hist_trigger_data *hist_data;
 };
 
-static void add_or_delete_synth_event(struct synth_event *event, int delete)
-{
-	if (delete)
-		free_synth_event(event);
-	else {
-		if (!find_synth_event(event->name))
-			list_add(&event->list, &synth_event_list);
-		else
-			free_synth_event(event);
-	}
-}
-
 static int create_synth_event(int argc, char **argv)
 {
 	struct synth_field *field, *fields[SYNTH_FIELDS_MAX];
@@ -1052,15 +1040,16 @@ static int create_synth_event(int argc, char **argv)
 	if (event) {
 		if (delete_event) {
 			if (event->ref) {
-				event = NULL;
 				ret = -EBUSY;
 				goto out;
 			}
-			list_del(&event->list);
-			goto out;
-		}
-		event = NULL;
-		ret = -EEXIST;
+			ret = unregister_synth_event(event);
+			if (!ret) {
+				list_del(&event->list);
+				free_synth_event(event);
+			}
+		} else
+			ret = -EEXIST;
 		goto out;
 	} else if (delete_event) {
 		ret = -ENOENT;
@@ -1100,29 +1089,21 @@ static int create_synth_event(int argc, char **argv)
 		event = NULL;
 		goto err;
 	}
+	ret = register_synth_event(event);
+	if (!ret)
+		list_add(&event->list, &synth_event_list);
+	else
+		free_synth_event(event);
  out:
-	if (event) {
-		if (delete_event) {
-			ret = unregister_synth_event(event);
-			add_or_delete_synth_event(event, !ret);
-		} else {
-			ret = register_synth_event(event);
-			add_or_delete_synth_event(event, ret);
-		}
-	}
 	mutex_unlock(&synth_event_mutex);
 	mutex_unlock(&event_mutex);
 
 	return ret;
  err:
-	mutex_unlock(&synth_event_mutex);
-	mutex_unlock(&event_mutex);
-
 	for (i = 0; i < n_fields; i++)
 		free_synth_field(fields[i]);
-	free_synth_event(event);
 
-	return ret;
+	goto out;
 }
 
 static int release_all_synth_events(void)
@@ -1141,10 +1122,12 @@ static int release_all_synth_events(void)
 	}
 
 	list_for_each_entry_safe(event, e, &synth_event_list, list) {
-		list_del(&event->list);
-
 		ret = unregister_synth_event(event);
-		add_or_delete_synth_event(event, !ret);
+		if (!ret) {
+			list_del(&event->list);
+			free_synth_event(event);
+		} else
+			break;
 	}
 	mutex_unlock(&synth_event_mutex);
 	mutex_unlock(&event_mutex);
-- 
cgit 


From d00bbea9456f35fb34310d454e561f05d68d07fe Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Mon, 5 Nov 2018 18:01:40 +0900
Subject: tracing: Integrate similar probe argument parsers

Integrate similar argument parsers for kprobes and uprobes events
into traceprobe_parse_probe_arg().

Link: http://lkml.kernel.org/r/154140850016.17322.9836787731210512176.stgit@devbox

Reviewed-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Tested-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_kprobe.c | 48 ++-------------------------------------------
 kernel/trace/trace_probe.c  | 47 +++++++++++++++++++++++++++++++++++++++++---
 kernel/trace/trace_probe.h  |  7 ++-----
 kernel/trace/trace_uprobe.c | 44 ++---------------------------------------
 4 files changed, 50 insertions(+), 96 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index fec67188c4d2..d313bcc259dc 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -548,7 +548,6 @@ static int create_trace_kprobe(int argc, char **argv)
 	bool is_return = false, is_delete = false;
 	char *symbol = NULL, *event = NULL, *group = NULL;
 	int maxactive = 0;
-	char *arg;
 	long offset = 0;
 	void *addr = NULL;
 	char buf[MAX_EVENT_NAME_LEN];
@@ -676,53 +675,10 @@ static int create_trace_kprobe(int argc, char **argv)
 	}
 
 	/* parse arguments */
-	ret = 0;
 	for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
-		struct probe_arg *parg = &tk->tp.args[i];
-
-		/* Increment count for freeing args in error case */
-		tk->tp.nr_args++;
-
-		/* Parse argument name */
-		arg = strchr(argv[i], '=');
-		if (arg) {
-			*arg++ = '\0';
-			parg->name = kstrdup(argv[i], GFP_KERNEL);
-		} else {
-			arg = argv[i];
-			/* If argument name is omitted, set "argN" */
-			snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1);
-			parg->name = kstrdup(buf, GFP_KERNEL);
-		}
-
-		if (!parg->name) {
-			pr_info("Failed to allocate argument[%d] name.\n", i);
-			ret = -ENOMEM;
-			goto error;
-		}
-
-		if (!is_good_name(parg->name)) {
-			pr_info("Invalid argument[%d] name: %s\n",
-				i, parg->name);
-			ret = -EINVAL;
-			goto error;
-		}
-
-		if (traceprobe_conflict_field_name(parg->name,
-							tk->tp.args, i)) {
-			pr_info("Argument[%d] name '%s' conflicts with "
-				"another field.\n", i, argv[i]);
-			ret = -EINVAL;
-			goto error;
-		}
-
-		/* Parse fetch argument */
-		ret = traceprobe_parse_probe_arg(arg, &tk->tp.size, parg,
-						 flags);
-		if (ret) {
-			pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
+		ret = traceprobe_parse_probe_arg(&tk->tp, i, argv[i], flags);
+		if (ret)
 			goto error;
-		}
 	}
 
 	ret = register_trace_kprobe(tk);
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index bd30e9398d2a..449150c6a87f 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -348,7 +348,7 @@ static int __parse_bitfield_probe_arg(const char *bf,
 }
 
 /* String length checking wrapper */
-int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
+static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
 		struct probe_arg *parg, unsigned int flags)
 {
 	struct fetch_insn *code, *scode, *tmp = NULL;
@@ -491,8 +491,8 @@ fail:
 }
 
 /* Return 1 if name is reserved or already used by another argument */
-int traceprobe_conflict_field_name(const char *name,
-			       struct probe_arg *args, int narg)
+static int traceprobe_conflict_field_name(const char *name,
+					  struct probe_arg *args, int narg)
 {
 	int i;
 
@@ -507,6 +507,47 @@ int traceprobe_conflict_field_name(const char *name,
 	return 0;
 }
 
+int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, char *arg,
+				unsigned int flags)
+{
+	struct probe_arg *parg = &tp->args[i];
+	char *body;
+	int ret;
+
+	/* Increment count for freeing args in error case */
+	tp->nr_args++;
+
+	body = strchr(arg, '=');
+	if (body) {
+		parg->name = kmemdup_nul(arg, body - arg, GFP_KERNEL);
+		body++;
+	} else {
+		/* If argument name is omitted, set "argN" */
+		parg->name = kasprintf(GFP_KERNEL, "arg%d", i + 1);
+		body = arg;
+	}
+	if (!parg->name)
+		return -ENOMEM;
+
+	if (!is_good_name(parg->name)) {
+		pr_info("Invalid argument[%d] name: %s\n",
+			i, parg->name);
+		return -EINVAL;
+	}
+
+	if (traceprobe_conflict_field_name(parg->name, tp->args, i)) {
+		pr_info("Argument[%d]: '%s' conflicts with another field.\n",
+			i, parg->name);
+		return -EINVAL;
+	}
+
+	/* Parse fetch argument */
+	ret = traceprobe_parse_probe_arg_body(body, &tp->size, parg, flags);
+	if (ret)
+		pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
+	return ret;
+}
+
 void traceprobe_free_probe_arg(struct probe_arg *arg)
 {
 	struct fetch_insn *code = arg->code;
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 974afc1a3e73..feeec261b356 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -272,11 +272,8 @@ find_event_file_link(struct trace_probe *tp, struct trace_event_file *file)
 #define TPARG_FL_FENTRY BIT(2)
 #define TPARG_FL_MASK	GENMASK(2, 0)
 
-extern int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
-		   struct probe_arg *parg, unsigned int flags);
-
-extern int traceprobe_conflict_field_name(const char *name,
-			       struct probe_arg *args, int narg);
+extern int traceprobe_parse_probe_arg(struct trace_probe *tp, int i,
+				char *arg, unsigned int flags);
 
 extern int traceprobe_update_arg(struct probe_arg *arg);
 extern void traceprobe_free_probe_arg(struct probe_arg *arg);
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index b708e4ff7ea7..6eaaa2150685 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -517,51 +517,11 @@ static int create_trace_uprobe(int argc, char **argv)
 	}
 
 	/* parse arguments */
-	ret = 0;
 	for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
-		struct probe_arg *parg = &tu->tp.args[i];
-
-		/* Increment count for freeing args in error case */
-		tu->tp.nr_args++;
-
-		/* Parse argument name */
-		arg = strchr(argv[i], '=');
-		if (arg) {
-			*arg++ = '\0';
-			parg->name = kstrdup(argv[i], GFP_KERNEL);
-		} else {
-			arg = argv[i];
-			/* If argument name is omitted, set "argN" */
-			snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1);
-			parg->name = kstrdup(buf, GFP_KERNEL);
-		}
-
-		if (!parg->name) {
-			pr_info("Failed to allocate argument[%d] name.\n", i);
-			ret = -ENOMEM;
-			goto error;
-		}
-
-		if (!is_good_name(parg->name)) {
-			pr_info("Invalid argument[%d] name: %s\n", i, parg->name);
-			ret = -EINVAL;
-			goto error;
-		}
-
-		if (traceprobe_conflict_field_name(parg->name, tu->tp.args, i)) {
-			pr_info("Argument[%d] name '%s' conflicts with "
-				"another field.\n", i, argv[i]);
-			ret = -EINVAL;
-			goto error;
-		}
-
-		/* Parse fetch argument */
-		ret = traceprobe_parse_probe_arg(arg, &tu->tp.size, parg,
+		ret = traceprobe_parse_probe_arg(&tu->tp, i, argv[i],
 					is_return ? TPARG_FL_RETURN : 0);
-		if (ret) {
-			pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
+		if (ret)
 			goto error;
-		}
 	}
 
 	ret = register_trace_uprobe(tu);
-- 
cgit 


From 5448d44c38557fc15d1c53b608a9c9f0e1ca8f86 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Mon, 5 Nov 2018 18:02:08 +0900
Subject: tracing: Add unified dynamic event framework

Add unified dynamic event framework for ftrace kprobes, uprobes
and synthetic events. Those dynamic events can be co-exist on
same file because those syntax doesn't overlap.

This introduces a framework part which provides a unified tracefs
interface and operations.

Link: http://lkml.kernel.org/r/154140852824.17322.12250362185969352095.stgit@devbox

Reviewed-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Tested-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/Kconfig          |   3 +
 kernel/trace/Makefile         |   1 +
 kernel/trace/trace.c          |   4 +
 kernel/trace/trace_dynevent.c | 210 ++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_dynevent.h | 119 ++++++++++++++++++++++++
 5 files changed, 337 insertions(+)
 create mode 100644 kernel/trace/trace_dynevent.c
 create mode 100644 kernel/trace/trace_dynevent.h

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 5e3de28c7677..bf2e8a5a91f1 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -518,6 +518,9 @@ config BPF_EVENTS
 	help
 	  This allows the user to attach BPF programs to kprobe events.
 
+config DYNAMIC_EVENTS
+	def_bool n
+
 config PROBE_EVENTS
 	def_bool n
 
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index c7ade7965464..c2b2148bb1d2 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -79,6 +79,7 @@ endif
 ifeq ($(CONFIG_TRACING),y)
 obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
 endif
+obj-$(CONFIG_DYNAMIC_EVENTS) += trace_dynevent.o
 obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o
 obj-$(CONFIG_UPROBE_EVENTS) += trace_uprobe.o
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 194c01838e3f..7e0332f90ed4 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4604,6 +4604,10 @@ static const char readme_msg[] =
 	"\t\t\t  traces\n"
 #endif
 #endif /* CONFIG_STACK_TRACER */
+#ifdef CONFIG_DYNAMIC_EVENTS
+	"  dynamic_events\t\t- Add/remove/show the generic dynamic events\n"
+	"\t\t\t  Write into this file to define/undefine new trace events.\n"
+#endif
 #ifdef CONFIG_KPROBE_EVENTS
 	"  kprobe_events\t\t- Add/remove/show the kernel dynamic events\n"
 	"\t\t\t  Write into this file to define/undefine new trace events.\n"
diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c
new file mode 100644
index 000000000000..f17a887abb66
--- /dev/null
+++ b/kernel/trace/trace_dynevent.c
@@ -0,0 +1,210 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Generic dynamic event control interface
+ *
+ * Copyright (C) 2018 Masami Hiramatsu <mhiramat@kernel.org>
+ */
+
+#include <linux/debugfs.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/mutex.h>
+#include <linux/tracefs.h>
+
+#include "trace.h"
+#include "trace_dynevent.h"
+
+static DEFINE_MUTEX(dyn_event_ops_mutex);
+static LIST_HEAD(dyn_event_ops_list);
+
+int dyn_event_register(struct dyn_event_operations *ops)
+{
+	if (!ops || !ops->create || !ops->show || !ops->is_busy ||
+	    !ops->free || !ops->match)
+		return -EINVAL;
+
+	INIT_LIST_HEAD(&ops->list);
+	mutex_lock(&dyn_event_ops_mutex);
+	list_add_tail(&ops->list, &dyn_event_ops_list);
+	mutex_unlock(&dyn_event_ops_mutex);
+	return 0;
+}
+
+int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type)
+{
+	struct dyn_event *pos, *n;
+	char *system = NULL, *event, *p;
+	int ret = -ENOENT;
+
+	if (argv[0][1] != ':')
+		return -EINVAL;
+
+	event = &argv[0][2];
+	p = strchr(event, '/');
+	if (p) {
+		system = event;
+		event = p + 1;
+		*p = '\0';
+	}
+	if (event[0] == '\0')
+		return -EINVAL;
+
+	mutex_lock(&event_mutex);
+	for_each_dyn_event_safe(pos, n) {
+		if (type && type != pos->ops)
+			continue;
+		if (pos->ops->match(system, event, pos)) {
+			ret = pos->ops->free(pos);
+			break;
+		}
+	}
+	mutex_unlock(&event_mutex);
+
+	return ret;
+}
+
+static int create_dyn_event(int argc, char **argv)
+{
+	struct dyn_event_operations *ops;
+	int ret;
+
+	if (argv[0][0] == '-')
+		return dyn_event_release(argc, argv, NULL);
+
+	mutex_lock(&dyn_event_ops_mutex);
+	list_for_each_entry(ops, &dyn_event_ops_list, list) {
+		ret = ops->create(argc, (const char **)argv);
+		if (!ret || ret != -ECANCELED)
+			break;
+	}
+	mutex_unlock(&dyn_event_ops_mutex);
+	if (ret == -ECANCELED)
+		ret = -EINVAL;
+
+	return ret;
+}
+
+/* Protected by event_mutex */
+LIST_HEAD(dyn_event_list);
+
+void *dyn_event_seq_start(struct seq_file *m, loff_t *pos)
+{
+	mutex_lock(&event_mutex);
+	return seq_list_start(&dyn_event_list, *pos);
+}
+
+void *dyn_event_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	return seq_list_next(v, &dyn_event_list, pos);
+}
+
+void dyn_event_seq_stop(struct seq_file *m, void *v)
+{
+	mutex_unlock(&event_mutex);
+}
+
+static int dyn_event_seq_show(struct seq_file *m, void *v)
+{
+	struct dyn_event *ev = v;
+
+	if (ev && ev->ops)
+		return ev->ops->show(m, ev);
+
+	return 0;
+}
+
+static const struct seq_operations dyn_event_seq_op = {
+	.start	= dyn_event_seq_start,
+	.next	= dyn_event_seq_next,
+	.stop	= dyn_event_seq_stop,
+	.show	= dyn_event_seq_show
+};
+
+/*
+ * dyn_events_release_all - Release all specific events
+ * @type:	the dyn_event_operations * which filters releasing events
+ *
+ * This releases all events which ->ops matches @type. If @type is NULL,
+ * all events are released.
+ * Return -EBUSY if any of them are in use, and return other errors when
+ * it failed to free the given event. Except for -EBUSY, event releasing
+ * process will be aborted at that point and there may be some other
+ * releasable events on the list.
+ */
+int dyn_events_release_all(struct dyn_event_operations *type)
+{
+	struct dyn_event *ev, *tmp;
+	int ret = 0;
+
+	mutex_lock(&event_mutex);
+	for_each_dyn_event(ev) {
+		if (type && ev->ops != type)
+			continue;
+		if (ev->ops->is_busy(ev)) {
+			ret = -EBUSY;
+			goto out;
+		}
+	}
+	for_each_dyn_event_safe(ev, tmp) {
+		if (type && ev->ops != type)
+			continue;
+		ret = ev->ops->free(ev);
+		if (ret)
+			break;
+	}
+out:
+	mutex_unlock(&event_mutex);
+
+	return ret;
+}
+
+static int dyn_event_open(struct inode *inode, struct file *file)
+{
+	int ret;
+
+	if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
+		ret = dyn_events_release_all(NULL);
+		if (ret < 0)
+			return ret;
+	}
+
+	return seq_open(file, &dyn_event_seq_op);
+}
+
+static ssize_t dyn_event_write(struct file *file, const char __user *buffer,
+				size_t count, loff_t *ppos)
+{
+	return trace_parse_run_command(file, buffer, count, ppos,
+				       create_dyn_event);
+}
+
+static const struct file_operations dynamic_events_ops = {
+	.owner          = THIS_MODULE,
+	.open           = dyn_event_open,
+	.read           = seq_read,
+	.llseek         = seq_lseek,
+	.release        = seq_release,
+	.write		= dyn_event_write,
+};
+
+/* Make a tracefs interface for controlling dynamic events */
+static __init int init_dynamic_event(void)
+{
+	struct dentry *d_tracer;
+	struct dentry *entry;
+
+	d_tracer = tracing_init_dentry();
+	if (IS_ERR(d_tracer))
+		return 0;
+
+	entry = tracefs_create_file("dynamic_events", 0644, d_tracer,
+				    NULL, &dynamic_events_ops);
+
+	/* Event list interface */
+	if (!entry)
+		pr_warn("Could not create tracefs 'dynamic_events' entry\n");
+
+	return 0;
+}
+fs_initcall(init_dynamic_event);
diff --git a/kernel/trace/trace_dynevent.h b/kernel/trace/trace_dynevent.h
new file mode 100644
index 000000000000..8c334064e4d6
--- /dev/null
+++ b/kernel/trace/trace_dynevent.h
@@ -0,0 +1,119 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Common header file for generic dynamic events.
+ */
+
+#ifndef _TRACE_DYNEVENT_H
+#define _TRACE_DYNEVENT_H
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/seq_file.h>
+
+#include "trace.h"
+
+struct dyn_event;
+
+/**
+ * struct dyn_event_operations - Methods for each type of dynamic events
+ *
+ * These methods must be set for each type, since there is no default method.
+ * Before using this for dyn_event_init(), it must be registered by
+ * dyn_event_register().
+ *
+ * @create: Parse and create event method. This is invoked when user passes
+ *  a event definition to dynamic_events interface. This must not destruct
+ *  the arguments and return -ECANCELED if given arguments doesn't match its
+ *  command prefix.
+ * @show: Showing method. This is invoked when user reads the event definitions
+ *  via dynamic_events interface.
+ * @is_busy: Check whether given event is busy so that it can not be deleted.
+ *  Return true if it is busy, otherwides false.
+ * @free: Delete the given event. Return 0 if success, otherwides error.
+ * @match: Check whether given event and system name match this event.
+ *  Return true if it matches, otherwides false.
+ *
+ * Except for @create, these methods are called under holding event_mutex.
+ */
+struct dyn_event_operations {
+	struct list_head	list;
+	int (*create)(int argc, const char *argv[]);
+	int (*show)(struct seq_file *m, struct dyn_event *ev);
+	bool (*is_busy)(struct dyn_event *ev);
+	int (*free)(struct dyn_event *ev);
+	bool (*match)(const char *system, const char *event,
+			struct dyn_event *ev);
+};
+
+/* Register new dyn_event type -- must be called at first */
+int dyn_event_register(struct dyn_event_operations *ops);
+
+/**
+ * struct dyn_event - Dynamic event list header
+ *
+ * The dyn_event structure encapsulates a list and a pointer to the operators
+ * for making a global list of dynamic events.
+ * User must includes this in each event structure, so that those events can
+ * be added/removed via dynamic_events interface.
+ */
+struct dyn_event {
+	struct list_head		list;
+	struct dyn_event_operations	*ops;
+};
+
+extern struct list_head dyn_event_list;
+
+static inline
+int dyn_event_init(struct dyn_event *ev, struct dyn_event_operations *ops)
+{
+	if (!ev || !ops)
+		return -EINVAL;
+
+	INIT_LIST_HEAD(&ev->list);
+	ev->ops = ops;
+	return 0;
+}
+
+static inline int dyn_event_add(struct dyn_event *ev)
+{
+	lockdep_assert_held(&event_mutex);
+
+	if (!ev || !ev->ops)
+		return -EINVAL;
+
+	list_add_tail(&ev->list, &dyn_event_list);
+	return 0;
+}
+
+static inline void dyn_event_remove(struct dyn_event *ev)
+{
+	lockdep_assert_held(&event_mutex);
+	list_del_init(&ev->list);
+}
+
+void *dyn_event_seq_start(struct seq_file *m, loff_t *pos);
+void *dyn_event_seq_next(struct seq_file *m, void *v, loff_t *pos);
+void dyn_event_seq_stop(struct seq_file *m, void *v);
+int dyn_events_release_all(struct dyn_event_operations *type);
+int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type);
+
+/*
+ * for_each_dyn_event	-	iterate over the dyn_event list
+ * @pos:	the struct dyn_event * to use as a loop cursor
+ *
+ * This is just a basement of for_each macro. Wrap this for
+ * each actual event structure with ops filtering.
+ */
+#define for_each_dyn_event(pos)	\
+	list_for_each_entry(pos, &dyn_event_list, list)
+
+/*
+ * for_each_dyn_event	-	iterate over the dyn_event list safely
+ * @pos:	the struct dyn_event * to use as a loop cursor
+ * @n:		the struct dyn_event * to use as temporary storage
+ */
+#define for_each_dyn_event_safe(pos, n)	\
+	list_for_each_entry_safe(pos, n, &dyn_event_list, list)
+
+#endif
-- 
cgit 


From 6212dd29683eec51d6d05374a66ac953e81250e9 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Mon, 5 Nov 2018 18:02:36 +0900
Subject: tracing/kprobes: Use dyn_event framework for kprobe events

Use dyn_event framework for kprobe events. This shows
kprobe events on "tracing/dynamic_events" file.

User can also define new events via tracing/dynamic_events.

Link: http://lkml.kernel.org/r/154140855646.17322.6619219995865980392.stgit@devbox

Reviewed-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Tested-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 Documentation/trace/kprobetrace.rst |   3 +
 kernel/trace/Kconfig                |   1 +
 kernel/trace/trace_kprobe.c         | 319 ++++++++++++++++++++----------------
 kernel/trace/trace_probe.c          |  27 +++
 kernel/trace/trace_probe.h          |   2 +
 5 files changed, 207 insertions(+), 145 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/trace/kprobetrace.rst b/Documentation/trace/kprobetrace.rst
index 47e765c2f2c3..235ce2ab131a 100644
--- a/Documentation/trace/kprobetrace.rst
+++ b/Documentation/trace/kprobetrace.rst
@@ -20,6 +20,9 @@ current_tracer. Instead of that, add probe points via
 /sys/kernel/debug/tracing/kprobe_events, and enable it via
 /sys/kernel/debug/tracing/events/kprobes/<EVENT>/enable.
 
+You can also use /sys/kernel/debug/tracing/dynamic_events instead of
+kprobe_events. That interface will provide unified access to other
+dynamic events too.
 
 Synopsis of kprobe_events
 -------------------------
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index bf2e8a5a91f1..c0f6b0105609 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -461,6 +461,7 @@ config KPROBE_EVENTS
 	bool "Enable kprobes-based dynamic events"
 	select TRACING
 	select PROBE_EVENTS
+	select DYNAMIC_EVENTS
 	default y
 	help
 	  This allows the user to add tracing events (similar to tracepoints)
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index d313bcc259dc..bdf8c2ad5152 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -12,6 +12,7 @@
 #include <linux/rculist.h>
 #include <linux/error-injection.h>
 
+#include "trace_dynevent.h"
 #include "trace_kprobe_selftest.h"
 #include "trace_probe.h"
 #include "trace_probe_tmpl.h"
@@ -19,17 +20,51 @@
 #define KPROBE_EVENT_SYSTEM "kprobes"
 #define KRETPROBE_MAXACTIVE_MAX 4096
 
+static int trace_kprobe_create(int argc, const char **argv);
+static int trace_kprobe_show(struct seq_file *m, struct dyn_event *ev);
+static int trace_kprobe_release(struct dyn_event *ev);
+static bool trace_kprobe_is_busy(struct dyn_event *ev);
+static bool trace_kprobe_match(const char *system, const char *event,
+			       struct dyn_event *ev);
+
+static struct dyn_event_operations trace_kprobe_ops = {
+	.create = trace_kprobe_create,
+	.show = trace_kprobe_show,
+	.is_busy = trace_kprobe_is_busy,
+	.free = trace_kprobe_release,
+	.match = trace_kprobe_match,
+};
+
 /**
  * Kprobe event core functions
  */
 struct trace_kprobe {
-	struct list_head	list;
+	struct dyn_event	devent;
 	struct kretprobe	rp;	/* Use rp.kp for kprobe use */
 	unsigned long __percpu *nhit;
 	const char		*symbol;	/* symbol name */
 	struct trace_probe	tp;
 };
 
+static bool is_trace_kprobe(struct dyn_event *ev)
+{
+	return ev->ops == &trace_kprobe_ops;
+}
+
+static struct trace_kprobe *to_trace_kprobe(struct dyn_event *ev)
+{
+	return container_of(ev, struct trace_kprobe, devent);
+}
+
+/**
+ * for_each_trace_kprobe - iterate over the trace_kprobe list
+ * @pos:	the struct trace_kprobe * for each entry
+ * @dpos:	the struct dyn_event * to use as a loop cursor
+ */
+#define for_each_trace_kprobe(pos, dpos)	\
+	for_each_dyn_event(dpos)		\
+		if (is_trace_kprobe(dpos) && (pos = to_trace_kprobe(dpos)))
+
 #define SIZEOF_TRACE_KPROBE(n)				\
 	(offsetof(struct trace_kprobe, tp.args) +	\
 	(sizeof(struct probe_arg) * (n)))
@@ -81,6 +116,22 @@ static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk)
 	return ret;
 }
 
+static bool trace_kprobe_is_busy(struct dyn_event *ev)
+{
+	struct trace_kprobe *tk = to_trace_kprobe(ev);
+
+	return trace_probe_is_enabled(&tk->tp);
+}
+
+static bool trace_kprobe_match(const char *system, const char *event,
+			       struct dyn_event *ev)
+{
+	struct trace_kprobe *tk = to_trace_kprobe(ev);
+
+	return strcmp(trace_event_name(&tk->tp.call), event) == 0 &&
+	    (!system || strcmp(tk->tp.call.class->system, system) == 0);
+}
+
 static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk)
 {
 	unsigned long nhit = 0;
@@ -128,9 +179,6 @@ bool trace_kprobe_error_injectable(struct trace_event_call *call)
 static int register_kprobe_event(struct trace_kprobe *tk);
 static int unregister_kprobe_event(struct trace_kprobe *tk);
 
-static DEFINE_MUTEX(probe_lock);
-static LIST_HEAD(probe_list);
-
 static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs);
 static int kretprobe_dispatcher(struct kretprobe_instance *ri,
 				struct pt_regs *regs);
@@ -192,7 +240,7 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group,
 	if (!tk->tp.class.system)
 		goto error;
 
-	INIT_LIST_HEAD(&tk->list);
+	dyn_event_init(&tk->devent, &trace_kprobe_ops);
 	INIT_LIST_HEAD(&tk->tp.files);
 	return tk;
 error:
@@ -207,6 +255,9 @@ static void free_trace_kprobe(struct trace_kprobe *tk)
 {
 	int i;
 
+	if (!tk)
+		return;
+
 	for (i = 0; i < tk->tp.nr_args; i++)
 		traceprobe_free_probe_arg(&tk->tp.args[i]);
 
@@ -220,9 +271,10 @@ static void free_trace_kprobe(struct trace_kprobe *tk)
 static struct trace_kprobe *find_trace_kprobe(const char *event,
 					      const char *group)
 {
+	struct dyn_event *pos;
 	struct trace_kprobe *tk;
 
-	list_for_each_entry(tk, &probe_list, list)
+	for_each_trace_kprobe(tk, pos)
 		if (strcmp(trace_event_name(&tk->tp.call), event) == 0 &&
 		    strcmp(tk->tp.call.class->system, group) == 0)
 			return tk;
@@ -321,7 +373,7 @@ disable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file)
 	 * created with perf_event_open. We don't need to wait for these
 	 * trace_kprobes
 	 */
-	if (list_empty(&tk->list))
+	if (list_empty(&tk->devent.list))
 		wait = 0;
  out:
 	if (wait) {
@@ -419,7 +471,7 @@ static void __unregister_trace_kprobe(struct trace_kprobe *tk)
 	}
 }
 
-/* Unregister a trace_probe and probe_event: call with locking probe_lock */
+/* Unregister a trace_probe and probe_event */
 static int unregister_trace_kprobe(struct trace_kprobe *tk)
 {
 	/* Enabled event can not be unregistered */
@@ -431,7 +483,7 @@ static int unregister_trace_kprobe(struct trace_kprobe *tk)
 		return -EBUSY;
 
 	__unregister_trace_kprobe(tk);
-	list_del(&tk->list);
+	dyn_event_remove(&tk->devent);
 
 	return 0;
 }
@@ -442,7 +494,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk)
 	struct trace_kprobe *old_tk;
 	int ret;
 
-	mutex_lock(&probe_lock);
+	mutex_lock(&event_mutex);
 
 	/* Delete old (same name) event if exist */
 	old_tk = find_trace_kprobe(trace_event_name(&tk->tp.call),
@@ -471,10 +523,10 @@ static int register_trace_kprobe(struct trace_kprobe *tk)
 	if (ret < 0)
 		unregister_kprobe_event(tk);
 	else
-		list_add_tail(&tk->list, &probe_list);
+		dyn_event_add(&tk->devent);
 
 end:
-	mutex_unlock(&probe_lock);
+	mutex_unlock(&event_mutex);
 	return ret;
 }
 
@@ -483,6 +535,7 @@ static int trace_kprobe_module_callback(struct notifier_block *nb,
 				       unsigned long val, void *data)
 {
 	struct module *mod = data;
+	struct dyn_event *pos;
 	struct trace_kprobe *tk;
 	int ret;
 
@@ -490,8 +543,8 @@ static int trace_kprobe_module_callback(struct notifier_block *nb,
 		return NOTIFY_DONE;
 
 	/* Update probes on coming module */
-	mutex_lock(&probe_lock);
-	list_for_each_entry(tk, &probe_list, list) {
+	mutex_lock(&event_mutex);
+	for_each_trace_kprobe(tk, pos) {
 		if (trace_kprobe_within_module(tk, mod)) {
 			/* Don't need to check busy - this should have gone. */
 			__unregister_trace_kprobe(tk);
@@ -502,7 +555,7 @@ static int trace_kprobe_module_callback(struct notifier_block *nb,
 					mod->name, ret);
 		}
 	}
-	mutex_unlock(&probe_lock);
+	mutex_unlock(&event_mutex);
 
 	return NOTIFY_DONE;
 }
@@ -520,7 +573,7 @@ static inline void sanitize_event_name(char *name)
 			*name = '_';
 }
 
-static int create_trace_kprobe(int argc, char **argv)
+static int trace_kprobe_create(int argc, const char *argv[])
 {
 	/*
 	 * Argument syntax:
@@ -544,9 +597,10 @@ static int create_trace_kprobe(int argc, char **argv)
 	 *  FETCHARG:TYPE : use TYPE instead of unsigned long.
 	 */
 	struct trace_kprobe *tk;
-	int i, ret = 0;
-	bool is_return = false, is_delete = false;
-	char *symbol = NULL, *event = NULL, *group = NULL;
+	int i, len, ret = 0;
+	bool is_return = false;
+	char *symbol = NULL, *tmp = NULL;
+	const char *event = NULL, *group = KPROBE_EVENT_SYSTEM;
 	int maxactive = 0;
 	long offset = 0;
 	void *addr = NULL;
@@ -554,26 +608,26 @@ static int create_trace_kprobe(int argc, char **argv)
 	unsigned int flags = TPARG_FL_KERNEL;
 
 	/* argc must be >= 1 */
-	if (argv[0][0] == 'p')
-		is_return = false;
-	else if (argv[0][0] == 'r') {
+	if (argv[0][0] == 'r') {
 		is_return = true;
 		flags |= TPARG_FL_RETURN;
-	} else if (argv[0][0] == '-')
-		is_delete = true;
-	else {
-		pr_info("Probe definition must be started with 'p', 'r' or"
-			" '-'.\n");
-		return -EINVAL;
-	}
+	} else if (argv[0][0] != 'p' || argc < 2)
+		return -ECANCELED;
 
 	event = strchr(&argv[0][1], ':');
-	if (event) {
-		event[0] = '\0';
+	if (event)
 		event++;
-	}
+
 	if (is_return && isdigit(argv[0][1])) {
-		ret = kstrtouint(&argv[0][1], 0, &maxactive);
+		if (event)
+			len = event - &argv[0][1] - 1;
+		else
+			len = strlen(&argv[0][1]);
+		if (len > MAX_EVENT_NAME_LEN - 1)
+			return -E2BIG;
+		memcpy(buf, &argv[0][1], len);
+		buf[len] = '\0';
+		ret = kstrtouint(buf, 0, &maxactive);
 		if (ret) {
 			pr_info("Failed to parse maxactive.\n");
 			return ret;
@@ -588,74 +642,37 @@ static int create_trace_kprobe(int argc, char **argv)
 		}
 	}
 
-	if (event) {
-		char *slash;
-
-		slash = strchr(event, '/');
-		if (slash) {
-			group = event;
-			event = slash + 1;
-			slash[0] = '\0';
-			if (strlen(group) == 0) {
-				pr_info("Group name is not specified\n");
-				return -EINVAL;
-			}
-		}
-		if (strlen(event) == 0) {
-			pr_info("Event name is not specified\n");
-			return -EINVAL;
-		}
-	}
-	if (!group)
-		group = KPROBE_EVENT_SYSTEM;
-
-	if (is_delete) {
-		if (!event) {
-			pr_info("Delete command needs an event name.\n");
-			return -EINVAL;
-		}
-		mutex_lock(&probe_lock);
-		tk = find_trace_kprobe(event, group);
-		if (!tk) {
-			mutex_unlock(&probe_lock);
-			pr_info("Event %s/%s doesn't exist.\n", group, event);
-			return -ENOENT;
-		}
-		/* delete an event */
-		ret = unregister_trace_kprobe(tk);
-		if (ret == 0)
-			free_trace_kprobe(tk);
-		mutex_unlock(&probe_lock);
-		return ret;
-	}
-
-	if (argc < 2) {
-		pr_info("Probe point is not specified.\n");
-		return -EINVAL;
-	}
-
 	/* try to parse an address. if that fails, try to read the
 	 * input as a symbol. */
 	if (kstrtoul(argv[1], 0, (unsigned long *)&addr)) {
+		/* Check whether uprobe event specified */
+		if (strchr(argv[1], '/') && strchr(argv[1], ':'))
+			return -ECANCELED;
 		/* a symbol specified */
-		symbol = argv[1];
+		symbol = kstrdup(argv[1], GFP_KERNEL);
+		if (!symbol)
+			return -ENOMEM;
 		/* TODO: support .init module functions */
 		ret = traceprobe_split_symbol_offset(symbol, &offset);
 		if (ret || offset < 0 || offset > UINT_MAX) {
 			pr_info("Failed to parse either an address or a symbol.\n");
-			return ret;
+			goto out;
 		}
 		if (kprobe_on_func_entry(NULL, symbol, offset))
 			flags |= TPARG_FL_FENTRY;
 		if (offset && is_return && !(flags & TPARG_FL_FENTRY)) {
 			pr_info("Given offset is not valid for return probe.\n");
-			return -EINVAL;
+			ret = -EINVAL;
+			goto out;
 		}
 	}
 	argc -= 2; argv += 2;
 
-	/* setup a probe */
-	if (!event) {
+	if (event) {
+		ret = traceprobe_parse_event_name(&event, &group, buf);
+		if (ret)
+			goto out;
+	} else {
 		/* Make a new event name */
 		if (symbol)
 			snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_%ld",
@@ -666,17 +683,27 @@ static int create_trace_kprobe(int argc, char **argv)
 		sanitize_event_name(buf);
 		event = buf;
 	}
+
+	/* setup a probe */
 	tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive,
 			       argc, is_return);
 	if (IS_ERR(tk)) {
 		pr_info("Failed to allocate trace_probe.(%d)\n",
 			(int)PTR_ERR(tk));
-		return PTR_ERR(tk);
+		ret = PTR_ERR(tk);
+		goto out;
 	}
 
 	/* parse arguments */
 	for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
-		ret = traceprobe_parse_probe_arg(&tk->tp, i, argv[i], flags);
+		tmp = kstrdup(argv[i], GFP_KERNEL);
+		if (!tmp) {
+			ret = -ENOMEM;
+			goto error;
+		}
+
+		ret = traceprobe_parse_probe_arg(&tk->tp, i, tmp, flags);
+		kfree(tmp);
 		if (ret)
 			goto error;
 	}
@@ -684,60 +711,39 @@ static int create_trace_kprobe(int argc, char **argv)
 	ret = register_trace_kprobe(tk);
 	if (ret)
 		goto error;
-	return 0;
+out:
+	kfree(symbol);
+	return ret;
 
 error:
 	free_trace_kprobe(tk);
-	return ret;
+	goto out;
 }
 
-static int release_all_trace_kprobes(void)
+static int create_or_delete_trace_kprobe(int argc, char **argv)
 {
-	struct trace_kprobe *tk;
-	int ret = 0;
-
-	mutex_lock(&probe_lock);
-	/* Ensure no probe is in use. */
-	list_for_each_entry(tk, &probe_list, list)
-		if (trace_probe_is_enabled(&tk->tp)) {
-			ret = -EBUSY;
-			goto end;
-		}
-	/* TODO: Use batch unregistration */
-	while (!list_empty(&probe_list)) {
-		tk = list_entry(probe_list.next, struct trace_kprobe, list);
-		ret = unregister_trace_kprobe(tk);
-		if (ret)
-			goto end;
-		free_trace_kprobe(tk);
-	}
-
-end:
-	mutex_unlock(&probe_lock);
+	int ret;
 
-	return ret;
-}
+	if (argv[0][0] == '-')
+		return dyn_event_release(argc, argv, &trace_kprobe_ops);
 
-/* Probes listing interfaces */
-static void *probes_seq_start(struct seq_file *m, loff_t *pos)
-{
-	mutex_lock(&probe_lock);
-	return seq_list_start(&probe_list, *pos);
+	ret = trace_kprobe_create(argc, (const char **)argv);
+	return ret == -ECANCELED ? -EINVAL : ret;
 }
 
-static void *probes_seq_next(struct seq_file *m, void *v, loff_t *pos)
+static int trace_kprobe_release(struct dyn_event *ev)
 {
-	return seq_list_next(v, &probe_list, pos);
-}
+	struct trace_kprobe *tk = to_trace_kprobe(ev);
+	int ret = unregister_trace_kprobe(tk);
 
-static void probes_seq_stop(struct seq_file *m, void *v)
-{
-	mutex_unlock(&probe_lock);
+	if (!ret)
+		free_trace_kprobe(tk);
+	return ret;
 }
 
-static int probes_seq_show(struct seq_file *m, void *v)
+static int trace_kprobe_show(struct seq_file *m, struct dyn_event *ev)
 {
-	struct trace_kprobe *tk = v;
+	struct trace_kprobe *tk = to_trace_kprobe(ev);
 	int i;
 
 	seq_putc(m, trace_kprobe_is_return(tk) ? 'r' : 'p');
@@ -759,10 +765,20 @@ static int probes_seq_show(struct seq_file *m, void *v)
 	return 0;
 }
 
+static int probes_seq_show(struct seq_file *m, void *v)
+{
+	struct dyn_event *ev = v;
+
+	if (!is_trace_kprobe(ev))
+		return 0;
+
+	return trace_kprobe_show(m, ev);
+}
+
 static const struct seq_operations probes_seq_op = {
-	.start  = probes_seq_start,
-	.next   = probes_seq_next,
-	.stop   = probes_seq_stop,
+	.start  = dyn_event_seq_start,
+	.next   = dyn_event_seq_next,
+	.stop   = dyn_event_seq_stop,
 	.show   = probes_seq_show
 };
 
@@ -771,7 +787,7 @@ static int probes_open(struct inode *inode, struct file *file)
 	int ret;
 
 	if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
-		ret = release_all_trace_kprobes();
+		ret = dyn_events_release_all(&trace_kprobe_ops);
 		if (ret < 0)
 			return ret;
 	}
@@ -783,7 +799,7 @@ static ssize_t probes_write(struct file *file, const char __user *buffer,
 			    size_t count, loff_t *ppos)
 {
 	return trace_parse_run_command(file, buffer, count, ppos,
-				       create_trace_kprobe);
+				       create_or_delete_trace_kprobe);
 }
 
 static const struct file_operations kprobe_events_ops = {
@@ -798,8 +814,13 @@ static const struct file_operations kprobe_events_ops = {
 /* Probes profiling interfaces */
 static int probes_profile_seq_show(struct seq_file *m, void *v)
 {
-	struct trace_kprobe *tk = v;
+	struct dyn_event *ev = v;
+	struct trace_kprobe *tk;
 
+	if (!is_trace_kprobe(ev))
+		return 0;
+
+	tk = to_trace_kprobe(ev);
 	seq_printf(m, "  %-44s %15lu %15lu\n",
 		   trace_event_name(&tk->tp.call),
 		   trace_kprobe_nhit(tk),
@@ -809,9 +830,9 @@ static int probes_profile_seq_show(struct seq_file *m, void *v)
 }
 
 static const struct seq_operations profile_seq_op = {
-	.start  = probes_seq_start,
-	.next   = probes_seq_next,
-	.stop   = probes_seq_stop,
+	.start  = dyn_event_seq_start,
+	.next   = dyn_event_seq_next,
+	.stop   = dyn_event_seq_stop,
 	.show   = probes_profile_seq_show
 };
 
@@ -1332,7 +1353,7 @@ static int register_kprobe_event(struct trace_kprobe *tk)
 		kfree(call->print_fmt);
 		return -ENODEV;
 	}
-	ret = trace_add_event_call(call);
+	ret = trace_add_event_call_nolock(call);
 	if (ret) {
 		pr_info("Failed to register kprobe event: %s\n",
 			trace_event_name(call));
@@ -1347,7 +1368,7 @@ static int unregister_kprobe_event(struct trace_kprobe *tk)
 	int ret;
 
 	/* tp->event is unregistered in trace_remove_event_call() */
-	ret = trace_remove_event_call(&tk->tp.call);
+	ret = trace_remove_event_call_nolock(&tk->tp.call);
 	if (!ret)
 		kfree(tk->tp.call.print_fmt);
 	return ret;
@@ -1364,7 +1385,7 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs,
 	char *event;
 
 	/*
-	 * local trace_kprobes are not added to probe_list, so they are never
+	 * local trace_kprobes are not added to dyn_event, so they are never
 	 * searched in find_trace_kprobe(). Therefore, there is no concern of
 	 * duplicated name here.
 	 */
@@ -1422,6 +1443,11 @@ static __init int init_kprobe_trace(void)
 {
 	struct dentry *d_tracer;
 	struct dentry *entry;
+	int ret;
+
+	ret = dyn_event_register(&trace_kprobe_ops);
+	if (ret)
+		return ret;
 
 	if (register_module_notifier(&trace_kprobe_module_nb))
 		return -EINVAL;
@@ -1479,9 +1505,8 @@ static __init int kprobe_trace_self_tests_init(void)
 
 	pr_info("Testing kprobe tracing: ");
 
-	ret = trace_run_command("p:testprobe kprobe_trace_selftest_target "
-				"$stack $stack0 +0($stack)",
-				create_trace_kprobe);
+	ret = trace_run_command("p:testprobe kprobe_trace_selftest_target $stack $stack0 +0($stack)",
+				create_or_delete_trace_kprobe);
 	if (WARN_ON_ONCE(ret)) {
 		pr_warn("error on probing function entry.\n");
 		warn++;
@@ -1501,8 +1526,8 @@ static __init int kprobe_trace_self_tests_init(void)
 		}
 	}
 
-	ret = trace_run_command("r:testprobe2 kprobe_trace_selftest_target "
-				"$retval", create_trace_kprobe);
+	ret = trace_run_command("r:testprobe2 kprobe_trace_selftest_target $retval",
+				create_or_delete_trace_kprobe);
 	if (WARN_ON_ONCE(ret)) {
 		pr_warn("error on probing function return.\n");
 		warn++;
@@ -1572,20 +1597,24 @@ static __init int kprobe_trace_self_tests_init(void)
 			disable_trace_kprobe(tk, file);
 	}
 
-	ret = trace_run_command("-:testprobe", create_trace_kprobe);
+	ret = trace_run_command("-:testprobe", create_or_delete_trace_kprobe);
 	if (WARN_ON_ONCE(ret)) {
 		pr_warn("error on deleting a probe.\n");
 		warn++;
 	}
 
-	ret = trace_run_command("-:testprobe2", create_trace_kprobe);
+	ret = trace_run_command("-:testprobe2", create_or_delete_trace_kprobe);
 	if (WARN_ON_ONCE(ret)) {
 		pr_warn("error on deleting a probe.\n");
 		warn++;
 	}
 
 end:
-	release_all_trace_kprobes();
+	ret = dyn_events_release_all(&trace_kprobe_ops);
+	if (WARN_ON_ONCE(ret)) {
+		pr_warn("error on cleaning up probes.\n");
+		warn++;
+	}
 	/*
 	 * Wait for the optimizer work to finish. Otherwise it might fiddle
 	 * with probes in already freed __init text.
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 449150c6a87f..ff86417c0149 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -154,6 +154,33 @@ int traceprobe_split_symbol_offset(char *symbol, long *offset)
 	return 0;
 }
 
+/* @buf must has MAX_EVENT_NAME_LEN size */
+int traceprobe_parse_event_name(const char **pevent, const char **pgroup,
+				char *buf)
+{
+	const char *slash, *event = *pevent;
+
+	slash = strchr(event, '/');
+	if (slash) {
+		if (slash == event) {
+			pr_info("Group name is not specified\n");
+			return -EINVAL;
+		}
+		if (slash - event + 1 > MAX_EVENT_NAME_LEN) {
+			pr_info("Group name is too long\n");
+			return -E2BIG;
+		}
+		strlcpy(buf, event, slash - event + 1);
+		*pgroup = buf;
+		*pevent = slash + 1;
+	}
+	if (strlen(event) == 0) {
+		pr_info("Event name is not specified\n");
+		return -EINVAL;
+	}
+	return 0;
+}
+
 #define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
 
 static int parse_probe_vars(char *arg, const struct fetch_type *t,
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index feeec261b356..8a63f8bc01bc 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -279,6 +279,8 @@ extern int traceprobe_update_arg(struct probe_arg *arg);
 extern void traceprobe_free_probe_arg(struct probe_arg *arg);
 
 extern int traceprobe_split_symbol_offset(char *symbol, long *offset);
+extern int traceprobe_parse_event_name(const char **pevent,
+				       const char **pgroup, char *buf);
 
 extern int traceprobe_set_print_fmt(struct trace_probe *tp, bool is_return);
 
-- 
cgit 


From 0597c49c69d53f00df99421d832453f4e92a5006 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Mon, 5 Nov 2018 18:03:04 +0900
Subject: tracing/uprobes: Use dyn_event framework for uprobe events

Use dyn_event framework for uprobe events. This shows
uprobe events on "dynamic_events" file.
User can also define new uprobe events via dynamic_events.

Link: http://lkml.kernel.org/r/154140858481.17322.9091293846515154065.stgit@devbox

Reviewed-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Tested-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 Documentation/trace/uprobetracer.rst |   4 +
 kernel/trace/Kconfig                 |   1 +
 kernel/trace/trace_uprobe.c          | 278 +++++++++++++++++++----------------
 3 files changed, 153 insertions(+), 130 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/trace/uprobetracer.rst b/Documentation/trace/uprobetracer.rst
index d0822811527a..4c3bfde2ba47 100644
--- a/Documentation/trace/uprobetracer.rst
+++ b/Documentation/trace/uprobetracer.rst
@@ -18,6 +18,10 @@ current_tracer. Instead of that, add probe points via
 However unlike kprobe-event tracer, the uprobe event interface expects the
 user to calculate the offset of the probepoint in the object.
 
+You can also use /sys/kernel/debug/tracing/dynamic_events instead of
+uprobe_events. That interface will provide unified access to other
+dynamic events too.
+
 Synopsis of uprobe_tracer
 -------------------------
 ::
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index c0f6b0105609..2cab3c5dfe2c 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -501,6 +501,7 @@ config UPROBE_EVENTS
 	depends on PERF_EVENTS
 	select UPROBES
 	select PROBE_EVENTS
+	select DYNAMIC_EVENTS
 	select TRACING
 	default y
 	help
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 6eaaa2150685..4a7b21c891f3 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -7,6 +7,7 @@
  */
 #define pr_fmt(fmt)	"trace_kprobe: " fmt
 
+#include <linux/ctype.h>
 #include <linux/module.h>
 #include <linux/uaccess.h>
 #include <linux/uprobes.h>
@@ -14,6 +15,7 @@
 #include <linux/string.h>
 #include <linux/rculist.h>
 
+#include "trace_dynevent.h"
 #include "trace_probe.h"
 #include "trace_probe_tmpl.h"
 
@@ -37,11 +39,26 @@ struct trace_uprobe_filter {
 	struct list_head	perf_events;
 };
 
+static int trace_uprobe_create(int argc, const char **argv);
+static int trace_uprobe_show(struct seq_file *m, struct dyn_event *ev);
+static int trace_uprobe_release(struct dyn_event *ev);
+static bool trace_uprobe_is_busy(struct dyn_event *ev);
+static bool trace_uprobe_match(const char *system, const char *event,
+			       struct dyn_event *ev);
+
+static struct dyn_event_operations trace_uprobe_ops = {
+	.create = trace_uprobe_create,
+	.show = trace_uprobe_show,
+	.is_busy = trace_uprobe_is_busy,
+	.free = trace_uprobe_release,
+	.match = trace_uprobe_match,
+};
+
 /*
  * uprobe event core functions
  */
 struct trace_uprobe {
-	struct list_head		list;
+	struct dyn_event		devent;
 	struct trace_uprobe_filter	filter;
 	struct uprobe_consumer		consumer;
 	struct path			path;
@@ -53,6 +70,25 @@ struct trace_uprobe {
 	struct trace_probe		tp;
 };
 
+static bool is_trace_uprobe(struct dyn_event *ev)
+{
+	return ev->ops == &trace_uprobe_ops;
+}
+
+static struct trace_uprobe *to_trace_uprobe(struct dyn_event *ev)
+{
+	return container_of(ev, struct trace_uprobe, devent);
+}
+
+/**
+ * for_each_trace_uprobe - iterate over the trace_uprobe list
+ * @pos:	the struct trace_uprobe * for each entry
+ * @dpos:	the struct dyn_event * to use as a loop cursor
+ */
+#define for_each_trace_uprobe(pos, dpos)	\
+	for_each_dyn_event(dpos)		\
+		if (is_trace_uprobe(dpos) && (pos = to_trace_uprobe(dpos)))
+
 #define SIZEOF_TRACE_UPROBE(n)				\
 	(offsetof(struct trace_uprobe, tp.args) +	\
 	(sizeof(struct probe_arg) * (n)))
@@ -60,9 +96,6 @@ struct trace_uprobe {
 static int register_uprobe_event(struct trace_uprobe *tu);
 static int unregister_uprobe_event(struct trace_uprobe *tu);
 
-static DEFINE_MUTEX(uprobe_lock);
-static LIST_HEAD(uprobe_list);
-
 struct uprobe_dispatch_data {
 	struct trace_uprobe	*tu;
 	unsigned long		bp_addr;
@@ -209,6 +242,22 @@ static inline bool is_ret_probe(struct trace_uprobe *tu)
 	return tu->consumer.ret_handler != NULL;
 }
 
+static bool trace_uprobe_is_busy(struct dyn_event *ev)
+{
+	struct trace_uprobe *tu = to_trace_uprobe(ev);
+
+	return trace_probe_is_enabled(&tu->tp);
+}
+
+static bool trace_uprobe_match(const char *system, const char *event,
+			       struct dyn_event *ev)
+{
+	struct trace_uprobe *tu = to_trace_uprobe(ev);
+
+	return strcmp(trace_event_name(&tu->tp.call), event) == 0 &&
+		(!system || strcmp(tu->tp.call.class->system, system) == 0);
+}
+
 /*
  * Allocate new trace_uprobe and initialize it (including uprobes).
  */
@@ -236,7 +285,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)
 	if (!tu->tp.class.system)
 		goto error;
 
-	INIT_LIST_HEAD(&tu->list);
+	dyn_event_init(&tu->devent, &trace_uprobe_ops);
 	INIT_LIST_HEAD(&tu->tp.files);
 	tu->consumer.handler = uprobe_dispatcher;
 	if (is_ret)
@@ -255,6 +304,9 @@ static void free_trace_uprobe(struct trace_uprobe *tu)
 {
 	int i;
 
+	if (!tu)
+		return;
+
 	for (i = 0; i < tu->tp.nr_args; i++)
 		traceprobe_free_probe_arg(&tu->tp.args[i]);
 
@@ -267,9 +319,10 @@ static void free_trace_uprobe(struct trace_uprobe *tu)
 
 static struct trace_uprobe *find_probe_event(const char *event, const char *group)
 {
+	struct dyn_event *pos;
 	struct trace_uprobe *tu;
 
-	list_for_each_entry(tu, &uprobe_list, list)
+	for_each_trace_uprobe(tu, pos)
 		if (strcmp(trace_event_name(&tu->tp.call), event) == 0 &&
 		    strcmp(tu->tp.call.class->system, group) == 0)
 			return tu;
@@ -277,7 +330,7 @@ static struct trace_uprobe *find_probe_event(const char *event, const char *grou
 	return NULL;
 }
 
-/* Unregister a trace_uprobe and probe_event: call with locking uprobe_lock */
+/* Unregister a trace_uprobe and probe_event */
 static int unregister_trace_uprobe(struct trace_uprobe *tu)
 {
 	int ret;
@@ -286,7 +339,7 @@ static int unregister_trace_uprobe(struct trace_uprobe *tu)
 	if (ret)
 		return ret;
 
-	list_del(&tu->list);
+	dyn_event_remove(&tu->devent);
 	free_trace_uprobe(tu);
 	return 0;
 }
@@ -302,13 +355,14 @@ static int unregister_trace_uprobe(struct trace_uprobe *tu)
  */
 static struct trace_uprobe *find_old_trace_uprobe(struct trace_uprobe *new)
 {
+	struct dyn_event *pos;
 	struct trace_uprobe *tmp, *old = NULL;
 	struct inode *new_inode = d_real_inode(new->path.dentry);
 
 	old = find_probe_event(trace_event_name(&new->tp.call),
 				new->tp.call.class->system);
 
-	list_for_each_entry(tmp, &uprobe_list, list) {
+	for_each_trace_uprobe(tmp, pos) {
 		if ((old ? old != tmp : true) &&
 		    new_inode == d_real_inode(tmp->path.dentry) &&
 		    new->offset == tmp->offset &&
@@ -326,7 +380,7 @@ static int register_trace_uprobe(struct trace_uprobe *tu)
 	struct trace_uprobe *old_tu;
 	int ret;
 
-	mutex_lock(&uprobe_lock);
+	mutex_lock(&event_mutex);
 
 	/* register as an event */
 	old_tu = find_old_trace_uprobe(tu);
@@ -348,10 +402,10 @@ static int register_trace_uprobe(struct trace_uprobe *tu)
 		goto end;
 	}
 
-	list_add_tail(&tu->list, &uprobe_list);
+	dyn_event_add(&tu->devent);
 
 end:
-	mutex_unlock(&uprobe_lock);
+	mutex_unlock(&event_mutex);
 
 	return ret;
 }
@@ -362,91 +416,49 @@ end:
  *
  *  - Remove uprobe: -:[GRP/]EVENT
  */
-static int create_trace_uprobe(int argc, char **argv)
+static int trace_uprobe_create(int argc, const char **argv)
 {
 	struct trace_uprobe *tu;
-	char *arg, *event, *group, *filename, *rctr, *rctr_end;
+	const char *event = NULL, *group = UPROBE_EVENT_SYSTEM;
+	char *arg, *filename, *rctr, *rctr_end, *tmp;
 	char buf[MAX_EVENT_NAME_LEN];
 	struct path path;
 	unsigned long offset, ref_ctr_offset;
-	bool is_delete, is_return;
+	bool is_return = false;
 	int i, ret;
 
 	ret = 0;
-	is_delete = false;
-	is_return = false;
-	event = NULL;
-	group = NULL;
 	ref_ctr_offset = 0;
 
 	/* argc must be >= 1 */
-	if (argv[0][0] == '-')
-		is_delete = true;
-	else if (argv[0][0] == 'r')
+	if (argv[0][0] == 'r')
 		is_return = true;
-	else if (argv[0][0] != 'p') {
-		pr_info("Probe definition must be started with 'p', 'r' or '-'.\n");
-		return -EINVAL;
-	}
+	else if (argv[0][0] != 'p' || argc < 2)
+		return -ECANCELED;
 
-	if (argv[0][1] == ':') {
+	if (argv[0][1] == ':')
 		event = &argv[0][2];
-		arg = strchr(event, '/');
 
-		if (arg) {
-			group = event;
-			event = arg + 1;
-			event[-1] = '\0';
+	if (!strchr(argv[1], '/'))
+		return -ECANCELED;
 
-			if (strlen(group) == 0) {
-				pr_info("Group name is not specified\n");
-				return -EINVAL;
-			}
-		}
-		if (strlen(event) == 0) {
-			pr_info("Event name is not specified\n");
-			return -EINVAL;
-		}
-	}
-	if (!group)
-		group = UPROBE_EVENT_SYSTEM;
-
-	if (is_delete) {
-		int ret;
-
-		if (!event) {
-			pr_info("Delete command needs an event name.\n");
-			return -EINVAL;
-		}
-		mutex_lock(&uprobe_lock);
-		tu = find_probe_event(event, group);
-
-		if (!tu) {
-			mutex_unlock(&uprobe_lock);
-			pr_info("Event %s/%s doesn't exist.\n", group, event);
-			return -ENOENT;
-		}
-		/* delete an event */
-		ret = unregister_trace_uprobe(tu);
-		mutex_unlock(&uprobe_lock);
-		return ret;
-	}
+	filename = kstrdup(argv[1], GFP_KERNEL);
+	if (!filename)
+		return -ENOMEM;
 
-	if (argc < 2) {
-		pr_info("Probe point is not specified.\n");
-		return -EINVAL;
-	}
 	/* Find the last occurrence, in case the path contains ':' too. */
-	arg = strrchr(argv[1], ':');
-	if (!arg)
-		return -EINVAL;
+	arg = strrchr(filename, ':');
+	if (!arg || !isdigit(arg[1])) {
+		kfree(filename);
+		return -ECANCELED;
+	}
 
 	*arg++ = '\0';
-	filename = argv[1];
 	ret = kern_path(filename, LOOKUP_FOLLOW, &path);
-	if (ret)
+	if (ret) {
+		kfree(filename);
 		return ret;
-
+	}
 	if (!d_is_reg(path.dentry)) {
 		ret = -EINVAL;
 		goto fail_address_parse;
@@ -480,7 +492,11 @@ static int create_trace_uprobe(int argc, char **argv)
 	argv += 2;
 
 	/* setup a probe */
-	if (!event) {
+	if (event) {
+		ret = traceprobe_parse_event_name(&event, &group, buf);
+		if (ret)
+			goto fail_address_parse;
+	} else {
 		char *tail;
 		char *ptr;
 
@@ -508,18 +524,19 @@ static int create_trace_uprobe(int argc, char **argv)
 	tu->offset = offset;
 	tu->ref_ctr_offset = ref_ctr_offset;
 	tu->path = path;
-	tu->filename = kstrdup(filename, GFP_KERNEL);
-
-	if (!tu->filename) {
-		pr_info("Failed to allocate filename.\n");
-		ret = -ENOMEM;
-		goto error;
-	}
+	tu->filename = filename;
 
 	/* parse arguments */
 	for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
-		ret = traceprobe_parse_probe_arg(&tu->tp, i, argv[i],
+		tmp = kstrdup(argv[i], GFP_KERNEL);
+		if (!tmp) {
+			ret = -ENOMEM;
+			goto error;
+		}
+
+		ret = traceprobe_parse_probe_arg(&tu->tp, i, tmp,
 					is_return ? TPARG_FL_RETURN : 0);
+		kfree(tmp);
 		if (ret)
 			goto error;
 	}
@@ -535,55 +552,35 @@ error:
 
 fail_address_parse:
 	path_put(&path);
+	kfree(filename);
 
 	pr_info("Failed to parse address or file.\n");
 
 	return ret;
 }
 
-static int cleanup_all_probes(void)
+static int create_or_delete_trace_uprobe(int argc, char **argv)
 {
-	struct trace_uprobe *tu;
-	int ret = 0;
+	int ret;
 
-	mutex_lock(&uprobe_lock);
-	/* Ensure no probe is in use. */
-	list_for_each_entry(tu, &uprobe_list, list)
-		if (trace_probe_is_enabled(&tu->tp)) {
-			ret = -EBUSY;
-			goto end;
-		}
-	while (!list_empty(&uprobe_list)) {
-		tu = list_entry(uprobe_list.next, struct trace_uprobe, list);
-		ret = unregister_trace_uprobe(tu);
-		if (ret)
-			break;
-	}
-end:
-	mutex_unlock(&uprobe_lock);
-	return ret;
-}
+	if (argv[0][0] == '-')
+		return dyn_event_release(argc, argv, &trace_uprobe_ops);
 
-/* Probes listing interfaces */
-static void *probes_seq_start(struct seq_file *m, loff_t *pos)
-{
-	mutex_lock(&uprobe_lock);
-	return seq_list_start(&uprobe_list, *pos);
+	ret = trace_uprobe_create(argc, (const char **)argv);
+	return ret == -ECANCELED ? -EINVAL : ret;
 }
 
-static void *probes_seq_next(struct seq_file *m, void *v, loff_t *pos)
+static int trace_uprobe_release(struct dyn_event *ev)
 {
-	return seq_list_next(v, &uprobe_list, pos);
-}
+	struct trace_uprobe *tu = to_trace_uprobe(ev);
 
-static void probes_seq_stop(struct seq_file *m, void *v)
-{
-	mutex_unlock(&uprobe_lock);
+	return unregister_trace_uprobe(tu);
 }
 
-static int probes_seq_show(struct seq_file *m, void *v)
+/* Probes listing interfaces */
+static int trace_uprobe_show(struct seq_file *m, struct dyn_event *ev)
 {
-	struct trace_uprobe *tu = v;
+	struct trace_uprobe *tu = to_trace_uprobe(ev);
 	char c = is_ret_probe(tu) ? 'r' : 'p';
 	int i;
 
@@ -601,11 +598,21 @@ static int probes_seq_show(struct seq_file *m, void *v)
 	return 0;
 }
 
+static int probes_seq_show(struct seq_file *m, void *v)
+{
+	struct dyn_event *ev = v;
+
+	if (!is_trace_uprobe(ev))
+		return 0;
+
+	return trace_uprobe_show(m, ev);
+}
+
 static const struct seq_operations probes_seq_op = {
-	.start	= probes_seq_start,
-	.next	= probes_seq_next,
-	.stop	= probes_seq_stop,
-	.show	= probes_seq_show
+	.start  = dyn_event_seq_start,
+	.next   = dyn_event_seq_next,
+	.stop   = dyn_event_seq_stop,
+	.show   = probes_seq_show
 };
 
 static int probes_open(struct inode *inode, struct file *file)
@@ -613,7 +620,7 @@ static int probes_open(struct inode *inode, struct file *file)
 	int ret;
 
 	if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
-		ret = cleanup_all_probes();
+		ret = dyn_events_release_all(&trace_uprobe_ops);
 		if (ret)
 			return ret;
 	}
@@ -624,7 +631,8 @@ static int probes_open(struct inode *inode, struct file *file)
 static ssize_t probes_write(struct file *file, const char __user *buffer,
 			    size_t count, loff_t *ppos)
 {
-	return trace_parse_run_command(file, buffer, count, ppos, create_trace_uprobe);
+	return trace_parse_run_command(file, buffer, count, ppos,
+					create_or_delete_trace_uprobe);
 }
 
 static const struct file_operations uprobe_events_ops = {
@@ -639,17 +647,22 @@ static const struct file_operations uprobe_events_ops = {
 /* Probes profiling interfaces */
 static int probes_profile_seq_show(struct seq_file *m, void *v)
 {
-	struct trace_uprobe *tu = v;
+	struct dyn_event *ev = v;
+	struct trace_uprobe *tu;
+
+	if (!is_trace_uprobe(ev))
+		return 0;
 
+	tu = to_trace_uprobe(ev);
 	seq_printf(m, "  %s %-44s %15lu\n", tu->filename,
 			trace_event_name(&tu->tp.call), tu->nhit);
 	return 0;
 }
 
 static const struct seq_operations profile_seq_op = {
-	.start	= probes_seq_start,
-	.next	= probes_seq_next,
-	.stop	= probes_seq_stop,
+	.start  = dyn_event_seq_start,
+	.next   = dyn_event_seq_next,
+	.stop   = dyn_event_seq_stop,
 	.show	= probes_profile_seq_show
 };
 
@@ -1307,7 +1320,7 @@ static int register_uprobe_event(struct trace_uprobe *tu)
 		return -ENODEV;
 	}
 
-	ret = trace_add_event_call(call);
+	ret = trace_add_event_call_nolock(call);
 
 	if (ret) {
 		pr_info("Failed to register uprobe event: %s\n",
@@ -1324,7 +1337,7 @@ static int unregister_uprobe_event(struct trace_uprobe *tu)
 	int ret;
 
 	/* tu->event is unregistered in trace_remove_event_call() */
-	ret = trace_remove_event_call(&tu->tp.call);
+	ret = trace_remove_event_call_nolock(&tu->tp.call);
 	if (ret)
 		return ret;
 	kfree(tu->tp.call.print_fmt);
@@ -1351,7 +1364,7 @@ create_local_trace_uprobe(char *name, unsigned long offs,
 	}
 
 	/*
-	 * local trace_kprobes are not added to probe_list, so they are never
+	 * local trace_kprobes are not added to dyn_event, so they are never
 	 * searched in find_trace_kprobe(). Therefore, there is no concern of
 	 * duplicated name "DUMMY_EVENT" here.
 	 */
@@ -1399,6 +1412,11 @@ void destroy_local_trace_uprobe(struct trace_event_call *event_call)
 static __init int init_uprobe_trace(void)
 {
 	struct dentry *d_tracer;
+	int ret;
+
+	ret = dyn_event_register(&trace_uprobe_ops);
+	if (ret)
+		return ret;
 
 	d_tracer = tracing_init_dentry();
 	if (IS_ERR(d_tracer))
-- 
cgit 


From 7bbab38d07f3185fddf6fce126e2239010efdfce Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Mon, 5 Nov 2018 18:03:33 +0900
Subject: tracing: Use dyn_event framework for synthetic events

Use dyn_event framework for synthetic events. This shows
synthetic events on "tracing/dynamic_events" file in addition
to tracing/synthetic_events interface.

User can also define new events via tracing/dynamic_events
with "s:" prefix. So, the new syntax is below;

  s:[synthetic/]EVENT_NAME TYPE ARG; [TYPE ARG;]...

To remove events via tracing/dynamic_events, you can use
"-:" prefix as same as other events.

Link: http://lkml.kernel.org/r/154140861301.17322.15454611233735614508.stgit@devbox

Reviewed-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Tested-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/Kconfig             |   1 +
 kernel/trace/trace.c             |   8 ++
 kernel/trace/trace_events_hist.c | 265 ++++++++++++++++++++++++---------------
 3 files changed, 176 insertions(+), 98 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 2cab3c5dfe2c..fa8b1fe824f3 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -635,6 +635,7 @@ config HIST_TRIGGERS
 	depends on ARCH_HAVE_NMI_SAFE_CMPXCHG
 	select TRACING_MAP
 	select TRACING
+	select DYNAMIC_EVENTS
 	default n
 	help
 	  Hist triggers allow one or more arbitrary trace event fields
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 7e0332f90ed4..911470ad9e94 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4620,6 +4620,9 @@ static const char readme_msg[] =
 	"\t  accepts: event-definitions (one definition per line)\n"
 	"\t   Format: p[:[<group>/]<event>] <place> [<args>]\n"
 	"\t           r[maxactive][:[<group>/]<event>] <place> [<args>]\n"
+#ifdef CONFIG_HIST_TRIGGERS
+	"\t           s:[synthetic/]<event> <field> [<field>]\n"
+#endif
 	"\t           -:[<group>/]<event>\n"
 #ifdef CONFIG_KPROBE_EVENTS
 	"\t    place: [<module>:]<symbol>[+<offset>]|<memaddr>\n"
@@ -4638,6 +4641,11 @@ static const char readme_msg[] =
 	"\t     type: s8/16/32/64, u8/16/32/64, x8/16/32/64, string, symbol,\n"
 	"\t           b<bit-width>@<bit-offset>/<container-size>,\n"
 	"\t           <type>\\[<array-size>\\]\n"
+#ifdef CONFIG_HIST_TRIGGERS
+	"\t    field: <stype> <name>;\n"
+	"\t    stype: u8/u16/u32/u64, s8/s16/s32/s64, pid_t,\n"
+	"\t           [unsigned] char/int/long\n"
+#endif
 #endif
 	"  events/\t\t- Directory containing all trace event subsystems:\n"
 	"      enable\t\t- Write 0/1 to enable/disable tracing of all events\n"
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 0feb7f460123..414aabd67d1f 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -15,6 +15,7 @@
 
 #include "tracing_map.h"
 #include "trace.h"
+#include "trace_dynevent.h"
 
 #define SYNTH_SYSTEM		"synthetic"
 #define SYNTH_FIELDS_MAX	16
@@ -292,6 +293,21 @@ struct hist_trigger_data {
 	unsigned int			n_max_var_str;
 };
 
+static int synth_event_create(int argc, const char **argv);
+static int synth_event_show(struct seq_file *m, struct dyn_event *ev);
+static int synth_event_release(struct dyn_event *ev);
+static bool synth_event_is_busy(struct dyn_event *ev);
+static bool synth_event_match(const char *system, const char *event,
+			      struct dyn_event *ev);
+
+static struct dyn_event_operations synth_event_ops = {
+	.create = synth_event_create,
+	.show = synth_event_show,
+	.is_busy = synth_event_is_busy,
+	.free = synth_event_release,
+	.match = synth_event_match,
+};
+
 struct synth_field {
 	char *type;
 	char *name;
@@ -301,7 +317,7 @@ struct synth_field {
 };
 
 struct synth_event {
-	struct list_head			list;
+	struct dyn_event			devent;
 	int					ref;
 	char					*name;
 	struct synth_field			**fields;
@@ -312,6 +328,32 @@ struct synth_event {
 	struct tracepoint			*tp;
 };
 
+static bool is_synth_event(struct dyn_event *ev)
+{
+	return ev->ops == &synth_event_ops;
+}
+
+static struct synth_event *to_synth_event(struct dyn_event *ev)
+{
+	return container_of(ev, struct synth_event, devent);
+}
+
+static bool synth_event_is_busy(struct dyn_event *ev)
+{
+	struct synth_event *event = to_synth_event(ev);
+
+	return event->ref != 0;
+}
+
+static bool synth_event_match(const char *system, const char *event,
+			      struct dyn_event *ev)
+{
+	struct synth_event *sev = to_synth_event(ev);
+
+	return strcmp(sev->name, event) == 0 &&
+		(!system || strcmp(system, SYNTH_SYSTEM) == 0);
+}
+
 struct action_data;
 
 typedef void (*action_fn_t) (struct hist_trigger_data *hist_data,
@@ -402,7 +444,6 @@ static bool have_hist_err(void)
 	return false;
 }
 
-static LIST_HEAD(synth_event_list);
 static DEFINE_MUTEX(synth_event_mutex);
 
 struct synth_trace_event {
@@ -738,14 +779,12 @@ static void free_synth_field(struct synth_field *field)
 	kfree(field);
 }
 
-static struct synth_field *parse_synth_field(int argc, char **argv,
+static struct synth_field *parse_synth_field(int argc, const char **argv,
 					     int *consumed)
 {
 	struct synth_field *field;
-	const char *prefix = NULL;
-	char *field_type = argv[0], *field_name;
+	const char *prefix = NULL, *field_type = argv[0], *field_name, *array;
 	int len, ret = 0;
-	char *array;
 
 	if (field_type[0] == ';')
 		field_type++;
@@ -762,20 +801,31 @@ static struct synth_field *parse_synth_field(int argc, char **argv,
 		*consumed = 2;
 	}
 
-	len = strlen(field_name);
-	if (field_name[len - 1] == ';')
-		field_name[len - 1] = '\0';
-
 	field = kzalloc(sizeof(*field), GFP_KERNEL);
 	if (!field)
 		return ERR_PTR(-ENOMEM);
 
-	len = strlen(field_type) + 1;
+	len = strlen(field_name);
 	array = strchr(field_name, '[');
+	if (array)
+		len -= strlen(array);
+	else if (field_name[len - 1] == ';')
+		len--;
+
+	field->name = kmemdup_nul(field_name, len, GFP_KERNEL);
+	if (!field->name) {
+		ret = -ENOMEM;
+		goto free;
+	}
+
+	if (field_type[0] == ';')
+		field_type++;
+	len = strlen(field_type) + 1;
 	if (array)
 		len += strlen(array);
 	if (prefix)
 		len += strlen(prefix);
+
 	field->type = kzalloc(len, GFP_KERNEL);
 	if (!field->type) {
 		ret = -ENOMEM;
@@ -786,7 +836,8 @@ static struct synth_field *parse_synth_field(int argc, char **argv,
 	strcat(field->type, field_type);
 	if (array) {
 		strcat(field->type, array);
-		*array = '\0';
+		if (field->type[len - 1] == ';')
+			field->type[len - 1] = '\0';
 	}
 
 	field->size = synth_field_size(field->type);
@@ -800,11 +851,6 @@ static struct synth_field *parse_synth_field(int argc, char **argv,
 
 	field->is_signed = synth_field_signed(field->type);
 
-	field->name = kstrdup(field_name, GFP_KERNEL);
-	if (!field->name) {
-		ret = -ENOMEM;
-		goto free;
-	}
  out:
 	return field;
  free:
@@ -868,9 +914,13 @@ static inline void trace_synth(struct synth_event *event, u64 *var_ref_vals,
 
 static struct synth_event *find_synth_event(const char *name)
 {
+	struct dyn_event *pos;
 	struct synth_event *event;
 
-	list_for_each_entry(event, &synth_event_list, list) {
+	for_each_dyn_event(pos) {
+		if (!is_synth_event(pos))
+			continue;
+		event = to_synth_event(pos);
 		if (strcmp(event->name, name) == 0)
 			return event;
 	}
@@ -921,7 +971,7 @@ static int register_synth_event(struct synth_event *event)
 
 	ret = set_synth_event_print_fmt(call);
 	if (ret < 0) {
-		trace_remove_event_call(call);
+		trace_remove_event_call_nolock(call);
 		goto err;
 	}
  out:
@@ -959,7 +1009,7 @@ static void free_synth_event(struct synth_event *event)
 	kfree(event);
 }
 
-static struct synth_event *alloc_synth_event(char *event_name, int n_fields,
+static struct synth_event *alloc_synth_event(const char *name, int n_fields,
 					     struct synth_field **fields)
 {
 	struct synth_event *event;
@@ -971,7 +1021,7 @@ static struct synth_event *alloc_synth_event(char *event_name, int n_fields,
 		goto out;
 	}
 
-	event->name = kstrdup(event_name, GFP_KERNEL);
+	event->name = kstrdup(name, GFP_KERNEL);
 	if (!event->name) {
 		kfree(event);
 		event = ERR_PTR(-ENOMEM);
@@ -985,6 +1035,8 @@ static struct synth_event *alloc_synth_event(char *event_name, int n_fields,
 		goto out;
 	}
 
+	dyn_event_init(&event->devent, &synth_event_ops);
+
 	for (i = 0; i < n_fields; i++)
 		event->fields[i] = fields[i];
 
@@ -1008,16 +1060,11 @@ struct hist_var_data {
 	struct hist_trigger_data *hist_data;
 };
 
-static int create_synth_event(int argc, char **argv)
+static int __create_synth_event(int argc, const char *name, const char **argv)
 {
 	struct synth_field *field, *fields[SYNTH_FIELDS_MAX];
 	struct synth_event *event = NULL;
-	bool delete_event = false;
 	int i, consumed = 0, n_fields = 0, ret = 0;
-	char *name;
-
-	mutex_lock(&event_mutex);
-	mutex_lock(&synth_event_mutex);
 
 	/*
 	 * Argument syntax:
@@ -1025,43 +1072,20 @@ static int create_synth_event(int argc, char **argv)
 	 *  - Remove synthetic event: !<event_name> field[;field] ...
 	 *      where 'field' = type field_name
 	 */
-	if (argc < 1) {
-		ret = -EINVAL;
-		goto out;
-	}
 
-	name = argv[0];
-	if (name[0] == '!') {
-		delete_event = true;
-		name++;
-	}
+	if (name[0] == '\0' || argc < 1)
+		return -EINVAL;
+
+	mutex_lock(&event_mutex);
+	mutex_lock(&synth_event_mutex);
 
 	event = find_synth_event(name);
 	if (event) {
-		if (delete_event) {
-			if (event->ref) {
-				ret = -EBUSY;
-				goto out;
-			}
-			ret = unregister_synth_event(event);
-			if (!ret) {
-				list_del(&event->list);
-				free_synth_event(event);
-			}
-		} else
-			ret = -EEXIST;
-		goto out;
-	} else if (delete_event) {
-		ret = -ENOENT;
+		ret = -EEXIST;
 		goto out;
 	}
 
-	if (argc < 2) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	for (i = 1; i < argc - 1; i++) {
+	for (i = 0; i < argc - 1; i++) {
 		if (strcmp(argv[i], ";") == 0)
 			continue;
 		if (n_fields == SYNTH_FIELDS_MAX) {
@@ -1091,7 +1115,7 @@ static int create_synth_event(int argc, char **argv)
 	}
 	ret = register_synth_event(event);
 	if (!ret)
-		list_add(&event->list, &synth_event_list);
+		dyn_event_add(&event->devent);
 	else
 		free_synth_event(event);
  out:
@@ -1106,57 +1130,77 @@ static int create_synth_event(int argc, char **argv)
 	goto out;
 }
 
-static int release_all_synth_events(void)
+static int create_or_delete_synth_event(int argc, char **argv)
 {
-	struct synth_event *event, *e;
-	int ret = 0;
-
-	mutex_lock(&event_mutex);
-	mutex_lock(&synth_event_mutex);
-
-	list_for_each_entry(event, &synth_event_list, list) {
-		if (event->ref) {
-			mutex_unlock(&synth_event_mutex);
-			return -EBUSY;
-		}
-	}
+	const char *name = argv[0];
+	struct synth_event *event = NULL;
+	int ret;
 
-	list_for_each_entry_safe(event, e, &synth_event_list, list) {
-		ret = unregister_synth_event(event);
-		if (!ret) {
-			list_del(&event->list);
-			free_synth_event(event);
+	/* trace_run_command() ensures argc != 0 */
+	if (name[0] == '!') {
+		mutex_lock(&event_mutex);
+		mutex_lock(&synth_event_mutex);
+		event = find_synth_event(name + 1);
+		if (event) {
+			if (event->ref)
+				ret = -EBUSY;
+			else {
+				ret = unregister_synth_event(event);
+				if (!ret) {
+					dyn_event_remove(&event->devent);
+					free_synth_event(event);
+				}
+			}
 		} else
-			break;
+			ret = -ENOENT;
+		mutex_unlock(&synth_event_mutex);
+		mutex_unlock(&event_mutex);
+		return ret;
 	}
-	mutex_unlock(&synth_event_mutex);
-	mutex_unlock(&event_mutex);
 
-	return ret;
+	ret = __create_synth_event(argc - 1, name, (const char **)argv + 1);
+	return ret == -ECANCELED ? -EINVAL : ret;
 }
 
-
-static void *synth_events_seq_start(struct seq_file *m, loff_t *pos)
+static int synth_event_create(int argc, const char **argv)
 {
-	mutex_lock(&synth_event_mutex);
+	const char *name = argv[0];
+	int len;
 
-	return seq_list_start(&synth_event_list, *pos);
-}
+	if (name[0] != 's' || name[1] != ':')
+		return -ECANCELED;
+	name += 2;
 
-static void *synth_events_seq_next(struct seq_file *m, void *v, loff_t *pos)
-{
-	return seq_list_next(v, &synth_event_list, pos);
+	/* This interface accepts group name prefix */
+	if (strchr(name, '/')) {
+		len = sizeof(SYNTH_SYSTEM "/") - 1;
+		if (strncmp(name, SYNTH_SYSTEM "/", len))
+			return -EINVAL;
+		name += len;
+	}
+	return __create_synth_event(argc - 1, name, argv + 1);
 }
 
-static void synth_events_seq_stop(struct seq_file *m, void *v)
+static int synth_event_release(struct dyn_event *ev)
 {
-	mutex_unlock(&synth_event_mutex);
+	struct synth_event *event = to_synth_event(ev);
+	int ret;
+
+	if (event->ref)
+		return -EBUSY;
+
+	ret = unregister_synth_event(event);
+	if (ret)
+		return ret;
+
+	dyn_event_remove(ev);
+	free_synth_event(event);
+	return 0;
 }
 
-static int synth_events_seq_show(struct seq_file *m, void *v)
+static int __synth_event_show(struct seq_file *m, struct synth_event *event)
 {
 	struct synth_field *field;
-	struct synth_event *event = v;
 	unsigned int i;
 
 	seq_printf(m, "%s\t", event->name);
@@ -1174,11 +1218,30 @@ static int synth_events_seq_show(struct seq_file *m, void *v)
 	return 0;
 }
 
+static int synth_event_show(struct seq_file *m, struct dyn_event *ev)
+{
+	struct synth_event *event = to_synth_event(ev);
+
+	seq_printf(m, "s:%s/", event->class.system);
+
+	return __synth_event_show(m, event);
+}
+
+static int synth_events_seq_show(struct seq_file *m, void *v)
+{
+	struct dyn_event *ev = v;
+
+	if (!is_synth_event(ev))
+		return 0;
+
+	return __synth_event_show(m, to_synth_event(ev));
+}
+
 static const struct seq_operations synth_events_seq_op = {
-	.start  = synth_events_seq_start,
-	.next   = synth_events_seq_next,
-	.stop   = synth_events_seq_stop,
-	.show   = synth_events_seq_show
+	.start	= dyn_event_seq_start,
+	.next	= dyn_event_seq_next,
+	.stop	= dyn_event_seq_stop,
+	.show	= synth_events_seq_show,
 };
 
 static int synth_events_open(struct inode *inode, struct file *file)
@@ -1186,7 +1249,7 @@ static int synth_events_open(struct inode *inode, struct file *file)
 	int ret;
 
 	if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
-		ret = release_all_synth_events();
+		ret = dyn_events_release_all(&synth_event_ops);
 		if (ret < 0)
 			return ret;
 	}
@@ -1199,7 +1262,7 @@ static ssize_t synth_events_write(struct file *file,
 				  size_t count, loff_t *ppos)
 {
 	return trace_parse_run_command(file, buffer, count, ppos,
-				       create_synth_event);
+				       create_or_delete_synth_event);
 }
 
 static const struct file_operations synth_events_fops = {
@@ -5791,6 +5854,12 @@ static __init int trace_events_hist_init(void)
 	struct dentry *d_tracer;
 	int err = 0;
 
+	err = dyn_event_register(&synth_event_ops);
+	if (err) {
+		pr_warn("Could not register synth_event_ops\n");
+		return err;
+	}
+
 	d_tracer = tracing_init_dentry();
 	if (IS_ERR(d_tracer)) {
 		err = PTR_ERR(d_tracer);
-- 
cgit 


From 0e2b81f7b52a1c1a8c46986f9ca01eb7b3c421f8 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Mon, 5 Nov 2018 18:04:01 +0900
Subject: tracing: Remove unneeded synth_event_mutex

Rmove unneeded synth_event_mutex. This mutex protects the reference
count in synth_event, however, those operational points are already
protected by event_mutex.

1. In __create_synth_event() and create_or_delete_synth_event(),
 those synth_event_mutex clearly obtained right after event_mutex.

2. event_hist_trigger_func() is trigger_hist_cmd.func() which is
 called by trigger_process_regex(), which is a part of
 event_trigger_regex_write() and this function takes event_mutex.

3. hist_unreg_all() is trigger_hist_cmd.unreg_all() which is called
 by event_trigger_regex_open() and it takes event_mutex.

4. onmatch_destroy() and onmatch_create() have long call tree,
 but both are finally invoked from event_trigger_regex_write()
 and event_trace_del_tracer(), former takes event_mutex, and latter
 ensures called under event_mutex locked.

Finally, I ensured there is no resource conflict. For safety,
I added lockdep_assert_held(&event_mutex) for each function.

Link: http://lkml.kernel.org/r/154140864134.17322.4796059721306031894.stgit@devbox

Reviewed-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Tested-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c | 30 +++++++-----------------------
 1 file changed, 7 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 414aabd67d1f..21e4954375a1 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -444,8 +444,6 @@ static bool have_hist_err(void)
 	return false;
 }
 
-static DEFINE_MUTEX(synth_event_mutex);
-
 struct synth_trace_event {
 	struct trace_entry	ent;
 	u64			fields[];
@@ -1077,7 +1075,6 @@ static int __create_synth_event(int argc, const char *name, const char **argv)
 		return -EINVAL;
 
 	mutex_lock(&event_mutex);
-	mutex_lock(&synth_event_mutex);
 
 	event = find_synth_event(name);
 	if (event) {
@@ -1119,7 +1116,6 @@ static int __create_synth_event(int argc, const char *name, const char **argv)
 	else
 		free_synth_event(event);
  out:
-	mutex_unlock(&synth_event_mutex);
 	mutex_unlock(&event_mutex);
 
 	return ret;
@@ -1139,7 +1135,6 @@ static int create_or_delete_synth_event(int argc, char **argv)
 	/* trace_run_command() ensures argc != 0 */
 	if (name[0] == '!') {
 		mutex_lock(&event_mutex);
-		mutex_lock(&synth_event_mutex);
 		event = find_synth_event(name + 1);
 		if (event) {
 			if (event->ref)
@@ -1153,7 +1148,6 @@ static int create_or_delete_synth_event(int argc, char **argv)
 			}
 		} else
 			ret = -ENOENT;
-		mutex_unlock(&synth_event_mutex);
 		mutex_unlock(&event_mutex);
 		return ret;
 	}
@@ -3535,7 +3529,7 @@ static void onmatch_destroy(struct action_data *data)
 {
 	unsigned int i;
 
-	mutex_lock(&synth_event_mutex);
+	lockdep_assert_held(&event_mutex);
 
 	kfree(data->onmatch.match_event);
 	kfree(data->onmatch.match_event_system);
@@ -3548,8 +3542,6 @@ static void onmatch_destroy(struct action_data *data)
 		data->onmatch.synth_event->ref--;
 
 	kfree(data);
-
-	mutex_unlock(&synth_event_mutex);
 }
 
 static void destroy_field_var(struct field_var *field_var)
@@ -3700,15 +3692,14 @@ static int onmatch_create(struct hist_trigger_data *hist_data,
 	struct synth_event *event;
 	int ret = 0;
 
-	mutex_lock(&synth_event_mutex);
+	lockdep_assert_held(&event_mutex);
+
 	event = find_synth_event(data->onmatch.synth_event_name);
 	if (!event) {
 		hist_err("onmatch: Couldn't find synthetic event: ", data->onmatch.synth_event_name);
-		mutex_unlock(&synth_event_mutex);
 		return -EINVAL;
 	}
 	event->ref++;
-	mutex_unlock(&synth_event_mutex);
 
 	var_ref_idx = hist_data->n_var_refs;
 
@@ -3782,9 +3773,7 @@ static int onmatch_create(struct hist_trigger_data *hist_data,
  out:
 	return ret;
  err:
-	mutex_lock(&synth_event_mutex);
 	event->ref--;
-	mutex_unlock(&synth_event_mutex);
 
 	goto out;
 }
@@ -5492,6 +5481,8 @@ static void hist_unreg_all(struct trace_event_file *file)
 	struct synth_event *se;
 	const char *se_name;
 
+	lockdep_assert_held(&event_mutex);
+
 	if (hist_file_check_refs(file))
 		return;
 
@@ -5501,12 +5492,10 @@ static void hist_unreg_all(struct trace_event_file *file)
 			list_del_rcu(&test->list);
 			trace_event_trigger_enable_disable(file, 0);
 
-			mutex_lock(&synth_event_mutex);
 			se_name = trace_event_name(file->event_call);
 			se = find_synth_event(se_name);
 			if (se)
 				se->ref--;
-			mutex_unlock(&synth_event_mutex);
 
 			update_cond_flag(file);
 			if (hist_data->enable_timestamps)
@@ -5532,6 +5521,8 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
 	char *trigger, *p;
 	int ret = 0;
 
+	lockdep_assert_held(&event_mutex);
+
 	if (glob && strlen(glob)) {
 		last_cmd_set(param);
 		hist_err_clear();
@@ -5622,14 +5613,10 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
 		}
 
 		cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file);
-
-		mutex_lock(&synth_event_mutex);
 		se_name = trace_event_name(file->event_call);
 		se = find_synth_event(se_name);
 		if (se)
 			se->ref--;
-		mutex_unlock(&synth_event_mutex);
-
 		ret = 0;
 		goto out_free;
 	}
@@ -5665,13 +5652,10 @@ enable:
 	if (ret)
 		goto out_unreg;
 
-	mutex_lock(&synth_event_mutex);
 	se_name = trace_event_name(file->event_call);
 	se = find_synth_event(se_name);
 	if (se)
 		se->ref++;
-	mutex_unlock(&synth_event_mutex);
-
 	/* Just return zero, not the number of registered triggers */
 	ret = 0;
  out:
-- 
cgit 


From c454a46b5efd8eff8880e88ece2976e60a26bf35 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Fri, 7 Dec 2018 16:42:25 -0800
Subject: bpf: Add bpf_line_info support

This patch adds bpf_line_info support.

It accepts an array of bpf_line_info objects during BPF_PROG_LOAD.
The "line_info", "line_info_cnt" and "line_info_rec_size" are added
to the "union bpf_attr".  The "line_info_rec_size" makes
bpf_line_info extensible in the future.

The new "check_btf_line()" ensures the userspace line_info is valid
for the kernel to use.

When the verifier is translating/patching the bpf_prog (through
"bpf_patch_insn_single()"), the line_infos' insn_off is also
adjusted by the newly added "bpf_adj_linfo()".

If the bpf_prog is jited, this patch also provides the jited addrs (in
aux->jited_linfo) for the corresponding line_info.insn_off.
"bpf_prog_fill_jited_linfo()" is added to fill the aux->jited_linfo.
It is currently called by the x86 jit.  Other jits can also use
"bpf_prog_fill_jited_linfo()" and it will be done in the followup patches.
In the future, if it deemed necessary, a particular jit could also provide
its own "bpf_prog_fill_jited_linfo()" implementation.

A few "*line_info*" fields are added to the bpf_prog_info such
that the user can get the xlated line_info back (i.e. the line_info
with its insn_off reflecting the translated prog).  The jited_line_info
is available if the prog is jited.  It is an array of __u64.
If the prog is not jited, jited_line_info_cnt is 0.

The verifier's verbose log with line_info will be done in
a follow up patch.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/x86/net/bpf_jit_comp.c  |   2 +
 include/linux/bpf.h          |  21 +++++
 include/linux/bpf_verifier.h |   1 +
 include/linux/btf.h          |   1 +
 include/linux/filter.h       |   7 ++
 include/uapi/linux/bpf.h     |  19 +++++
 kernel/bpf/btf.c             |   2 +-
 kernel/bpf/core.c            | 118 +++++++++++++++++++++++++-
 kernel/bpf/syscall.c         |  83 ++++++++++++++++--
 kernel/bpf/verifier.c        | 198 +++++++++++++++++++++++++++++++++++++------
 10 files changed, 419 insertions(+), 33 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 2580cd2e98b1..5542303c43d9 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1181,6 +1181,8 @@ out_image:
 	}
 
 	if (!image || !prog->is_func || extra_pass) {
+		if (image)
+			bpf_prog_fill_jited_linfo(prog, addrs);
 out_addrs:
 		kfree(addrs);
 		kfree(jit_data);
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e82b7039fc66..0c992b86eb2c 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -319,7 +319,28 @@ struct bpf_prog_aux {
 	struct bpf_prog_offload *offload;
 	struct btf *btf;
 	struct bpf_func_info *func_info;
+	/* bpf_line_info loaded from userspace.  linfo->insn_off
+	 * has the xlated insn offset.
+	 * Both the main and sub prog share the same linfo.
+	 * The subprog can access its first linfo by
+	 * using the linfo_idx.
+	 */
+	struct bpf_line_info *linfo;
+	/* jited_linfo is the jited addr of the linfo.  It has a
+	 * one to one mapping to linfo:
+	 * jited_linfo[i] is the jited addr for the linfo[i]->insn_off.
+	 * Both the main and sub prog share the same jited_linfo.
+	 * The subprog can access its first jited_linfo by
+	 * using the linfo_idx.
+	 */
+	void **jited_linfo;
 	u32 func_info_cnt;
+	u32 nr_linfo;
+	/* subprog can use linfo_idx to access its first linfo and
+	 * jited_linfo.
+	 * main prog always has linfo_idx == 0
+	 */
+	u32 linfo_idx;
 	union {
 		struct work_struct work;
 		struct rcu_head	rcu;
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 11f5df1092d9..c736945be7c5 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -203,6 +203,7 @@ static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log)
 
 struct bpf_subprog_info {
 	u32 start; /* insn idx of function entry point */
+	u32 linfo_idx; /* The idx to the main_prog->aux->linfo */
 	u16 stack_depth; /* max. stack depth used by this function */
 };
 
diff --git a/include/linux/btf.h b/include/linux/btf.h
index 8c2199b5d250..b98405a56383 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -46,6 +46,7 @@ void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj,
 		       struct seq_file *m);
 int btf_get_fd_by_id(u32 id);
 u32 btf_id(const struct btf *btf);
+bool btf_name_offset_valid(const struct btf *btf, u32 offset);
 
 #ifdef CONFIG_BPF_SYSCALL
 const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id);
diff --git a/include/linux/filter.h b/include/linux/filter.h
index d16deead65c6..29f21f9d7f68 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -718,6 +718,13 @@ void bpf_prog_free(struct bpf_prog *fp);
 
 bool bpf_opcode_in_insntable(u8 code);
 
+void bpf_prog_free_linfo(struct bpf_prog *prog);
+void bpf_prog_fill_jited_linfo(struct bpf_prog *prog,
+			       const u32 *insn_to_jit_off);
+int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog);
+void bpf_prog_free_jited_linfo(struct bpf_prog *prog);
+void bpf_prog_free_unused_jited_linfo(struct bpf_prog *prog);
+
 struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags);
 struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
 				  gfp_t gfp_extra_flags);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index a84fd232d934..7a66db8d15d5 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -356,6 +356,9 @@ union bpf_attr {
 		__u32		func_info_rec_size;	/* userspace bpf_func_info size */
 		__aligned_u64	func_info;	/* func info */
 		__u32		func_info_cnt;	/* number of bpf_func_info records */
+		__u32		line_info_rec_size;	/* userspace bpf_line_info size */
+		__aligned_u64	line_info;	/* line info */
+		__u32		line_info_cnt;	/* number of bpf_line_info records */
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
@@ -2679,6 +2682,12 @@ struct bpf_prog_info {
 	__u32 func_info_rec_size;
 	__aligned_u64 func_info;
 	__u32 func_info_cnt;
+	__u32 line_info_cnt;
+	__aligned_u64 line_info;
+	__aligned_u64 jited_line_info;
+	__u32 jited_line_info_cnt;
+	__u32 line_info_rec_size;
+	__u32 jited_line_info_rec_size;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
@@ -2995,4 +3004,14 @@ struct bpf_func_info {
 	__u32	type_id;
 };
 
+#define BPF_LINE_INFO_LINE_NUM(line_col)	((line_col) >> 10)
+#define BPF_LINE_INFO_LINE_COL(line_col)	((line_col) & 0x3ff)
+
+struct bpf_line_info {
+	__u32	insn_off;
+	__u32	file_name_off;
+	__u32	line_off;
+	__u32	line_col;
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index a09b2f94ab25..e0a827f95e19 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -444,7 +444,7 @@ static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t)
 	return kind_ops[BTF_INFO_KIND(t->info)];
 }
 
-static bool btf_name_offset_valid(const struct btf *btf, u32 offset)
+bool btf_name_offset_valid(const struct btf *btf, u32 offset)
 {
 	return BTF_STR_OFFSET_VALID(offset) &&
 		offset < btf->hdr.str_len;
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index a5b223ef7131..5cdd8da0e7f2 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -105,6 +105,91 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
 }
 EXPORT_SYMBOL_GPL(bpf_prog_alloc);
 
+int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog)
+{
+	if (!prog->aux->nr_linfo || !prog->jit_requested)
+		return 0;
+
+	prog->aux->jited_linfo = kcalloc(prog->aux->nr_linfo,
+					 sizeof(*prog->aux->jited_linfo),
+					 GFP_KERNEL | __GFP_NOWARN);
+	if (!prog->aux->jited_linfo)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void bpf_prog_free_jited_linfo(struct bpf_prog *prog)
+{
+	kfree(prog->aux->jited_linfo);
+	prog->aux->jited_linfo = NULL;
+}
+
+void bpf_prog_free_unused_jited_linfo(struct bpf_prog *prog)
+{
+	if (prog->aux->jited_linfo && !prog->aux->jited_linfo[0])
+		bpf_prog_free_jited_linfo(prog);
+}
+
+/* The jit engine is responsible to provide an array
+ * for insn_off to the jited_off mapping (insn_to_jit_off).
+ *
+ * The idx to this array is the insn_off.  Hence, the insn_off
+ * here is relative to the prog itself instead of the main prog.
+ * This array has one entry for each xlated bpf insn.
+ *
+ * jited_off is the byte off to the last byte of the jited insn.
+ *
+ * Hence, with
+ * insn_start:
+ *      The first bpf insn off of the prog.  The insn off
+ *      here is relative to the main prog.
+ *      e.g. if prog is a subprog, insn_start > 0
+ * linfo_idx:
+ *      The prog's idx to prog->aux->linfo and jited_linfo
+ *
+ * jited_linfo[linfo_idx] = prog->bpf_func
+ *
+ * For i > linfo_idx,
+ *
+ * jited_linfo[i] = prog->bpf_func +
+ *	insn_to_jit_off[linfo[i].insn_off - insn_start - 1]
+ */
+void bpf_prog_fill_jited_linfo(struct bpf_prog *prog,
+			       const u32 *insn_to_jit_off)
+{
+	u32 linfo_idx, insn_start, insn_end, nr_linfo, i;
+	const struct bpf_line_info *linfo;
+	void **jited_linfo;
+
+	if (!prog->aux->jited_linfo)
+		/* Userspace did not provide linfo */
+		return;
+
+	linfo_idx = prog->aux->linfo_idx;
+	linfo = &prog->aux->linfo[linfo_idx];
+	insn_start = linfo[0].insn_off;
+	insn_end = insn_start + prog->len;
+
+	jited_linfo = &prog->aux->jited_linfo[linfo_idx];
+	jited_linfo[0] = prog->bpf_func;
+
+	nr_linfo = prog->aux->nr_linfo - linfo_idx;
+
+	for (i = 1; i < nr_linfo && linfo[i].insn_off < insn_end; i++)
+		/* The verifier ensures that linfo[i].insn_off is
+		 * strictly increasing
+		 */
+		jited_linfo[i] = prog->bpf_func +
+			insn_to_jit_off[linfo[i].insn_off - insn_start - 1];
+}
+
+void bpf_prog_free_linfo(struct bpf_prog *prog)
+{
+	bpf_prog_free_jited_linfo(prog);
+	kvfree(prog->aux->linfo);
+}
+
 struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
 				  gfp_t gfp_extra_flags)
 {
@@ -294,6 +379,26 @@ static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta,
 	return ret;
 }
 
+static void bpf_adj_linfo(struct bpf_prog *prog, u32 off, u32 delta)
+{
+	struct bpf_line_info *linfo;
+	u32 i, nr_linfo;
+
+	nr_linfo = prog->aux->nr_linfo;
+	if (!nr_linfo || !delta)
+		return;
+
+	linfo = prog->aux->linfo;
+
+	for (i = 0; i < nr_linfo; i++)
+		if (off < linfo[i].insn_off)
+			break;
+
+	/* Push all off < linfo[i].insn_off by delta */
+	for (; i < nr_linfo; i++)
+		linfo[i].insn_off += delta;
+}
+
 struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
 				       const struct bpf_insn *patch, u32 len)
 {
@@ -349,6 +454,8 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
 	 */
 	BUG_ON(bpf_adj_branches(prog_adj, off, insn_delta, false));
 
+	bpf_adj_linfo(prog_adj, off, insn_delta);
+
 	return prog_adj;
 }
 
@@ -1591,13 +1698,20 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
 	 * be JITed, but falls back to the interpreter.
 	 */
 	if (!bpf_prog_is_dev_bound(fp->aux)) {
+		*err = bpf_prog_alloc_jited_linfo(fp);
+		if (*err)
+			return fp;
+
 		fp = bpf_int_jit_compile(fp);
-#ifdef CONFIG_BPF_JIT_ALWAYS_ON
 		if (!fp->jited) {
+			bpf_prog_free_jited_linfo(fp);
+#ifdef CONFIG_BPF_JIT_ALWAYS_ON
 			*err = -ENOTSUPP;
 			return fp;
-		}
 #endif
+		} else {
+			bpf_prog_free_unused_jited_linfo(fp);
+		}
 	} else {
 		*err = bpf_prog_offload_compile(fp);
 		if (*err)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index aa05aa38f4a8..19c88cff7880 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1215,6 +1215,7 @@ static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
 		bpf_prog_kallsyms_del_all(prog);
 		btf_put(prog->aux->btf);
 		kvfree(prog->aux->func_info);
+		bpf_prog_free_linfo(prog);
 
 		call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
 	}
@@ -1439,7 +1440,7 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type,
 }
 
 /* last field in 'union bpf_attr' used by this command */
-#define	BPF_PROG_LOAD_LAST_FIELD func_info_cnt
+#define	BPF_PROG_LOAD_LAST_FIELD line_info_cnt
 
 static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
 {
@@ -1560,6 +1561,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
 	return err;
 
 free_used_maps:
+	bpf_prog_free_linfo(prog);
 	kvfree(prog->aux->func_info);
 	btf_put(prog->aux->btf);
 	bpf_prog_kallsyms_del_subprogs(prog);
@@ -2041,6 +2043,37 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog)
 	return insns;
 }
 
+static int set_info_rec_size(struct bpf_prog_info *info)
+{
+	/*
+	 * Ensure info.*_rec_size is the same as kernel expected size
+	 *
+	 * or
+	 *
+	 * Only allow zero *_rec_size if both _rec_size and _cnt are
+	 * zero.  In this case, the kernel will set the expected
+	 * _rec_size back to the info.
+	 */
+
+	if ((info->func_info_cnt || info->func_info_rec_size) &&
+	    info->func_info_rec_size != sizeof(struct bpf_func_info))
+		return -EINVAL;
+
+	if ((info->line_info_cnt || info->line_info_rec_size) &&
+	    info->line_info_rec_size != sizeof(struct bpf_line_info))
+		return -EINVAL;
+
+	if ((info->jited_line_info_cnt || info->jited_line_info_rec_size) &&
+	    info->jited_line_info_rec_size != sizeof(__u64))
+		return -EINVAL;
+
+	info->func_info_rec_size = sizeof(struct bpf_func_info);
+	info->line_info_rec_size = sizeof(struct bpf_line_info);
+	info->jited_line_info_rec_size = sizeof(__u64);
+
+	return 0;
+}
+
 static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 				   const union bpf_attr *attr,
 				   union bpf_attr __user *uattr)
@@ -2083,11 +2116,9 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 				return -EFAULT;
 	}
 
-	if ((info.func_info_cnt || info.func_info_rec_size) &&
-	    info.func_info_rec_size != sizeof(struct bpf_func_info))
-		return -EINVAL;
-
-	info.func_info_rec_size = sizeof(struct bpf_func_info);
+	err = set_info_rec_size(&info);
+	if (err)
+		return err;
 
 	if (!capable(CAP_SYS_ADMIN)) {
 		info.jited_prog_len = 0;
@@ -2095,6 +2126,8 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 		info.nr_jited_ksyms = 0;
 		info.nr_jited_func_lens = 0;
 		info.func_info_cnt = 0;
+		info.line_info_cnt = 0;
+		info.jited_line_info_cnt = 0;
 		goto done;
 	}
 
@@ -2251,6 +2284,44 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 		}
 	}
 
+	ulen = info.line_info_cnt;
+	info.line_info_cnt = prog->aux->nr_linfo;
+	if (info.line_info_cnt && ulen) {
+		if (bpf_dump_raw_ok()) {
+			__u8 __user *user_linfo;
+
+			user_linfo = u64_to_user_ptr(info.line_info);
+			ulen = min_t(u32, info.line_info_cnt, ulen);
+			if (copy_to_user(user_linfo, prog->aux->linfo,
+					 info.line_info_rec_size * ulen))
+				return -EFAULT;
+		} else {
+			info.line_info = 0;
+		}
+	}
+
+	ulen = info.jited_line_info_cnt;
+	if (prog->aux->jited_linfo)
+		info.jited_line_info_cnt = prog->aux->nr_linfo;
+	else
+		info.jited_line_info_cnt = 0;
+	if (info.jited_line_info_cnt && ulen) {
+		if (bpf_dump_raw_ok()) {
+			__u64 __user *user_linfo;
+			u32 i;
+
+			user_linfo = u64_to_user_ptr(info.jited_line_info);
+			ulen = min_t(u32, info.jited_line_info_cnt, ulen);
+			for (i = 0; i < ulen; i++) {
+				if (put_user((__u64)(long)prog->aux->jited_linfo[i],
+					     &user_linfo[i]))
+					return -EFAULT;
+			}
+		} else {
+			info.jited_line_info = 0;
+		}
+	}
+
 done:
 	if (copy_to_user(uinfo, &info, info_len) ||
 	    put_user(info_len, &uattr->info.info_len))
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2752d35ad073..9d25506bd55a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4640,15 +4640,17 @@ err_free:
 #define MIN_BPF_FUNCINFO_SIZE	8
 #define MAX_FUNCINFO_REC_SIZE	252
 
-static int check_btf_func(struct bpf_prog *prog, struct bpf_verifier_env *env,
-			  union bpf_attr *attr, union bpf_attr __user *uattr)
+static int check_btf_func(struct bpf_verifier_env *env,
+			  const union bpf_attr *attr,
+			  union bpf_attr __user *uattr)
 {
 	u32 i, nfuncs, urec_size, min_size, prev_offset;
 	u32 krec_size = sizeof(struct bpf_func_info);
-	struct bpf_func_info *krecord = NULL;
+	struct bpf_func_info *krecord;
 	const struct btf_type *type;
+	struct bpf_prog *prog;
+	const struct btf *btf;
 	void __user *urecord;
-	struct btf *btf;
 	int ret = 0;
 
 	nfuncs = attr->func_info_cnt;
@@ -4668,20 +4670,15 @@ static int check_btf_func(struct bpf_prog *prog, struct bpf_verifier_env *env,
 		return -EINVAL;
 	}
 
-	btf = btf_get_by_fd(attr->prog_btf_fd);
-	if (IS_ERR(btf)) {
-		verbose(env, "unable to get btf from fd\n");
-		return PTR_ERR(btf);
-	}
+	prog = env->prog;
+	btf = prog->aux->btf;
 
 	urecord = u64_to_user_ptr(attr->func_info);
 	min_size = min_t(u32, krec_size, urec_size);
 
 	krecord = kvcalloc(nfuncs, krec_size, GFP_KERNEL | __GFP_NOWARN);
-	if (!krecord) {
-		ret = -ENOMEM;
-		goto free_btf;
-	}
+	if (!krecord)
+		return -ENOMEM;
 
 	for (i = 0; i < nfuncs; i++) {
 		ret = bpf_check_uarg_tail_zero(urecord, krec_size, urec_size);
@@ -4694,12 +4691,12 @@ static int check_btf_func(struct bpf_prog *prog, struct bpf_verifier_env *env,
 				if (put_user(min_size, &uattr->func_info_rec_size))
 					ret = -EFAULT;
 			}
-			goto free_btf;
+			goto err_free;
 		}
 
 		if (copy_from_user(&krecord[i], urecord, min_size)) {
 			ret = -EFAULT;
-			goto free_btf;
+			goto err_free;
 		}
 
 		/* check insn_off */
@@ -4709,20 +4706,20 @@ static int check_btf_func(struct bpf_prog *prog, struct bpf_verifier_env *env,
 					"nonzero insn_off %u for the first func info record",
 					krecord[i].insn_off);
 				ret = -EINVAL;
-				goto free_btf;
+				goto err_free;
 			}
 		} else if (krecord[i].insn_off <= prev_offset) {
 			verbose(env,
 				"same or smaller insn offset (%u) than previous func info record (%u)",
 				krecord[i].insn_off, prev_offset);
 			ret = -EINVAL;
-			goto free_btf;
+			goto err_free;
 		}
 
 		if (env->subprog_info[i].start != krecord[i].insn_off) {
 			verbose(env, "func_info BTF section doesn't match subprog layout in BPF program\n");
 			ret = -EINVAL;
-			goto free_btf;
+			goto err_free;
 		}
 
 		/* check type_id */
@@ -4731,20 +4728,18 @@ static int check_btf_func(struct bpf_prog *prog, struct bpf_verifier_env *env,
 			verbose(env, "invalid type id %d in func info",
 				krecord[i].type_id);
 			ret = -EINVAL;
-			goto free_btf;
+			goto err_free;
 		}
 
 		prev_offset = krecord[i].insn_off;
 		urecord += urec_size;
 	}
 
-	prog->aux->btf = btf;
 	prog->aux->func_info = krecord;
 	prog->aux->func_info_cnt = nfuncs;
 	return 0;
 
-free_btf:
-	btf_put(btf);
+err_free:
 	kvfree(krecord);
 	return ret;
 }
@@ -4760,6 +4755,150 @@ static void adjust_btf_func(struct bpf_verifier_env *env)
 		env->prog->aux->func_info[i].insn_off = env->subprog_info[i].start;
 }
 
+#define MIN_BPF_LINEINFO_SIZE	(offsetof(struct bpf_line_info, line_col) + \
+		sizeof(((struct bpf_line_info *)(0))->line_col))
+#define MAX_LINEINFO_REC_SIZE	MAX_FUNCINFO_REC_SIZE
+
+static int check_btf_line(struct bpf_verifier_env *env,
+			  const union bpf_attr *attr,
+			  union bpf_attr __user *uattr)
+{
+	u32 i, s, nr_linfo, ncopy, expected_size, rec_size, prev_offset = 0;
+	struct bpf_subprog_info *sub;
+	struct bpf_line_info *linfo;
+	struct bpf_prog *prog;
+	const struct btf *btf;
+	void __user *ulinfo;
+	int err;
+
+	nr_linfo = attr->line_info_cnt;
+	if (!nr_linfo)
+		return 0;
+
+	rec_size = attr->line_info_rec_size;
+	if (rec_size < MIN_BPF_LINEINFO_SIZE ||
+	    rec_size > MAX_LINEINFO_REC_SIZE ||
+	    rec_size & (sizeof(u32) - 1))
+		return -EINVAL;
+
+	/* Need to zero it in case the userspace may
+	 * pass in a smaller bpf_line_info object.
+	 */
+	linfo = kvcalloc(nr_linfo, sizeof(struct bpf_line_info),
+			 GFP_KERNEL | __GFP_NOWARN);
+	if (!linfo)
+		return -ENOMEM;
+
+	prog = env->prog;
+	btf = prog->aux->btf;
+
+	s = 0;
+	sub = env->subprog_info;
+	ulinfo = u64_to_user_ptr(attr->line_info);
+	expected_size = sizeof(struct bpf_line_info);
+	ncopy = min_t(u32, expected_size, rec_size);
+	for (i = 0; i < nr_linfo; i++) {
+		err = bpf_check_uarg_tail_zero(ulinfo, expected_size, rec_size);
+		if (err) {
+			if (err == -E2BIG) {
+				verbose(env, "nonzero tailing record in line_info");
+				if (put_user(expected_size,
+					     &uattr->line_info_rec_size))
+					err = -EFAULT;
+			}
+			goto err_free;
+		}
+
+		if (copy_from_user(&linfo[i], ulinfo, ncopy)) {
+			err = -EFAULT;
+			goto err_free;
+		}
+
+		/*
+		 * Check insn_off to ensure
+		 * 1) strictly increasing AND
+		 * 2) bounded by prog->len
+		 *
+		 * The linfo[0].insn_off == 0 check logically falls into
+		 * the later "missing bpf_line_info for func..." case
+		 * because the first linfo[0].insn_off must be the
+		 * first sub also and the first sub must have
+		 * subprog_info[0].start == 0.
+		 */
+		if ((i && linfo[i].insn_off <= prev_offset) ||
+		    linfo[i].insn_off >= prog->len) {
+			verbose(env, "Invalid line_info[%u].insn_off:%u (prev_offset:%u prog->len:%u)\n",
+				i, linfo[i].insn_off, prev_offset,
+				prog->len);
+			err = -EINVAL;
+			goto err_free;
+		}
+
+		if (!btf_name_offset_valid(btf, linfo[i].line_off) ||
+		    !btf_name_offset_valid(btf, linfo[i].file_name_off)) {
+			verbose(env, "Invalid line_info[%u].line_off or .file_name_off\n", i);
+			err = -EINVAL;
+			goto err_free;
+		}
+
+		if (s != env->subprog_cnt) {
+			if (linfo[i].insn_off == sub[s].start) {
+				sub[s].linfo_idx = i;
+				s++;
+			} else if (sub[s].start < linfo[i].insn_off) {
+				verbose(env, "missing bpf_line_info for func#%u\n", s);
+				err = -EINVAL;
+				goto err_free;
+			}
+		}
+
+		prev_offset = linfo[i].insn_off;
+		ulinfo += rec_size;
+	}
+
+	if (s != env->subprog_cnt) {
+		verbose(env, "missing bpf_line_info for %u funcs starting from func#%u\n",
+			env->subprog_cnt - s, s);
+		err = -EINVAL;
+		goto err_free;
+	}
+
+	prog->aux->linfo = linfo;
+	prog->aux->nr_linfo = nr_linfo;
+
+	return 0;
+
+err_free:
+	kvfree(linfo);
+	return err;
+}
+
+static int check_btf_info(struct bpf_verifier_env *env,
+			  const union bpf_attr *attr,
+			  union bpf_attr __user *uattr)
+{
+	struct btf *btf;
+	int err;
+
+	if (!attr->func_info_cnt && !attr->line_info_cnt)
+		return 0;
+
+	btf = btf_get_by_fd(attr->prog_btf_fd);
+	if (IS_ERR(btf))
+		return PTR_ERR(btf);
+	env->prog->aux->btf = btf;
+
+	err = check_btf_func(env, attr, uattr);
+	if (err)
+		return err;
+
+	err = check_btf_line(env, attr, uattr);
+	if (err)
+		return err;
+
+	return 0;
+}
+
 /* check %cur's range satisfies %old's */
 static bool range_within(struct bpf_reg_state *old,
 			 struct bpf_reg_state *cur)
@@ -6004,7 +6143,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 	int i, j, subprog_start, subprog_end = 0, len, subprog;
 	struct bpf_insn *insn;
 	void *old_bpf_func;
-	int err = -ENOMEM;
+	int err;
 
 	if (env->subprog_cnt <= 1)
 		return 0;
@@ -6035,6 +6174,11 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 		insn->imm = 1;
 	}
 
+	err = bpf_prog_alloc_jited_linfo(prog);
+	if (err)
+		goto out_undo_insn;
+
+	err = -ENOMEM;
 	func = kcalloc(env->subprog_cnt, sizeof(prog), GFP_KERNEL);
 	if (!func)
 		goto out_undo_insn;
@@ -6065,6 +6209,10 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 		func[i]->aux->name[0] = 'F';
 		func[i]->aux->stack_depth = env->subprog_info[i].stack_depth;
 		func[i]->jit_requested = 1;
+		func[i]->aux->linfo = prog->aux->linfo;
+		func[i]->aux->nr_linfo = prog->aux->nr_linfo;
+		func[i]->aux->jited_linfo = prog->aux->jited_linfo;
+		func[i]->aux->linfo_idx = env->subprog_info[i].linfo_idx;
 		func[i] = bpf_int_jit_compile(func[i]);
 		if (!func[i]->jited) {
 			err = -ENOTSUPP;
@@ -6138,6 +6286,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 	prog->bpf_func = func[0]->bpf_func;
 	prog->aux->func = func;
 	prog->aux->func_cnt = env->subprog_cnt;
+	bpf_prog_free_unused_jited_linfo(prog);
 	return 0;
 out_free:
 	for (i = 0; i < env->subprog_cnt; i++)
@@ -6154,6 +6303,7 @@ out_undo_insn:
 		insn->off = 0;
 		insn->imm = env->insn_aux_data[i].call_imm;
 	}
+	bpf_prog_free_jited_linfo(prog);
 	return err;
 }
 
@@ -6526,7 +6676,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
 	if (ret < 0)
 		goto skip_full_check;
 
-	ret = check_btf_func(env->prog, env, attr, uattr);
+	ret = check_btf_info(env, attr, uattr);
 	if (ret < 0)
 		goto skip_full_check;
 
-- 
cgit 


From e80c1a9d5f514ce5134c6c4263a11607341466c9 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Date: Tue, 4 Dec 2018 19:00:01 +0900
Subject: printk: fix printk_time race.

Since printk_time can be toggled via /sys/module/printk/parameters/time ,
it is not safe to assume that output length does not change across
multiple msg_print_text() calls. If we hit this race, we can observe
failures such as SYSLOG_ACTION_READ_ALL writes more bytes than userspace
has supplied, SYSLOG_ACTION_SIZE_UNREAD returns -EFAULT when succeeded,
SYSLOG_ACTION_READ reads garbage memory or even triggers an kernel oops
at _copy_to_user() due to integer overflow.

To close this race, get a snapshot value of printk_time and pass it to
SYSLOG_ACTION_READ, SYSLOG_ACTION_READ_ALL, SYSLOG_ACTION_SIZE_UNREAD and
kmsg_dump_get_buffer().

Link: http://lkml.kernel.org/r/555af37c-b9e0-f940-cb73-a78eba2d4944@i-love.sakura.ne.jp
To: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 kernel/printk/printk.c | 70 ++++++++++++++++++++++++++++----------------------
 1 file changed, 39 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index a1d88212a5d2..c7d217764db3 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -404,6 +404,7 @@ DECLARE_WAIT_QUEUE_HEAD(log_wait);
 static u64 syslog_seq;
 static u32 syslog_idx;
 static size_t syslog_partial;
+static bool syslog_time;
 
 /* index and sequence number of the first record stored in the buffer */
 static u64 log_first_seq;
@@ -1229,12 +1230,7 @@ module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
 
 static size_t print_time(u64 ts, char *buf)
 {
-	unsigned long rem_nsec;
-
-	if (!printk_time)
-		return 0;
-
-	rem_nsec = do_div(ts, 1000000000);
+	unsigned long rem_nsec = do_div(ts, 1000000000);
 
 	if (!buf)
 		return snprintf(NULL, 0, "[%5lu.000000] ", (unsigned long)ts);
@@ -1243,7 +1239,8 @@ static size_t print_time(u64 ts, char *buf)
 		       (unsigned long)ts, rem_nsec / 1000);
 }
 
-static size_t print_prefix(const struct printk_log *msg, bool syslog, char *buf)
+static size_t print_prefix(const struct printk_log *msg, bool syslog,
+			   bool time, char *buf)
 {
 	size_t len = 0;
 	unsigned int prefix = (msg->facility << 3) | msg->level;
@@ -1262,11 +1259,13 @@ static size_t print_prefix(const struct printk_log *msg, bool syslog, char *buf)
 		}
 	}
 
-	len += print_time(msg->ts_nsec, buf ? buf + len : NULL);
+	if (time)
+		len += print_time(msg->ts_nsec, buf ? buf + len : NULL);
 	return len;
 }
 
-static size_t msg_print_text(const struct printk_log *msg, bool syslog, char *buf, size_t size)
+static size_t msg_print_text(const struct printk_log *msg, bool syslog,
+			     bool time, char *buf, size_t size)
 {
 	const char *text = log_text(msg);
 	size_t text_size = msg->text_len;
@@ -1285,17 +1284,17 @@ static size_t msg_print_text(const struct printk_log *msg, bool syslog, char *bu
 		}
 
 		if (buf) {
-			if (print_prefix(msg, syslog, NULL) +
+			if (print_prefix(msg, syslog, time, NULL) +
 			    text_len + 1 >= size - len)
 				break;
 
-			len += print_prefix(msg, syslog, buf + len);
+			len += print_prefix(msg, syslog, time, buf + len);
 			memcpy(buf + len, text, text_len);
 			len += text_len;
 			buf[len++] = '\n';
 		} else {
 			/* SYSLOG_ACTION_* buffer size only calculation */
-			len += print_prefix(msg, syslog, NULL);
+			len += print_prefix(msg, syslog, time, NULL);
 			len += text_len;
 			len++;
 		}
@@ -1332,9 +1331,17 @@ static int syslog_print(char __user *buf, int size)
 			break;
 		}
 
+		/*
+		 * To keep reading/counting partial line consistent,
+		 * use printk_time value as of the beginning of a line.
+		 */
+		if (!syslog_partial)
+			syslog_time = printk_time;
+
 		skip = syslog_partial;
 		msg = log_from_idx(syslog_idx);
-		n = msg_print_text(msg, true, text, LOG_LINE_MAX + PREFIX_MAX);
+		n = msg_print_text(msg, true, syslog_time, text,
+				   LOG_LINE_MAX + PREFIX_MAX);
 		if (n - syslog_partial <= size) {
 			/* message fits into buffer, move forward */
 			syslog_idx = log_next(syslog_idx);
@@ -1374,11 +1381,13 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
 	u64 next_seq;
 	u64 seq;
 	u32 idx;
+	bool time;
 
 	text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
 	if (!text)
 		return -ENOMEM;
 
+	time = printk_time;
 	logbuf_lock_irq();
 	/*
 	 * Find first record that fits, including all following records,
@@ -1389,7 +1398,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
 	while (seq < log_next_seq) {
 		struct printk_log *msg = log_from_idx(idx);
 
-		len += msg_print_text(msg, true, NULL, 0);
+		len += msg_print_text(msg, true, time, NULL, 0);
 		idx = log_next(idx);
 		seq++;
 	}
@@ -1400,7 +1409,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
 	while (len > size && seq < log_next_seq) {
 		struct printk_log *msg = log_from_idx(idx);
 
-		len -= msg_print_text(msg, true, NULL, 0);
+		len -= msg_print_text(msg, true, time, NULL, 0);
 		idx = log_next(idx);
 		seq++;
 	}
@@ -1411,14 +1420,9 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
 	len = 0;
 	while (len >= 0 && seq < next_seq) {
 		struct printk_log *msg = log_from_idx(idx);
-		int textlen;
+		int textlen = msg_print_text(msg, true, time, text,
+					     LOG_LINE_MAX + PREFIX_MAX);
 
-		textlen = msg_print_text(msg, true, text,
-					 LOG_LINE_MAX + PREFIX_MAX);
-		if (textlen < 0) {
-			len = textlen;
-			break;
-		}
 		idx = log_next(idx);
 		seq++;
 
@@ -1542,11 +1546,14 @@ int do_syslog(int type, char __user *buf, int len, int source)
 		} else {
 			u64 seq = syslog_seq;
 			u32 idx = syslog_idx;
+			bool time = syslog_partial ? syslog_time : printk_time;
 
 			while (seq < log_next_seq) {
 				struct printk_log *msg = log_from_idx(idx);
 
-				error += msg_print_text(msg, true, NULL, 0);
+				error += msg_print_text(msg, true, time, NULL,
+							0);
+				time = printk_time;
 				idx = log_next(idx);
 				seq++;
 			}
@@ -2004,6 +2011,7 @@ EXPORT_SYMBOL(printk);
 
 #define LOG_LINE_MAX		0
 #define PREFIX_MAX		0
+#define printk_time		false
 
 static u64 syslog_seq;
 static u32 syslog_idx;
@@ -2027,8 +2035,8 @@ static void console_lock_spinning_enable(void) { }
 static int console_lock_spinning_disable_and_check(void) { return 0; }
 static void call_console_drivers(const char *ext_text, size_t ext_len,
 				 const char *text, size_t len) {}
-static size_t msg_print_text(const struct printk_log *msg,
-			     bool syslog, char *buf, size_t size) { return 0; }
+static size_t msg_print_text(const struct printk_log *msg, bool syslog,
+			     bool time, char *buf, size_t size) { return 0; }
 static bool suppress_message_printing(int level) { return false; }
 
 #endif /* CONFIG_PRINTK */
@@ -2386,8 +2394,7 @@ skip:
 
 		len += msg_print_text(msg,
 				console_msg_format & MSG_FORMAT_SYSLOG,
-				text + len,
-				sizeof(text) - len);
+				printk_time, text + len, sizeof(text) - len);
 		if (nr_ext_console_drivers) {
 			ext_len = msg_print_ext_header(ext_text,
 						sizeof(ext_text),
@@ -3111,7 +3118,7 @@ bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog,
 		goto out;
 
 	msg = log_from_idx(dumper->cur_idx);
-	l = msg_print_text(msg, syslog, line, size);
+	l = msg_print_text(msg, syslog, printk_time, line, size);
 
 	dumper->cur_idx = log_next(dumper->cur_idx);
 	dumper->cur_seq++;
@@ -3182,6 +3189,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
 	u32 next_idx;
 	size_t l = 0;
 	bool ret = false;
+	bool time = printk_time;
 
 	if (!dumper->active)
 		goto out;
@@ -3205,7 +3213,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
 	while (seq < dumper->next_seq) {
 		struct printk_log *msg = log_from_idx(idx);
 
-		l += msg_print_text(msg, true, NULL, 0);
+		l += msg_print_text(msg, true, time, NULL, 0);
 		idx = log_next(idx);
 		seq++;
 	}
@@ -3216,7 +3224,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
 	while (l > size && seq < dumper->next_seq) {
 		struct printk_log *msg = log_from_idx(idx);
 
-		l -= msg_print_text(msg, true, NULL, 0);
+		l -= msg_print_text(msg, true, time, NULL, 0);
 		idx = log_next(idx);
 		seq++;
 	}
@@ -3229,7 +3237,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
 	while (seq < dumper->next_seq) {
 		struct printk_log *msg = log_from_idx(idx);
 
-		l += msg_print_text(msg, syslog, buf + l, size - l);
+		l += msg_print_text(msg, syslog, time, buf + l, size - l);
 		idx = log_next(idx);
 		seq++;
 	}
-- 
cgit 


From 7e1413edd6194a9807aa5f3ac0378b9b4b9da879 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Tue, 4 Dec 2018 13:35:45 -0500
Subject: tracing: Consolidate trace_add/remove_event_call back to the nolock
 functions

The trace_add/remove_event_call_nolock() functions were added to allow
the tace_add/remove_event_call() code be called when the event_mutex
lock was already taken. Now that all callers are done within the
event_mutex, there's no reason to have two different interfaces.

Remove the current wrapper trace_add/remove_event_call()s and rename the
_nolock versions back to the original names.

Link: http://lkml.kernel.org/r/154140866955.17322.2081425494660638846.stgit@devbox

Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/trace_events.h     |  2 --
 kernel/trace/trace_events.c      | 30 ++++--------------------------
 kernel/trace/trace_events_hist.c |  6 +++---
 kernel/trace/trace_kprobe.c      |  4 ++--
 kernel/trace/trace_uprobe.c      |  4 ++--
 5 files changed, 11 insertions(+), 35 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 3aa05593a53f..4130a5497d40 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -529,8 +529,6 @@ extern int trace_event_raw_init(struct trace_event_call *call);
 extern int trace_define_field(struct trace_event_call *call, const char *type,
 			      const char *name, int offset, int size,
 			      int is_signed, int filter_type);
-extern int trace_add_event_call_nolock(struct trace_event_call *call);
-extern int trace_remove_event_call_nolock(struct trace_event_call *call);
 extern int trace_add_event_call(struct trace_event_call *call);
 extern int trace_remove_event_call(struct trace_event_call *call);
 extern int trace_event_get_offsets(struct trace_event_call *call);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index a3b157f689ee..bd0162c0467c 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2305,7 +2305,8 @@ __trace_early_add_new_event(struct trace_event_call *call,
 struct ftrace_module_file_ops;
 static void __add_event_to_tracers(struct trace_event_call *call);
 
-int trace_add_event_call_nolock(struct trace_event_call *call)
+/* Add an additional event_call dynamically */
+int trace_add_event_call(struct trace_event_call *call)
 {
 	int ret;
 	lockdep_assert_held(&event_mutex);
@@ -2320,17 +2321,6 @@ int trace_add_event_call_nolock(struct trace_event_call *call)
 	return ret;
 }
 
-/* Add an additional event_call dynamically */
-int trace_add_event_call(struct trace_event_call *call)
-{
-	int ret;
-
-	mutex_lock(&event_mutex);
-	ret = trace_add_event_call_nolock(call);
-	mutex_unlock(&event_mutex);
-	return ret;
-}
-
 /*
  * Must be called under locking of trace_types_lock, event_mutex and
  * trace_event_sem.
@@ -2376,8 +2366,8 @@ static int probe_remove_event_call(struct trace_event_call *call)
 	return 0;
 }
 
-/* no event_mutex version */
-int trace_remove_event_call_nolock(struct trace_event_call *call)
+/* Remove an event_call */
+int trace_remove_event_call(struct trace_event_call *call)
 {
 	int ret;
 
@@ -2392,18 +2382,6 @@ int trace_remove_event_call_nolock(struct trace_event_call *call)
 	return ret;
 }
 
-/* Remove an event_call */
-int trace_remove_event_call(struct trace_event_call *call)
-{
-	int ret;
-
-	mutex_lock(&event_mutex);
-	ret = trace_remove_event_call_nolock(call);
-	mutex_unlock(&event_mutex);
-
-	return ret;
-}
-
 #define for_each_event(event, start, end)			\
 	for (event = start;					\
 	     (unsigned long)event < (unsigned long)end;		\
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 21e4954375a1..82e72c48a5a9 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -960,7 +960,7 @@ static int register_synth_event(struct synth_event *event)
 	call->data = event;
 	call->tp = event->tp;
 
-	ret = trace_add_event_call_nolock(call);
+	ret = trace_add_event_call(call);
 	if (ret) {
 		pr_warn("Failed to register synthetic event: %s\n",
 			trace_event_name(call));
@@ -969,7 +969,7 @@ static int register_synth_event(struct synth_event *event)
 
 	ret = set_synth_event_print_fmt(call);
 	if (ret < 0) {
-		trace_remove_event_call_nolock(call);
+		trace_remove_event_call(call);
 		goto err;
 	}
  out:
@@ -984,7 +984,7 @@ static int unregister_synth_event(struct synth_event *event)
 	struct trace_event_call *call = &event->call;
 	int ret;
 
-	ret = trace_remove_event_call_nolock(call);
+	ret = trace_remove_event_call(call);
 
 	return ret;
 }
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index bdf8c2ad5152..0e0f7b8024fb 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1353,7 +1353,7 @@ static int register_kprobe_event(struct trace_kprobe *tk)
 		kfree(call->print_fmt);
 		return -ENODEV;
 	}
-	ret = trace_add_event_call_nolock(call);
+	ret = trace_add_event_call(call);
 	if (ret) {
 		pr_info("Failed to register kprobe event: %s\n",
 			trace_event_name(call));
@@ -1368,7 +1368,7 @@ static int unregister_kprobe_event(struct trace_kprobe *tk)
 	int ret;
 
 	/* tp->event is unregistered in trace_remove_event_call() */
-	ret = trace_remove_event_call_nolock(&tk->tp.call);
+	ret = trace_remove_event_call(&tk->tp.call);
 	if (!ret)
 		kfree(tk->tp.call.print_fmt);
 	return ret;
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 4a7b21c891f3..e335576b9411 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1320,7 +1320,7 @@ static int register_uprobe_event(struct trace_uprobe *tu)
 		return -ENODEV;
 	}
 
-	ret = trace_add_event_call_nolock(call);
+	ret = trace_add_event_call(call);
 
 	if (ret) {
 		pr_info("Failed to register uprobe event: %s\n",
@@ -1337,7 +1337,7 @@ static int unregister_uprobe_event(struct trace_uprobe *tu)
 	int ret;
 
 	/* tu->event is unregistered in trace_remove_event_call() */
-	ret = trace_remove_event_call_nolock(&tu->tp.call);
+	ret = trace_remove_event_call(&tu->tp.call);
 	if (ret)
 		return ret;
 	kfree(tu->tp.call.print_fmt);
-- 
cgit 


From 1ce25e9f6fff7633955e8ce776e5e9d7a780d34b Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Mon, 5 Nov 2018 18:04:57 +0900
Subject: tracing: Add generic event-name based remove event method

Add a generic method to remove event from dynamic event
list. This is same as other system under ftrace. You
just need to pass the event name with '!', e.g.

  # echo p:new_grp/new_event _do_fork > dynamic_events

This creates an event, and

  # echo '!p:new_grp/new_event _do_fork' > dynamic_events

Or,

  # echo '!p:new_grp/new_event' > dynamic_events

will remove new_grp/new_event event.

Note that this doesn't check the event prefix (e.g. "p:")
strictly, because the "group/event" name must be unique.

Link: http://lkml.kernel.org/r/154140869774.17322.8887303560398645347.stgit@devbox

Reviewed-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Tested-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_dynevent.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c
index f17a887abb66..dd1f43588d70 100644
--- a/kernel/trace/trace_dynevent.c
+++ b/kernel/trace/trace_dynevent.c
@@ -37,10 +37,17 @@ int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type)
 	char *system = NULL, *event, *p;
 	int ret = -ENOENT;
 
-	if (argv[0][1] != ':')
-		return -EINVAL;
+	if (argv[0][0] == '-') {
+		if (argv[0][1] != ':')
+			return -EINVAL;
+		event = &argv[0][2];
+	} else {
+		event = strchr(argv[0], ':');
+		if (!event)
+			return -EINVAL;
+		event++;
+	}
 
-	event = &argv[0][2];
 	p = strchr(event, '/');
 	if (p) {
 		system = event;
@@ -69,7 +76,7 @@ static int create_dyn_event(int argc, char **argv)
 	struct dyn_event_operations *ops;
 	int ret;
 
-	if (argv[0][0] == '-')
+	if (argv[0][0] == '-' || argv[0][0] == '!')
 		return dyn_event_release(argc, argv, NULL);
 
 	mutex_lock(&dyn_event_ops_mutex);
-- 
cgit 


From a0572f687fb3c46e15554f4789797a077cc393b4 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Wed, 5 Dec 2018 12:48:53 -0500
Subject: ftrace: Allow ftrace_replace_code() to be schedulable

The function ftrace_replace_code() is the ftrace engine that does the
work to modify all the nops into the calls to the function callback in
all the functions being traced.

The generic version which is normally called from stop machine, but an
architecture can implement a non stop machine version and still use the
generic ftrace_replace_code(). When an architecture does this,
ftrace_replace_code() may be called from a schedulable context, where
it can allow the code to be preemptible, and schedule out.

In order to allow an architecture to make ftrace_replace_code()
schedulable, a new command flag is added called:

 FTRACE_MAY_SLEEP

Which can be or'd to the command that is passed to
ftrace_modify_all_code() that calls ftrace_replace_code() and will have
it call cond_resched() in the loop that modifies the nops into the
calls to the ftrace trampolines.

Link: http://lkml.kernel.org/r/20181204192903.8193-1-anders.roxell@linaro.org
Link: http://lkml.kernel.org/r/20181205183303.828422192@goodmis.org

Reported-by: Anders Roxell <anders.roxell@linaro.org>
Tested-by: Anders Roxell <anders.roxell@linaro.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/ftrace.h |  1 +
 kernel/trace/ftrace.c  | 19 ++++++++++++++++---
 2 files changed, 17 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 98e141c71ad0..13485a19e964 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -389,6 +389,7 @@ enum {
 	FTRACE_UPDATE_TRACE_FUNC	= (1 << 2),
 	FTRACE_START_FUNC_RET		= (1 << 3),
 	FTRACE_STOP_FUNC_RET		= (1 << 4),
+	FTRACE_MAY_SLEEP		= (1 << 5),
 };
 
 /*
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 8ef9fc226037..ab3e8b995e12 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -77,6 +77,11 @@
 #define ASSIGN_OPS_HASH(opsname, val)
 #endif
 
+enum {
+	FTRACE_MODIFY_ENABLE_FL		= (1 << 0),
+	FTRACE_MODIFY_MAY_SLEEP_FL	= (1 << 1),
+};
+
 struct ftrace_ops ftrace_list_end __read_mostly = {
 	.func		= ftrace_stub,
 	.flags		= FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB,
@@ -2389,10 +2394,12 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 	return -1; /* unknow ftrace bug */
 }
 
-void __weak ftrace_replace_code(int enable)
+void __weak ftrace_replace_code(int mod_flags)
 {
 	struct dyn_ftrace *rec;
 	struct ftrace_page *pg;
+	int enable = mod_flags & FTRACE_MODIFY_ENABLE_FL;
+	int schedulable = mod_flags & FTRACE_MODIFY_MAY_SLEEP_FL;
 	int failed;
 
 	if (unlikely(ftrace_disabled))
@@ -2409,6 +2416,8 @@ void __weak ftrace_replace_code(int enable)
 			/* Stop processing */
 			return;
 		}
+		if (schedulable)
+			cond_resched();
 	} while_for_each_ftrace_rec();
 }
 
@@ -2522,8 +2531,12 @@ int __weak ftrace_arch_code_modify_post_process(void)
 void ftrace_modify_all_code(int command)
 {
 	int update = command & FTRACE_UPDATE_TRACE_FUNC;
+	int mod_flags = 0;
 	int err = 0;
 
+	if (command & FTRACE_MAY_SLEEP)
+		mod_flags = FTRACE_MODIFY_MAY_SLEEP_FL;
+
 	/*
 	 * If the ftrace_caller calls a ftrace_ops func directly,
 	 * we need to make sure that it only traces functions it
@@ -2541,9 +2554,9 @@ void ftrace_modify_all_code(int command)
 	}
 
 	if (command & FTRACE_UPDATE_CALLS)
-		ftrace_replace_code(1);
+		ftrace_replace_code(mod_flags | FTRACE_MODIFY_ENABLE_FL);
 	else if (command & FTRACE_DISABLE_CALLS)
-		ftrace_replace_code(0);
+		ftrace_replace_code(mod_flags);
 
 	if (update && ftrace_trace_function != ftrace_ops_list_func) {
 		function_trace_op = set_function_trace_op;
-- 
cgit 


From 45fe439bc3699851802062c05acf4cbac4d672bc Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Fri, 7 Dec 2018 12:25:52 -0500
Subject: fgraph: Add comment to describe ftrace_graph_get_ret_stack

The ret_stack should not be accessed directly via the curr_ret_stack
variable on the task_struct. This is because the ret_stack is going to be
converted into a series of longs and not an array of ret_stack structures.
The way that a ret_stack should be retrieved is via the
ftrace_graph_get_ret_stack structure, but it needs to be documented on how
to use it.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/fgraph.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index a3704ec8b599..d4f04f0ca646 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -232,6 +232,17 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
 	return ret;
 }
 
+/**
+ * ftrace_graph_get_ret_stack - return the entry of the shadow stack
+ * @task: The task to read the shadow stack from
+ * @idx: Index down the shadow stack
+ *
+ * Return the ret_struct on the shadow stack of the @task at the
+ * call graph at @idx starting with zero. If @idx is zero, it
+ * will return the last saved ret_stack entry. If it is greater than
+ * zero, it will return the corresponding ret_stack for the depth
+ * of saved return addresses.
+ */
 struct ftrace_ret_stack *
 ftrace_graph_get_ret_stack(struct task_struct *task, int idx)
 {
-- 
cgit 


From e434b8cdf788568ba65a0a0fd9f3cb41f3ca1803 Mon Sep 17 00:00:00 2001
From: Jiong Wang <jiong.wang@netronome.com>
Date: Fri, 7 Dec 2018 12:16:18 -0500
Subject: bpf: relax verifier restriction on BPF_MOV | BPF_ALU

Currently, the destination register is marked as unknown for 32-bit
sub-register move (BPF_MOV | BPF_ALU) whenever the source register type is
SCALAR_VALUE.

This is too conservative that some valid cases will be rejected.
Especially, this may turn a constant scalar value into unknown value that
could break some assumptions of verifier.

For example, test_l4lb_noinline.c has the following C code:

    struct real_definition *dst

1:  if (!get_packet_dst(&dst, &pckt, vip_info, is_ipv6))
2:    return TC_ACT_SHOT;
3:
4:  if (dst->flags & F_IPV6) {

get_packet_dst is responsible for initializing "dst" into valid pointer and
return true (1), otherwise return false (0). The compiled instruction
sequence using alu32 will be:

  412: (54) (u32) r7 &= (u32) 1
  413: (bc) (u32) r0 = (u32) r7
  414: (95) exit

insn 413, a BPF_MOV | BPF_ALU, however will turn r0 into unknown value even
r7 contains SCALAR_VALUE 1.

This causes trouble when verifier is walking the code path that hasn't
initialized "dst" inside get_packet_dst, for which case 0 is returned and
we would then expect verifier concluding line 1 in the above C code pass
the "if" check, therefore would skip fall through path starting at line 4.
Now, because r0 returned from callee has became unknown value, so verifier
won't skip analyzing path starting at line 4 and "dst->flags" requires
dereferencing the pointer "dst" which actually hasn't be initialized for
this path.

This patch relaxed the code marking sub-register move destination. For a
SCALAR_VALUE, it is safe to just copy the value from source then truncate
it into 32-bit.

A unit test also included to demonstrate this issue. This test will fail
before this patch.

This relaxation could let verifier skipping more paths for conditional
comparison against immediate. It also let verifier recording a more
accurate/strict value for one register at one state, if this state end up
with going through exit without rejection and it is used for state
comparison later, then it is possible an inaccurate/permissive value is
better. So the real impact on verifier processed insn number is complex.
But in all, without this fix, valid program could be rejected.

>From real benchmarking on kernel selftests and Cilium bpf tests, there is
no impact on processed instruction number when tests ares compiled with
default compilation options. There is slightly improvements when they are
compiled with -mattr=+alu32 after this patch.

Also, test_xdp_noinline/-mattr=+alu32 now passed verification. It is
rejected before this fix.

Insn processed before/after this patch:

                        default     -mattr=+alu32

Kernel selftest

===
test_xdp.o              371/371      369/369
test_l4lb.o             6345/6345    5623/5623
test_xdp_noinline.o     2971/2971    rejected/2727
test_tcp_estates.o      429/429      430/430

Cilium bpf
===
bpf_lb-DLB_L3.o:        2085/2085     1685/1687
bpf_lb-DLB_L4.o:        2287/2287     1986/1982
bpf_lb-DUNKNOWN.o:      690/690       622/622
bpf_lxc.o:              95033/95033   N/A
bpf_netdev.o:           7245/7245     N/A
bpf_overlay.o:          2898/2898     3085/2947

NOTE:
  - bpf_lxc.o and bpf_netdev.o compiled by -mattr=+alu32 are rejected by
    verifier due to another issue inside verifier on supporting alu32
    binary.
  - Each cilium bpf program could generate several processed insn number,
    above number is sum of them.

v1->v2:
 - Restrict the change on SCALAR_VALUE.
 - Update benchmark numbers on Cilium bpf tests.

Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c                       | 16 ++++++++++++----
 tools/testing/selftests/bpf/test_verifier.c | 13 +++++++++++++
 2 files changed, 25 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9d25506bd55a..2e70b813a1a7 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3583,12 +3583,15 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 			return err;
 
 		if (BPF_SRC(insn->code) == BPF_X) {
+			struct bpf_reg_state *src_reg = regs + insn->src_reg;
+			struct bpf_reg_state *dst_reg = regs + insn->dst_reg;
+
 			if (BPF_CLASS(insn->code) == BPF_ALU64) {
 				/* case: R1 = R2
 				 * copy register state to dest reg
 				 */
-				regs[insn->dst_reg] = regs[insn->src_reg];
-				regs[insn->dst_reg].live |= REG_LIVE_WRITTEN;
+				*dst_reg = *src_reg;
+				dst_reg->live |= REG_LIVE_WRITTEN;
 			} else {
 				/* R1 = (u32) R2 */
 				if (is_pointer_value(env, insn->src_reg)) {
@@ -3596,9 +3599,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 						"R%d partial copy of pointer\n",
 						insn->src_reg);
 					return -EACCES;
+				} else if (src_reg->type == SCALAR_VALUE) {
+					*dst_reg = *src_reg;
+					dst_reg->live |= REG_LIVE_WRITTEN;
+				} else {
+					mark_reg_unknown(env, regs,
+							 insn->dst_reg);
 				}
-				mark_reg_unknown(env, regs, insn->dst_reg);
-				coerce_reg_to_size(&regs[insn->dst_reg], 4);
+				coerce_reg_to_size(dst_reg, 4);
 			}
 		} else {
 			/* case: R = imm
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 36ce58b4933e..957e4711c46c 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -2959,6 +2959,19 @@ static struct bpf_test tests[] = {
 		.result_unpriv = REJECT,
 		.result = ACCEPT,
 	},
+	{
+		"alu32: mov u32 const",
+		.insns = {
+			BPF_MOV32_IMM(BPF_REG_7, 0),
+			BPF_ALU32_IMM(BPF_AND, BPF_REG_7, 1),
+			BPF_MOV32_REG(BPF_REG_0, BPF_REG_7),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.retval = 0,
+	},
 	{
 		"unpriv: partial copy of pointer",
 		.insns = {
-- 
cgit 


From 7a5725ddc6e18aab61f647c0996d3b7eb03ff5cb Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Mon, 10 Dec 2018 11:17:50 -0800
Subject: bpf: clean up bpf_prog_get_info_by_fd()

info.nr_jited_ksyms and info.nr_jited_func_lens cannot be 0 in these two
statements, so we don't need to check them.

Signed-off-by: Song Liu <songliubraving@fb.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/syscall.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 19c88cff7880..a99a23bf5910 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2209,7 +2209,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 
 	ulen = info.nr_jited_ksyms;
 	info.nr_jited_ksyms = prog->aux->func_cnt ? : 1;
-	if (info.nr_jited_ksyms && ulen) {
+	if (ulen) {
 		if (bpf_dump_raw_ok()) {
 			unsigned long ksym_addr;
 			u64 __user *user_ksyms;
@@ -2240,7 +2240,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 
 	ulen = info.nr_jited_func_lens;
 	info.nr_jited_func_lens = prog->aux->func_cnt ? : 1;
-	if (info.nr_jited_func_lens && ulen) {
+	if (ulen) {
 		if (bpf_dump_raw_ok()) {
 			u32 __user *user_lens;
 			u32 func_len, i;
-- 
cgit 


From 11d8b82d2222cade12caad2c125f23023777dcbc Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Mon, 10 Dec 2018 14:14:08 -0800
Subject: bpf: rename *_info_cnt to nr_*_info in bpf_prog_info

In uapi bpf.h, currently we have the following fields in
the struct bpf_prog_info:
	__u32 func_info_cnt;
	__u32 line_info_cnt;
	__u32 jited_line_info_cnt;
The above field names "func_info_cnt" and "line_info_cnt"
also appear in union bpf_attr for program loading.

The original intention is to keep the names the same
between bpf_prog_info and bpf_attr
so it will imply what we returned to user space will be
the same as what the user space passed to the kernel.

Such a naming convention in bpf_prog_info is not consistent
with other fields like:
        __u32 nr_jited_ksyms;
        __u32 nr_jited_func_lens;

This patch made this adjustment so in bpf_prog_info
newly introduced *_info_cnt becomes nr_*_info.

Acked-by: Song Liu <songliubraving@fb.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h |  6 +++---
 kernel/bpf/syscall.c     | 38 +++++++++++++++++++-------------------
 2 files changed, 22 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 1bee1135866a..f943ed803309 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2696,11 +2696,11 @@ struct bpf_prog_info {
 	__u32 btf_id;
 	__u32 func_info_rec_size;
 	__aligned_u64 func_info;
-	__u32 func_info_cnt;
-	__u32 line_info_cnt;
+	__u32 nr_func_info;
+	__u32 nr_line_info;
 	__aligned_u64 line_info;
 	__aligned_u64 jited_line_info;
-	__u32 jited_line_info_cnt;
+	__u32 nr_jited_line_info;
 	__u32 line_info_rec_size;
 	__u32 jited_line_info_rec_size;
 } __attribute__((aligned(8)));
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index a99a23bf5910..5745c7837621 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2055,15 +2055,15 @@ static int set_info_rec_size(struct bpf_prog_info *info)
 	 * _rec_size back to the info.
 	 */
 
-	if ((info->func_info_cnt || info->func_info_rec_size) &&
+	if ((info->nr_func_info || info->func_info_rec_size) &&
 	    info->func_info_rec_size != sizeof(struct bpf_func_info))
 		return -EINVAL;
 
-	if ((info->line_info_cnt || info->line_info_rec_size) &&
+	if ((info->nr_line_info || info->line_info_rec_size) &&
 	    info->line_info_rec_size != sizeof(struct bpf_line_info))
 		return -EINVAL;
 
-	if ((info->jited_line_info_cnt || info->jited_line_info_rec_size) &&
+	if ((info->nr_jited_line_info || info->jited_line_info_rec_size) &&
 	    info->jited_line_info_rec_size != sizeof(__u64))
 		return -EINVAL;
 
@@ -2125,9 +2125,9 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 		info.xlated_prog_len = 0;
 		info.nr_jited_ksyms = 0;
 		info.nr_jited_func_lens = 0;
-		info.func_info_cnt = 0;
-		info.line_info_cnt = 0;
-		info.jited_line_info_cnt = 0;
+		info.nr_func_info = 0;
+		info.nr_line_info = 0;
+		info.nr_jited_line_info = 0;
 		goto done;
 	}
 
@@ -2268,14 +2268,14 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 	if (prog->aux->btf)
 		info.btf_id = btf_id(prog->aux->btf);
 
-	ulen = info.func_info_cnt;
-	info.func_info_cnt = prog->aux->func_info_cnt;
-	if (info.func_info_cnt && ulen) {
+	ulen = info.nr_func_info;
+	info.nr_func_info = prog->aux->func_info_cnt;
+	if (info.nr_func_info && ulen) {
 		if (bpf_dump_raw_ok()) {
 			char __user *user_finfo;
 
 			user_finfo = u64_to_user_ptr(info.func_info);
-			ulen = min_t(u32, info.func_info_cnt, ulen);
+			ulen = min_t(u32, info.nr_func_info, ulen);
 			if (copy_to_user(user_finfo, prog->aux->func_info,
 					 info.func_info_rec_size * ulen))
 				return -EFAULT;
@@ -2284,14 +2284,14 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 		}
 	}
 
-	ulen = info.line_info_cnt;
-	info.line_info_cnt = prog->aux->nr_linfo;
-	if (info.line_info_cnt && ulen) {
+	ulen = info.nr_line_info;
+	info.nr_line_info = prog->aux->nr_linfo;
+	if (info.nr_line_info && ulen) {
 		if (bpf_dump_raw_ok()) {
 			__u8 __user *user_linfo;
 
 			user_linfo = u64_to_user_ptr(info.line_info);
-			ulen = min_t(u32, info.line_info_cnt, ulen);
+			ulen = min_t(u32, info.nr_line_info, ulen);
 			if (copy_to_user(user_linfo, prog->aux->linfo,
 					 info.line_info_rec_size * ulen))
 				return -EFAULT;
@@ -2300,18 +2300,18 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 		}
 	}
 
-	ulen = info.jited_line_info_cnt;
+	ulen = info.nr_jited_line_info;
 	if (prog->aux->jited_linfo)
-		info.jited_line_info_cnt = prog->aux->nr_linfo;
+		info.nr_jited_line_info = prog->aux->nr_linfo;
 	else
-		info.jited_line_info_cnt = 0;
-	if (info.jited_line_info_cnt && ulen) {
+		info.nr_jited_line_info = 0;
+	if (info.nr_jited_line_info && ulen) {
 		if (bpf_dump_raw_ok()) {
 			__u64 __user *user_linfo;
 			u32 i;
 
 			user_linfo = u64_to_user_ptr(info.jited_line_info);
-			ulen = min_t(u32, info.jited_line_info_cnt, ulen);
+			ulen = min_t(u32, info.nr_jited_line_info, ulen);
 			for (i = 0; i < ulen; i++) {
 				if (put_user((__u64)(long)prog->aux->jited_linfo[i],
 					     &user_linfo[i]))
-- 
cgit 


From 108c35a908d484df094f46a1e9d961d732737013 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Mon, 3 Dec 2018 11:29:29 +0100
Subject: sched/cpufreq: Add the SPDX tags

The SPDX tags are not present in cpufreq.c and cpufreq_schedutil.c.

Add them and remove the license descriptions

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/sched/cpufreq.c           | 5 +----
 kernel/sched/cpufreq_schedutil.c | 5 +----
 2 files changed, 2 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
index 5e54cbcae673..22bd8980f32f 100644
--- a/kernel/sched/cpufreq.c
+++ b/kernel/sched/cpufreq.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Scheduler code and data structures related to cpufreq.
  *
  * Copyright (C) 2016, Intel Corporation
  * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 #include "sched.h"
 
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 3fffad3bc8a8..626ddd4ffa43 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * CPUFreq governor based on scheduler-provided CPU utilization data.
  *
  * Copyright (C) 2016, Intel Corporation
  * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-- 
cgit 


From 9f191555ba4ba8fc82e589670e46a7f79b72a157 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Mon, 10 Dec 2018 14:00:28 +0000
Subject: dma-debug: Expose nr_total_entries in debugfs

Expose nr_total_entries in debugfs, so that {num,min}_free_entries
become even more meaningful to users interested in current/maximum
utilisation. This becomes even more relevant once nr_total_entries
may change at runtime beyond just the existing AMD GART debug code.

Suggested-by: John Garry <john.garry@huawei.com>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Tested-by: Qian Cai <cai@lca.pw>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 Documentation/DMA-API.txt | 3 +++
 kernel/dma/debug.c        | 7 +++++++
 2 files changed, 10 insertions(+)

(limited to 'kernel')

diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt
index ac66ae2509a9..6bdb095393b0 100644
--- a/Documentation/DMA-API.txt
+++ b/Documentation/DMA-API.txt
@@ -723,6 +723,9 @@ dma-api/min_free_entries	This read-only file can be read to get the
 dma-api/num_free_entries	The current number of free dma_debug_entries
 				in the allocator.
 
+dma-api/nr_total_entries	The total number of dma_debug_entries in the
+				allocator, both free and used.
+
 dma-api/driver-filter		You can write a name of a driver into this file
 				to limit the debug output to requests from that
 				particular driver. Write an empty string to
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index 231ca4628062..f6a141eb9438 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -142,6 +142,7 @@ static struct dentry *show_all_errors_dent  __read_mostly;
 static struct dentry *show_num_errors_dent  __read_mostly;
 static struct dentry *num_free_entries_dent __read_mostly;
 static struct dentry *min_free_entries_dent __read_mostly;
+static struct dentry *nr_total_entries_dent __read_mostly;
 static struct dentry *filter_dent           __read_mostly;
 
 /* per-driver filter related state */
@@ -926,6 +927,12 @@ static int dma_debug_fs_init(void)
 	if (!min_free_entries_dent)
 		goto out_err;
 
+	nr_total_entries_dent = debugfs_create_u32("nr_total_entries", 0444,
+			dma_debug_dent,
+			&nr_total_entries);
+	if (!nr_total_entries_dent)
+		goto out_err;
+
 	filter_dent = debugfs_create_file("driver_filter", 0644,
 					  dma_debug_dent, NULL, &filter_fops);
 	if (!filter_dent)
-- 
cgit 


From f737b095c60c635db260e02fdb9f0efb9f3360c4 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Mon, 10 Dec 2018 14:00:27 +0000
Subject: dma-debug: Use pr_fmt()

Use pr_fmt() to generate the "DMA-API: " prefix consistently. This
results in it being added to a couple of pr_*() messages which were
missing it before, and for the err_printk() calls moves it to the actual
start of the message instead of somewhere in the middle.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Tested-by: Qian Cai <cai@lca.pw>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 kernel/dma/debug.c | 74 ++++++++++++++++++++++++++++--------------------------
 1 file changed, 38 insertions(+), 36 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index f6a141eb9438..29486eb9d1dc 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -17,6 +17,8 @@
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
  */
 
+#define pr_fmt(fmt)	"DMA-API: " fmt
+
 #include <linux/sched/task_stack.h>
 #include <linux/scatterlist.h>
 #include <linux/dma-mapping.h>
@@ -235,7 +237,7 @@ static bool driver_filter(struct device *dev)
 		error_count += 1;					\
 		if (driver_filter(dev) &&				\
 		    (show_all_errors || show_num_errors > 0)) {		\
-			WARN(1, "%s %s: " format,			\
+			WARN(1, pr_fmt("%s %s: ") format,		\
 			     dev ? dev_driver_string(dev) : "NULL",	\
 			     dev ? dev_name(dev) : "NULL", ## arg);	\
 			dump_entry_trace(entry);			\
@@ -520,7 +522,7 @@ static void active_cacheline_inc_overlap(phys_addr_t cln)
 	 * prematurely.
 	 */
 	WARN_ONCE(overlap > ACTIVE_CACHELINE_MAX_OVERLAP,
-		  "DMA-API: exceeded %d overlapping mappings of cacheline %pa\n",
+		  pr_fmt("exceeded %d overlapping mappings of cacheline %pa\n"),
 		  ACTIVE_CACHELINE_MAX_OVERLAP, &cln);
 }
 
@@ -615,7 +617,7 @@ void debug_dma_assert_idle(struct page *page)
 
 	cln = to_cacheline_number(entry);
 	err_printk(entry->dev, entry,
-		   "DMA-API: cpu touching an active dma mapped cacheline [cln=%pa]\n",
+		   "cpu touching an active dma mapped cacheline [cln=%pa]\n",
 		   &cln);
 }
 
@@ -635,7 +637,7 @@ static void add_dma_entry(struct dma_debug_entry *entry)
 
 	rc = active_cacheline_insert(entry);
 	if (rc == -ENOMEM) {
-		pr_err("DMA-API: cacheline tracking ENOMEM, dma-debug disabled\n");
+		pr_err("cacheline tracking ENOMEM, dma-debug disabled\n");
 		global_disable = true;
 	}
 
@@ -674,7 +676,7 @@ static struct dma_debug_entry *dma_entry_alloc(void)
 	if (list_empty(&free_entries)) {
 		global_disable = true;
 		spin_unlock_irqrestore(&free_entries_lock, flags);
-		pr_err("DMA-API: debugging out of memory - disabling\n");
+		pr_err("debugging out of memory - disabling\n");
 		return NULL;
 	}
 
@@ -778,7 +780,7 @@ static int prealloc_memory(u32 num_entries)
 	num_free_entries = num_entries;
 	min_free_entries = num_entries;
 
-	pr_info("DMA-API: preallocated %d debug entries\n", num_entries);
+	pr_info("preallocated %d debug entries\n", num_entries);
 
 	return 0;
 
@@ -851,7 +853,7 @@ static ssize_t filter_write(struct file *file, const char __user *userbuf,
 		 * switched off.
 		 */
 		if (current_driver_name[0])
-			pr_info("DMA-API: switching off dma-debug driver filter\n");
+			pr_info("switching off dma-debug driver filter\n");
 		current_driver_name[0] = 0;
 		current_driver = NULL;
 		goto out_unlock;
@@ -869,7 +871,7 @@ static ssize_t filter_write(struct file *file, const char __user *userbuf,
 	current_driver_name[i] = 0;
 	current_driver = NULL;
 
-	pr_info("DMA-API: enable driver filter for driver [%s]\n",
+	pr_info("enable driver filter for driver [%s]\n",
 		current_driver_name);
 
 out_unlock:
@@ -888,7 +890,7 @@ static int dma_debug_fs_init(void)
 {
 	dma_debug_dent = debugfs_create_dir("dma-api", NULL);
 	if (!dma_debug_dent) {
-		pr_err("DMA-API: can not create debugfs directory\n");
+		pr_err("can not create debugfs directory\n");
 		return -ENOMEM;
 	}
 
@@ -980,7 +982,7 @@ static int dma_debug_device_change(struct notifier_block *nb, unsigned long acti
 		count = device_dma_allocations(dev, &entry);
 		if (count == 0)
 			break;
-		err_printk(dev, entry, "DMA-API: device driver has pending "
+		err_printk(dev, entry, "device driver has pending "
 				"DMA allocations while released from device "
 				"[count=%d]\n"
 				"One of leaked entries details: "
@@ -1030,14 +1032,14 @@ static int dma_debug_init(void)
 	}
 
 	if (dma_debug_fs_init() != 0) {
-		pr_err("DMA-API: error creating debugfs entries - disabling\n");
+		pr_err("error creating debugfs entries - disabling\n");
 		global_disable = true;
 
 		return 0;
 	}
 
 	if (prealloc_memory(nr_prealloc_entries) != 0) {
-		pr_err("DMA-API: debugging out of memory error - disabled\n");
+		pr_err("debugging out of memory error - disabled\n");
 		global_disable = true;
 
 		return 0;
@@ -1047,7 +1049,7 @@ static int dma_debug_init(void)
 
 	dma_debug_initialized = true;
 
-	pr_info("DMA-API: debugging enabled by kernel config\n");
+	pr_info("debugging enabled by kernel config\n");
 	return 0;
 }
 core_initcall(dma_debug_init);
@@ -1058,7 +1060,7 @@ static __init int dma_debug_cmdline(char *str)
 		return -EINVAL;
 
 	if (strncmp(str, "off", 3) == 0) {
-		pr_info("DMA-API: debugging disabled on kernel command line\n");
+		pr_info("debugging disabled on kernel command line\n");
 		global_disable = true;
 	}
 
@@ -1092,11 +1094,11 @@ static void check_unmap(struct dma_debug_entry *ref)
 
 		if (dma_mapping_error(ref->dev, ref->dev_addr)) {
 			err_printk(ref->dev, NULL,
-				   "DMA-API: device driver tries to free an "
+				   "device driver tries to free an "
 				   "invalid DMA memory address\n");
 		} else {
 			err_printk(ref->dev, NULL,
-				   "DMA-API: device driver tries to free DMA "
+				   "device driver tries to free DMA "
 				   "memory it has not allocated [device "
 				   "address=0x%016llx] [size=%llu bytes]\n",
 				   ref->dev_addr, ref->size);
@@ -1105,7 +1107,7 @@ static void check_unmap(struct dma_debug_entry *ref)
 	}
 
 	if (ref->size != entry->size) {
-		err_printk(ref->dev, entry, "DMA-API: device driver frees "
+		err_printk(ref->dev, entry, "device driver frees "
 			   "DMA memory with different size "
 			   "[device address=0x%016llx] [map size=%llu bytes] "
 			   "[unmap size=%llu bytes]\n",
@@ -1113,7 +1115,7 @@ static void check_unmap(struct dma_debug_entry *ref)
 	}
 
 	if (ref->type != entry->type) {
-		err_printk(ref->dev, entry, "DMA-API: device driver frees "
+		err_printk(ref->dev, entry, "device driver frees "
 			   "DMA memory with wrong function "
 			   "[device address=0x%016llx] [size=%llu bytes] "
 			   "[mapped as %s] [unmapped as %s]\n",
@@ -1121,7 +1123,7 @@ static void check_unmap(struct dma_debug_entry *ref)
 			   type2name[entry->type], type2name[ref->type]);
 	} else if ((entry->type == dma_debug_coherent) &&
 		   (phys_addr(ref) != phys_addr(entry))) {
-		err_printk(ref->dev, entry, "DMA-API: device driver frees "
+		err_printk(ref->dev, entry, "device driver frees "
 			   "DMA memory with different CPU address "
 			   "[device address=0x%016llx] [size=%llu bytes] "
 			   "[cpu alloc address=0x%016llx] "
@@ -1133,7 +1135,7 @@ static void check_unmap(struct dma_debug_entry *ref)
 
 	if (ref->sg_call_ents && ref->type == dma_debug_sg &&
 	    ref->sg_call_ents != entry->sg_call_ents) {
-		err_printk(ref->dev, entry, "DMA-API: device driver frees "
+		err_printk(ref->dev, entry, "device driver frees "
 			   "DMA sg list with different entry count "
 			   "[map count=%d] [unmap count=%d]\n",
 			   entry->sg_call_ents, ref->sg_call_ents);
@@ -1144,7 +1146,7 @@ static void check_unmap(struct dma_debug_entry *ref)
 	 * DMA API don't handle this properly, so check for it here
 	 */
 	if (ref->direction != entry->direction) {
-		err_printk(ref->dev, entry, "DMA-API: device driver frees "
+		err_printk(ref->dev, entry, "device driver frees "
 			   "DMA memory with different direction "
 			   "[device address=0x%016llx] [size=%llu bytes] "
 			   "[mapped with %s] [unmapped with %s]\n",
@@ -1160,7 +1162,7 @@ static void check_unmap(struct dma_debug_entry *ref)
 	 */
 	if (entry->map_err_type == MAP_ERR_NOT_CHECKED) {
 		err_printk(ref->dev, entry,
-			   "DMA-API: device driver failed to check map error"
+			   "device driver failed to check map error"
 			   "[device address=0x%016llx] [size=%llu bytes] "
 			   "[mapped as %s]",
 			   ref->dev_addr, ref->size,
@@ -1185,7 +1187,7 @@ static void check_for_stack(struct device *dev,
 			return;
 		addr = page_address(page) + offset;
 		if (object_is_on_stack(addr))
-			err_printk(dev, NULL, "DMA-API: device driver maps memory from stack [addr=%p]\n", addr);
+			err_printk(dev, NULL, "device driver maps memory from stack [addr=%p]\n", addr);
 	} else {
 		/* Stack is vmalloced. */
 		int i;
@@ -1195,7 +1197,7 @@ static void check_for_stack(struct device *dev,
 				continue;
 
 			addr = (u8 *)current->stack + i * PAGE_SIZE + offset;
-			err_printk(dev, NULL, "DMA-API: device driver maps memory from stack [probable addr=%p]\n", addr);
+			err_printk(dev, NULL, "device driver maps memory from stack [probable addr=%p]\n", addr);
 			break;
 		}
 	}
@@ -1215,7 +1217,7 @@ static void check_for_illegal_area(struct device *dev, void *addr, unsigned long
 {
 	if (overlap(addr, len, _stext, _etext) ||
 	    overlap(addr, len, __start_rodata, __end_rodata))
-		err_printk(dev, NULL, "DMA-API: device driver maps memory from kernel text or rodata [addr=%p] [len=%lu]\n", addr, len);
+		err_printk(dev, NULL, "device driver maps memory from kernel text or rodata [addr=%p] [len=%lu]\n", addr, len);
 }
 
 static void check_sync(struct device *dev,
@@ -1231,7 +1233,7 @@ static void check_sync(struct device *dev,
 	entry = bucket_find_contain(&bucket, ref, &flags);
 
 	if (!entry) {
-		err_printk(dev, NULL, "DMA-API: device driver tries "
+		err_printk(dev, NULL, "device driver tries "
 				"to sync DMA memory it has not allocated "
 				"[device address=0x%016llx] [size=%llu bytes]\n",
 				(unsigned long long)ref->dev_addr, ref->size);
@@ -1239,7 +1241,7 @@ static void check_sync(struct device *dev,
 	}
 
 	if (ref->size > entry->size) {
-		err_printk(dev, entry, "DMA-API: device driver syncs"
+		err_printk(dev, entry, "device driver syncs"
 				" DMA memory outside allocated range "
 				"[device address=0x%016llx] "
 				"[allocation size=%llu bytes] "
@@ -1252,7 +1254,7 @@ static void check_sync(struct device *dev,
 		goto out;
 
 	if (ref->direction != entry->direction) {
-		err_printk(dev, entry, "DMA-API: device driver syncs "
+		err_printk(dev, entry, "device driver syncs "
 				"DMA memory with different direction "
 				"[device address=0x%016llx] [size=%llu bytes] "
 				"[mapped with %s] [synced with %s]\n",
@@ -1263,7 +1265,7 @@ static void check_sync(struct device *dev,
 
 	if (to_cpu && !(entry->direction == DMA_FROM_DEVICE) &&
 		      !(ref->direction == DMA_TO_DEVICE))
-		err_printk(dev, entry, "DMA-API: device driver syncs "
+		err_printk(dev, entry, "device driver syncs "
 				"device read-only DMA memory for cpu "
 				"[device address=0x%016llx] [size=%llu bytes] "
 				"[mapped with %s] [synced with %s]\n",
@@ -1273,7 +1275,7 @@ static void check_sync(struct device *dev,
 
 	if (!to_cpu && !(entry->direction == DMA_TO_DEVICE) &&
 		       !(ref->direction == DMA_FROM_DEVICE))
-		err_printk(dev, entry, "DMA-API: device driver syncs "
+		err_printk(dev, entry, "device driver syncs "
 				"device write-only DMA memory to device "
 				"[device address=0x%016llx] [size=%llu bytes] "
 				"[mapped with %s] [synced with %s]\n",
@@ -1283,7 +1285,7 @@ static void check_sync(struct device *dev,
 
 	if (ref->sg_call_ents && ref->type == dma_debug_sg &&
 	    ref->sg_call_ents != entry->sg_call_ents) {
-		err_printk(ref->dev, entry, "DMA-API: device driver syncs "
+		err_printk(ref->dev, entry, "device driver syncs "
 			   "DMA sg list with different entry count "
 			   "[map count=%d] [sync count=%d]\n",
 			   entry->sg_call_ents, ref->sg_call_ents);
@@ -1304,7 +1306,7 @@ static void check_sg_segment(struct device *dev, struct scatterlist *sg)
 	 * whoever generated the list forgot to check them.
 	 */
 	if (sg->length > max_seg)
-		err_printk(dev, NULL, "DMA-API: mapping sg segment longer than device claims to support [len=%u] [max=%u]\n",
+		err_printk(dev, NULL, "mapping sg segment longer than device claims to support [len=%u] [max=%u]\n",
 			   sg->length, max_seg);
 	/*
 	 * In some cases this could potentially be the DMA API
@@ -1314,7 +1316,7 @@ static void check_sg_segment(struct device *dev, struct scatterlist *sg)
 	start = sg_dma_address(sg);
 	end = start + sg_dma_len(sg) - 1;
 	if ((start ^ end) & ~boundary)
-		err_printk(dev, NULL, "DMA-API: mapping sg segment across boundary [start=0x%016llx] [end=0x%016llx] [boundary=0x%016llx]\n",
+		err_printk(dev, NULL, "mapping sg segment across boundary [start=0x%016llx] [end=0x%016llx] [boundary=0x%016llx]\n",
 			   start, end, boundary);
 #endif
 }
@@ -1326,11 +1328,11 @@ void debug_dma_map_single(struct device *dev, const void *addr,
 		return;
 
 	if (!virt_addr_valid(addr))
-		err_printk(dev, NULL, "DMA-API: device driver maps memory from invalid area [addr=%p] [len=%lu]\n",
+		err_printk(dev, NULL, "device driver maps memory from invalid area [addr=%p] [len=%lu]\n",
 			   addr, len);
 
 	if (is_vmalloc_addr(addr))
-		err_printk(dev, NULL, "DMA-API: device driver maps memory from vmalloc area [addr=%p] [len=%lu]\n",
+		err_printk(dev, NULL, "device driver maps memory from vmalloc area [addr=%p] [len=%lu]\n",
 			   addr, len);
 }
 EXPORT_SYMBOL(debug_dma_map_single);
@@ -1787,7 +1789,7 @@ static int __init dma_debug_driver_setup(char *str)
 	}
 
 	if (current_driver_name[0])
-		pr_info("DMA-API: enable driver filter for driver [%s]\n",
+		pr_info("enable driver filter for driver [%s]\n",
 			current_driver_name);
 
 
-- 
cgit 


From 2b9d9ac02b9d8d32c515c82bb17401c429f160ab Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Mon, 10 Dec 2018 14:00:29 +0000
Subject: dma-debug: Dynamically expand the dma_debug_entry pool

Certain drivers such as large multi-queue network adapters can use pools
of mapped DMA buffers larger than the default dma_debug_entry pool of
65536 entries, with the result that merely probing such a device can
cause DMA debug to disable itself during boot unless explicitly given an
appropriate "dma_debug_entries=..." option.

Developers trying to debug some other driver on such a system may not be
immediately aware of this, and at worst it can hide bugs if they fail to
realise that dma-debug has already disabled itself unexpectedly by the
time their code of interest gets to run. Even once they do realise, it
can be a bit of a pain to emprirically determine a suitable number of
preallocated entries to configure, short of massively over-allocating.

There's really no need for such a static limit, though, since we can
quite easily expand the pool at runtime in those rare cases that the
preallocated entries are insufficient, which is arguably the least
surprising and most useful behaviour. To that end, refactor the
prealloc_memory() logic a little bit to generalise it for runtime
reallocations as well.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Tested-by: Qian Cai <cai@lca.pw>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 Documentation/DMA-API.txt | 11 +++----
 kernel/dma/debug.c        | 79 ++++++++++++++++++++++++-----------------------
 2 files changed, 46 insertions(+), 44 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt
index 6bdb095393b0..0fcb7561af1e 100644
--- a/Documentation/DMA-API.txt
+++ b/Documentation/DMA-API.txt
@@ -717,8 +717,8 @@ dma-api/num_errors		The number in this file shows how many
 dma-api/min_free_entries	This read-only file can be read to get the
 				minimum number of free dma_debug_entries the
 				allocator has ever seen. If this value goes
-				down to zero the code will disable itself
-				because it is not longer reliable.
+				down to zero the code will attempt to increase
+				nr_total_entries to compensate.
 
 dma-api/num_free_entries	The current number of free dma_debug_entries
 				in the allocator.
@@ -745,10 +745,9 @@ driver filter at boot time. The debug code will only print errors for that
 driver afterwards. This filter can be disabled or changed later using debugfs.
 
 When the code disables itself at runtime this is most likely because it ran
-out of dma_debug_entries. These entries are preallocated at boot. The number
-of preallocated entries is defined per architecture. If it is too low for you
-boot with 'dma_debug_entries=<your_desired_number>' to overwrite the
-architectural default.
+out of dma_debug_entries and was unable to allocate more on-demand. 65536
+entries are preallocated at boot - if this is too low for you boot with
+'dma_debug_entries=<your_desired_number>' to overwrite the default.
 
 ::
 
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index 29486eb9d1dc..ef7c90b7a346 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -47,6 +47,8 @@
 #ifndef PREALLOC_DMA_DEBUG_ENTRIES
 #define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16)
 #endif
+/* If the pool runs out, add this many new entries at once */
+#define DMA_DEBUG_DYNAMIC_ENTRIES 256
 
 enum {
 	dma_debug_single,
@@ -646,6 +648,34 @@ static void add_dma_entry(struct dma_debug_entry *entry)
 	 */
 }
 
+static int dma_debug_create_entries(u32 num_entries, gfp_t gfp)
+{
+	struct dma_debug_entry *entry, *next_entry;
+	int i;
+
+	for (i = 0; i < num_entries; ++i) {
+		entry = kzalloc(sizeof(*entry), gfp);
+		if (!entry)
+			goto out_err;
+
+		list_add_tail(&entry->list, &free_entries);
+	}
+
+	num_free_entries += num_entries;
+	nr_total_entries += num_entries;
+
+	return 0;
+
+out_err:
+
+	list_for_each_entry_safe(entry, next_entry, &free_entries, list) {
+		list_del(&entry->list);
+		kfree(entry);
+	}
+
+	return -ENOMEM;
+}
+
 static struct dma_debug_entry *__dma_entry_alloc(void)
 {
 	struct dma_debug_entry *entry;
@@ -672,12 +702,14 @@ static struct dma_debug_entry *dma_entry_alloc(void)
 	unsigned long flags;
 
 	spin_lock_irqsave(&free_entries_lock, flags);
-
-	if (list_empty(&free_entries)) {
-		global_disable = true;
-		spin_unlock_irqrestore(&free_entries_lock, flags);
-		pr_err("debugging out of memory - disabling\n");
-		return NULL;
+	if (num_free_entries == 0) {
+		if (dma_debug_create_entries(DMA_DEBUG_DYNAMIC_ENTRIES,
+					     GFP_ATOMIC)) {
+			global_disable = true;
+			spin_unlock_irqrestore(&free_entries_lock, flags);
+			pr_err("debugging out of memory - disabling\n");
+			return NULL;
+		}
 	}
 
 	entry = __dma_entry_alloc();
@@ -764,36 +796,6 @@ int dma_debug_resize_entries(u32 num_entries)
  *   2. Preallocate a given number of dma_debug_entry structs
  */
 
-static int prealloc_memory(u32 num_entries)
-{
-	struct dma_debug_entry *entry, *next_entry;
-	int i;
-
-	for (i = 0; i < num_entries; ++i) {
-		entry = kzalloc(sizeof(*entry), GFP_KERNEL);
-		if (!entry)
-			goto out_err;
-
-		list_add_tail(&entry->list, &free_entries);
-	}
-
-	num_free_entries = num_entries;
-	min_free_entries = num_entries;
-
-	pr_info("preallocated %d debug entries\n", num_entries);
-
-	return 0;
-
-out_err:
-
-	list_for_each_entry_safe(entry, next_entry, &free_entries, list) {
-		list_del(&entry->list);
-		kfree(entry);
-	}
-
-	return -ENOMEM;
-}
-
 static ssize_t filter_read(struct file *file, char __user *user_buf,
 			   size_t count, loff_t *ppos)
 {
@@ -1038,14 +1040,15 @@ static int dma_debug_init(void)
 		return 0;
 	}
 
-	if (prealloc_memory(nr_prealloc_entries) != 0) {
+	if (dma_debug_create_entries(nr_prealloc_entries, GFP_KERNEL) != 0) {
 		pr_err("debugging out of memory error - disabled\n");
 		global_disable = true;
 
 		return 0;
 	}
 
-	nr_total_entries = num_free_entries;
+	min_free_entries = num_free_entries;
+	pr_info("preallocated %d debug entries\n", nr_total_entries);
 
 	dma_debug_initialized = true;
 
-- 
cgit 


From ceb51173b2b5bd7af0d99079f6bbacdcfc58fe08 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Mon, 10 Dec 2018 14:00:30 +0000
Subject: dma-debug: Make leak-like behaviour apparent

Now that we can dynamically allocate DMA debug entries to cope with
drivers maintaining excessively large numbers of live mappings, a driver
which *does* actually have a bug leaking mappings (and is not unloaded)
will no longer trigger the "DMA-API: debugging out of memory - disabling"
message until it gets to actual kernel OOM conditions, which means it
could go unnoticed for a while. To that end, let's inform the user each
time the pool has grown to a multiple of its initial size, which should
make it apparent that they either have a leak or might want to increase
the preallocation size.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Tested-by: Qian Cai <cai@lca.pw>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 Documentation/DMA-API.txt |  6 +++++-
 kernel/dma/debug.c        | 13 +++++++++++++
 2 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt
index 0fcb7561af1e..7a7d8a415ce8 100644
--- a/Documentation/DMA-API.txt
+++ b/Documentation/DMA-API.txt
@@ -747,7 +747,11 @@ driver afterwards. This filter can be disabled or changed later using debugfs.
 When the code disables itself at runtime this is most likely because it ran
 out of dma_debug_entries and was unable to allocate more on-demand. 65536
 entries are preallocated at boot - if this is too low for you boot with
-'dma_debug_entries=<your_desired_number>' to overwrite the default.
+'dma_debug_entries=<your_desired_number>' to overwrite the default. The
+code will print to the kernel log each time it has dynamically allocated
+as many entries as were initially preallocated. This is to indicate that a
+larger preallocation size may be appropriate, or if it happens continually
+that a driver may be leaking mappings.
 
 ::
 
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index ef7c90b7a346..912c23f4c177 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -691,6 +691,18 @@ static struct dma_debug_entry *__dma_entry_alloc(void)
 	return entry;
 }
 
+void __dma_entry_alloc_check_leak(void)
+{
+	u32 tmp = nr_total_entries % nr_prealloc_entries;
+
+	/* Shout each time we tick over some multiple of the initial pool */
+	if (tmp < DMA_DEBUG_DYNAMIC_ENTRIES) {
+		pr_info("dma_debug_entry pool grown to %u (%u00%%)\n",
+			nr_total_entries,
+			(nr_total_entries / nr_prealloc_entries));
+	}
+}
+
 /* struct dma_entry allocator
  *
  * The next two functions implement the allocator for
@@ -710,6 +722,7 @@ static struct dma_debug_entry *dma_entry_alloc(void)
 			pr_err("debugging out of memory - disabling\n");
 			return NULL;
 		}
+		__dma_entry_alloc_check_leak();
 	}
 
 	entry = __dma_entry_alloc();
-- 
cgit 


From 0cb0e25e421436a83ee39857923e4213b983e463 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Mon, 10 Dec 2018 14:00:32 +0000
Subject: dma/debug: Remove dma_debug_resize_entries()

With the only caller now gone, we can clean up this part of dma-debug's
exposed internals and make way to tweak the allocation behaviour.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Tested-by: Qian Cai <cai@lca.pw>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/dma-debug.h |  7 -------
 kernel/dma/debug.c        | 46 ----------------------------------------------
 2 files changed, 53 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/dma-debug.h b/include/linux/dma-debug.h
index 30213adbb6b9..46e6131a72b6 100644
--- a/include/linux/dma-debug.h
+++ b/include/linux/dma-debug.h
@@ -30,8 +30,6 @@ struct bus_type;
 
 extern void dma_debug_add_bus(struct bus_type *bus);
 
-extern int dma_debug_resize_entries(u32 num_entries);
-
 extern void debug_dma_map_single(struct device *dev, const void *addr,
 				 unsigned long len);
 
@@ -101,11 +99,6 @@ static inline void dma_debug_add_bus(struct bus_type *bus)
 {
 }
 
-static inline int dma_debug_resize_entries(u32 num_entries)
-{
-	return 0;
-}
-
 static inline void debug_dma_map_single(struct device *dev, const void *addr,
 					unsigned long len)
 {
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index 912c23f4c177..36a42874b05f 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -755,52 +755,6 @@ static void dma_entry_free(struct dma_debug_entry *entry)
 	spin_unlock_irqrestore(&free_entries_lock, flags);
 }
 
-int dma_debug_resize_entries(u32 num_entries)
-{
-	int i, delta, ret = 0;
-	unsigned long flags;
-	struct dma_debug_entry *entry;
-	LIST_HEAD(tmp);
-
-	spin_lock_irqsave(&free_entries_lock, flags);
-
-	if (nr_total_entries < num_entries) {
-		delta = num_entries - nr_total_entries;
-
-		spin_unlock_irqrestore(&free_entries_lock, flags);
-
-		for (i = 0; i < delta; i++) {
-			entry = kzalloc(sizeof(*entry), GFP_KERNEL);
-			if (!entry)
-				break;
-
-			list_add_tail(&entry->list, &tmp);
-		}
-
-		spin_lock_irqsave(&free_entries_lock, flags);
-
-		list_splice(&tmp, &free_entries);
-		nr_total_entries += i;
-		num_free_entries += i;
-	} else {
-		delta = nr_total_entries - num_entries;
-
-		for (i = 0; i < delta && !list_empty(&free_entries); i++) {
-			entry = __dma_entry_alloc();
-			kfree(entry);
-		}
-
-		nr_total_entries -= i;
-	}
-
-	if (nr_total_entries != num_entries)
-		ret = 1;
-
-	spin_unlock_irqrestore(&free_entries_lock, flags);
-
-	return ret;
-}
-
 /*
  * DMA-API debugging init code
  *
-- 
cgit 


From ad78dee0b630527bdfed809d1f5ed95c601886ae Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Mon, 10 Dec 2018 14:00:33 +0000
Subject: dma-debug: Batch dma_debug_entry allocation

DMA debug entries are one of those things which aren't that useful
individually - we will always want some larger quantity of them - and
which we don't really need to manage the exact number of - we only care
about having 'enough'. In that regard, the current behaviour of creating
them one-by-one leads to a lot of unwarranted function call overhead and
memory wasted on alignment padding.

Now that we don't have to worry about freeing anything via
dma_debug_resize_entries(), we can optimise the allocation behaviour by
grabbing whole pages at once, which will save considerably on the
aforementioned overheads, and probably offer a little more cache/TLB
locality benefit for traversing the lists under normal operation. This
should also give even less reason for an architecture-level override of
the preallocation size, so make the definition unconditional - if there
is still any desire to change the compile-time value for some platforms
it would be better off as a Kconfig option anyway.

Since freeing a whole page of entries at once becomes enough of a
challenge that it's not really worth complicating dma_debug_init(), we
may as well tweak the preallocation behaviour such that as long as we
manage to allocate *some* pages, we can leave debugging enabled on a
best-effort basis rather than otherwise wasting them.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Tested-by: Qian Cai <cai@lca.pw>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 Documentation/DMA-API.txt |  4 +++-
 kernel/dma/debug.c        | 50 ++++++++++++++++++++---------------------------
 2 files changed, 24 insertions(+), 30 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt
index 7a7d8a415ce8..016eb6909b8a 100644
--- a/Documentation/DMA-API.txt
+++ b/Documentation/DMA-API.txt
@@ -747,7 +747,9 @@ driver afterwards. This filter can be disabled or changed later using debugfs.
 When the code disables itself at runtime this is most likely because it ran
 out of dma_debug_entries and was unable to allocate more on-demand. 65536
 entries are preallocated at boot - if this is too low for you boot with
-'dma_debug_entries=<your_desired_number>' to overwrite the default. The
+'dma_debug_entries=<your_desired_number>' to overwrite the default. Note
+that the code allocates entries in batches, so the exact number of
+preallocated entries may be greater than the actual number requested. The
 code will print to the kernel log each time it has dynamically allocated
 as many entries as were initially preallocated. This is to indicate that a
 larger preallocation size may be appropriate, or if it happens continually
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index 36a42874b05f..20ab0f6c1b70 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -43,12 +43,9 @@
 #define HASH_FN_SHIFT   13
 #define HASH_FN_MASK    (HASH_SIZE - 1)
 
-/* allow architectures to override this if absolutely required */
-#ifndef PREALLOC_DMA_DEBUG_ENTRIES
 #define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16)
-#endif
 /* If the pool runs out, add this many new entries at once */
-#define DMA_DEBUG_DYNAMIC_ENTRIES 256
+#define DMA_DEBUG_DYNAMIC_ENTRIES (PAGE_SIZE / sizeof(struct dma_debug_entry))
 
 enum {
 	dma_debug_single,
@@ -648,32 +645,22 @@ static void add_dma_entry(struct dma_debug_entry *entry)
 	 */
 }
 
-static int dma_debug_create_entries(u32 num_entries, gfp_t gfp)
+static int dma_debug_create_entries(gfp_t gfp)
 {
-	struct dma_debug_entry *entry, *next_entry;
+	struct dma_debug_entry *entry;
 	int i;
 
-	for (i = 0; i < num_entries; ++i) {
-		entry = kzalloc(sizeof(*entry), gfp);
-		if (!entry)
-			goto out_err;
+	entry = (void *)get_zeroed_page(gfp);
+	if (!entry)
+		return -ENOMEM;
 
-		list_add_tail(&entry->list, &free_entries);
-	}
+	for (i = 0; i < DMA_DEBUG_DYNAMIC_ENTRIES; i++)
+		list_add_tail(&entry[i].list, &free_entries);
 
-	num_free_entries += num_entries;
-	nr_total_entries += num_entries;
+	num_free_entries += DMA_DEBUG_DYNAMIC_ENTRIES;
+	nr_total_entries += DMA_DEBUG_DYNAMIC_ENTRIES;
 
 	return 0;
-
-out_err:
-
-	list_for_each_entry_safe(entry, next_entry, &free_entries, list) {
-		list_del(&entry->list);
-		kfree(entry);
-	}
-
-	return -ENOMEM;
 }
 
 static struct dma_debug_entry *__dma_entry_alloc(void)
@@ -715,8 +702,7 @@ static struct dma_debug_entry *dma_entry_alloc(void)
 
 	spin_lock_irqsave(&free_entries_lock, flags);
 	if (num_free_entries == 0) {
-		if (dma_debug_create_entries(DMA_DEBUG_DYNAMIC_ENTRIES,
-					     GFP_ATOMIC)) {
+		if (dma_debug_create_entries(GFP_ATOMIC)) {
 			global_disable = true;
 			spin_unlock_irqrestore(&free_entries_lock, flags);
 			pr_err("debugging out of memory - disabling\n");
@@ -987,7 +973,7 @@ void dma_debug_add_bus(struct bus_type *bus)
 
 static int dma_debug_init(void)
 {
-	int i;
+	int i, nr_pages;
 
 	/* Do not use dma_debug_initialized here, since we really want to be
 	 * called to set dma_debug_initialized
@@ -1007,15 +993,21 @@ static int dma_debug_init(void)
 		return 0;
 	}
 
-	if (dma_debug_create_entries(nr_prealloc_entries, GFP_KERNEL) != 0) {
+	nr_pages = DIV_ROUND_UP(nr_prealloc_entries, DMA_DEBUG_DYNAMIC_ENTRIES);
+	for (i = 0; i < nr_pages; ++i)
+		dma_debug_create_entries(GFP_KERNEL);
+	if (num_free_entries >= nr_prealloc_entries) {
+		pr_info("preallocated %d debug entries\n", nr_total_entries);
+	} else if (num_free_entries > 0) {
+		pr_warn("%d debug entries requested but only %d allocated\n",
+			nr_prealloc_entries, nr_total_entries);
+	} else {
 		pr_err("debugging out of memory error - disabled\n");
 		global_disable = true;
 
 		return 0;
 	}
-
 	min_free_entries = num_free_entries;
-	pr_info("preallocated %d debug entries\n", nr_total_entries);
 
 	dma_debug_initialized = true;
 
-- 
cgit 


From 1431a5d2cfa18d7006d9b0e7ab4548d9bb19ce55 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Thu, 6 Dec 2018 17:11:32 -0800
Subject: locking/lockdep: Declare local symbols static

This patch avoids that sparse complains about a missing declaration for
the lock_classes array when building with CONFIG_DEBUG_LOCKDEP=n.

Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Waiman Long <longman@redhat.com>
Cc: johannes.berg@intel.com
Cc: tj@kernel.org
Link: https://lkml.kernel.org/r/20181207011148.251812-9-bvanassche@acm.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/lockdep.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 1efada2dd9dd..7434a00b2b2f 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -138,6 +138,9 @@ static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES];
  * get freed - this significantly simplifies the debugging code.
  */
 unsigned long nr_lock_classes;
+#ifndef CONFIG_DEBUG_LOCKDEP
+static
+#endif
 struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
 
 static inline struct lock_class *hlock_class(struct held_lock *hlock)
-- 
cgit 


From d35568bdb6ce4be3f885f8f189bbde5adc7e0160 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Thu, 6 Dec 2018 17:11:33 -0800
Subject: locking/lockdep: Inline __lockdep_init_map()

Since the function __lockdep_init_map() only has one caller, inline it
into its caller. This patch does not change any functionality.

Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Waiman Long <longman@redhat.com>
Cc: johannes.berg@intel.com
Cc: tj@kernel.org
Link: https://lkml.kernel.org/r/20181207011148.251812-10-bvanassche@acm.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/lockdep.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 7434a00b2b2f..b5c8fcb6c070 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -3091,7 +3091,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
 /*
  * Initialize a lock instance's lock-class mapping info:
  */
-static void __lockdep_init_map(struct lockdep_map *lock, const char *name,
+void lockdep_init_map(struct lockdep_map *lock, const char *name,
 		      struct lock_class_key *key, int subclass)
 {
 	int i;
@@ -3147,12 +3147,6 @@ static void __lockdep_init_map(struct lockdep_map *lock, const char *name,
 		raw_local_irq_restore(flags);
 	}
 }
-
-void lockdep_init_map(struct lockdep_map *lock, const char *name,
-		      struct lock_class_key *key, int subclass)
-{
-	__lockdep_init_map(lock, name, key, subclass);
-}
 EXPORT_SYMBOL_GPL(lockdep_init_map);
 
 struct lock_class_key __lockdep_no_validate__;
-- 
cgit 


From 2904d9fa45d3ce7153f1e10d78c570ecf7f19c35 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Thu, 6 Dec 2018 17:11:34 -0800
Subject: locking/lockdep: Introduce lock_class_cache_is_registered()

This patch does not change any functionality but makes the
lockdep_reset_lock() function easier to read.

Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Waiman Long <longman@redhat.com>
Cc: johannes.berg@intel.com
Cc: tj@kernel.org
Link: https://lkml.kernel.org/r/20181207011148.251812-11-bvanassche@acm.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/lockdep.c | 50 +++++++++++++++++++++++++++++-------------------
 1 file changed, 30 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index b5c8fcb6c070..81388d028ac7 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -4201,13 +4201,33 @@ void lockdep_free_key_range(void *start, unsigned long size)
 	 */
 }
 
-void lockdep_reset_lock(struct lockdep_map *lock)
+/*
+ * Check whether any element of the @lock->class_cache[] array refers to a
+ * registered lock class. The caller must hold either the graph lock or the
+ * RCU read lock.
+ */
+static bool lock_class_cache_is_registered(struct lockdep_map *lock)
 {
 	struct lock_class *class;
 	struct hlist_head *head;
-	unsigned long flags;
 	int i, j;
-	int locked;
+
+	for (i = 0; i < CLASSHASH_SIZE; i++) {
+		head = classhash_table + i;
+		hlist_for_each_entry_rcu(class, head, hash_entry) {
+			for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++)
+				if (lock->class_cache[j] == class)
+					return true;
+		}
+	}
+	return false;
+}
+
+void lockdep_reset_lock(struct lockdep_map *lock)
+{
+	struct lock_class *class;
+	unsigned long flags;
+	int j, locked;
 
 	raw_local_irq_save(flags);
 
@@ -4227,24 +4247,14 @@ void lockdep_reset_lock(struct lockdep_map *lock)
 	 * be gone.
 	 */
 	locked = graph_lock();
-	for (i = 0; i < CLASSHASH_SIZE; i++) {
-		head = classhash_table + i;
-		hlist_for_each_entry_rcu(class, head, hash_entry) {
-			int match = 0;
-
-			for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++)
-				match |= class == lock->class_cache[j];
-
-			if (unlikely(match)) {
-				if (debug_locks_off_graph_unlock()) {
-					/*
-					 * We all just reset everything, how did it match?
-					 */
-					WARN_ON(1);
-				}
-				goto out_restore;
-			}
+	if (unlikely(lock_class_cache_is_registered(lock))) {
+		if (debug_locks_off_graph_unlock()) {
+			/*
+			 * We all just reset everything, how did it match?
+			 */
+			WARN_ON(1);
 		}
+		goto out_restore;
 	}
 	if (locked)
 		graph_unlock();
-- 
cgit 


From a66b6922dc6a5ece60ea9326153da3b062977a4d Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Thu, 6 Dec 2018 17:11:35 -0800
Subject: locking/lockdep: Remove a superfluous INIT_LIST_HEAD() statement

Initializing a list entry just before it is passed to list_add_tail_rcu()
is not necessary because list_add_tail_rcu() overwrites the next and prev
pointers anyway. Hence remove the INIT_LIST_HEAD() statement.

Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Waiman Long <longman@redhat.com>
Cc: johannes.berg@intel.com
Cc: tj@kernel.org
Link: https://lkml.kernel.org/r/20181207011148.251812-12-bvanassche@acm.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/lockdep.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 81388d028ac7..346b5a1fd062 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -792,7 +792,6 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
 	class->key = key;
 	class->name = lock->name;
 	class->subclass = subclass;
-	INIT_LIST_HEAD(&class->lock_entry);
 	INIT_LIST_HEAD(&class->locks_before);
 	INIT_LIST_HEAD(&class->locks_after);
 	class->name_version = count_matching_names(class);
-- 
cgit 


From 786fa29e9cb6810e21ab0d9c41a81d81d54d1d1b Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Thu, 6 Dec 2018 17:11:36 -0800
Subject: locking/lockdep: Make concurrent lockdep_reset_lock() calls safe

Since zap_class() removes items from the all_lock_classes list and the
classhash_table, protect all zap_class() calls against concurrent
data structure modifications with the graph lock.

Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Waiman Long <longman@redhat.com>
Cc: johannes.berg@intel.com
Cc: tj@kernel.org
Link: https://lkml.kernel.org/r/20181207011148.251812-13-bvanassche@acm.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/lockdep.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 346b5a1fd062..737d2dd3ea56 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -4122,6 +4122,9 @@ void lockdep_reset(void)
 	raw_local_irq_restore(flags);
 }
 
+/*
+ * Remove all references to a lock class. The caller must hold the graph lock.
+ */
 static void zap_class(struct lock_class *class)
 {
 	int i;
@@ -4229,6 +4232,7 @@ void lockdep_reset_lock(struct lockdep_map *lock)
 	int j, locked;
 
 	raw_local_irq_save(flags);
+	locked = graph_lock();
 
 	/*
 	 * Remove all classes this lock might have:
@@ -4245,7 +4249,6 @@ void lockdep_reset_lock(struct lockdep_map *lock)
 	 * Debug check: in the end all mapped classes should
 	 * be gone.
 	 */
-	locked = graph_lock();
 	if (unlikely(lock_class_cache_is_registered(lock))) {
 		if (debug_locks_off_graph_unlock()) {
 			/*
-- 
cgit 


From fe27b0de8dfcdf8482558ce5d25e697fe74d851e Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Thu, 6 Dec 2018 17:11:37 -0800
Subject: locking/lockdep: Stop using RCU primitives to access
 'all_lock_classes'

Due to the previous patch all code that accesses the 'all_lock_classes'
list holds the graph lock. Hence use regular list primitives instead of
their RCU variants to access this list.

Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Waiman Long <longman@redhat.com>
Cc: johannes.berg@intel.com
Cc: tj@kernel.org
Link: https://lkml.kernel.org/r/20181207011148.251812-14-bvanassche@acm.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/lockdep.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 737d2dd3ea56..5c837a537273 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -629,7 +629,8 @@ static int static_obj(void *obj)
 
 /*
  * To make lock name printouts unique, we calculate a unique
- * class->name_version generation counter:
+ * class->name_version generation counter. The caller must hold the graph
+ * lock.
  */
 static int count_matching_names(struct lock_class *new_class)
 {
@@ -639,7 +640,7 @@ static int count_matching_names(struct lock_class *new_class)
 	if (!new_class->name)
 		return 0;
 
-	list_for_each_entry_rcu(class, &all_lock_classes, lock_entry) {
+	list_for_each_entry(class, &all_lock_classes, lock_entry) {
 		if (new_class->key - new_class->subclass == class->key)
 			return class->name_version;
 		if (class->name && !strcmp(class->name, new_class->name))
@@ -803,7 +804,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
 	/*
 	 * Add it to the global list of classes:
 	 */
-	list_add_tail_rcu(&class->lock_entry, &all_lock_classes);
+	list_add_tail(&class->lock_entry, &all_lock_classes);
 
 	if (verbose(class)) {
 		graph_unlock();
@@ -4141,7 +4142,7 @@ static void zap_class(struct lock_class *class)
 	 * Unhash the class and remove it from the all_lock_classes list:
 	 */
 	hlist_del_rcu(&class->hash_entry);
-	list_del_rcu(&class->lock_entry);
+	list_del(&class->lock_entry);
 
 	RCU_INIT_POINTER(class->key, NULL);
 	RCU_INIT_POINTER(class->name, NULL);
-- 
cgit 


From 80eb865768703c0f85a0603762742ae1dedf21f0 Mon Sep 17 00:00:00 2001
From: Andrea Parri <andrea.parri@amarulasolutions.com>
Date: Tue, 27 Nov 2018 12:01:10 +0100
Subject: sched/fair: Clean up comment in nohz_idle_balance()

Concerning the comment associated to the atomic_fetch_andnot() in
nohz_idle_balance(), Vincent explains [1]:

  "[...] the comment is useless and can be removed [...]  it was
   referring to a line code above the comment that was present in
   a previous iteration of the patchset. This line disappeared in
   final version but the comment has stayed."

So remove the comment.

Vincent also points out that the full ordering associated to the
atomic_fetch_andnot() primitive could be relaxed, but this patch
insists on the current more conservative/fully ordered solution:

"Performance" isn't a concern, stay away from "correctness"/subtle
relaxed (re)ordering if possible..., just make sure not to confuse
the next reader with misleading/out-of-date comments.

[1] http://lkml.kernel.org/r/CAKfTPtBjA-oCBRkO6__npQwL3+HLjzk7riCcPU1R7YdO-EpuZg@mail.gmail.com

Suggested-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Andrea Parri <andrea.parri@amarulasolutions.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20181127110110.5533-1-andrea.parri@amarulasolutions.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ac855b2f4774..db514993565b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9533,9 +9533,7 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 		return false;
 	}
 
-	/*
-	 * barrier, pairs with nohz_balance_enter_idle(), ensures ...
-	 */
+	/* could be _relaxed() */
 	flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
 	if (!(flags & NOHZ_KICK_MASK))
 		return false;
-- 
cgit 


From 765d0af19f5f388a34bf4533378f8398b72ded46 Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Wed, 29 Aug 2018 15:19:11 +0200
Subject: sched/topology: Remove the ::smt_gain field from 'struct
 sched_domain'

::smt_gain is used to compute the capacity of CPUs of a SMT core with the
constraint 1 < ::smt_gain < 2 in order to be able to compute number of CPUs
per core. The field has_free_capacity of struct numa_stat, which was the
last user of this computation of number of CPUs per core, has been removed
by:

  2d4056fafa19 ("sched/numa: Remove numa_has_capacity()")

We can now remove this constraint on core capacity and use the defautl value
SCHED_CAPACITY_SCALE for SMT CPUs. With this remove, SCHED_CAPACITY_SCALE
becomes the maximum compute capacity of CPUs on every systems. This should
help to simplify some code and remove fields like rd->max_cpu_capacity

Furthermore, arch_scale_cpu_capacity() is used with a NULL sd in several other
places in the code when it wants the capacity of a CPUs to scale
some metrics like in pelt, deadline or schedutil. In case on SMT, the value
returned is not the capacity of SMT CPUs but default SCHED_CAPACITY_SCALE.

So remove it.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1535548752-4434-4-git-send-email-vincent.guittot@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/topology.h | 1 -
 kernel/sched/sched.h           | 3 ---
 kernel/sched/topology.c        | 2 --
 3 files changed, 6 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 6b9976180c1e..7fa0bc17cd8c 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -89,7 +89,6 @@ struct sched_domain {
 	unsigned int newidle_idx;
 	unsigned int wake_idx;
 	unsigned int forkexec_idx;
-	unsigned int smt_gain;
 
 	int nohz_idle;			/* NOHZ IDLE status */
 	int flags;			/* See SD_* */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 9bde60a11805..ceb896404869 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1864,9 +1864,6 @@ unsigned long arch_scale_freq_capacity(int cpu)
 static __always_inline
 unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
 {
-	if (sd && (sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1))
-		return sd->smt_gain / sd->span_weight;
-
 	return SCHED_CAPACITY_SCALE;
 }
 #endif
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 8d7f15ba5916..7364e0b427b7 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1133,7 +1133,6 @@ sd_init(struct sched_domain_topology_level *tl,
 
 		.last_balance		= jiffies,
 		.balance_interval	= sd_weight,
-		.smt_gain		= 0,
 		.max_newidle_lb_cost	= 0,
 		.next_decay_max_lb_cost	= jiffies,
 		.child			= child,
@@ -1164,7 +1163,6 @@ sd_init(struct sched_domain_topology_level *tl,
 
 	if (sd->flags & SD_SHARE_CPUCAPACITY) {
 		sd->imbalance_pct = 110;
-		sd->smt_gain = 1178; /* ~15% */
 
 	} else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
 		sd->imbalance_pct = 117;
-- 
cgit 


From 9ebc6053814d37b9de8cc291fba28f30a729c929 Mon Sep 17 00:00:00 2001
From: Yangtao Li <tiny.windzz@gmail.com>
Date: Sat, 3 Nov 2018 13:26:02 -0400
Subject: sched/core: Remove unnecessary unlikely() in push_*_task()

WARN_ON() already contains an unlikely(), so it's not necessary to
use WARN_ON(1).

Signed-off-by: Yangtao Li <tiny.windzz@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20181103172602.1917-1-tiny.windzz@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/deadline.c | 4 +---
 kernel/sched/rt.c       | 4 +---
 2 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index b32bc1f7cd14..fb8b7b5d745d 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2042,10 +2042,8 @@ static int push_dl_task(struct rq *rq)
 		return 0;
 
 retry:
-	if (unlikely(next_task == rq->curr)) {
-		WARN_ON(1);
+	if (WARN_ON(next_task == rq->curr))
 		return 0;
-	}
 
 	/*
 	 * If next_task preempts rq->curr, and rq->curr
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 9aa3287ce301..e4f398ad9e73 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1813,10 +1813,8 @@ static int push_rt_task(struct rq *rq)
 		return 0;
 
 retry:
-	if (unlikely(next_task == rq->curr)) {
-		WARN_ON(1);
+	if (WARN_ON(next_task == rq->curr))
 		return 0;
-	}
 
 	/*
 	 * It's possible that the next_task slipped in of
-- 
cgit 


From 5bd0988be12733a42a1a3d50e3e2ddfd79e57518 Mon Sep 17 00:00:00 2001
From: Quentin Perret <quentin.perret@arm.com>
Date: Mon, 3 Dec 2018 09:56:14 +0000
Subject: sched/topology: Relocate arch_scale_cpu_capacity() to the internal
 header

By default, arch_scale_cpu_capacity() is only visible from within the
kernel/sched folder. Relocate it to include/linux/sched/topology.h to
make it visible to other clients needing to know about the capacity of
CPUs, such as the Energy Model framework.

This also shrinks the <linux/sched/topology.h> public header.

Signed-off-by: Quentin Perret <quentin.perret@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: adharmap@codeaurora.org
Cc: chris.redpath@arm.com
Cc: currojerez@riseup.net
Cc: dietmar.eggemann@arm.com
Cc: edubezval@gmail.com
Cc: gregkh@linuxfoundation.org
Cc: javi.merino@kernel.org
Cc: joel@joelfernandes.org
Cc: juri.lelli@redhat.com
Cc: morten.rasmussen@arm.com
Cc: patrick.bellasi@arm.com
Cc: pkondeti@codeaurora.org
Cc: rjw@rjwysocki.net
Cc: skannan@codeaurora.org
Cc: smuckle@google.com
Cc: srinivas.pandruvada@linux.intel.com
Cc: thara.gopinath@linaro.org
Cc: tkjos@google.com
Cc: valentin.schneider@arm.com
Cc: vincent.guittot@linaro.org
Cc: viresh.kumar@linaro.org
Link: https://lkml.kernel.org/r/20181203095628.11858-2-quentin.perret@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/topology.h | 16 ++++++++++++++++
 kernel/sched/sched.h           | 18 ------------------
 2 files changed, 16 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 7fa0bc17cd8c..c31d3a47a47c 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -201,6 +201,14 @@ extern void set_sched_topology(struct sched_domain_topology_level *tl);
 # define SD_INIT_NAME(type)
 #endif
 
+#ifndef arch_scale_cpu_capacity
+static __always_inline
+unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
+{
+	return SCHED_CAPACITY_SCALE;
+}
+#endif
+
 #else /* CONFIG_SMP */
 
 struct sched_domain_attr;
@@ -216,6 +224,14 @@ static inline bool cpus_share_cache(int this_cpu, int that_cpu)
 	return true;
 }
 
+#ifndef arch_scale_cpu_capacity
+static __always_inline
+unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu)
+{
+	return SCHED_CAPACITY_SCALE;
+}
+#endif
+
 #endif	/* !CONFIG_SMP */
 
 static inline int task_node(const struct task_struct *p)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ceb896404869..66067152a831 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1859,24 +1859,6 @@ unsigned long arch_scale_freq_capacity(int cpu)
 }
 #endif
 
-#ifdef CONFIG_SMP
-#ifndef arch_scale_cpu_capacity
-static __always_inline
-unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
-{
-	return SCHED_CAPACITY_SCALE;
-}
-#endif
-#else
-#ifndef arch_scale_cpu_capacity
-static __always_inline
-unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu)
-{
-	return SCHED_CAPACITY_SCALE;
-}
-#endif
-#endif
-
 #ifdef CONFIG_SMP
 #ifdef CONFIG_PREEMPT
 
-- 
cgit 


From 938e5e4b0d1502a93e787985cb95b136b40717b7 Mon Sep 17 00:00:00 2001
From: Quentin Perret <quentin.perret@arm.com>
Date: Mon, 3 Dec 2018 09:56:15 +0000
Subject: sched/cpufreq: Prepare schedutil for Energy Aware Scheduling

Schedutil requests frequency by aggregating utilization signals from
the scheduler (CFS, RT, DL, IRQ) and applying a 25% margin on top of
them. Since Energy Aware Scheduling (EAS) needs to be able to predict
the frequency requests, it needs to forecast the decisions made by the
governor.

In order to prepare the introduction of EAS, introduce
schedutil_freq_util() to centralize the aforementioned signal
aggregation and make it available to both schedutil and EAS. Since
frequency selection and energy estimation still need to deal with RT and
DL signals slightly differently, schedutil_freq_util() is called with a
different 'type' parameter in those two contexts, and returns an
aggregated utilization signal accordingly. While at it, introduce the
map_util_freq() function which is designed to make schedutil's 25%
margin usable easily for both sugov and EAS.

As EAS will be able to predict schedutil's frequency requests more
accurately than any other governor by design, it'd be sensible to make
sure EAS cannot be used without schedutil. This will be done later, once
EAS has actually been introduced.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Quentin Perret <quentin.perret@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: adharmap@codeaurora.org
Cc: chris.redpath@arm.com
Cc: currojerez@riseup.net
Cc: dietmar.eggemann@arm.com
Cc: edubezval@gmail.com
Cc: gregkh@linuxfoundation.org
Cc: javi.merino@kernel.org
Cc: joel@joelfernandes.org
Cc: juri.lelli@redhat.com
Cc: morten.rasmussen@arm.com
Cc: patrick.bellasi@arm.com
Cc: pkondeti@codeaurora.org
Cc: rjw@rjwysocki.net
Cc: skannan@codeaurora.org
Cc: smuckle@google.com
Cc: srinivas.pandruvada@linux.intel.com
Cc: thara.gopinath@linaro.org
Cc: tkjos@google.com
Cc: valentin.schneider@arm.com
Cc: vincent.guittot@linaro.org
Cc: viresh.kumar@linaro.org
Link: https://lkml.kernel.org/r/20181203095628.11858-3-quentin.perret@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/cpufreq.h    |  6 +++++
 kernel/sched/cpufreq_schedutil.c | 53 ++++++++++++++++++++++++++++------------
 kernel/sched/sched.h             | 30 +++++++++++++++++++++++
 3 files changed, 74 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched/cpufreq.h b/include/linux/sched/cpufreq.h
index 59667444669f..afa940cd50dc 100644
--- a/include/linux/sched/cpufreq.h
+++ b/include/linux/sched/cpufreq.h
@@ -20,6 +20,12 @@ void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data,
                        void (*func)(struct update_util_data *data, u64 time,
 				    unsigned int flags));
 void cpufreq_remove_update_util_hook(int cpu);
+
+static inline unsigned long map_util_freq(unsigned long util,
+					unsigned long freq, unsigned long cap)
+{
+	return (freq + (freq >> 2)) * util / cap;
+}
 #endif /* CONFIG_CPU_FREQ */
 
 #endif /* _LINUX_SCHED_CPUFREQ_H */
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 3fffad3bc8a8..90128be27712 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -13,6 +13,7 @@
 
 #include "sched.h"
 
+#include <linux/sched/cpufreq.h>
 #include <trace/events/power.h>
 
 struct sugov_tunables {
@@ -167,7 +168,7 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
 	unsigned int freq = arch_scale_freq_invariant() ?
 				policy->cpuinfo.max_freq : policy->cur;
 
-	freq = (freq + (freq >> 2)) * util / max;
+	freq = map_util_freq(util, freq, max);
 
 	if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update)
 		return sg_policy->next_freq;
@@ -197,15 +198,13 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
  * based on the task model parameters and gives the minimal utilization
  * required to meet deadlines.
  */
-static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu)
+unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs,
+				  unsigned long max, enum schedutil_type type)
 {
-	struct rq *rq = cpu_rq(sg_cpu->cpu);
-	unsigned long util, irq, max;
+	unsigned long dl_util, util, irq;
+	struct rq *rq = cpu_rq(cpu);
 
-	sg_cpu->max = max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu);
-	sg_cpu->bw_dl = cpu_bw_dl(rq);
-
-	if (rt_rq_is_runnable(&rq->rt))
+	if (type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt))
 		return max;
 
 	/*
@@ -223,21 +222,30 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu)
 	 * utilization (PELT windows are synchronized) we can directly add them
 	 * to obtain the CPU's actual utilization.
 	 */
-	util = cpu_util_cfs(rq);
+	util = util_cfs;
 	util += cpu_util_rt(rq);
 
+	dl_util = cpu_util_dl(rq);
+
 	/*
-	 * We do not make cpu_util_dl() a permanent part of this sum because we
-	 * want to use cpu_bw_dl() later on, but we need to check if the
-	 * CFS+RT+DL sum is saturated (ie. no idle time) such that we select
-	 * f_max when there is no idle time.
+	 * For frequency selection we do not make cpu_util_dl() a permanent part
+	 * of this sum because we want to use cpu_bw_dl() later on, but we need
+	 * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such
+	 * that we select f_max when there is no idle time.
 	 *
 	 * NOTE: numerical errors or stop class might cause us to not quite hit
 	 * saturation when we should -- something for later.
 	 */
-	if ((util + cpu_util_dl(rq)) >= max)
+	if (util + dl_util >= max)
 		return max;
 
+	/*
+	 * OTOH, for energy computation we need the estimated running time, so
+	 * include util_dl and ignore dl_bw.
+	 */
+	if (type == ENERGY_UTIL)
+		util += dl_util;
+
 	/*
 	 * There is still idle time; further improve the number by using the
 	 * irq metric. Because IRQ/steal time is hidden from the task clock we
@@ -260,7 +268,22 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu)
 	 * bw_dl as requested freq. However, cpufreq is not yet ready for such
 	 * an interface. So, we only do the latter for now.
 	 */
-	return min(max, util + sg_cpu->bw_dl);
+	if (type == FREQUENCY_UTIL)
+		util += cpu_bw_dl(rq);
+
+	return min(max, util);
+}
+
+static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu)
+{
+	struct rq *rq = cpu_rq(sg_cpu->cpu);
+	unsigned long util = cpu_util_cfs(rq);
+	unsigned long max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu);
+
+	sg_cpu->max = max;
+	sg_cpu->bw_dl = cpu_bw_dl(rq);
+
+	return schedutil_freq_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL);
 }
 
 /**
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 66067152a831..2eafa228aebf 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2191,6 +2191,31 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
 #endif
 
 #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
+/**
+ * enum schedutil_type - CPU utilization type
+ * @FREQUENCY_UTIL:	Utilization used to select frequency
+ * @ENERGY_UTIL:	Utilization used during energy calculation
+ *
+ * The utilization signals of all scheduling classes (CFS/RT/DL) and IRQ time
+ * need to be aggregated differently depending on the usage made of them. This
+ * enum is used within schedutil_freq_util() to differentiate the types of
+ * utilization expected by the callers, and adjust the aggregation accordingly.
+ */
+enum schedutil_type {
+	FREQUENCY_UTIL,
+	ENERGY_UTIL,
+};
+
+unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs,
+				  unsigned long max, enum schedutil_type type);
+
+static inline unsigned long schedutil_energy_util(int cpu, unsigned long cfs)
+{
+	unsigned long max = arch_scale_cpu_capacity(NULL, cpu);
+
+	return schedutil_freq_util(cpu, cfs, max, ENERGY_UTIL);
+}
+
 static inline unsigned long cpu_bw_dl(struct rq *rq)
 {
 	return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT;
@@ -2217,6 +2242,11 @@ static inline unsigned long cpu_util_rt(struct rq *rq)
 {
 	return READ_ONCE(rq->avg_rt.util_avg);
 }
+#else /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
+static inline unsigned long schedutil_energy_util(int cpu, unsigned long cfs)
+{
+	return cfs;
+}
 #endif
 
 #ifdef CONFIG_HAVE_SCHED_AVG_IRQ
-- 
cgit 


From 27871f7a8a341ef5c636a337856369acf8013e4e Mon Sep 17 00:00:00 2001
From: Quentin Perret <quentin.perret@arm.com>
Date: Mon, 3 Dec 2018 09:56:16 +0000
Subject: PM: Introduce an Energy Model management framework

Several subsystems in the kernel (task scheduler and/or thermal at the
time of writing) can benefit from knowing about the energy consumed by
CPUs. Yet, this information can come from different sources (DT or
firmware for example), in different formats, hence making it hard to
exploit without a standard API.

As an attempt to address this, introduce a centralized Energy Model
(EM) management framework which aggregates the power values provided
by drivers into a table for each performance domain in the system. The
power cost tables are made available to interested clients (e.g. task
scheduler or thermal) via platform-agnostic APIs. The overall design
is represented by the diagram below (focused on Arm-related drivers as
an example, but applicable to any architecture):

     +---------------+  +-----------------+  +-------------+
     | Thermal (IPA) |  | Scheduler (EAS) |  |    Other    |
     +---------------+  +-----------------+  +-------------+
             |                   | em_pd_energy()   |
             |                   | em_cpu_get()     |
             +-----------+       |         +--------+
                         |       |         |
                         v       v         v
                      +---------------------+
                      |                     |
                      |    Energy Model     |
                      |                     |
                      |     Framework       |
                      |                     |
                      +---------------------+
                         ^       ^       ^
                         |       |       | em_register_perf_domain()
              +----------+       |       +---------+
              |                  |                 |
      +---------------+  +---------------+  +--------------+
      |  cpufreq-dt   |  |   arm_scmi    |  |    Other     |
      +---------------+  +---------------+  +--------------+
              ^                  ^                 ^
              |                  |                 |
      +--------------+   +---------------+  +--------------+
      | Device Tree  |   |   Firmware    |  |      ?       |
      +--------------+   +---------------+  +--------------+

Drivers (typically, but not limited to, CPUFreq drivers) can register
data in the EM framework using the em_register_perf_domain() API. The
calling driver must provide a callback function with a standardized
signature that will be used by the EM framework to build the power
cost tables of the performance domain. This design should offer a lot of
flexibility to calling drivers which are free of reading information
from any location and to use any technique to compute power costs.
Moreover, the capacity states registered by drivers in the EM framework
are not required to match real performance states of the target. This
is particularly important on targets where the performance states are
not known by the OS.

The power cost coefficients managed by the EM framework are specified in
milli-watts. Although the two potential users of those coefficients (IPA
and EAS) only need relative correctness, IPA specifically needs to
compare the power of CPUs with the power of other components (GPUs, for
example), which are still expressed in absolute terms in their
respective subsystems. Hence, specifying the power of CPUs in
milli-watts should help transitioning IPA to using the EM framework
without introducing new problems by keeping units comparable across
sub-systems.
On the longer term, the EM of other devices than CPUs could also be
managed by the EM framework, which would enable to remove the absolute
unit. However, this is not absolutely required as a first step, so this
extension of the EM framework is left for later.

On the client side, the EM framework offers APIs to access the power
cost tables of a CPU (em_cpu_get()), and to estimate the energy
consumed by the CPUs of a performance domain (em_pd_energy()). Clients
such as the task scheduler can then use these APIs to access the shared
data structures holding the Energy Model of CPUs.

Signed-off-by: Quentin Perret <quentin.perret@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rafael J. Wysocki <rjw@rjwysocki.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: adharmap@codeaurora.org
Cc: chris.redpath@arm.com
Cc: currojerez@riseup.net
Cc: dietmar.eggemann@arm.com
Cc: edubezval@gmail.com
Cc: gregkh@linuxfoundation.org
Cc: javi.merino@kernel.org
Cc: joel@joelfernandes.org
Cc: juri.lelli@redhat.com
Cc: morten.rasmussen@arm.com
Cc: patrick.bellasi@arm.com
Cc: pkondeti@codeaurora.org
Cc: skannan@codeaurora.org
Cc: smuckle@google.com
Cc: srinivas.pandruvada@linux.intel.com
Cc: thara.gopinath@linaro.org
Cc: tkjos@google.com
Cc: valentin.schneider@arm.com
Cc: vincent.guittot@linaro.org
Cc: viresh.kumar@linaro.org
Link: https://lkml.kernel.org/r/20181203095628.11858-4-quentin.perret@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/energy_model.h | 187 ++++++++++++++++++++++++++++++++++++++++
 kernel/power/Kconfig         |  15 ++++
 kernel/power/Makefile        |   2 +
 kernel/power/energy_model.c  | 201 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 405 insertions(+)
 create mode 100644 include/linux/energy_model.h
 create mode 100644 kernel/power/energy_model.c

(limited to 'kernel')

diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h
new file mode 100644
index 000000000000..aa027f7bcb3e
--- /dev/null
+++ b/include/linux/energy_model.h
@@ -0,0 +1,187 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_ENERGY_MODEL_H
+#define _LINUX_ENERGY_MODEL_H
+#include <linux/cpumask.h>
+#include <linux/jump_label.h>
+#include <linux/kobject.h>
+#include <linux/rcupdate.h>
+#include <linux/sched/cpufreq.h>
+#include <linux/sched/topology.h>
+#include <linux/types.h>
+
+#ifdef CONFIG_ENERGY_MODEL
+/**
+ * em_cap_state - Capacity state of a performance domain
+ * @frequency:	The CPU frequency in KHz, for consistency with CPUFreq
+ * @power:	The power consumed by 1 CPU at this level, in milli-watts
+ * @cost:	The cost coefficient associated with this level, used during
+ *		energy calculation. Equal to: power * max_frequency / frequency
+ */
+struct em_cap_state {
+	unsigned long frequency;
+	unsigned long power;
+	unsigned long cost;
+};
+
+/**
+ * em_perf_domain - Performance domain
+ * @table:		List of capacity states, in ascending order
+ * @nr_cap_states:	Number of capacity states
+ * @cpus:		Cpumask covering the CPUs of the domain
+ *
+ * A "performance domain" represents a group of CPUs whose performance is
+ * scaled together. All CPUs of a performance domain must have the same
+ * micro-architecture. Performance domains often have a 1-to-1 mapping with
+ * CPUFreq policies.
+ */
+struct em_perf_domain {
+	struct em_cap_state *table;
+	int nr_cap_states;
+	unsigned long cpus[0];
+};
+
+#define EM_CPU_MAX_POWER 0xFFFF
+
+struct em_data_callback {
+	/**
+	 * active_power() - Provide power at the next capacity state of a CPU
+	 * @power	: Active power at the capacity state in mW (modified)
+	 * @freq	: Frequency at the capacity state in kHz (modified)
+	 * @cpu		: CPU for which we do this operation
+	 *
+	 * active_power() must find the lowest capacity state of 'cpu' above
+	 * 'freq' and update 'power' and 'freq' to the matching active power
+	 * and frequency.
+	 *
+	 * The power is the one of a single CPU in the domain, expressed in
+	 * milli-watts. It is expected to fit in the [0, EM_CPU_MAX_POWER]
+	 * range.
+	 *
+	 * Return 0 on success.
+	 */
+	int (*active_power)(unsigned long *power, unsigned long *freq, int cpu);
+};
+#define EM_DATA_CB(_active_power_cb) { .active_power = &_active_power_cb }
+
+struct em_perf_domain *em_cpu_get(int cpu);
+int em_register_perf_domain(cpumask_t *span, unsigned int nr_states,
+						struct em_data_callback *cb);
+
+/**
+ * em_pd_energy() - Estimates the energy consumed by the CPUs of a perf. domain
+ * @pd		: performance domain for which energy has to be estimated
+ * @max_util	: highest utilization among CPUs of the domain
+ * @sum_util	: sum of the utilization of all CPUs in the domain
+ *
+ * Return: the sum of the energy consumed by the CPUs of the domain assuming
+ * a capacity state satisfying the max utilization of the domain.
+ */
+static inline unsigned long em_pd_energy(struct em_perf_domain *pd,
+				unsigned long max_util, unsigned long sum_util)
+{
+	unsigned long freq, scale_cpu;
+	struct em_cap_state *cs;
+	int i, cpu;
+
+	/*
+	 * In order to predict the capacity state, map the utilization of the
+	 * most utilized CPU of the performance domain to a requested frequency,
+	 * like schedutil.
+	 */
+	cpu = cpumask_first(to_cpumask(pd->cpus));
+	scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
+	cs = &pd->table[pd->nr_cap_states - 1];
+	freq = map_util_freq(max_util, cs->frequency, scale_cpu);
+
+	/*
+	 * Find the lowest capacity state of the Energy Model above the
+	 * requested frequency.
+	 */
+	for (i = 0; i < pd->nr_cap_states; i++) {
+		cs = &pd->table[i];
+		if (cs->frequency >= freq)
+			break;
+	}
+
+	/*
+	 * The capacity of a CPU in the domain at that capacity state (cs)
+	 * can be computed as:
+	 *
+	 *             cs->freq * scale_cpu
+	 *   cs->cap = --------------------                          (1)
+	 *                 cpu_max_freq
+	 *
+	 * So, ignoring the costs of idle states (which are not available in
+	 * the EM), the energy consumed by this CPU at that capacity state is
+	 * estimated as:
+	 *
+	 *             cs->power * cpu_util
+	 *   cpu_nrg = --------------------                          (2)
+	 *                   cs->cap
+	 *
+	 * since 'cpu_util / cs->cap' represents its percentage of busy time.
+	 *
+	 *   NOTE: Although the result of this computation actually is in
+	 *         units of power, it can be manipulated as an energy value
+	 *         over a scheduling period, since it is assumed to be
+	 *         constant during that interval.
+	 *
+	 * By injecting (1) in (2), 'cpu_nrg' can be re-expressed as a product
+	 * of two terms:
+	 *
+	 *             cs->power * cpu_max_freq   cpu_util
+	 *   cpu_nrg = ------------------------ * ---------          (3)
+	 *                    cs->freq            scale_cpu
+	 *
+	 * The first term is static, and is stored in the em_cap_state struct
+	 * as 'cs->cost'.
+	 *
+	 * Since all CPUs of the domain have the same micro-architecture, they
+	 * share the same 'cs->cost', and the same CPU capacity. Hence, the
+	 * total energy of the domain (which is the simple sum of the energy of
+	 * all of its CPUs) can be factorized as:
+	 *
+	 *            cs->cost * \Sum cpu_util
+	 *   pd_nrg = ------------------------                       (4)
+	 *                  scale_cpu
+	 */
+	return cs->cost * sum_util / scale_cpu;
+}
+
+/**
+ * em_pd_nr_cap_states() - Get the number of capacity states of a perf. domain
+ * @pd		: performance domain for which this must be done
+ *
+ * Return: the number of capacity states in the performance domain table
+ */
+static inline int em_pd_nr_cap_states(struct em_perf_domain *pd)
+{
+	return pd->nr_cap_states;
+}
+
+#else
+struct em_perf_domain {};
+struct em_data_callback {};
+#define EM_DATA_CB(_active_power_cb) { }
+
+static inline int em_register_perf_domain(cpumask_t *span,
+			unsigned int nr_states, struct em_data_callback *cb)
+{
+	return -EINVAL;
+}
+static inline struct em_perf_domain *em_cpu_get(int cpu)
+{
+	return NULL;
+}
+static inline unsigned long em_pd_energy(struct em_perf_domain *pd,
+			unsigned long max_util, unsigned long sum_util)
+{
+	return 0;
+}
+static inline int em_pd_nr_cap_states(struct em_perf_domain *pd)
+{
+	return 0;
+}
+#endif
+
+#endif
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 3a6c2f87699e..f8fe57d1022e 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -298,3 +298,18 @@ config PM_GENERIC_DOMAINS_OF
 
 config CPU_PM
 	bool
+
+config ENERGY_MODEL
+	bool "Energy Model for CPUs"
+	depends on SMP
+	depends on CPU_FREQ
+	default n
+	help
+	  Several subsystems (thermal and/or the task scheduler for example)
+	  can leverage information about the energy consumed by CPUs to make
+	  smarter decisions. This config option enables the framework from
+	  which subsystems can access the energy models.
+
+	  The exact usage of the energy model is subsystem-dependent.
+
+	  If in doubt, say N.
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index a3f79f0eef36..e7e47d9be1e5 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -15,3 +15,5 @@ obj-$(CONFIG_PM_AUTOSLEEP)	+= autosleep.o
 obj-$(CONFIG_PM_WAKELOCKS)	+= wakelock.o
 
 obj-$(CONFIG_MAGIC_SYSRQ)	+= poweroff.o
+
+obj-$(CONFIG_ENERGY_MODEL)	+= energy_model.o
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
new file mode 100644
index 000000000000..d9dc2c38764a
--- /dev/null
+++ b/kernel/power/energy_model.c
@@ -0,0 +1,201 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Energy Model of CPUs
+ *
+ * Copyright (c) 2018, Arm ltd.
+ * Written by: Quentin Perret, Arm ltd.
+ */
+
+#define pr_fmt(fmt) "energy_model: " fmt
+
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/energy_model.h>
+#include <linux/sched/topology.h>
+#include <linux/slab.h>
+
+/* Mapping of each CPU to the performance domain to which it belongs. */
+static DEFINE_PER_CPU(struct em_perf_domain *, em_data);
+
+/*
+ * Mutex serializing the registrations of performance domains and letting
+ * callbacks defined by drivers sleep.
+ */
+static DEFINE_MUTEX(em_pd_mutex);
+
+static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states,
+						struct em_data_callback *cb)
+{
+	unsigned long opp_eff, prev_opp_eff = ULONG_MAX;
+	unsigned long power, freq, prev_freq = 0;
+	int i, ret, cpu = cpumask_first(span);
+	struct em_cap_state *table;
+	struct em_perf_domain *pd;
+	u64 fmax;
+
+	if (!cb->active_power)
+		return NULL;
+
+	pd = kzalloc(sizeof(*pd) + cpumask_size(), GFP_KERNEL);
+	if (!pd)
+		return NULL;
+
+	table = kcalloc(nr_states, sizeof(*table), GFP_KERNEL);
+	if (!table)
+		goto free_pd;
+
+	/* Build the list of capacity states for this performance domain */
+	for (i = 0, freq = 0; i < nr_states; i++, freq++) {
+		/*
+		 * active_power() is a driver callback which ceils 'freq' to
+		 * lowest capacity state of 'cpu' above 'freq' and updates
+		 * 'power' and 'freq' accordingly.
+		 */
+		ret = cb->active_power(&power, &freq, cpu);
+		if (ret) {
+			pr_err("pd%d: invalid cap. state: %d\n", cpu, ret);
+			goto free_cs_table;
+		}
+
+		/*
+		 * We expect the driver callback to increase the frequency for
+		 * higher capacity states.
+		 */
+		if (freq <= prev_freq) {
+			pr_err("pd%d: non-increasing freq: %lu\n", cpu, freq);
+			goto free_cs_table;
+		}
+
+		/*
+		 * The power returned by active_state() is expected to be
+		 * positive, in milli-watts and to fit into 16 bits.
+		 */
+		if (!power || power > EM_CPU_MAX_POWER) {
+			pr_err("pd%d: invalid power: %lu\n", cpu, power);
+			goto free_cs_table;
+		}
+
+		table[i].power = power;
+		table[i].frequency = prev_freq = freq;
+
+		/*
+		 * The hertz/watts efficiency ratio should decrease as the
+		 * frequency grows on sane platforms. But this isn't always
+		 * true in practice so warn the user if a higher OPP is more
+		 * power efficient than a lower one.
+		 */
+		opp_eff = freq / power;
+		if (opp_eff >= prev_opp_eff)
+			pr_warn("pd%d: hertz/watts ratio non-monotonically decreasing: em_cap_state %d >= em_cap_state%d\n",
+					cpu, i, i - 1);
+		prev_opp_eff = opp_eff;
+	}
+
+	/* Compute the cost of each capacity_state. */
+	fmax = (u64) table[nr_states - 1].frequency;
+	for (i = 0; i < nr_states; i++) {
+		table[i].cost = div64_u64(fmax * table[i].power,
+					  table[i].frequency);
+	}
+
+	pd->table = table;
+	pd->nr_cap_states = nr_states;
+	cpumask_copy(to_cpumask(pd->cpus), span);
+
+	return pd;
+
+free_cs_table:
+	kfree(table);
+free_pd:
+	kfree(pd);
+
+	return NULL;
+}
+
+/**
+ * em_cpu_get() - Return the performance domain for a CPU
+ * @cpu : CPU to find the performance domain for
+ *
+ * Return: the performance domain to which 'cpu' belongs, or NULL if it doesn't
+ * exist.
+ */
+struct em_perf_domain *em_cpu_get(int cpu)
+{
+	return READ_ONCE(per_cpu(em_data, cpu));
+}
+EXPORT_SYMBOL_GPL(em_cpu_get);
+
+/**
+ * em_register_perf_domain() - Register the Energy Model of a performance domain
+ * @span	: Mask of CPUs in the performance domain
+ * @nr_states	: Number of capacity states to register
+ * @cb		: Callback functions providing the data of the Energy Model
+ *
+ * Create Energy Model tables for a performance domain using the callbacks
+ * defined in cb.
+ *
+ * If multiple clients register the same performance domain, all but the first
+ * registration will be ignored.
+ *
+ * Return 0 on success
+ */
+int em_register_perf_domain(cpumask_t *span, unsigned int nr_states,
+						struct em_data_callback *cb)
+{
+	unsigned long cap, prev_cap = 0;
+	struct em_perf_domain *pd;
+	int cpu, ret = 0;
+
+	if (!span || !nr_states || !cb)
+		return -EINVAL;
+
+	/*
+	 * Use a mutex to serialize the registration of performance domains and
+	 * let the driver-defined callback functions sleep.
+	 */
+	mutex_lock(&em_pd_mutex);
+
+	for_each_cpu(cpu, span) {
+		/* Make sure we don't register again an existing domain. */
+		if (READ_ONCE(per_cpu(em_data, cpu))) {
+			ret = -EEXIST;
+			goto unlock;
+		}
+
+		/*
+		 * All CPUs of a domain must have the same micro-architecture
+		 * since they all share the same table.
+		 */
+		cap = arch_scale_cpu_capacity(NULL, cpu);
+		if (prev_cap && prev_cap != cap) {
+			pr_err("CPUs of %*pbl must have the same capacity\n",
+							cpumask_pr_args(span));
+			ret = -EINVAL;
+			goto unlock;
+		}
+		prev_cap = cap;
+	}
+
+	/* Create the performance domain and add it to the Energy Model. */
+	pd = em_create_pd(span, nr_states, cb);
+	if (!pd) {
+		ret = -EINVAL;
+		goto unlock;
+	}
+
+	for_each_cpu(cpu, span) {
+		/*
+		 * The per-cpu array can be read concurrently from em_cpu_get().
+		 * The barrier enforces the ordering needed to make sure readers
+		 * can only access well formed em_perf_domain structs.
+		 */
+		smp_store_release(per_cpu_ptr(&em_data, cpu), pd);
+	}
+
+	pr_debug("Created perf domain %*pbl\n", cpumask_pr_args(span));
+unlock:
+	mutex_unlock(&em_pd_mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(em_register_perf_domain);
-- 
cgit 


From 6aa140fa4508933a6ac6717d65a403eb904d6c02 Mon Sep 17 00:00:00 2001
From: Quentin Perret <quentin.perret@arm.com>
Date: Mon, 3 Dec 2018 09:56:18 +0000
Subject: sched/topology: Reference the Energy Model of CPUs when available

The existing scheduling domain hierarchy is defined to map to the cache
topology of the system. However, Energy Aware Scheduling (EAS) requires
more knowledge about the platform, and specifically needs to know about
the span of Performance Domains (PD), which do not always align with
caches.

To address this issue, use the Energy Model (EM) of the system to extend
the scheduler topology code with a representation of the PDs, alongside
the scheduling domains. More specifically, a linked list of PDs is
attached to each root domain. When multiple root domains are in use,
each list contains only the PDs covering the CPUs of its root domain. If
a PD spans over CPUs of multiple different root domains, it will be
duplicated in all lists.

The lists are fully maintained by the scheduler from
partition_sched_domains() in order to cope with hotplug and cpuset
changes. As for scheduling domains, the list are protected by RCU to
ensure safe concurrent updates.

Signed-off-by: Quentin Perret <quentin.perret@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: adharmap@codeaurora.org
Cc: chris.redpath@arm.com
Cc: currojerez@riseup.net
Cc: dietmar.eggemann@arm.com
Cc: edubezval@gmail.com
Cc: gregkh@linuxfoundation.org
Cc: javi.merino@kernel.org
Cc: joel@joelfernandes.org
Cc: juri.lelli@redhat.com
Cc: morten.rasmussen@arm.com
Cc: patrick.bellasi@arm.com
Cc: pkondeti@codeaurora.org
Cc: rjw@rjwysocki.net
Cc: skannan@codeaurora.org
Cc: smuckle@google.com
Cc: srinivas.pandruvada@linux.intel.com
Cc: thara.gopinath@linaro.org
Cc: tkjos@google.com
Cc: valentin.schneider@arm.com
Cc: vincent.guittot@linaro.org
Cc: viresh.kumar@linaro.org
Link: https://lkml.kernel.org/r/20181203095628.11858-6-quentin.perret@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/sched.h    |  21 ++++++++
 kernel/sched/topology.c | 134 ++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 151 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 2eafa228aebf..808a565187b1 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -45,6 +45,7 @@
 #include <linux/ctype.h>
 #include <linux/debugfs.h>
 #include <linux/delayacct.h>
+#include <linux/energy_model.h>
 #include <linux/init_task.h>
 #include <linux/kprobes.h>
 #include <linux/kthread.h>
@@ -709,6 +710,12 @@ static inline bool sched_asym_prefer(int a, int b)
 	return arch_asym_cpu_priority(a) > arch_asym_cpu_priority(b);
 }
 
+struct perf_domain {
+	struct em_perf_domain *em_pd;
+	struct perf_domain *next;
+	struct rcu_head rcu;
+};
+
 /*
  * We add the notion of a root-domain which will be used to define per-domain
  * variables. Each exclusive cpuset essentially defines an island domain by
@@ -761,6 +768,12 @@ struct root_domain {
 	struct cpupri		cpupri;
 
 	unsigned long		max_cpu_capacity;
+
+	/*
+	 * NULL-terminated list of performance domains intersecting with the
+	 * CPUs of the rd. Protected by RCU.
+	 */
+	struct perf_domain	*pd;
 };
 
 extern struct root_domain def_root_domain;
@@ -2276,3 +2289,11 @@ unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned
 	return util;
 }
 #endif
+
+#ifdef CONFIG_SMP
+#ifdef CONFIG_ENERGY_MODEL
+#define perf_domain_span(pd) (to_cpumask(((pd)->em_pd->cpus)))
+#else
+#define perf_domain_span(pd) NULL
+#endif
+#endif
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 7364e0b427b7..169d25cafab5 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -201,6 +201,116 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
 	return 1;
 }
 
+#ifdef CONFIG_ENERGY_MODEL
+static void free_pd(struct perf_domain *pd)
+{
+	struct perf_domain *tmp;
+
+	while (pd) {
+		tmp = pd->next;
+		kfree(pd);
+		pd = tmp;
+	}
+}
+
+static struct perf_domain *find_pd(struct perf_domain *pd, int cpu)
+{
+	while (pd) {
+		if (cpumask_test_cpu(cpu, perf_domain_span(pd)))
+			return pd;
+		pd = pd->next;
+	}
+
+	return NULL;
+}
+
+static struct perf_domain *pd_init(int cpu)
+{
+	struct em_perf_domain *obj = em_cpu_get(cpu);
+	struct perf_domain *pd;
+
+	if (!obj) {
+		if (sched_debug())
+			pr_info("%s: no EM found for CPU%d\n", __func__, cpu);
+		return NULL;
+	}
+
+	pd = kzalloc(sizeof(*pd), GFP_KERNEL);
+	if (!pd)
+		return NULL;
+	pd->em_pd = obj;
+
+	return pd;
+}
+
+static void perf_domain_debug(const struct cpumask *cpu_map,
+						struct perf_domain *pd)
+{
+	if (!sched_debug() || !pd)
+		return;
+
+	printk(KERN_DEBUG "root_domain %*pbl:", cpumask_pr_args(cpu_map));
+
+	while (pd) {
+		printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_cstate=%d }",
+				cpumask_first(perf_domain_span(pd)),
+				cpumask_pr_args(perf_domain_span(pd)),
+				em_pd_nr_cap_states(pd->em_pd));
+		pd = pd->next;
+	}
+
+	printk(KERN_CONT "\n");
+}
+
+static void destroy_perf_domain_rcu(struct rcu_head *rp)
+{
+	struct perf_domain *pd;
+
+	pd = container_of(rp, struct perf_domain, rcu);
+	free_pd(pd);
+}
+
+static void build_perf_domains(const struct cpumask *cpu_map)
+{
+	struct perf_domain *pd = NULL, *tmp;
+	int cpu = cpumask_first(cpu_map);
+	struct root_domain *rd = cpu_rq(cpu)->rd;
+	int i;
+
+	for_each_cpu(i, cpu_map) {
+		/* Skip already covered CPUs. */
+		if (find_pd(pd, i))
+			continue;
+
+		/* Create the new pd and add it to the local list. */
+		tmp = pd_init(i);
+		if (!tmp)
+			goto free;
+		tmp->next = pd;
+		pd = tmp;
+	}
+
+	perf_domain_debug(cpu_map, pd);
+
+	/* Attach the new list of performance domains to the root domain. */
+	tmp = rd->pd;
+	rcu_assign_pointer(rd->pd, pd);
+	if (tmp)
+		call_rcu(&tmp->rcu, destroy_perf_domain_rcu);
+
+	return;
+
+free:
+	free_pd(pd);
+	tmp = rd->pd;
+	rcu_assign_pointer(rd->pd, NULL);
+	if (tmp)
+		call_rcu(&tmp->rcu, destroy_perf_domain_rcu);
+}
+#else
+static void free_pd(struct perf_domain *pd) { }
+#endif /* CONFIG_ENERGY_MODEL */
+
 static void free_rootdomain(struct rcu_head *rcu)
 {
 	struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
@@ -211,6 +321,7 @@ static void free_rootdomain(struct rcu_head *rcu)
 	free_cpumask_var(rd->rto_mask);
 	free_cpumask_var(rd->online);
 	free_cpumask_var(rd->span);
+	free_pd(rd->pd);
 	kfree(rd);
 }
 
@@ -1959,8 +2070,8 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
 	/* Destroy deleted domains: */
 	for (i = 0; i < ndoms_cur; i++) {
 		for (j = 0; j < n && !new_topology; j++) {
-			if (cpumask_equal(doms_cur[i], doms_new[j])
-			    && dattrs_equal(dattr_cur, i, dattr_new, j))
+			if (cpumask_equal(doms_cur[i], doms_new[j]) &&
+			    dattrs_equal(dattr_cur, i, dattr_new, j))
 				goto match1;
 		}
 		/* No match - a current sched domain not in new doms_new[] */
@@ -1980,8 +2091,8 @@ match1:
 	/* Build new domains: */
 	for (i = 0; i < ndoms_new; i++) {
 		for (j = 0; j < n && !new_topology; j++) {
-			if (cpumask_equal(doms_new[i], doms_cur[j])
-			    && dattrs_equal(dattr_new, i, dattr_cur, j))
+			if (cpumask_equal(doms_new[i], doms_cur[j]) &&
+			    dattrs_equal(dattr_new, i, dattr_cur, j))
 				goto match2;
 		}
 		/* No match - add a new doms_new */
@@ -1990,6 +2101,21 @@ match2:
 		;
 	}
 
+#ifdef CONFIG_ENERGY_MODEL
+	/* Build perf. domains: */
+	for (i = 0; i < ndoms_new; i++) {
+		for (j = 0; j < n; j++) {
+			if (cpumask_equal(doms_new[i], doms_cur[j]) &&
+			    cpu_rq(cpumask_first(doms_cur[j]))->rd->pd)
+				goto match3;
+		}
+		/* No match - add perf. domains for a new rd */
+		build_perf_domains(doms_new[i]);
+match3:
+		;
+	}
+#endif
+
 	/* Remember the new sched domains: */
 	if (doms_cur != &fallback_doms)
 		free_sched_domains(doms_cur, ndoms_cur);
-- 
cgit 


From 011b27bb5d3139e8b5fe9ceff1fc7f6dc3145071 Mon Sep 17 00:00:00 2001
From: Quentin Perret <quentin.perret@arm.com>
Date: Mon, 3 Dec 2018 09:56:19 +0000
Subject: sched/topology: Add lowest CPU asymmetry sched_domain level pointer

Add another member to the family of per-cpu sched_domain shortcut
pointers. This one, sd_asym_cpucapacity, points to the lowest level
at which the SD_ASYM_CPUCAPACITY flag is set. While at it, rename the
sd_asym shortcut to sd_asym_packing to avoid confusions.

Generally speaking, the largest opportunity to save energy via
scheduling comes from a smarter exploitation of heterogeneous platforms
(i.e. big.LITTLE). Consequently, the sd_asym_cpucapacity shortcut will
be used at first as the lowest domain where Energy-Aware Scheduling
(EAS) should be applied. For example, it is possible to apply EAS within
a socket on a multi-socket system, as long as each socket has an
asymmetric topology. Energy-aware cross-sockets wake-up balancing will
only happen when the system is over-utilized, or this_cpu and prev_cpu
are in different sockets.

Suggested-by: Morten Rasmussen <morten.rasmussen@arm.com>
Signed-off-by: Quentin Perret <quentin.perret@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: adharmap@codeaurora.org
Cc: chris.redpath@arm.com
Cc: currojerez@riseup.net
Cc: dietmar.eggemann@arm.com
Cc: edubezval@gmail.com
Cc: gregkh@linuxfoundation.org
Cc: javi.merino@kernel.org
Cc: joel@joelfernandes.org
Cc: juri.lelli@redhat.com
Cc: patrick.bellasi@arm.com
Cc: pkondeti@codeaurora.org
Cc: rjw@rjwysocki.net
Cc: skannan@codeaurora.org
Cc: smuckle@google.com
Cc: srinivas.pandruvada@linux.intel.com
Cc: thara.gopinath@linaro.org
Cc: tkjos@google.com
Cc: valentin.schneider@arm.com
Cc: vincent.guittot@linaro.org
Cc: viresh.kumar@linaro.org
Link: https://lkml.kernel.org/r/20181203095628.11858-7-quentin.perret@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c     | 2 +-
 kernel/sched/sched.h    | 3 ++-
 kernel/sched/topology.c | 8 ++++++--
 3 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fdc8356ea742..a31a6d325901 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9299,7 +9299,7 @@ static void nohz_balancer_kick(struct rq *rq)
 		}
 	}
 
-	sd = rcu_dereference(per_cpu(sd_asym, cpu));
+	sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
 	if (sd) {
 		for_each_cpu(i, sched_domain_span(sd)) {
 			if (i == cpu ||
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 808a565187b1..75c403674706 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1303,7 +1303,8 @@ DECLARE_PER_CPU(int, sd_llc_size);
 DECLARE_PER_CPU(int, sd_llc_id);
 DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
 DECLARE_PER_CPU(struct sched_domain *, sd_numa);
-DECLARE_PER_CPU(struct sched_domain *, sd_asym);
+DECLARE_PER_CPU(struct sched_domain *, sd_asym_packing);
+DECLARE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity);
 extern struct static_key_false sched_asym_cpucapacity;
 
 struct sched_group_capacity {
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 169d25cafab5..137ccfed9a43 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -508,7 +508,8 @@ DEFINE_PER_CPU(int, sd_llc_size);
 DEFINE_PER_CPU(int, sd_llc_id);
 DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
 DEFINE_PER_CPU(struct sched_domain *, sd_numa);
-DEFINE_PER_CPU(struct sched_domain *, sd_asym);
+DEFINE_PER_CPU(struct sched_domain *, sd_asym_packing);
+DEFINE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity);
 DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
 
 static void update_top_cache_domain(int cpu)
@@ -534,7 +535,10 @@ static void update_top_cache_domain(int cpu)
 	rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
 
 	sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
-	rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
+	rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd);
+
+	sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY);
+	rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd);
 }
 
 /*
-- 
cgit 


From b68a4c0dba3b1e1dda1ede49f3c2fc72d3b54567 Mon Sep 17 00:00:00 2001
From: Quentin Perret <quentin.perret@arm.com>
Date: Mon, 3 Dec 2018 09:56:20 +0000
Subject: sched/topology: Disable EAS on inappropriate platforms

Energy Aware Scheduling (EAS) in its current form is most relevant on
platforms with asymmetric CPU topologies (e.g. Arm big.LITTLE) since
this is where there is a lot of potential for saving energy through
scheduling. This is particularly true since the Energy Model only
includes the active power costs of CPUs, hence not providing enough data
to compare packing-vs-spreading strategies.

As such, disable EAS on root domains where the SD_ASYM_CPUCAPACITY flag
is not set. While at it, disable EAS on systems where the complexity of
the Energy Model is too high since that could lead to unacceptable
scheduling overhead.

All in all, EAS can be used on a root domain if and only if:
  1. an Energy Model is available;
  2. the root domain has an asymmetric CPU capacity topology;
  3. the complexity of the root domain's EM is low enough to keep
     scheduling overheads low.

Signed-off-by: Quentin Perret <quentin.perret@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: adharmap@codeaurora.org
Cc: chris.redpath@arm.com
Cc: currojerez@riseup.net
Cc: dietmar.eggemann@arm.com
Cc: edubezval@gmail.com
Cc: gregkh@linuxfoundation.org
Cc: javi.merino@kernel.org
Cc: joel@joelfernandes.org
Cc: juri.lelli@redhat.com
Cc: morten.rasmussen@arm.com
Cc: patrick.bellasi@arm.com
Cc: pkondeti@codeaurora.org
Cc: rjw@rjwysocki.net
Cc: skannan@codeaurora.org
Cc: smuckle@google.com
Cc: srinivas.pandruvada@linux.intel.com
Cc: thara.gopinath@linaro.org
Cc: tkjos@google.com
Cc: valentin.schneider@arm.com
Cc: vincent.guittot@linaro.org
Cc: viresh.kumar@linaro.org
Link: https://lkml.kernel.org/r/20181203095628.11858-8-quentin.perret@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/topology.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 48 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 137ccfed9a43..6ddb804b2dec 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -270,12 +270,45 @@ static void destroy_perf_domain_rcu(struct rcu_head *rp)
 	free_pd(pd);
 }
 
+/*
+ * EAS can be used on a root domain if it meets all the following conditions:
+ *    1. an Energy Model (EM) is available;
+ *    2. the SD_ASYM_CPUCAPACITY flag is set in the sched_domain hierarchy.
+ *    3. the EM complexity is low enough to keep scheduling overheads low;
+ *
+ * The complexity of the Energy Model is defined as:
+ *
+ *              C = nr_pd * (nr_cpus + nr_cs)
+ *
+ * with parameters defined as:
+ *  - nr_pd:    the number of performance domains
+ *  - nr_cpus:  the number of CPUs
+ *  - nr_cs:    the sum of the number of capacity states of all performance
+ *              domains (for example, on a system with 2 performance domains,
+ *              with 10 capacity states each, nr_cs = 2 * 10 = 20).
+ *
+ * It is generally not a good idea to use such a model in the wake-up path on
+ * very complex platforms because of the associated scheduling overheads. The
+ * arbitrary constraint below prevents that. It makes EAS usable up to 16 CPUs
+ * with per-CPU DVFS and less than 8 capacity states each, for example.
+ */
+#define EM_MAX_COMPLEXITY 2048
+
 static void build_perf_domains(const struct cpumask *cpu_map)
 {
+	int i, nr_pd = 0, nr_cs = 0, nr_cpus = cpumask_weight(cpu_map);
 	struct perf_domain *pd = NULL, *tmp;
 	int cpu = cpumask_first(cpu_map);
 	struct root_domain *rd = cpu_rq(cpu)->rd;
-	int i;
+
+	/* EAS is enabled for asymmetric CPU capacity topologies. */
+	if (!per_cpu(sd_asym_cpucapacity, cpu)) {
+		if (sched_debug()) {
+			pr_info("rd %*pbl: CPUs do not have asymmetric capacities\n",
+					cpumask_pr_args(cpu_map));
+		}
+		goto free;
+	}
 
 	for_each_cpu(i, cpu_map) {
 		/* Skip already covered CPUs. */
@@ -288,6 +321,20 @@ static void build_perf_domains(const struct cpumask *cpu_map)
 			goto free;
 		tmp->next = pd;
 		pd = tmp;
+
+		/*
+		 * Count performance domains and capacity states for the
+		 * complexity check.
+		 */
+		nr_pd++;
+		nr_cs += em_pd_nr_cap_states(pd->em_pd);
+	}
+
+	/* Bail out if the Energy Model complexity is too high. */
+	if (nr_pd * (nr_cs + nr_cpus) > EM_MAX_COMPLEXITY) {
+		WARN(1, "rd %*pbl: Failed to start EAS, EM complexity is too high\n",
+						cpumask_pr_args(cpu_map));
+		goto free;
 	}
 
 	perf_domain_debug(cpu_map, pd);
-- 
cgit 


From 531b5c9f5cd05ead53324f419b32685a22eebe8b Mon Sep 17 00:00:00 2001
From: Quentin Perret <quentin.perret@arm.com>
Date: Mon, 3 Dec 2018 09:56:21 +0000
Subject: sched/topology: Make Energy Aware Scheduling depend on schedutil

Energy Aware Scheduling (EAS) is designed with the assumption that
frequencies of CPUs follow their utilization value. When using a CPUFreq
governor other than schedutil, the chances of this assumption being true
are small, if any. When schedutil is being used, EAS' predictions are at
least consistent with the frequency requests. Although those requests
have no guarantees to be honored by the hardware, they should at least
guide DVFS in the right direction and provide some hope in regards to the
EAS model being accurate.

To make sure EAS is only used in a sane configuration, create a strong
dependency on schedutil being used. Since having sugov compiled-in does
not provide that guarantee, make CPUFreq call a scheduler function on
governor changes hence letting it rebuild the scheduling domains, check
the governors of the online CPUs, and enable/disable EAS accordingly.

Signed-off-by: Quentin Perret <quentin.perret@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rafael J. Wysocki <rjw@rjwysocki.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: adharmap@codeaurora.org
Cc: chris.redpath@arm.com
Cc: currojerez@riseup.net
Cc: dietmar.eggemann@arm.com
Cc: edubezval@gmail.com
Cc: gregkh@linuxfoundation.org
Cc: javi.merino@kernel.org
Cc: joel@joelfernandes.org
Cc: juri.lelli@redhat.com
Cc: morten.rasmussen@arm.com
Cc: patrick.bellasi@arm.com
Cc: pkondeti@codeaurora.org
Cc: skannan@codeaurora.org
Cc: smuckle@google.com
Cc: srinivas.pandruvada@linux.intel.com
Cc: thara.gopinath@linaro.org
Cc: tkjos@google.com
Cc: valentin.schneider@arm.com
Cc: vincent.guittot@linaro.org
Cc: viresh.kumar@linaro.org
Link: https://lkml.kernel.org/r/20181203095628.11858-9-quentin.perret@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/cpufreq/cpufreq.c        |  1 +
 include/linux/cpufreq.h          |  8 ++++++++
 kernel/sched/cpufreq_schedutil.c | 37 +++++++++++++++++++++++++++++++++++--
 kernel/sched/sched.h             |  4 +---
 kernel/sched/topology.c          | 28 ++++++++++++++++++++++++----
 5 files changed, 69 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 7aa3dcad2175..6f23ebb395f1 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -2277,6 +2277,7 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
 		ret = cpufreq_start_governor(policy);
 		if (!ret) {
 			pr_debug("cpufreq: governor change\n");
+			sched_cpufreq_governor_change(policy, old_gov);
 			return 0;
 		}
 		cpufreq_exit_governor(policy);
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 882a9b9e34bc..c86d6d8bdfed 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -950,6 +950,14 @@ static inline bool policy_has_boost_freq(struct cpufreq_policy *policy)
 }
 #endif
 
+#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
+void sched_cpufreq_governor_change(struct cpufreq_policy *policy,
+			struct cpufreq_governor *old_gov);
+#else
+static inline void sched_cpufreq_governor_change(struct cpufreq_policy *policy,
+			struct cpufreq_governor *old_gov) { }
+#endif
+
 extern void arch_freq_prepare_all(void);
 extern unsigned int arch_freq_get_on_cpu(int cpu);
 
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 90128be27712..c2e53d1a3143 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -624,7 +624,7 @@ static struct kobj_type sugov_tunables_ktype = {
 
 /********************** cpufreq governor interface *********************/
 
-static struct cpufreq_governor schedutil_gov;
+struct cpufreq_governor schedutil_gov;
 
 static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy)
 {
@@ -883,7 +883,7 @@ static void sugov_limits(struct cpufreq_policy *policy)
 	sg_policy->need_freq_update = true;
 }
 
-static struct cpufreq_governor schedutil_gov = {
+struct cpufreq_governor schedutil_gov = {
 	.name			= "schedutil",
 	.owner			= THIS_MODULE,
 	.dynamic_switching	= true,
@@ -906,3 +906,36 @@ static int __init sugov_register(void)
 	return cpufreq_register_governor(&schedutil_gov);
 }
 fs_initcall(sugov_register);
+
+#ifdef CONFIG_ENERGY_MODEL
+extern bool sched_energy_update;
+extern struct mutex sched_energy_mutex;
+
+static void rebuild_sd_workfn(struct work_struct *work)
+{
+	mutex_lock(&sched_energy_mutex);
+	sched_energy_update = true;
+	rebuild_sched_domains();
+	sched_energy_update = false;
+	mutex_unlock(&sched_energy_mutex);
+}
+static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn);
+
+/*
+ * EAS shouldn't be attempted without sugov, so rebuild the sched_domains
+ * on governor changes to make sure the scheduler knows about it.
+ */
+void sched_cpufreq_governor_change(struct cpufreq_policy *policy,
+				  struct cpufreq_governor *old_gov)
+{
+	if (old_gov == &schedutil_gov || policy->governor == &schedutil_gov) {
+		/*
+		 * When called from the cpufreq_register_driver() path, the
+		 * cpu_hotplug_lock is already held, so use a work item to
+		 * avoid nested locking in rebuild_sched_domains().
+		 */
+		schedule_work(&rebuild_sd_work);
+	}
+
+}
+#endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 75c403674706..fd84900b0b21 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2291,10 +2291,8 @@ unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned
 }
 #endif
 
-#ifdef CONFIG_SMP
-#ifdef CONFIG_ENERGY_MODEL
+#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
 #define perf_domain_span(pd) (to_cpumask(((pd)->em_pd->cpus)))
 #else
 #define perf_domain_span(pd) NULL
 #endif
-#endif
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 6ddb804b2dec..0a5a1d3a4eae 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -201,7 +201,10 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
 	return 1;
 }
 
-#ifdef CONFIG_ENERGY_MODEL
+#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
+DEFINE_MUTEX(sched_energy_mutex);
+bool sched_energy_update;
+
 static void free_pd(struct perf_domain *pd)
 {
 	struct perf_domain *tmp;
@@ -275,6 +278,7 @@ static void destroy_perf_domain_rcu(struct rcu_head *rp)
  *    1. an Energy Model (EM) is available;
  *    2. the SD_ASYM_CPUCAPACITY flag is set in the sched_domain hierarchy.
  *    3. the EM complexity is low enough to keep scheduling overheads low;
+ *    4. schedutil is driving the frequency of all CPUs of the rd;
  *
  * The complexity of the Energy Model is defined as:
  *
@@ -294,12 +298,15 @@ static void destroy_perf_domain_rcu(struct rcu_head *rp)
  */
 #define EM_MAX_COMPLEXITY 2048
 
+extern struct cpufreq_governor schedutil_gov;
 static void build_perf_domains(const struct cpumask *cpu_map)
 {
 	int i, nr_pd = 0, nr_cs = 0, nr_cpus = cpumask_weight(cpu_map);
 	struct perf_domain *pd = NULL, *tmp;
 	int cpu = cpumask_first(cpu_map);
 	struct root_domain *rd = cpu_rq(cpu)->rd;
+	struct cpufreq_policy *policy;
+	struct cpufreq_governor *gov;
 
 	/* EAS is enabled for asymmetric CPU capacity topologies. */
 	if (!per_cpu(sd_asym_cpucapacity, cpu)) {
@@ -315,6 +322,19 @@ static void build_perf_domains(const struct cpumask *cpu_map)
 		if (find_pd(pd, i))
 			continue;
 
+		/* Do not attempt EAS if schedutil is not being used. */
+		policy = cpufreq_cpu_get(i);
+		if (!policy)
+			goto free;
+		gov = policy->governor;
+		cpufreq_cpu_put(policy);
+		if (gov != &schedutil_gov) {
+			if (rd->pd)
+				pr_warn("rd %*pbl: Disabling EAS, schedutil is mandatory\n",
+						cpumask_pr_args(cpu_map));
+			goto free;
+		}
+
 		/* Create the new pd and add it to the local list. */
 		tmp = pd_init(i);
 		if (!tmp)
@@ -356,7 +376,7 @@ free:
 }
 #else
 static void free_pd(struct perf_domain *pd) { }
-#endif /* CONFIG_ENERGY_MODEL */
+#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL*/
 
 static void free_rootdomain(struct rcu_head *rcu)
 {
@@ -2152,10 +2172,10 @@ match2:
 		;
 	}
 
-#ifdef CONFIG_ENERGY_MODEL
+#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
 	/* Build perf. domains: */
 	for (i = 0; i < ndoms_new; i++) {
-		for (j = 0; j < n; j++) {
+		for (j = 0; j < n && !sched_energy_update; j++) {
 			if (cpumask_equal(doms_new[i], doms_cur[j]) &&
 			    cpu_rq(cpumask_first(doms_cur[j]))->rd->pd)
 				goto match3;
-- 
cgit 


From 1f74de8798c93ce14801cc4e772603e51c841c33 Mon Sep 17 00:00:00 2001
From: Quentin Perret <quentin.perret@arm.com>
Date: Mon, 3 Dec 2018 09:56:22 +0000
Subject: sched/toplogy: Introduce the 'sched_energy_present' static key

In order to make sure Energy Aware Scheduling (EAS) will not impact
systems where no Energy Model is available, introduce a static key
guarding the access to EAS code. Since EAS is enabled on a
per-root-domain basis, the static key is enabled when at least one root
domain meets all conditions for EAS.

Signed-off-by: Quentin Perret <quentin.perret@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: adharmap@codeaurora.org
Cc: chris.redpath@arm.com
Cc: currojerez@riseup.net
Cc: dietmar.eggemann@arm.com
Cc: edubezval@gmail.com
Cc: gregkh@linuxfoundation.org
Cc: javi.merino@kernel.org
Cc: joel@joelfernandes.org
Cc: juri.lelli@redhat.com
Cc: morten.rasmussen@arm.com
Cc: patrick.bellasi@arm.com
Cc: pkondeti@codeaurora.org
Cc: rjw@rjwysocki.net
Cc: skannan@codeaurora.org
Cc: smuckle@google.com
Cc: srinivas.pandruvada@linux.intel.com
Cc: thara.gopinath@linaro.org
Cc: tkjos@google.com
Cc: valentin.schneider@arm.com
Cc: vincent.guittot@linaro.org
Cc: viresh.kumar@linaro.org
Link: https://lkml.kernel.org/r/20181203095628.11858-10-quentin.perret@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/sched.h    |  4 ++++
 kernel/sched/topology.c | 28 ++++++++++++++++++++++++----
 2 files changed, 28 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index fd84900b0b21..2b3cf356e958 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2296,3 +2296,7 @@ unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned
 #else
 #define perf_domain_span(pd) NULL
 #endif
+
+#ifdef CONFIG_SMP
+extern struct static_key_false sched_energy_present;
+#endif
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 0a5a1d3a4eae..3f35ba1d8fde 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -201,6 +201,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
 	return 1;
 }
 
+DEFINE_STATIC_KEY_FALSE(sched_energy_present);
 #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
 DEFINE_MUTEX(sched_energy_mutex);
 bool sched_energy_update;
@@ -273,6 +274,19 @@ static void destroy_perf_domain_rcu(struct rcu_head *rp)
 	free_pd(pd);
 }
 
+static void sched_energy_set(bool has_eas)
+{
+	if (!has_eas && static_branch_unlikely(&sched_energy_present)) {
+		if (sched_debug())
+			pr_info("%s: stopping EAS\n", __func__);
+		static_branch_disable_cpuslocked(&sched_energy_present);
+	} else if (has_eas && !static_branch_unlikely(&sched_energy_present)) {
+		if (sched_debug())
+			pr_info("%s: starting EAS\n", __func__);
+		static_branch_enable_cpuslocked(&sched_energy_present);
+	}
+}
+
 /*
  * EAS can be used on a root domain if it meets all the following conditions:
  *    1. an Energy Model (EM) is available;
@@ -299,7 +313,7 @@ static void destroy_perf_domain_rcu(struct rcu_head *rp)
 #define EM_MAX_COMPLEXITY 2048
 
 extern struct cpufreq_governor schedutil_gov;
-static void build_perf_domains(const struct cpumask *cpu_map)
+static bool build_perf_domains(const struct cpumask *cpu_map)
 {
 	int i, nr_pd = 0, nr_cs = 0, nr_cpus = cpumask_weight(cpu_map);
 	struct perf_domain *pd = NULL, *tmp;
@@ -365,7 +379,7 @@ static void build_perf_domains(const struct cpumask *cpu_map)
 	if (tmp)
 		call_rcu(&tmp->rcu, destroy_perf_domain_rcu);
 
-	return;
+	return !!pd;
 
 free:
 	free_pd(pd);
@@ -373,6 +387,8 @@ free:
 	rcu_assign_pointer(rd->pd, NULL);
 	if (tmp)
 		call_rcu(&tmp->rcu, destroy_perf_domain_rcu);
+
+	return false;
 }
 #else
 static void free_pd(struct perf_domain *pd) { }
@@ -2114,6 +2130,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
 void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
 			     struct sched_domain_attr *dattr_new)
 {
+	bool __maybe_unused has_eas = false;
 	int i, j, n;
 	int new_topology;
 
@@ -2177,14 +2194,17 @@ match2:
 	for (i = 0; i < ndoms_new; i++) {
 		for (j = 0; j < n && !sched_energy_update; j++) {
 			if (cpumask_equal(doms_new[i], doms_cur[j]) &&
-			    cpu_rq(cpumask_first(doms_cur[j]))->rd->pd)
+			    cpu_rq(cpumask_first(doms_cur[j]))->rd->pd) {
+				has_eas = true;
 				goto match3;
+			}
 		}
 		/* No match - add perf. domains for a new rd */
-		build_perf_domains(doms_new[i]);
+		has_eas |= build_perf_domains(doms_new[i]);
 match3:
 		;
 	}
+	sched_energy_set(has_eas);
 #endif
 
 	/* Remember the new sched domains: */
-- 
cgit 


From 630246a06ae2a7a12d1fce85f1e5681032982791 Mon Sep 17 00:00:00 2001
From: Quentin Perret <quentin.perret@arm.com>
Date: Mon, 3 Dec 2018 09:56:24 +0000
Subject: sched/fair: Clean-up update_sg_lb_stats parameters

In preparation for the introduction of a new root domain flag which can
be set during load balance (the 'overutilized' flag), clean-up the set
of parameters passed to update_sg_lb_stats(). More specifically, the
'local_group' and 'local_idx' parameters can be removed since they can
easily be reconstructed from within the function.

While at it, transform the 'overload' parameter into a flag stored in
the 'sg_status' parameter hence facilitating the definition of new flags
when needed.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Suggested-by: Valentin Schneider <valentin.schneider@arm.com>
Signed-off-by: Quentin Perret <quentin.perret@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: adharmap@codeaurora.org
Cc: chris.redpath@arm.com
Cc: currojerez@riseup.net
Cc: dietmar.eggemann@arm.com
Cc: edubezval@gmail.com
Cc: gregkh@linuxfoundation.org
Cc: javi.merino@kernel.org
Cc: joel@joelfernandes.org
Cc: juri.lelli@redhat.com
Cc: morten.rasmussen@arm.com
Cc: patrick.bellasi@arm.com
Cc: pkondeti@codeaurora.org
Cc: rjw@rjwysocki.net
Cc: skannan@codeaurora.org
Cc: smuckle@google.com
Cc: srinivas.pandruvada@linux.intel.com
Cc: thara.gopinath@linaro.org
Cc: tkjos@google.com
Cc: vincent.guittot@linaro.org
Cc: viresh.kumar@linaro.org
Link: https://lkml.kernel.org/r/20181203095628.11858-12-quentin.perret@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c  | 27 +++++++++++----------------
 kernel/sched/sched.h |  3 +++
 2 files changed, 14 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a31a6d325901..e04f29098ec7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7905,16 +7905,16 @@ static bool update_nohz_stats(struct rq *rq, bool force)
  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
  * @env: The load balancing environment.
  * @group: sched_group whose statistics are to be updated.
- * @load_idx: Load index of sched_domain of this_cpu for load calc.
- * @local_group: Does group contain this_cpu.
  * @sgs: variable to hold the statistics for this group.
- * @overload: Indicate pullable load (e.g. >1 runnable task).
+ * @sg_status: Holds flag indicating the status of the sched_group
  */
 static inline void update_sg_lb_stats(struct lb_env *env,
-			struct sched_group *group, int load_idx,
-			int local_group, struct sg_lb_stats *sgs,
-			bool *overload)
+				      struct sched_group *group,
+				      struct sg_lb_stats *sgs,
+				      int *sg_status)
 {
+	int local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group));
+	int load_idx = get_sd_load_idx(env->sd, env->idle);
 	unsigned long load;
 	int i, nr_running;
 
@@ -7938,7 +7938,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 
 		nr_running = rq->nr_running;
 		if (nr_running > 1)
-			*overload = true;
+			*sg_status |= SG_OVERLOAD;
 
 #ifdef CONFIG_NUMA_BALANCING
 		sgs->nr_numa_running += rq->nr_numa_running;
@@ -7954,7 +7954,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 		if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
 		    sgs->group_misfit_task_load < rq->misfit_task_load) {
 			sgs->group_misfit_task_load = rq->misfit_task_load;
-			*overload = 1;
+			*sg_status |= SG_OVERLOAD;
 		}
 	}
 
@@ -8099,17 +8099,14 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 	struct sched_group *sg = env->sd->groups;
 	struct sg_lb_stats *local = &sds->local_stat;
 	struct sg_lb_stats tmp_sgs;
-	int load_idx;
-	bool overload = false;
 	bool prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
+	int sg_status = 0;
 
 #ifdef CONFIG_NO_HZ_COMMON
 	if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked))
 		env->flags |= LBF_NOHZ_STATS;
 #endif
 
-	load_idx = get_sd_load_idx(env->sd, env->idle);
-
 	do {
 		struct sg_lb_stats *sgs = &tmp_sgs;
 		int local_group;
@@ -8124,8 +8121,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 				update_group_capacity(env->sd, env->dst_cpu);
 		}
 
-		update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
-						&overload);
+		update_sg_lb_stats(env, sg, sgs, &sg_status);
 
 		if (local_group)
 			goto next_group;
@@ -8175,8 +8171,7 @@ next_group:
 
 	if (!env->sd->parent) {
 		/* update overload indicator if we are at root domain */
-		if (READ_ONCE(env->dst_rq->rd->overload) != overload)
-			WRITE_ONCE(env->dst_rq->rd->overload, overload);
+		WRITE_ONCE(env->dst_rq->rd->overload, sg_status & SG_OVERLOAD);
 	}
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 2b3cf356e958..d4d984846924 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -716,6 +716,9 @@ struct perf_domain {
 	struct rcu_head rcu;
 };
 
+/* Scheduling group status flags */
+#define SG_OVERLOAD		0x1 /* More than one runnable task on a CPU. */
+
 /*
  * We add the notion of a root-domain which will be used to define per-domain
  * variables. Each exclusive cpuset essentially defines an island domain by
-- 
cgit 


From 2802bf3cd936fe2c8033a696d375a4d9d3974de4 Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <morten.rasmussen@arm.com>
Date: Mon, 3 Dec 2018 09:56:25 +0000
Subject: sched/fair: Add over-utilization/tipping point indicator

Energy-aware scheduling is only meant to be active while the system is
_not_ over-utilized. That is, there are spare cycles available to shift
tasks around based on their actual utilization to get a more
energy-efficient task distribution without depriving any tasks. When
above the tipping point task placement is done the traditional way based
on load_avg, spreading the tasks across as many cpus as possible based
on priority scaled load to preserve smp_nice. Below the tipping point we
want to use util_avg instead. We need to define a criteria for when we
make the switch.

The util_avg for each cpu converges towards 100% regardless of how many
additional tasks we may put on it. If we define over-utilized as:

sum_{cpus}(rq.cfs.avg.util_avg) + margin > sum_{cpus}(rq.capacity)

some individual cpus may be over-utilized running multiple tasks even
when the above condition is false. That should be okay as long as we try
to spread the tasks out to avoid per-cpu over-utilization as much as
possible and if all tasks have the _same_ priority. If the latter isn't
true, we have to consider priority to preserve smp_nice.

For example, we could have n_cpus nice=-10 util_avg=55% tasks and
n_cpus/2 nice=0 util_avg=60% tasks. Balancing based on util_avg we are
likely to end up with nice=-10 tasks sharing cpus and nice=0 tasks
getting their own as we 1.5*n_cpus tasks in total and 55%+55% is less
over-utilized than 55%+60% for those cpus that have to be shared. The
system utilization is only 85% of the system capacity, but we are
breaking smp_nice.

To be sure not to break smp_nice, we have defined over-utilization
conservatively as when any cpu in the system is fully utilized at its
highest frequency instead:

cpu_rq(any).cfs.avg.util_avg + margin > cpu_rq(any).capacity

IOW, as soon as one cpu is (nearly) 100% utilized, we switch to load_avg
to factor in priority to preserve smp_nice.

With this definition, we can skip periodic load-balance as no cpu has an
always-running task when the system is not over-utilized. All tasks will
be periodic and we can balance them at wake-up. This conservative
condition does however mean that some scenarios that could benefit from
energy-aware decisions even if one cpu is fully utilized would not get
those benefits.

For systems where some cpus might have reduced capacity on some cpus
(RT-pressure and/or big.LITTLE), we want periodic load-balance checks as
soon a just a single cpu is fully utilized as it might one of those with
reduced capacity and in that case we want to migrate it.

[ peterz: Added a comment explaining why new tasks are not accounted during
          overutilization detection. ]

Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
Signed-off-by: Quentin Perret <quentin.perret@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: adharmap@codeaurora.org
Cc: chris.redpath@arm.com
Cc: currojerez@riseup.net
Cc: dietmar.eggemann@arm.com
Cc: edubezval@gmail.com
Cc: gregkh@linuxfoundation.org
Cc: javi.merino@kernel.org
Cc: joel@joelfernandes.org
Cc: juri.lelli@redhat.com
Cc: patrick.bellasi@arm.com
Cc: pkondeti@codeaurora.org
Cc: rjw@rjwysocki.net
Cc: skannan@codeaurora.org
Cc: smuckle@google.com
Cc: srinivas.pandruvada@linux.intel.com
Cc: thara.gopinath@linaro.org
Cc: tkjos@google.com
Cc: valentin.schneider@arm.com
Cc: vincent.guittot@linaro.org
Cc: viresh.kumar@linaro.org
Link: https://lkml.kernel.org/r/20181203095628.11858-13-quentin.perret@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c  | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 kernel/sched/sched.h |  4 ++++
 2 files changed, 61 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e04f29098ec7..767e7675774b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5082,6 +5082,24 @@ static inline void hrtick_update(struct rq *rq)
 }
 #endif
 
+#ifdef CONFIG_SMP
+static inline unsigned long cpu_util(int cpu);
+static unsigned long capacity_of(int cpu);
+
+static inline bool cpu_overutilized(int cpu)
+{
+	return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin);
+}
+
+static inline void update_overutilized_status(struct rq *rq)
+{
+	if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu))
+		WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
+}
+#else
+static inline void update_overutilized_status(struct rq *rq) { }
+#endif
+
 /*
  * The enqueue_task method is called before nr_running is
  * increased. Here we update the fair scheduling stats and
@@ -5139,8 +5157,26 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		update_cfs_group(se);
 	}
 
-	if (!se)
+	if (!se) {
 		add_nr_running(rq, 1);
+		/*
+		 * Since new tasks are assigned an initial util_avg equal to
+		 * half of the spare capacity of their CPU, tiny tasks have the
+		 * ability to cross the overutilized threshold, which will
+		 * result in the load balancer ruining all the task placement
+		 * done by EAS. As a way to mitigate that effect, do not account
+		 * for the first enqueue operation of new tasks during the
+		 * overutilized flag detection.
+		 *
+		 * A better way of solving this problem would be to wait for
+		 * the PELT signals of tasks to converge before taking them
+		 * into account, but that is not straightforward to implement,
+		 * and the following generally works well enough in practice.
+		 */
+		if (flags & ENQUEUE_WAKEUP)
+			update_overutilized_status(rq);
+
+	}
 
 	hrtick_update(rq);
 }
@@ -7940,6 +7976,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 		if (nr_running > 1)
 			*sg_status |= SG_OVERLOAD;
 
+		if (cpu_overutilized(i))
+			*sg_status |= SG_OVERUTILIZED;
+
 #ifdef CONFIG_NUMA_BALANCING
 		sgs->nr_numa_running += rq->nr_numa_running;
 		sgs->nr_preferred_running += rq->nr_preferred_running;
@@ -8170,8 +8209,15 @@ next_group:
 		env->fbq_type = fbq_classify_group(&sds->busiest_stat);
 
 	if (!env->sd->parent) {
+		struct root_domain *rd = env->dst_rq->rd;
+
 		/* update overload indicator if we are at root domain */
-		WRITE_ONCE(env->dst_rq->rd->overload, sg_status & SG_OVERLOAD);
+		WRITE_ONCE(rd->overload, sg_status & SG_OVERLOAD);
+
+		/* Update over-utilization (tipping point, U >= 0) indicator */
+		WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED);
+	} else if (sg_status & SG_OVERUTILIZED) {
+		WRITE_ONCE(env->dst_rq->rd->overutilized, SG_OVERUTILIZED);
 	}
 }
 
@@ -8398,6 +8444,14 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
 	 * this level.
 	 */
 	update_sd_lb_stats(env, &sds);
+
+	if (static_branch_unlikely(&sched_energy_present)) {
+		struct root_domain *rd = env->dst_rq->rd;
+
+		if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
+			goto out_balanced;
+	}
+
 	local = &sds.local_stat;
 	busiest = &sds.busiest_stat;
 
@@ -9798,6 +9852,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 		task_tick_numa(rq, curr);
 
 	update_misfit_status(curr, rq);
+	update_overutilized_status(task_rq(curr));
 }
 
 /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d4d984846924..0ba08924e017 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -718,6 +718,7 @@ struct perf_domain {
 
 /* Scheduling group status flags */
 #define SG_OVERLOAD		0x1 /* More than one runnable task on a CPU. */
+#define SG_OVERUTILIZED		0x2 /* One or more CPUs are over-utilized. */
 
 /*
  * We add the notion of a root-domain which will be used to define per-domain
@@ -741,6 +742,9 @@ struct root_domain {
 	 */
 	int			overload;
 
+	/* Indicate one or more cpus over-utilized (tipping point) */
+	int			overutilized;
+
 	/*
 	 * The bit corresponding to a CPU gets set here if such CPU has more
 	 * than one runnable -deadline task (as it is below for RT tasks).
-- 
cgit 


From 390031e4c309c94ecc07a558187eb5185200df83 Mon Sep 17 00:00:00 2001
From: Quentin Perret <quentin.perret@arm.com>
Date: Mon, 3 Dec 2018 09:56:26 +0000
Subject: sched/fair: Introduce an energy estimation helper function

In preparation for the definition of an energy-aware wakeup path,
introduce a helper function to estimate the consequence on system energy
when a specific task wakes-up on a specific CPU. compute_energy()
estimates the capacity state to be reached by all performance domains
and estimates the consumption of each online CPU according to its Energy
Model and its percentage of busy time.

Signed-off-by: Quentin Perret <quentin.perret@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: adharmap@codeaurora.org
Cc: chris.redpath@arm.com
Cc: currojerez@riseup.net
Cc: dietmar.eggemann@arm.com
Cc: edubezval@gmail.com
Cc: gregkh@linuxfoundation.org
Cc: javi.merino@kernel.org
Cc: joel@joelfernandes.org
Cc: juri.lelli@redhat.com
Cc: morten.rasmussen@arm.com
Cc: patrick.bellasi@arm.com
Cc: pkondeti@codeaurora.org
Cc: rjw@rjwysocki.net
Cc: skannan@codeaurora.org
Cc: smuckle@google.com
Cc: srinivas.pandruvada@linux.intel.com
Cc: thara.gopinath@linaro.org
Cc: tkjos@google.com
Cc: valentin.schneider@arm.com
Cc: vincent.guittot@linaro.org
Cc: viresh.kumar@linaro.org
Link: https://lkml.kernel.org/r/20181203095628.11858-14-quentin.perret@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 76 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 76 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 767e7675774b..b3c94584d947 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6377,6 +6377,82 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
 	return !task_fits_capacity(p, min_cap);
 }
 
+/*
+ * Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued)
+ * to @dst_cpu.
+ */
+static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
+{
+	struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
+	unsigned long util_est, util = READ_ONCE(cfs_rq->avg.util_avg);
+
+	/*
+	 * If @p migrates from @cpu to another, remove its contribution. Or,
+	 * if @p migrates from another CPU to @cpu, add its contribution. In
+	 * the other cases, @cpu is not impacted by the migration, so the
+	 * util_avg should already be correct.
+	 */
+	if (task_cpu(p) == cpu && dst_cpu != cpu)
+		sub_positive(&util, task_util(p));
+	else if (task_cpu(p) != cpu && dst_cpu == cpu)
+		util += task_util(p);
+
+	if (sched_feat(UTIL_EST)) {
+		util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
+
+		/*
+		 * During wake-up, the task isn't enqueued yet and doesn't
+		 * appear in the cfs_rq->avg.util_est.enqueued of any rq,
+		 * so just add it (if needed) to "simulate" what will be
+		 * cpu_util() after the task has been enqueued.
+		 */
+		if (dst_cpu == cpu)
+			util_est += _task_util_est(p);
+
+		util = max(util, util_est);
+	}
+
+	return min(util, capacity_orig_of(cpu));
+}
+
+/*
+ * compute_energy(): Estimates the energy that would be consumed if @p was
+ * migrated to @dst_cpu. compute_energy() predicts what will be the utilization
+ * landscape of the * CPUs after the task migration, and uses the Energy Model
+ * to compute what would be the energy if we decided to actually migrate that
+ * task.
+ */
+static long
+compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
+{
+	long util, max_util, sum_util, energy = 0;
+	int cpu;
+
+	for (; pd; pd = pd->next) {
+		max_util = sum_util = 0;
+		/*
+		 * The capacity state of CPUs of the current rd can be driven by
+		 * CPUs of another rd if they belong to the same performance
+		 * domain. So, account for the utilization of these CPUs too
+		 * by masking pd with cpu_online_mask instead of the rd span.
+		 *
+		 * If an entire performance domain is outside of the current rd,
+		 * it will not appear in its pd list and will not be accounted
+		 * by compute_energy().
+		 */
+		for_each_cpu_and(cpu, perf_domain_span(pd), cpu_online_mask) {
+			util = cpu_util_next(cpu, p, dst_cpu);
+			util = schedutil_energy_util(cpu, util);
+			max_util = max(util, max_util);
+			sum_util += util;
+		}
+
+		energy += em_pd_energy(pd->em_pd, max_util, sum_util);
+	}
+
+	return energy;
+}
+
 /*
  * select_task_rq_fair: Select target runqueue for the waking task in domains
  * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
-- 
cgit 


From 732cd75b8c920d3727e69957b14faa7c2d7c3b75 Mon Sep 17 00:00:00 2001
From: Quentin Perret <quentin.perret@arm.com>
Date: Mon, 3 Dec 2018 09:56:27 +0000
Subject: sched/fair: Select an energy-efficient CPU on task wake-up

If an Energy Model (EM) is available and if the system isn't
overutilized, re-route waking tasks into an energy-aware placement
algorithm. The selection of an energy-efficient CPU for a task
is achieved by estimating the impact on system-level active energy
resulting from the placement of the task on the CPU with the highest
spare capacity in each performance domain. This strategy spreads tasks
in a performance domain and avoids overly aggressive task packing. The
best CPU energy-wise is then selected if it saves a large enough amount
of energy with respect to prev_cpu.

Although it has already shown significant benefits on some existing
targets, this approach cannot scale to platforms with numerous CPUs.
This is an attempt to do something useful as writing a fast heuristic
that performs reasonably well on a broad spectrum of architectures isn't
an easy task. As such, the scope of usability of the energy-aware
wake-up path is restricted to systems with the SD_ASYM_CPUCAPACITY flag
set, and where the EM isn't too complex.

Signed-off-by: Quentin Perret <quentin.perret@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: adharmap@codeaurora.org
Cc: chris.redpath@arm.com
Cc: currojerez@riseup.net
Cc: dietmar.eggemann@arm.com
Cc: edubezval@gmail.com
Cc: gregkh@linuxfoundation.org
Cc: javi.merino@kernel.org
Cc: joel@joelfernandes.org
Cc: juri.lelli@redhat.com
Cc: morten.rasmussen@arm.com
Cc: patrick.bellasi@arm.com
Cc: pkondeti@codeaurora.org
Cc: rjw@rjwysocki.net
Cc: skannan@codeaurora.org
Cc: smuckle@google.com
Cc: srinivas.pandruvada@linux.intel.com
Cc: thara.gopinath@linaro.org
Cc: tkjos@google.com
Cc: valentin.schneider@arm.com
Cc: vincent.guittot@linaro.org
Cc: viresh.kumar@linaro.org
Link: https://lkml.kernel.org/r/20181203095628.11858-15-quentin.perret@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 143 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 141 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b3c94584d947..ca469646ebe1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6453,6 +6453,137 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
 	return energy;
 }
 
+/*
+ * find_energy_efficient_cpu(): Find most energy-efficient target CPU for the
+ * waking task. find_energy_efficient_cpu() looks for the CPU with maximum
+ * spare capacity in each performance domain and uses it as a potential
+ * candidate to execute the task. Then, it uses the Energy Model to figure
+ * out which of the CPU candidates is the most energy-efficient.
+ *
+ * The rationale for this heuristic is as follows. In a performance domain,
+ * all the most energy efficient CPU candidates (according to the Energy
+ * Model) are those for which we'll request a low frequency. When there are
+ * several CPUs for which the frequency request will be the same, we don't
+ * have enough data to break the tie between them, because the Energy Model
+ * only includes active power costs. With this model, if we assume that
+ * frequency requests follow utilization (e.g. using schedutil), the CPU with
+ * the maximum spare capacity in a performance domain is guaranteed to be among
+ * the best candidates of the performance domain.
+ *
+ * In practice, it could be preferable from an energy standpoint to pack
+ * small tasks on a CPU in order to let other CPUs go in deeper idle states,
+ * but that could also hurt our chances to go cluster idle, and we have no
+ * ways to tell with the current Energy Model if this is actually a good
+ * idea or not. So, find_energy_efficient_cpu() basically favors
+ * cluster-packing, and spreading inside a cluster. That should at least be
+ * a good thing for latency, and this is consistent with the idea that most
+ * of the energy savings of EAS come from the asymmetry of the system, and
+ * not so much from breaking the tie between identical CPUs. That's also the
+ * reason why EAS is enabled in the topology code only for systems where
+ * SD_ASYM_CPUCAPACITY is set.
+ *
+ * NOTE: Forkees are not accepted in the energy-aware wake-up path because
+ * they don't have any useful utilization data yet and it's not possible to
+ * forecast their impact on energy consumption. Consequently, they will be
+ * placed by find_idlest_cpu() on the least loaded CPU, which might turn out
+ * to be energy-inefficient in some use-cases. The alternative would be to
+ * bias new tasks towards specific types of CPUs first, or to try to infer
+ * their util_avg from the parent task, but those heuristics could hurt
+ * other use-cases too. So, until someone finds a better way to solve this,
+ * let's keep things simple by re-using the existing slow path.
+ */
+
+static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
+{
+	unsigned long prev_energy = ULONG_MAX, best_energy = ULONG_MAX;
+	struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
+	int cpu, best_energy_cpu = prev_cpu;
+	struct perf_domain *head, *pd;
+	unsigned long cpu_cap, util;
+	struct sched_domain *sd;
+
+	rcu_read_lock();
+	pd = rcu_dereference(rd->pd);
+	if (!pd || READ_ONCE(rd->overutilized))
+		goto fail;
+	head = pd;
+
+	/*
+	 * Energy-aware wake-up happens on the lowest sched_domain starting
+	 * from sd_asym_cpucapacity spanning over this_cpu and prev_cpu.
+	 */
+	sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity));
+	while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
+		sd = sd->parent;
+	if (!sd)
+		goto fail;
+
+	sync_entity_load_avg(&p->se);
+	if (!task_util_est(p))
+		goto unlock;
+
+	for (; pd; pd = pd->next) {
+		unsigned long cur_energy, spare_cap, max_spare_cap = 0;
+		int max_spare_cap_cpu = -1;
+
+		for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
+			if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
+				continue;
+
+			/* Skip CPUs that will be overutilized. */
+			util = cpu_util_next(cpu, p, cpu);
+			cpu_cap = capacity_of(cpu);
+			if (cpu_cap * 1024 < util * capacity_margin)
+				continue;
+
+			/* Always use prev_cpu as a candidate. */
+			if (cpu == prev_cpu) {
+				prev_energy = compute_energy(p, prev_cpu, head);
+				best_energy = min(best_energy, prev_energy);
+				continue;
+			}
+
+			/*
+			 * Find the CPU with the maximum spare capacity in
+			 * the performance domain
+			 */
+			spare_cap = cpu_cap - util;
+			if (spare_cap > max_spare_cap) {
+				max_spare_cap = spare_cap;
+				max_spare_cap_cpu = cpu;
+			}
+		}
+
+		/* Evaluate the energy impact of using this CPU. */
+		if (max_spare_cap_cpu >= 0) {
+			cur_energy = compute_energy(p, max_spare_cap_cpu, head);
+			if (cur_energy < best_energy) {
+				best_energy = cur_energy;
+				best_energy_cpu = max_spare_cap_cpu;
+			}
+		}
+	}
+unlock:
+	rcu_read_unlock();
+
+	/*
+	 * Pick the best CPU if prev_cpu cannot be used, or if it saves at
+	 * least 6% of the energy used by prev_cpu.
+	 */
+	if (prev_energy == ULONG_MAX)
+		return best_energy_cpu;
+
+	if ((prev_energy - best_energy) > (prev_energy >> 4))
+		return best_energy_cpu;
+
+	return prev_cpu;
+
+fail:
+	rcu_read_unlock();
+
+	return -1;
+}
+
 /*
  * select_task_rq_fair: Select target runqueue for the waking task in domains
  * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
@@ -6476,8 +6607,16 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 
 	if (sd_flag & SD_BALANCE_WAKE) {
 		record_wakee(p);
-		want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
-			      && cpumask_test_cpu(cpu, &p->cpus_allowed);
+
+		if (static_branch_unlikely(&sched_energy_present)) {
+			new_cpu = find_energy_efficient_cpu(p, prev_cpu);
+			if (new_cpu >= 0)
+				return new_cpu;
+			new_cpu = prev_cpu;
+		}
+
+		want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) &&
+			      cpumask_test_cpu(cpu, &p->cpus_allowed);
 	}
 
 	rcu_read_lock();
-- 
cgit 


From db5113911abaa7eb20cf115d4339959c1aecea95 Mon Sep 17 00:00:00 2001
From: Tycho Andersen <tycho@tycho.ws>
Date: Sun, 9 Dec 2018 11:24:11 -0700
Subject: seccomp: hoist struct seccomp_data recalculation higher

In the next patch, we're going to use the sd pointer passed to
__seccomp_filter() as the data to pass to userspace. Except that in some
cases (__seccomp_filter(SECCOMP_RET_TRACE), emulate_vsyscall(), every time
seccomp is inovked on power, etc.) the sd pointer will be NULL in order to
force seccomp to recompute the register data. Previously this recomputation
happened one level lower, in seccomp_run_filters(); this patch just moves
it up a level higher to __seccomp_filter().

Thanks Oleg for spotting this.

Signed-off-by: Tycho Andersen <tycho@tycho.ws>
CC: Kees Cook <keescook@chromium.org>
CC: Andy Lutomirski <luto@amacapital.net>
CC: Oleg Nesterov <oleg@redhat.com>
CC: Eric W. Biederman <ebiederm@xmission.com>
CC: "Serge E. Hallyn" <serge@hallyn.com>
Acked-by: Serge Hallyn <serge@hallyn.com>
CC: Christian Brauner <christian@brauner.io>
CC: Tyler Hicks <tyhicks@canonical.com>
CC: Akihiro Suda <suda.akihiro@lab.ntt.co.jp>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 kernel/seccomp.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index f2ae2324c232..96afc32e041d 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -188,7 +188,6 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
 static u32 seccomp_run_filters(const struct seccomp_data *sd,
 			       struct seccomp_filter **match)
 {
-	struct seccomp_data sd_local;
 	u32 ret = SECCOMP_RET_ALLOW;
 	/* Make sure cross-thread synced filter points somewhere sane. */
 	struct seccomp_filter *f =
@@ -198,11 +197,6 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd,
 	if (WARN_ON(f == NULL))
 		return SECCOMP_RET_KILL_PROCESS;
 
-	if (!sd) {
-		populate_seccomp_data(&sd_local);
-		sd = &sd_local;
-	}
-
 	/*
 	 * All filters in the list are evaluated and the lowest BPF return
 	 * value always takes priority (ignoring the DATA).
@@ -658,6 +652,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
 	u32 filter_ret, action;
 	struct seccomp_filter *match = NULL;
 	int data;
+	struct seccomp_data sd_local;
 
 	/*
 	 * Make sure that any changes to mode from another thread have
@@ -665,6 +660,11 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
 	 */
 	rmb();
 
+	if (!sd) {
+		populate_seccomp_data(&sd_local);
+		sd = &sd_local;
+	}
+
 	filter_ret = seccomp_run_filters(sd, &match);
 	data = filter_ret & SECCOMP_RET_DATA;
 	action = filter_ret & SECCOMP_RET_ACTION_FULL;
-- 
cgit 


From a5662e4d81c4d4b08140c625d0f3c50b15786252 Mon Sep 17 00:00:00 2001
From: Tycho Andersen <tycho@tycho.ws>
Date: Sun, 9 Dec 2018 11:24:12 -0700
Subject: seccomp: switch system call argument type to void *

The const qualifier causes problems for any code that wants to write to the
third argument of the seccomp syscall, as we will do in a future patch in
this series.

The third argument to the seccomp syscall is documented as void *, so
rather than just dropping the const, let's switch everything to use void *
as well.

I believe this is safe because of 1. the documentation above, 2. there's no
real type information exported about syscalls anywhere besides the man
pages.

Signed-off-by: Tycho Andersen <tycho@tycho.ws>
CC: Kees Cook <keescook@chromium.org>
CC: Andy Lutomirski <luto@amacapital.net>
CC: Oleg Nesterov <oleg@redhat.com>
CC: Eric W. Biederman <ebiederm@xmission.com>
CC: "Serge E. Hallyn" <serge@hallyn.com>
Acked-by: Serge Hallyn <serge@hallyn.com>
CC: Christian Brauner <christian@brauner.io>
CC: Tyler Hicks <tyhicks@canonical.com>
CC: Akihiro Suda <suda.akihiro@lab.ntt.co.jp>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/seccomp.h  | 2 +-
 include/linux/syscalls.h | 2 +-
 kernel/seccomp.c         | 8 ++++----
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index e5320f6c8654..b5103c019cf4 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -43,7 +43,7 @@ extern void secure_computing_strict(int this_syscall);
 #endif
 
 extern long prctl_get_seccomp(void);
-extern long prctl_set_seccomp(unsigned long, char __user *);
+extern long prctl_set_seccomp(unsigned long, void __user *);
 
 static inline int seccomp_mode(struct seccomp *s)
 {
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 2ac3d13a915b..a60694fb0f58 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -879,7 +879,7 @@ asmlinkage long sys_renameat2(int olddfd, const char __user *oldname,
 			      int newdfd, const char __user *newname,
 			      unsigned int flags);
 asmlinkage long sys_seccomp(unsigned int op, unsigned int flags,
-			    const char __user *uargs);
+			    void __user *uargs);
 asmlinkage long sys_getrandom(char __user *buf, size_t count,
 			      unsigned int flags);
 asmlinkage long sys_memfd_create(const char __user *uname_ptr, unsigned int flags);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 96afc32e041d..393e029f778a 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -924,7 +924,7 @@ static long seccomp_get_action_avail(const char __user *uaction)
 
 /* Common entry point for both prctl and syscall. */
 static long do_seccomp(unsigned int op, unsigned int flags,
-		       const char __user *uargs)
+		       void __user *uargs)
 {
 	switch (op) {
 	case SECCOMP_SET_MODE_STRICT:
@@ -944,7 +944,7 @@ static long do_seccomp(unsigned int op, unsigned int flags,
 }
 
 SYSCALL_DEFINE3(seccomp, unsigned int, op, unsigned int, flags,
-			 const char __user *, uargs)
+			 void __user *, uargs)
 {
 	return do_seccomp(op, flags, uargs);
 }
@@ -956,10 +956,10 @@ SYSCALL_DEFINE3(seccomp, unsigned int, op, unsigned int, flags,
  *
  * Returns 0 on success or -EINVAL on failure.
  */
-long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
+long prctl_set_seccomp(unsigned long seccomp_mode, void __user *filter)
 {
 	unsigned int op;
-	char __user *uargs;
+	void __user *uargs;
 
 	switch (seccomp_mode) {
 	case SECCOMP_MODE_STRICT:
-- 
cgit 


From 6a21cc50f0c7f87dae5259f6cfefe024412313f6 Mon Sep 17 00:00:00 2001
From: Tycho Andersen <tycho@tycho.ws>
Date: Sun, 9 Dec 2018 11:24:13 -0700
Subject: seccomp: add a return code to trap to userspace

This patch introduces a means for syscalls matched in seccomp to notify
some other task that a particular filter has been triggered.

The motivation for this is primarily for use with containers. For example,
if a container does an init_module(), we obviously don't want to load this
untrusted code, which may be compiled for the wrong version of the kernel
anyway. Instead, we could parse the module image, figure out which module
the container is trying to load and load it on the host.

As another example, containers cannot mount() in general since various
filesystems assume a trusted image. However, if an orchestrator knows that
e.g. a particular block device has not been exposed to a container for
writing, it want to allow the container to mount that block device (that
is, handle the mount for it).

This patch adds functionality that is already possible via at least two
other means that I know about, both of which involve ptrace(): first, one
could ptrace attach, and then iterate through syscalls via PTRACE_SYSCALL.
Unfortunately this is slow, so a faster version would be to install a
filter that does SECCOMP_RET_TRACE, which triggers a PTRACE_EVENT_SECCOMP.
Since ptrace allows only one tracer, if the container runtime is that
tracer, users inside the container (or outside) trying to debug it will not
be able to use ptrace, which is annoying. It also means that older
distributions based on Upstart cannot boot inside containers using ptrace,
since upstart itself uses ptrace to monitor services while starting.

The actual implementation of this is fairly small, although getting the
synchronization right was/is slightly complex.

Finally, it's worth noting that the classic seccomp TOCTOU of reading
memory data from the task still applies here, but can be avoided with
careful design of the userspace handler: if the userspace handler reads all
of the task memory that is necessary before applying its security policy,
the tracee's subsequent memory edits will not be read by the tracer.

Signed-off-by: Tycho Andersen <tycho@tycho.ws>
CC: Kees Cook <keescook@chromium.org>
CC: Andy Lutomirski <luto@amacapital.net>
CC: Oleg Nesterov <oleg@redhat.com>
CC: Eric W. Biederman <ebiederm@xmission.com>
CC: "Serge E. Hallyn" <serge@hallyn.com>
Acked-by: Serge Hallyn <serge@hallyn.com>
CC: Christian Brauner <christian@brauner.io>
CC: Tyler Hicks <tyhicks@canonical.com>
CC: Akihiro Suda <suda.akihiro@lab.ntt.co.jp>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 Documentation/ioctl/ioctl-number.txt           |   1 +
 Documentation/userspace-api/seccomp_filter.rst |  84 +++++
 include/linux/seccomp.h                        |   7 +-
 include/uapi/linux/seccomp.h                   |  40 ++-
 kernel/seccomp.c                               | 448 ++++++++++++++++++++++++-
 tools/testing/selftests/seccomp/seccomp_bpf.c  | 447 +++++++++++++++++++++++-
 6 files changed, 1017 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
index af6f6ba1fe80..c9558146ac58 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -79,6 +79,7 @@ Code  Seq#(hex)	Include File		Comments
 0x1b	all	InfiniBand Subsystem	<http://infiniband.sourceforge.net/>
 0x20	all	drivers/cdrom/cm206.h
 0x22	all	scsi/sg.h
+'!'	00-1F	uapi/linux/seccomp.h
 '#'	00-3F	IEEE 1394 Subsystem	Block for the entire subsystem
 '$'	00-0F	linux/perf_counter.h, linux/perf_event.h
 '%'	00-0F	include/uapi/linux/stm.h
diff --git a/Documentation/userspace-api/seccomp_filter.rst b/Documentation/userspace-api/seccomp_filter.rst
index 82a468bc7560..b1b846d8a094 100644
--- a/Documentation/userspace-api/seccomp_filter.rst
+++ b/Documentation/userspace-api/seccomp_filter.rst
@@ -122,6 +122,11 @@ In precedence order, they are:
 	Results in the lower 16-bits of the return value being passed
 	to userland as the errno without executing the system call.
 
+``SECCOMP_RET_USER_NOTIF``:
+    Results in a ``struct seccomp_notif`` message sent on the userspace
+    notification fd, if it is attached, or ``-ENOSYS`` if it is not. See below
+    on discussion of how to handle user notifications.
+
 ``SECCOMP_RET_TRACE``:
 	When returned, this value will cause the kernel to attempt to
 	notify a ``ptrace()``-based tracer prior to executing the system
@@ -183,6 +188,85 @@ The ``samples/seccomp/`` directory contains both an x86-specific example
 and a more generic example of a higher level macro interface for BPF
 program generation.
 
+Userspace Notification
+======================
+
+The ``SECCOMP_RET_USER_NOTIF`` return code lets seccomp filters pass a
+particular syscall to userspace to be handled. This may be useful for
+applications like container managers, which wish to intercept particular
+syscalls (``mount()``, ``finit_module()``, etc.) and change their behavior.
+
+To acquire a notification FD, use the ``SECCOMP_FILTER_FLAG_NEW_LISTENER``
+argument to the ``seccomp()`` syscall:
+
+.. code-block:: c
+
+    fd = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_NEW_LISTENER, &prog);
+
+which (on success) will return a listener fd for the filter, which can then be
+passed around via ``SCM_RIGHTS`` or similar. Note that filter fds correspond to
+a particular filter, and not a particular task. So if this task then forks,
+notifications from both tasks will appear on the same filter fd. Reads and
+writes to/from a filter fd are also synchronized, so a filter fd can safely
+have many readers.
+
+The interface for a seccomp notification fd consists of two structures:
+
+.. code-block:: c
+
+    struct seccomp_notif_sizes {
+        __u16 seccomp_notif;
+        __u16 seccomp_notif_resp;
+        __u16 seccomp_data;
+    };
+
+    struct seccomp_notif {
+        __u64 id;
+        __u32 pid;
+        __u32 flags;
+        struct seccomp_data data;
+    };
+
+    struct seccomp_notif_resp {
+        __u64 id;
+        __s64 val;
+        __s32 error;
+        __u32 flags;
+    };
+
+The ``struct seccomp_notif_sizes`` structure can be used to determine the size
+of the various structures used in seccomp notifications. The size of ``struct
+seccomp_data`` may change in the future, so code should use:
+
+.. code-block:: c
+
+    struct seccomp_notif_sizes sizes;
+    seccomp(SECCOMP_GET_NOTIF_SIZES, 0, &sizes);
+
+to determine the size of the various structures to allocate. See
+samples/seccomp/user-trap.c for an example.
+
+Users can read via ``ioctl(SECCOMP_IOCTL_NOTIF_RECV)``  (or ``poll()``) on a
+seccomp notification fd to receive a ``struct seccomp_notif``, which contains
+five members: the input length of the structure, a unique-per-filter ``id``,
+the ``pid`` of the task which triggered this request (which may be 0 if the
+task is in a pid ns not visible from the listener's pid namespace), a ``flags``
+member which for now only has ``SECCOMP_NOTIF_FLAG_SIGNALED``, representing
+whether or not the notification is a result of a non-fatal signal, and the
+``data`` passed to seccomp. Userspace can then make a decision based on this
+information about what to do, and ``ioctl(SECCOMP_IOCTL_NOTIF_SEND)`` a
+response, indicating what should be returned to userspace. The ``id`` member of
+``struct seccomp_notif_resp`` should be the same ``id`` as in ``struct
+seccomp_notif``.
+
+It is worth noting that ``struct seccomp_data`` contains the values of register
+arguments to the syscall, but does not contain pointers to memory. The task's
+memory is accessible to suitably privileged traces via ``ptrace()`` or
+``/proc/pid/mem``. However, care should be taken to avoid the TOCTOU mentioned
+above in this document: all arguments being read from the tracee's memory
+should be read into the tracer's memory before any policy decisions are made.
+This allows for an atomic decision on syscall arguments.
+
 Sysctls
 =======
 
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index b5103c019cf4..84868d37b35d 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -4,9 +4,10 @@
 
 #include <uapi/linux/seccomp.h>
 
-#define SECCOMP_FILTER_FLAG_MASK	(SECCOMP_FILTER_FLAG_TSYNC	| \
-					 SECCOMP_FILTER_FLAG_LOG	| \
-					 SECCOMP_FILTER_FLAG_SPEC_ALLOW)
+#define SECCOMP_FILTER_FLAG_MASK	(SECCOMP_FILTER_FLAG_TSYNC | \
+					 SECCOMP_FILTER_FLAG_LOG | \
+					 SECCOMP_FILTER_FLAG_SPEC_ALLOW | \
+					 SECCOMP_FILTER_FLAG_NEW_LISTENER)
 
 #ifdef CONFIG_SECCOMP
 
diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index 9efc0e73d50b..90734aa5aa36 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -15,11 +15,13 @@
 #define SECCOMP_SET_MODE_STRICT		0
 #define SECCOMP_SET_MODE_FILTER		1
 #define SECCOMP_GET_ACTION_AVAIL	2
+#define SECCOMP_GET_NOTIF_SIZES		3
 
 /* Valid flags for SECCOMP_SET_MODE_FILTER */
-#define SECCOMP_FILTER_FLAG_TSYNC	(1UL << 0)
-#define SECCOMP_FILTER_FLAG_LOG		(1UL << 1)
-#define SECCOMP_FILTER_FLAG_SPEC_ALLOW	(1UL << 2)
+#define SECCOMP_FILTER_FLAG_TSYNC		(1UL << 0)
+#define SECCOMP_FILTER_FLAG_LOG			(1UL << 1)
+#define SECCOMP_FILTER_FLAG_SPEC_ALLOW		(1UL << 2)
+#define SECCOMP_FILTER_FLAG_NEW_LISTENER	(1UL << 3)
 
 /*
  * All BPF programs must return a 32-bit value.
@@ -35,6 +37,7 @@
 #define SECCOMP_RET_KILL	 SECCOMP_RET_KILL_THREAD
 #define SECCOMP_RET_TRAP	 0x00030000U /* disallow and force a SIGSYS */
 #define SECCOMP_RET_ERRNO	 0x00050000U /* returns an errno */
+#define SECCOMP_RET_USER_NOTIF	 0x7fc00000U /* notifies userspace */
 #define SECCOMP_RET_TRACE	 0x7ff00000U /* pass to a tracer or disallow */
 #define SECCOMP_RET_LOG		 0x7ffc0000U /* allow after logging */
 #define SECCOMP_RET_ALLOW	 0x7fff0000U /* allow */
@@ -60,4 +63,35 @@ struct seccomp_data {
 	__u64 args[6];
 };
 
+struct seccomp_notif_sizes {
+	__u16 seccomp_notif;
+	__u16 seccomp_notif_resp;
+	__u16 seccomp_data;
+};
+
+struct seccomp_notif {
+	__u64 id;
+	__u32 pid;
+	__u32 flags;
+	struct seccomp_data data;
+};
+
+struct seccomp_notif_resp {
+	__u64 id;
+	__s64 val;
+	__s32 error;
+	__u32 flags;
+};
+
+#define SECCOMP_IOC_MAGIC		'!'
+#define SECCOMP_IO(nr)			_IO(SECCOMP_IOC_MAGIC, nr)
+#define SECCOMP_IOR(nr, type)		_IOR(SECCOMP_IOC_MAGIC, nr, type)
+#define SECCOMP_IOW(nr, type)		_IOW(SECCOMP_IOC_MAGIC, nr, type)
+#define SECCOMP_IOWR(nr, type)		_IOWR(SECCOMP_IOC_MAGIC, nr, type)
+
+/* Flags for seccomp notification fd ioctl. */
+#define SECCOMP_IOCTL_NOTIF_RECV	SECCOMP_IOWR(0, struct seccomp_notif)
+#define SECCOMP_IOCTL_NOTIF_SEND	SECCOMP_IOWR(1,	\
+						struct seccomp_notif_resp)
+#define SECCOMP_IOCTL_NOTIF_ID_VALID	SECCOMP_IOR(2, __u64)
 #endif /* _UAPI_LINUX_SECCOMP_H */
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 393e029f778a..15b6be97fc09 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -33,12 +33,74 @@
 #endif
 
 #ifdef CONFIG_SECCOMP_FILTER
+#include <linux/file.h>
 #include <linux/filter.h>
 #include <linux/pid.h>
 #include <linux/ptrace.h>
 #include <linux/security.h>
 #include <linux/tracehook.h>
 #include <linux/uaccess.h>
+#include <linux/anon_inodes.h>
+
+enum notify_state {
+	SECCOMP_NOTIFY_INIT,
+	SECCOMP_NOTIFY_SENT,
+	SECCOMP_NOTIFY_REPLIED,
+};
+
+struct seccomp_knotif {
+	/* The struct pid of the task whose filter triggered the notification */
+	struct task_struct *task;
+
+	/* The "cookie" for this request; this is unique for this filter. */
+	u64 id;
+
+	/*
+	 * The seccomp data. This pointer is valid the entire time this
+	 * notification is active, since it comes from __seccomp_filter which
+	 * eclipses the entire lifecycle here.
+	 */
+	const struct seccomp_data *data;
+
+	/*
+	 * Notification states. When SECCOMP_RET_USER_NOTIF is returned, a
+	 * struct seccomp_knotif is created and starts out in INIT. Once the
+	 * handler reads the notification off of an FD, it transitions to SENT.
+	 * If a signal is received the state transitions back to INIT and
+	 * another message is sent. When the userspace handler replies, state
+	 * transitions to REPLIED.
+	 */
+	enum notify_state state;
+
+	/* The return values, only valid when in SECCOMP_NOTIFY_REPLIED */
+	int error;
+	long val;
+
+	/* Signals when this has entered SECCOMP_NOTIFY_REPLIED */
+	struct completion ready;
+
+	struct list_head list;
+};
+
+/**
+ * struct notification - container for seccomp userspace notifications. Since
+ * most seccomp filters will not have notification listeners attached and this
+ * structure is fairly large, we store the notification-specific stuff in a
+ * separate structure.
+ *
+ * @request: A semaphore that users of this notification can wait on for
+ *           changes. Actual reads and writes are still controlled with
+ *           filter->notify_lock.
+ * @next_id: The id of the next request.
+ * @notifications: A list of struct seccomp_knotif elements.
+ * @wqh: A wait queue for poll.
+ */
+struct notification {
+	struct semaphore request;
+	u64 next_id;
+	struct list_head notifications;
+	wait_queue_head_t wqh;
+};
 
 /**
  * struct seccomp_filter - container for seccomp BPF programs
@@ -50,6 +112,8 @@
  * @log: true if all actions except for SECCOMP_RET_ALLOW should be logged
  * @prev: points to a previously installed, or inherited, filter
  * @prog: the BPF program to evaluate
+ * @notif: the struct that holds all notification related information
+ * @notify_lock: A lock for all notification-related accesses.
  *
  * seccomp_filter objects are organized in a tree linked via the @prev
  * pointer.  For any task, it appears to be a singly-linked list starting
@@ -66,6 +130,8 @@ struct seccomp_filter {
 	bool log;
 	struct seccomp_filter *prev;
 	struct bpf_prog *prog;
+	struct notification *notif;
+	struct mutex notify_lock;
 };
 
 /* Limit any path through the tree to 256KB worth of instructions. */
@@ -386,6 +452,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
 	if (!sfilter)
 		return ERR_PTR(-ENOMEM);
 
+	mutex_init(&sfilter->notify_lock);
 	ret = bpf_prog_create_from_user(&sfilter->prog, fprog,
 					seccomp_check_filter, save_orig);
 	if (ret < 0) {
@@ -479,7 +546,6 @@ static long seccomp_attach_filter(unsigned int flags,
 
 static void __get_seccomp_filter(struct seccomp_filter *filter)
 {
-	/* Reference count is bounded by the number of total processes. */
 	refcount_inc(&filter->usage);
 }
 
@@ -550,11 +616,13 @@ static void seccomp_send_sigsys(int syscall, int reason)
 #define SECCOMP_LOG_TRACE		(1 << 4)
 #define SECCOMP_LOG_LOG			(1 << 5)
 #define SECCOMP_LOG_ALLOW		(1 << 6)
+#define SECCOMP_LOG_USER_NOTIF		(1 << 7)
 
 static u32 seccomp_actions_logged = SECCOMP_LOG_KILL_PROCESS |
 				    SECCOMP_LOG_KILL_THREAD  |
 				    SECCOMP_LOG_TRAP  |
 				    SECCOMP_LOG_ERRNO |
+				    SECCOMP_LOG_USER_NOTIF |
 				    SECCOMP_LOG_TRACE |
 				    SECCOMP_LOG_LOG;
 
@@ -575,6 +643,9 @@ static inline void seccomp_log(unsigned long syscall, long signr, u32 action,
 	case SECCOMP_RET_TRACE:
 		log = requested && seccomp_actions_logged & SECCOMP_LOG_TRACE;
 		break;
+	case SECCOMP_RET_USER_NOTIF:
+		log = requested && seccomp_actions_logged & SECCOMP_LOG_USER_NOTIF;
+		break;
 	case SECCOMP_RET_LOG:
 		log = seccomp_actions_logged & SECCOMP_LOG_LOG;
 		break;
@@ -646,6 +717,68 @@ void secure_computing_strict(int this_syscall)
 #else
 
 #ifdef CONFIG_SECCOMP_FILTER
+static u64 seccomp_next_notify_id(struct seccomp_filter *filter)
+{
+	/*
+	 * Note: overflow is ok here, the id just needs to be unique per
+	 * filter.
+	 */
+	lockdep_assert_held(&filter->notify_lock);
+	return filter->notif->next_id++;
+}
+
+static void seccomp_do_user_notification(int this_syscall,
+					 struct seccomp_filter *match,
+					 const struct seccomp_data *sd)
+{
+	int err;
+	long ret = 0;
+	struct seccomp_knotif n = {};
+
+	mutex_lock(&match->notify_lock);
+	err = -ENOSYS;
+	if (!match->notif)
+		goto out;
+
+	n.task = current;
+	n.state = SECCOMP_NOTIFY_INIT;
+	n.data = sd;
+	n.id = seccomp_next_notify_id(match);
+	init_completion(&n.ready);
+	list_add(&n.list, &match->notif->notifications);
+
+	up(&match->notif->request);
+	wake_up_poll(&match->notif->wqh, EPOLLIN | EPOLLRDNORM);
+	mutex_unlock(&match->notify_lock);
+
+	/*
+	 * This is where we wait for a reply from userspace.
+	 */
+	err = wait_for_completion_interruptible(&n.ready);
+	mutex_lock(&match->notify_lock);
+	if (err == 0) {
+		ret = n.val;
+		err = n.error;
+	}
+
+	/*
+	 * Note that it's possible the listener died in between the time when
+	 * we were notified of a respons (or a signal) and when we were able to
+	 * re-acquire the lock, so only delete from the list if the
+	 * notification actually exists.
+	 *
+	 * Also note that this test is only valid because there's no way to
+	 * *reattach* to a notifier right now. If one is added, we'll need to
+	 * keep track of the notif itself and make sure they match here.
+	 */
+	if (match->notif)
+		list_del(&n.list);
+out:
+	mutex_unlock(&match->notify_lock);
+	syscall_set_return_value(current, task_pt_regs(current),
+				 err, ret);
+}
+
 static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
 			    const bool recheck_after_trace)
 {
@@ -728,6 +861,10 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
 
 		return 0;
 
+	case SECCOMP_RET_USER_NOTIF:
+		seccomp_do_user_notification(this_syscall, match, sd);
+		goto skip;
+
 	case SECCOMP_RET_LOG:
 		seccomp_log(this_syscall, 0, action, true);
 		return 0;
@@ -834,6 +971,263 @@ out:
 }
 
 #ifdef CONFIG_SECCOMP_FILTER
+static int seccomp_notify_release(struct inode *inode, struct file *file)
+{
+	struct seccomp_filter *filter = file->private_data;
+	struct seccomp_knotif *knotif;
+
+	mutex_lock(&filter->notify_lock);
+
+	/*
+	 * If this file is being closed because e.g. the task who owned it
+	 * died, let's wake everyone up who was waiting on us.
+	 */
+	list_for_each_entry(knotif, &filter->notif->notifications, list) {
+		if (knotif->state == SECCOMP_NOTIFY_REPLIED)
+			continue;
+
+		knotif->state = SECCOMP_NOTIFY_REPLIED;
+		knotif->error = -ENOSYS;
+		knotif->val = 0;
+
+		complete(&knotif->ready);
+	}
+
+	kfree(filter->notif);
+	filter->notif = NULL;
+	mutex_unlock(&filter->notify_lock);
+	__put_seccomp_filter(filter);
+	return 0;
+}
+
+static long seccomp_notify_recv(struct seccomp_filter *filter,
+				void __user *buf)
+{
+	struct seccomp_knotif *knotif = NULL, *cur;
+	struct seccomp_notif unotif;
+	ssize_t ret;
+
+	memset(&unotif, 0, sizeof(unotif));
+
+	ret = down_interruptible(&filter->notif->request);
+	if (ret < 0)
+		return ret;
+
+	mutex_lock(&filter->notify_lock);
+	list_for_each_entry(cur, &filter->notif->notifications, list) {
+		if (cur->state == SECCOMP_NOTIFY_INIT) {
+			knotif = cur;
+			break;
+		}
+	}
+
+	/*
+	 * If we didn't find a notification, it could be that the task was
+	 * interrupted by a fatal signal between the time we were woken and
+	 * when we were able to acquire the rw lock.
+	 */
+	if (!knotif) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	unotif.id = knotif->id;
+	unotif.pid = task_pid_vnr(knotif->task);
+	unotif.data = *(knotif->data);
+
+	knotif->state = SECCOMP_NOTIFY_SENT;
+	wake_up_poll(&filter->notif->wqh, EPOLLOUT | EPOLLWRNORM);
+	ret = 0;
+out:
+	mutex_unlock(&filter->notify_lock);
+
+	if (ret == 0 && copy_to_user(buf, &unotif, sizeof(unotif))) {
+		ret = -EFAULT;
+
+		/*
+		 * Userspace screwed up. To make sure that we keep this
+		 * notification alive, let's reset it back to INIT. It
+		 * may have died when we released the lock, so we need to make
+		 * sure it's still around.
+		 */
+		knotif = NULL;
+		mutex_lock(&filter->notify_lock);
+		list_for_each_entry(cur, &filter->notif->notifications, list) {
+			if (cur->id == unotif.id) {
+				knotif = cur;
+				break;
+			}
+		}
+
+		if (knotif) {
+			knotif->state = SECCOMP_NOTIFY_INIT;
+			up(&filter->notif->request);
+		}
+		mutex_unlock(&filter->notify_lock);
+	}
+
+	return ret;
+}
+
+static long seccomp_notify_send(struct seccomp_filter *filter,
+				void __user *buf)
+{
+	struct seccomp_notif_resp resp = {};
+	struct seccomp_knotif *knotif = NULL, *cur;
+	long ret;
+
+	if (copy_from_user(&resp, buf, sizeof(resp)))
+		return -EFAULT;
+
+	if (resp.flags)
+		return -EINVAL;
+
+	ret = mutex_lock_interruptible(&filter->notify_lock);
+	if (ret < 0)
+		return ret;
+
+	list_for_each_entry(cur, &filter->notif->notifications, list) {
+		if (cur->id == resp.id) {
+			knotif = cur;
+			break;
+		}
+	}
+
+	if (!knotif) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	/* Allow exactly one reply. */
+	if (knotif->state != SECCOMP_NOTIFY_SENT) {
+		ret = -EINPROGRESS;
+		goto out;
+	}
+
+	ret = 0;
+	knotif->state = SECCOMP_NOTIFY_REPLIED;
+	knotif->error = resp.error;
+	knotif->val = resp.val;
+	complete(&knotif->ready);
+out:
+	mutex_unlock(&filter->notify_lock);
+	return ret;
+}
+
+static long seccomp_notify_id_valid(struct seccomp_filter *filter,
+				    void __user *buf)
+{
+	struct seccomp_knotif *knotif = NULL;
+	u64 id;
+	long ret;
+
+	if (copy_from_user(&id, buf, sizeof(id)))
+		return -EFAULT;
+
+	ret = mutex_lock_interruptible(&filter->notify_lock);
+	if (ret < 0)
+		return ret;
+
+	ret = -ENOENT;
+	list_for_each_entry(knotif, &filter->notif->notifications, list) {
+		if (knotif->id == id) {
+			if (knotif->state == SECCOMP_NOTIFY_SENT)
+				ret = 0;
+			goto out;
+		}
+	}
+
+out:
+	mutex_unlock(&filter->notify_lock);
+	return ret;
+}
+
+static long seccomp_notify_ioctl(struct file *file, unsigned int cmd,
+				 unsigned long arg)
+{
+	struct seccomp_filter *filter = file->private_data;
+	void __user *buf = (void __user *)arg;
+
+	switch (cmd) {
+	case SECCOMP_IOCTL_NOTIF_RECV:
+		return seccomp_notify_recv(filter, buf);
+	case SECCOMP_IOCTL_NOTIF_SEND:
+		return seccomp_notify_send(filter, buf);
+	case SECCOMP_IOCTL_NOTIF_ID_VALID:
+		return seccomp_notify_id_valid(filter, buf);
+	default:
+		return -EINVAL;
+	}
+}
+
+static __poll_t seccomp_notify_poll(struct file *file,
+				    struct poll_table_struct *poll_tab)
+{
+	struct seccomp_filter *filter = file->private_data;
+	__poll_t ret = 0;
+	struct seccomp_knotif *cur;
+
+	poll_wait(file, &filter->notif->wqh, poll_tab);
+
+	ret = mutex_lock_interruptible(&filter->notify_lock);
+	if (ret < 0)
+		return EPOLLERR;
+
+	list_for_each_entry(cur, &filter->notif->notifications, list) {
+		if (cur->state == SECCOMP_NOTIFY_INIT)
+			ret |= EPOLLIN | EPOLLRDNORM;
+		if (cur->state == SECCOMP_NOTIFY_SENT)
+			ret |= EPOLLOUT | EPOLLWRNORM;
+		if ((ret & EPOLLIN) && (ret & EPOLLOUT))
+			break;
+	}
+
+	mutex_unlock(&filter->notify_lock);
+
+	return ret;
+}
+
+static const struct file_operations seccomp_notify_ops = {
+	.poll = seccomp_notify_poll,
+	.release = seccomp_notify_release,
+	.unlocked_ioctl = seccomp_notify_ioctl,
+};
+
+static struct file *init_listener(struct seccomp_filter *filter)
+{
+	struct file *ret = ERR_PTR(-EBUSY);
+	struct seccomp_filter *cur;
+
+	for (cur = current->seccomp.filter; cur; cur = cur->prev) {
+		if (cur->notif)
+			goto out;
+	}
+
+	ret = ERR_PTR(-ENOMEM);
+	filter->notif = kzalloc(sizeof(*(filter->notif)), GFP_KERNEL);
+	if (!filter->notif)
+		goto out;
+
+	sema_init(&filter->notif->request, 0);
+	filter->notif->next_id = get_random_u64();
+	INIT_LIST_HEAD(&filter->notif->notifications);
+	init_waitqueue_head(&filter->notif->wqh);
+
+	ret = anon_inode_getfile("seccomp notify", &seccomp_notify_ops,
+				 filter, O_RDWR);
+	if (IS_ERR(ret))
+		goto out_notif;
+
+	/* The file has a reference to it now */
+	__get_seccomp_filter(filter);
+
+out_notif:
+	if (IS_ERR(ret))
+		kfree(filter->notif);
+out:
+	return ret;
+}
+
 /**
  * seccomp_set_mode_filter: internal function for setting seccomp filter
  * @flags:  flags to change filter behavior
@@ -853,6 +1247,8 @@ static long seccomp_set_mode_filter(unsigned int flags,
 	const unsigned long seccomp_mode = SECCOMP_MODE_FILTER;
 	struct seccomp_filter *prepared = NULL;
 	long ret = -EINVAL;
+	int listener = -1;
+	struct file *listener_f = NULL;
 
 	/* Validate flags. */
 	if (flags & ~SECCOMP_FILTER_FLAG_MASK)
@@ -863,13 +1259,28 @@ static long seccomp_set_mode_filter(unsigned int flags,
 	if (IS_ERR(prepared))
 		return PTR_ERR(prepared);
 
+	if (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) {
+		listener = get_unused_fd_flags(O_CLOEXEC);
+		if (listener < 0) {
+			ret = listener;
+			goto out_free;
+		}
+
+		listener_f = init_listener(prepared);
+		if (IS_ERR(listener_f)) {
+			put_unused_fd(listener);
+			ret = PTR_ERR(listener_f);
+			goto out_free;
+		}
+	}
+
 	/*
 	 * Make sure we cannot change seccomp or nnp state via TSYNC
 	 * while another thread is in the middle of calling exec.
 	 */
 	if (flags & SECCOMP_FILTER_FLAG_TSYNC &&
 	    mutex_lock_killable(&current->signal->cred_guard_mutex))
-		goto out_free;
+		goto out_put_fd;
 
 	spin_lock_irq(&current->sighand->siglock);
 
@@ -887,6 +1298,16 @@ out:
 	spin_unlock_irq(&current->sighand->siglock);
 	if (flags & SECCOMP_FILTER_FLAG_TSYNC)
 		mutex_unlock(&current->signal->cred_guard_mutex);
+out_put_fd:
+	if (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) {
+		if (ret < 0) {
+			fput(listener_f);
+			put_unused_fd(listener);
+		} else {
+			fd_install(listener, listener_f);
+			ret = listener;
+		}
+	}
 out_free:
 	seccomp_filter_free(prepared);
 	return ret;
@@ -911,6 +1332,7 @@ static long seccomp_get_action_avail(const char __user *uaction)
 	case SECCOMP_RET_KILL_THREAD:
 	case SECCOMP_RET_TRAP:
 	case SECCOMP_RET_ERRNO:
+	case SECCOMP_RET_USER_NOTIF:
 	case SECCOMP_RET_TRACE:
 	case SECCOMP_RET_LOG:
 	case SECCOMP_RET_ALLOW:
@@ -922,6 +1344,20 @@ static long seccomp_get_action_avail(const char __user *uaction)
 	return 0;
 }
 
+static long seccomp_get_notif_sizes(void __user *usizes)
+{
+	struct seccomp_notif_sizes sizes = {
+		.seccomp_notif = sizeof(struct seccomp_notif),
+		.seccomp_notif_resp = sizeof(struct seccomp_notif_resp),
+		.seccomp_data = sizeof(struct seccomp_data),
+	};
+
+	if (copy_to_user(usizes, &sizes, sizeof(sizes)))
+		return -EFAULT;
+
+	return 0;
+}
+
 /* Common entry point for both prctl and syscall. */
 static long do_seccomp(unsigned int op, unsigned int flags,
 		       void __user *uargs)
@@ -938,6 +1374,11 @@ static long do_seccomp(unsigned int op, unsigned int flags,
 			return -EINVAL;
 
 		return seccomp_get_action_avail(uargs);
+	case SECCOMP_GET_NOTIF_SIZES:
+		if (flags != 0)
+			return -EINVAL;
+
+		return seccomp_get_notif_sizes(uargs);
 	default:
 		return -EINVAL;
 	}
@@ -1111,6 +1552,7 @@ long seccomp_get_metadata(struct task_struct *task,
 #define SECCOMP_RET_KILL_THREAD_NAME	"kill_thread"
 #define SECCOMP_RET_TRAP_NAME		"trap"
 #define SECCOMP_RET_ERRNO_NAME		"errno"
+#define SECCOMP_RET_USER_NOTIF_NAME	"user_notif"
 #define SECCOMP_RET_TRACE_NAME		"trace"
 #define SECCOMP_RET_LOG_NAME		"log"
 #define SECCOMP_RET_ALLOW_NAME		"allow"
@@ -1120,6 +1562,7 @@ static const char seccomp_actions_avail[] =
 				SECCOMP_RET_KILL_THREAD_NAME	" "
 				SECCOMP_RET_TRAP_NAME		" "
 				SECCOMP_RET_ERRNO_NAME		" "
+				SECCOMP_RET_USER_NOTIF_NAME     " "
 				SECCOMP_RET_TRACE_NAME		" "
 				SECCOMP_RET_LOG_NAME		" "
 				SECCOMP_RET_ALLOW_NAME;
@@ -1134,6 +1577,7 @@ static const struct seccomp_log_name seccomp_log_names[] = {
 	{ SECCOMP_LOG_KILL_THREAD, SECCOMP_RET_KILL_THREAD_NAME },
 	{ SECCOMP_LOG_TRAP, SECCOMP_RET_TRAP_NAME },
 	{ SECCOMP_LOG_ERRNO, SECCOMP_RET_ERRNO_NAME },
+	{ SECCOMP_LOG_USER_NOTIF, SECCOMP_RET_USER_NOTIF_NAME },
 	{ SECCOMP_LOG_TRACE, SECCOMP_RET_TRACE_NAME },
 	{ SECCOMP_LOG_LOG, SECCOMP_RET_LOG_NAME },
 	{ SECCOMP_LOG_ALLOW, SECCOMP_RET_ALLOW_NAME },
diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
index e1473234968d..5c9768a1b8cd 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -5,6 +5,7 @@
  * Test code for seccomp bpf.
  */
 
+#define _GNU_SOURCE
 #include <sys/types.h>
 
 /*
@@ -40,10 +41,12 @@
 #include <sys/fcntl.h>
 #include <sys/mman.h>
 #include <sys/times.h>
+#include <sys/socket.h>
+#include <sys/ioctl.h>
 
-#define _GNU_SOURCE
 #include <unistd.h>
 #include <sys/syscall.h>
+#include <poll.h>
 
 #include "../kselftest_harness.h"
 
@@ -133,6 +136,10 @@ struct seccomp_data {
 #define SECCOMP_GET_ACTION_AVAIL 2
 #endif
 
+#ifndef SECCOMP_GET_NOTIF_SIZES
+#define SECCOMP_GET_NOTIF_SIZES 3
+#endif
+
 #ifndef SECCOMP_FILTER_FLAG_TSYNC
 #define SECCOMP_FILTER_FLAG_TSYNC (1UL << 0)
 #endif
@@ -154,6 +161,44 @@ struct seccomp_metadata {
 };
 #endif
 
+#ifndef SECCOMP_FILTER_FLAG_NEW_LISTENER
+#define SECCOMP_FILTER_FLAG_NEW_LISTENER	(1UL << 3)
+
+#define SECCOMP_RET_USER_NOTIF 0x7fc00000U
+
+#define SECCOMP_IOC_MAGIC		'!'
+#define SECCOMP_IO(nr)			_IO(SECCOMP_IOC_MAGIC, nr)
+#define SECCOMP_IOR(nr, type)		_IOR(SECCOMP_IOC_MAGIC, nr, type)
+#define SECCOMP_IOW(nr, type)		_IOW(SECCOMP_IOC_MAGIC, nr, type)
+#define SECCOMP_IOWR(nr, type)		_IOWR(SECCOMP_IOC_MAGIC, nr, type)
+
+/* Flags for seccomp notification fd ioctl. */
+#define SECCOMP_IOCTL_NOTIF_RECV	SECCOMP_IOWR(0, struct seccomp_notif)
+#define SECCOMP_IOCTL_NOTIF_SEND	SECCOMP_IOWR(1,	\
+						struct seccomp_notif_resp)
+#define SECCOMP_IOCTL_NOTIF_ID_VALID	SECCOMP_IOR(2, __u64)
+
+struct seccomp_notif {
+	__u64 id;
+	__u32 pid;
+	__u32 flags;
+	struct seccomp_data data;
+};
+
+struct seccomp_notif_resp {
+	__u64 id;
+	__s64 val;
+	__s32 error;
+	__u32 flags;
+};
+
+struct seccomp_notif_sizes {
+	__u16 seccomp_notif;
+	__u16 seccomp_notif_resp;
+	__u16 seccomp_data;
+};
+#endif
+
 #ifndef seccomp
 int seccomp(unsigned int op, unsigned int flags, void *args)
 {
@@ -2077,7 +2122,8 @@ TEST(detect_seccomp_filter_flags)
 {
 	unsigned int flags[] = { SECCOMP_FILTER_FLAG_TSYNC,
 				 SECCOMP_FILTER_FLAG_LOG,
-				 SECCOMP_FILTER_FLAG_SPEC_ALLOW };
+				 SECCOMP_FILTER_FLAG_SPEC_ALLOW,
+				 SECCOMP_FILTER_FLAG_NEW_LISTENER };
 	unsigned int flag, all_flags;
 	int i;
 	long ret;
@@ -2933,6 +2979,403 @@ skip:
 	ASSERT_EQ(0, kill(pid, SIGKILL));
 }
 
+static int user_trap_syscall(int nr, unsigned int flags)
+{
+	struct sock_filter filter[] = {
+		BPF_STMT(BPF_LD+BPF_W+BPF_ABS,
+			offsetof(struct seccomp_data, nr)),
+		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, nr, 0, 1),
+		BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_USER_NOTIF),
+		BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
+	};
+
+	struct sock_fprog prog = {
+		.len = (unsigned short)ARRAY_SIZE(filter),
+		.filter = filter,
+	};
+
+	return seccomp(SECCOMP_SET_MODE_FILTER, flags, &prog);
+}
+
+#define USER_NOTIF_MAGIC 116983961184613L
+TEST(user_notification_basic)
+{
+	pid_t pid;
+	long ret;
+	int status, listener;
+	struct seccomp_notif req = {};
+	struct seccomp_notif_resp resp = {};
+	struct pollfd pollfd;
+
+	struct sock_filter filter[] = {
+		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+	};
+	struct sock_fprog prog = {
+		.len = (unsigned short)ARRAY_SIZE(filter),
+		.filter = filter,
+	};
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	/* Check that we get -ENOSYS with no listener attached */
+	if (pid == 0) {
+		if (user_trap_syscall(__NR_getpid, 0) < 0)
+			exit(1);
+		ret = syscall(__NR_getpid);
+		exit(ret >= 0 || errno != ENOSYS);
+	}
+
+	EXPECT_EQ(waitpid(pid, &status, 0), pid);
+	EXPECT_EQ(true, WIFEXITED(status));
+	EXPECT_EQ(0, WEXITSTATUS(status));
+
+	/* Add some no-op filters so for grins. */
+	EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
+	EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
+	EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
+	EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
+
+	/* Check that the basic notification machinery works */
+	listener = user_trap_syscall(__NR_getpid,
+				     SECCOMP_FILTER_FLAG_NEW_LISTENER);
+	EXPECT_GE(listener, 0);
+
+	/* Installing a second listener in the chain should EBUSY */
+	EXPECT_EQ(user_trap_syscall(__NR_getpid,
+				    SECCOMP_FILTER_FLAG_NEW_LISTENER),
+		  -1);
+	EXPECT_EQ(errno, EBUSY);
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		ret = syscall(__NR_getpid);
+		exit(ret != USER_NOTIF_MAGIC);
+	}
+
+	pollfd.fd = listener;
+	pollfd.events = POLLIN | POLLOUT;
+
+	EXPECT_GT(poll(&pollfd, 1, -1), 0);
+	EXPECT_EQ(pollfd.revents, POLLIN);
+
+	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
+
+	pollfd.fd = listener;
+	pollfd.events = POLLIN | POLLOUT;
+
+	EXPECT_GT(poll(&pollfd, 1, -1), 0);
+	EXPECT_EQ(pollfd.revents, POLLOUT);
+
+	EXPECT_EQ(req.data.nr,  __NR_getpid);
+
+	resp.id = req.id;
+	resp.error = 0;
+	resp.val = USER_NOTIF_MAGIC;
+
+	/* check that we make sure flags == 0 */
+	resp.flags = 1;
+	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1);
+	EXPECT_EQ(errno, EINVAL);
+
+	resp.flags = 0;
+	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
+
+	EXPECT_EQ(waitpid(pid, &status, 0), pid);
+	EXPECT_EQ(true, WIFEXITED(status));
+	EXPECT_EQ(0, WEXITSTATUS(status));
+}
+
+TEST(user_notification_kill_in_middle)
+{
+	pid_t pid;
+	long ret;
+	int listener;
+	struct seccomp_notif req = {};
+	struct seccomp_notif_resp resp = {};
+
+	listener = user_trap_syscall(__NR_getpid,
+				     SECCOMP_FILTER_FLAG_NEW_LISTENER);
+	EXPECT_GE(listener, 0);
+
+	/*
+	 * Check that nothing bad happens when we kill the task in the middle
+	 * of a syscall.
+	 */
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		ret = syscall(__NR_getpid);
+		exit(ret != USER_NOTIF_MAGIC);
+	}
+
+	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
+	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ID_VALID, &req.id), 0);
+
+	EXPECT_EQ(kill(pid, SIGKILL), 0);
+	EXPECT_EQ(waitpid(pid, NULL, 0), pid);
+
+	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ID_VALID, &req.id), -1);
+
+	resp.id = req.id;
+	ret = ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp);
+	EXPECT_EQ(ret, -1);
+	EXPECT_EQ(errno, ENOENT);
+}
+
+static int handled = -1;
+
+static void signal_handler(int signal)
+{
+	if (write(handled, "c", 1) != 1)
+		perror("write from signal");
+}
+
+TEST(user_notification_signal)
+{
+	pid_t pid;
+	long ret;
+	int status, listener, sk_pair[2];
+	struct seccomp_notif req = {};
+	struct seccomp_notif_resp resp = {};
+	char c;
+
+	ASSERT_EQ(socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair), 0);
+
+	listener = user_trap_syscall(__NR_gettid,
+				     SECCOMP_FILTER_FLAG_NEW_LISTENER);
+	EXPECT_GE(listener, 0);
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		close(sk_pair[0]);
+		handled = sk_pair[1];
+		if (signal(SIGUSR1, signal_handler) == SIG_ERR) {
+			perror("signal");
+			exit(1);
+		}
+		/*
+		 * ERESTARTSYS behavior is a bit hard to test, because we need
+		 * to rely on a signal that has not yet been handled. Let's at
+		 * least check that the error code gets propagated through, and
+		 * hope that it doesn't break when there is actually a signal :)
+		 */
+		ret = syscall(__NR_gettid);
+		exit(!(ret == -1 && errno == 512));
+	}
+
+	close(sk_pair[1]);
+
+	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
+
+	EXPECT_EQ(kill(pid, SIGUSR1), 0);
+
+	/*
+	 * Make sure the signal really is delivered, which means we're not
+	 * stuck in the user notification code any more and the notification
+	 * should be dead.
+	 */
+	EXPECT_EQ(read(sk_pair[0], &c, 1), 1);
+
+	resp.id = req.id;
+	resp.error = -EPERM;
+	resp.val = 0;
+
+	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1);
+	EXPECT_EQ(errno, ENOENT);
+
+	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
+
+	resp.id = req.id;
+	resp.error = -512; /* -ERESTARTSYS */
+	resp.val = 0;
+
+	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
+
+	EXPECT_EQ(waitpid(pid, &status, 0), pid);
+	EXPECT_EQ(true, WIFEXITED(status));
+	EXPECT_EQ(0, WEXITSTATUS(status));
+}
+
+TEST(user_notification_closed_listener)
+{
+	pid_t pid;
+	long ret;
+	int status, listener;
+
+	listener = user_trap_syscall(__NR_getpid,
+				     SECCOMP_FILTER_FLAG_NEW_LISTENER);
+	EXPECT_GE(listener, 0);
+
+	/*
+	 * Check that we get an ENOSYS when the listener is closed.
+	 */
+	pid = fork();
+	ASSERT_GE(pid, 0);
+	if (pid == 0) {
+		close(listener);
+		ret = syscall(__NR_getpid);
+		exit(ret != -1 && errno != ENOSYS);
+	}
+
+	close(listener);
+
+	EXPECT_EQ(waitpid(pid, &status, 0), pid);
+	EXPECT_EQ(true, WIFEXITED(status));
+	EXPECT_EQ(0, WEXITSTATUS(status));
+}
+
+/*
+ * Check that a pid in a child namespace still shows up as valid in ours.
+ */
+TEST(user_notification_child_pid_ns)
+{
+	pid_t pid;
+	int status, listener;
+	struct seccomp_notif req = {};
+	struct seccomp_notif_resp resp = {};
+
+	ASSERT_EQ(unshare(CLONE_NEWPID), 0);
+
+	listener = user_trap_syscall(__NR_getpid, SECCOMP_FILTER_FLAG_NEW_LISTENER);
+	ASSERT_GE(listener, 0);
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0)
+		exit(syscall(__NR_getpid) != USER_NOTIF_MAGIC);
+
+	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
+	EXPECT_EQ(req.pid, pid);
+
+	resp.id = req.id;
+	resp.error = 0;
+	resp.val = USER_NOTIF_MAGIC;
+
+	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
+
+	EXPECT_EQ(waitpid(pid, &status, 0), pid);
+	EXPECT_EQ(true, WIFEXITED(status));
+	EXPECT_EQ(0, WEXITSTATUS(status));
+	close(listener);
+}
+
+/*
+ * Check that a pid in a sibling (i.e. unrelated) namespace shows up as 0, i.e.
+ * invalid.
+ */
+TEST(user_notification_sibling_pid_ns)
+{
+	pid_t pid, pid2;
+	int status, listener;
+	struct seccomp_notif req = {};
+	struct seccomp_notif_resp resp = {};
+
+	listener = user_trap_syscall(__NR_getpid, SECCOMP_FILTER_FLAG_NEW_LISTENER);
+	ASSERT_GE(listener, 0);
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		ASSERT_EQ(unshare(CLONE_NEWPID), 0);
+
+		pid2 = fork();
+		ASSERT_GE(pid2, 0);
+
+		if (pid2 == 0)
+			exit(syscall(__NR_getpid) != USER_NOTIF_MAGIC);
+
+		EXPECT_EQ(waitpid(pid2, &status, 0), pid2);
+		EXPECT_EQ(true, WIFEXITED(status));
+		EXPECT_EQ(0, WEXITSTATUS(status));
+		exit(WEXITSTATUS(status));
+	}
+
+	/* Create the sibling ns, and sibling in it. */
+	EXPECT_EQ(unshare(CLONE_NEWPID), 0);
+	EXPECT_EQ(errno, 0);
+
+	pid2 = fork();
+	EXPECT_GE(pid2, 0);
+
+	if (pid2 == 0) {
+		ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
+		/*
+		 * The pid should be 0, i.e. the task is in some namespace that
+		 * we can't "see".
+		 */
+		ASSERT_EQ(req.pid, 0);
+
+		resp.id = req.id;
+		resp.error = 0;
+		resp.val = USER_NOTIF_MAGIC;
+
+		ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
+		exit(0);
+	}
+
+	close(listener);
+
+	EXPECT_EQ(waitpid(pid, &status, 0), pid);
+	EXPECT_EQ(true, WIFEXITED(status));
+	EXPECT_EQ(0, WEXITSTATUS(status));
+
+	EXPECT_EQ(waitpid(pid2, &status, 0), pid2);
+	EXPECT_EQ(true, WIFEXITED(status));
+	EXPECT_EQ(0, WEXITSTATUS(status));
+}
+
+TEST(user_notification_fault_recv)
+{
+	pid_t pid;
+	int status, listener;
+	struct seccomp_notif req = {};
+	struct seccomp_notif_resp resp = {};
+
+	listener = user_trap_syscall(__NR_getpid, SECCOMP_FILTER_FLAG_NEW_LISTENER);
+	ASSERT_GE(listener, 0);
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0)
+		exit(syscall(__NR_getpid) != USER_NOTIF_MAGIC);
+
+	/* Do a bad recv() */
+	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, NULL), -1);
+	EXPECT_EQ(errno, EFAULT);
+
+	/* We should still be able to receive this notification, though. */
+	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
+	EXPECT_EQ(req.pid, pid);
+
+	resp.id = req.id;
+	resp.error = 0;
+	resp.val = USER_NOTIF_MAGIC;
+
+	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
+
+	EXPECT_EQ(waitpid(pid, &status, 0), pid);
+	EXPECT_EQ(true, WIFEXITED(status));
+	EXPECT_EQ(0, WEXITSTATUS(status));
+}
+
+TEST(seccomp_get_notif_sizes)
+{
+	struct seccomp_notif_sizes sizes;
+
+	EXPECT_EQ(seccomp(SECCOMP_GET_NOTIF_SIZES, 0, &sizes), 0);
+	EXPECT_EQ(sizes.seccomp_notif, sizeof(struct seccomp_notif));
+	EXPECT_EQ(sizes.seccomp_notif_resp, sizeof(struct seccomp_notif_resp));
+}
+
 /*
  * TODO:
  * - add microbenchmarks
-- 
cgit 


From 5b20c6fd6a60e182243da31c47f2ebff5b0e3d57 Mon Sep 17 00:00:00 2001
From: Yangtao Li <tiny.windzz@gmail.com>
Date: Tue, 11 Dec 2018 11:37:44 -0500
Subject: timekeeping: Convert to DEFINE_SHOW_ATTRIBUTE

Use DEFINE_SHOW_ATTRIBUTE macro to simplify the code.

Signed-off-by: Yangtao Li <tiny.windzz@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: john.stultz@linaro.org
Cc: sboyd@kernel.org
Link: https://lkml.kernel.org/r/20181211163744.22133-1-tiny.windzz@gmail.com
---
 kernel/time/timekeeping_debug.c | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c
index f811882cfd13..86489950d690 100644
--- a/kernel/time/timekeeping_debug.c
+++ b/kernel/time/timekeeping_debug.c
@@ -19,7 +19,7 @@
 
 static unsigned int sleep_time_bin[NUM_BINS] = {0};
 
-static int tk_debug_show_sleep_time(struct seq_file *s, void *data)
+static int tk_debug_sleep_time_show(struct seq_file *s, void *data)
 {
 	unsigned int bin;
 	seq_puts(s, "      time (secs)        count\n");
@@ -33,18 +33,7 @@ static int tk_debug_show_sleep_time(struct seq_file *s, void *data)
 	}
 	return 0;
 }
-
-static int tk_debug_sleep_time_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, tk_debug_show_sleep_time, NULL);
-}
-
-static const struct file_operations tk_debug_sleep_time_fops = {
-	.open		= tk_debug_sleep_time_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(tk_debug_sleep_time);
 
 static int __init tk_debug_sleep_time_init(void)
 {
-- 
cgit 


From fdadd04931c2d7cd294dc5b2b342863f94be53a3 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 11 Dec 2018 12:14:12 +0100
Subject: bpf: fix bpf_jit_limit knob for PAGE_SIZE >= 64K

Michael and Sandipan report:

  Commit ede95a63b5 introduced a bpf_jit_limit tuneable to limit BPF
  JIT allocations. At compile time it defaults to PAGE_SIZE * 40000,
  and is adjusted again at init time if MODULES_VADDR is defined.

  For ppc64 kernels, MODULES_VADDR isn't defined, so we're stuck with
  the compile-time default at boot-time, which is 0x9c400000 when
  using 64K page size. This overflows the signed 32-bit bpf_jit_limit
  value:

  root@ubuntu:/tmp# cat /proc/sys/net/core/bpf_jit_limit
  -1673527296

  and can cause various unexpected failures throughout the network
  stack. In one case `strace dhclient eth0` reported:

  setsockopt(5, SOL_SOCKET, SO_ATTACH_FILTER, {len=11, filter=0x105dd27f8},
             16) = -1 ENOTSUPP (Unknown error 524)

  and similar failures can be seen with tools like tcpdump. This doesn't
  always reproduce however, and I'm not sure why. The more consistent
  failure I've seen is an Ubuntu 18.04 KVM guest booted on a POWER9
  host would time out on systemd/netplan configuring a virtio-net NIC
  with no noticeable errors in the logs.

Given this and also given that in near future some architectures like
arm64 will have a custom area for BPF JIT image allocations we should
get rid of the BPF_JIT_LIMIT_DEFAULT fallback / default entirely. For
4.21, we have an overridable bpf_jit_alloc_exec(), bpf_jit_free_exec()
so therefore add another overridable bpf_jit_alloc_exec_limit() helper
function which returns the possible size of the memory area for deriving
the default heuristic in bpf_jit_charge_init().

Like bpf_jit_alloc_exec() and bpf_jit_free_exec(), the new
bpf_jit_alloc_exec_limit() assumes that module_alloc() is the default
JIT memory provider, and therefore in case archs implement their custom
module_alloc() we use MODULES_{END,_VADDR} for limits and otherwise for
vmalloc_exec() cases like on ppc64 we use VMALLOC_{END,_START}.

Additionally, for archs supporting large page sizes, we should change
the sysctl to be handled as long to not run into sysctl restrictions
in future.

Fixes: ede95a63b5e8 ("bpf: add bpf_jit_limit knob to restrict unpriv allocations")
Reported-by: Sandipan Das <sandipan@linux.ibm.com>
Reported-by: Michael Roth <mdroth@linux.vnet.ibm.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: Michael Roth <mdroth@linux.vnet.ibm.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/filter.h     |  2 +-
 kernel/bpf/core.c          | 21 +++++++++++++++------
 net/core/sysctl_net_core.c | 20 +++++++++++++++++---
 3 files changed, 33 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 795ff0b869bb..a8b9d90a8042 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -861,7 +861,7 @@ bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
 extern int bpf_jit_enable;
 extern int bpf_jit_harden;
 extern int bpf_jit_kallsyms;
-extern int bpf_jit_limit;
+extern long bpf_jit_limit;
 
 typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size);
 
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index b1a3545d0ec8..b2890c268cb3 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -365,13 +365,11 @@ void bpf_prog_kallsyms_del_all(struct bpf_prog *fp)
 }
 
 #ifdef CONFIG_BPF_JIT
-# define BPF_JIT_LIMIT_DEFAULT	(PAGE_SIZE * 40000)
-
 /* All BPF JIT sysctl knobs here. */
 int bpf_jit_enable   __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON);
 int bpf_jit_harden   __read_mostly;
 int bpf_jit_kallsyms __read_mostly;
-int bpf_jit_limit    __read_mostly = BPF_JIT_LIMIT_DEFAULT;
+long bpf_jit_limit   __read_mostly;
 
 static __always_inline void
 bpf_get_prog_addr_region(const struct bpf_prog *prog,
@@ -580,16 +578,27 @@ int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
 
 static atomic_long_t bpf_jit_current;
 
+/* Can be overridden by an arch's JIT compiler if it has a custom,
+ * dedicated BPF backend memory area, or if neither of the two
+ * below apply.
+ */
+u64 __weak bpf_jit_alloc_exec_limit(void)
+{
 #if defined(MODULES_VADDR)
+	return MODULES_END - MODULES_VADDR;
+#else
+	return VMALLOC_END - VMALLOC_START;
+#endif
+}
+
 static int __init bpf_jit_charge_init(void)
 {
 	/* Only used as heuristic here to derive limit. */
-	bpf_jit_limit = min_t(u64, round_up((MODULES_END - MODULES_VADDR) >> 2,
-					    PAGE_SIZE), INT_MAX);
+	bpf_jit_limit = min_t(u64, round_up(bpf_jit_alloc_exec_limit() >> 2,
+					    PAGE_SIZE), LONG_MAX);
 	return 0;
 }
 pure_initcall(bpf_jit_charge_init);
-#endif
 
 static int bpf_jit_charge_modmem(u32 pages)
 {
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 37b4667128a3..d67ec17f2cc8 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -28,6 +28,8 @@ static int two __maybe_unused = 2;
 static int min_sndbuf = SOCK_MIN_SNDBUF;
 static int min_rcvbuf = SOCK_MIN_RCVBUF;
 static int max_skb_frags = MAX_SKB_FRAGS;
+static long long_one __maybe_unused = 1;
+static long long_max __maybe_unused = LONG_MAX;
 
 static int net_msg_warn;	/* Unused, but still a sysctl */
 
@@ -289,6 +291,17 @@ proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write,
 
 	return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 }
+
+static int
+proc_dolongvec_minmax_bpf_restricted(struct ctl_table *table, int write,
+				     void __user *buffer, size_t *lenp,
+				     loff_t *ppos)
+{
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
+}
 #endif
 
 static struct ctl_table net_core_table[] = {
@@ -398,10 +411,11 @@ static struct ctl_table net_core_table[] = {
 	{
 		.procname	= "bpf_jit_limit",
 		.data		= &bpf_jit_limit,
-		.maxlen		= sizeof(int),
+		.maxlen		= sizeof(long),
 		.mode		= 0600,
-		.proc_handler	= proc_dointvec_minmax_bpf_restricted,
-		.extra1		= &one,
+		.proc_handler	= proc_dolongvec_minmax_bpf_restricted,
+		.extra1		= &long_one,
+		.extra2		= &long_max,
 	},
 #endif
 	{
-- 
cgit 


From 07c17732bd687567802aaa5fa5c101c2776565d1 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Tue, 11 Dec 2018 18:49:05 +0900
Subject: printk: Remove print_prefix() calls with NULL buffer.

We can save lines/size by removing print_prefix() with buf == NULL.
This patch makes no functional change.

Link: http://lkml.kernel.org/r/1544521745-11925-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp
To: Steven Rostedt <rostedt@goodmis.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 kernel/printk/printk.c | 39 ++++++++++++++-------------------------
 1 file changed, 14 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index c7d217764db3..91db332ccf4d 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1228,13 +1228,15 @@ static inline void boot_delay_msec(int level)
 static bool printk_time = IS_ENABLED(CONFIG_PRINTK_TIME);
 module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
 
+static size_t print_syslog(unsigned int level, char *buf)
+{
+	return sprintf(buf, "<%u>", level);
+}
+
 static size_t print_time(u64 ts, char *buf)
 {
 	unsigned long rem_nsec = do_div(ts, 1000000000);
 
-	if (!buf)
-		return snprintf(NULL, 0, "[%5lu.000000] ", (unsigned long)ts);
-
 	return sprintf(buf, "[%5lu.%06lu] ",
 		       (unsigned long)ts, rem_nsec / 1000);
 }
@@ -1243,24 +1245,11 @@ static size_t print_prefix(const struct printk_log *msg, bool syslog,
 			   bool time, char *buf)
 {
 	size_t len = 0;
-	unsigned int prefix = (msg->facility << 3) | msg->level;
-
-	if (syslog) {
-		if (buf) {
-			len += sprintf(buf, "<%u>", prefix);
-		} else {
-			len += 3;
-			if (prefix > 999)
-				len += 3;
-			else if (prefix > 99)
-				len += 2;
-			else if (prefix > 9)
-				len++;
-		}
-	}
 
+	if (syslog)
+		len = print_syslog((msg->facility << 3) | msg->level, buf);
 	if (time)
-		len += print_time(msg->ts_nsec, buf ? buf + len : NULL);
+		len += print_time(msg->ts_nsec, buf + len);
 	return len;
 }
 
@@ -1270,6 +1259,8 @@ static size_t msg_print_text(const struct printk_log *msg, bool syslog,
 	const char *text = log_text(msg);
 	size_t text_size = msg->text_len;
 	size_t len = 0;
+	char prefix[PREFIX_MAX];
+	const size_t prefix_len = print_prefix(msg, syslog, time, prefix);
 
 	do {
 		const char *next = memchr(text, '\n', text_size);
@@ -1284,19 +1275,17 @@ static size_t msg_print_text(const struct printk_log *msg, bool syslog,
 		}
 
 		if (buf) {
-			if (print_prefix(msg, syslog, time, NULL) +
-			    text_len + 1 >= size - len)
+			if (prefix_len + text_len + 1 >= size - len)
 				break;
 
-			len += print_prefix(msg, syslog, time, buf + len);
+			memcpy(buf + len, prefix, prefix_len);
+			len += prefix_len;
 			memcpy(buf + len, text, text_len);
 			len += text_len;
 			buf[len++] = '\n';
 		} else {
 			/* SYSLOG_ACTION_* buffer size only calculation */
-			len += print_prefix(msg, syslog, time, NULL);
-			len += text_len;
-			len++;
+			len += prefix_len + text_len + 1;
 		}
 
 		text = next;
-- 
cgit 


From 943a10f85265d4c14753d445003b6f5d8e3d959a Mon Sep 17 00:00:00 2001
From: Yangtao Li <tiny.windzz@gmail.com>
Date: Tue, 11 Dec 2018 11:20:48 -0500
Subject: PM / sleep: convert to DEFINE_SHOW_ATTRIBUTE

Use DEFINE_SHOW_ATTRIBUTE macro to simplify the code.

Signed-off-by: Yangtao Li <tiny.windzz@gmail.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/main.c | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/main.c b/kernel/power/main.c
index 35b50823d83b..98e76cad128b 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -318,23 +318,12 @@ static int suspend_stats_show(struct seq_file *s, void *unused)
 
 	return 0;
 }
-
-static int suspend_stats_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, suspend_stats_show, NULL);
-}
-
-static const struct file_operations suspend_stats_operations = {
-	.open           = suspend_stats_open,
-	.read           = seq_read,
-	.llseek         = seq_lseek,
-	.release        = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(suspend_stats);
 
 static int __init pm_debugfs_init(void)
 {
 	debugfs_create_file("suspend_stats", S_IFREG | S_IRUGO,
-			NULL, NULL, &suspend_stats_operations);
+			NULL, NULL, &suspend_stats_fops);
 	return 0;
 }
 
-- 
cgit 


From 1b2b234b1318afb3775d4c6624fd5a96558f19df Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guroan@gmail.com>
Date: Mon, 10 Dec 2018 15:43:00 -0800
Subject: bpf: pass struct btf pointer to the map_check_btf() callback

If key_type or value_type are of non-trivial data types
(e.g. structure or typedef), it's not possible to check them without
the additional information, which can't be obtained without a pointer
to the btf structure.

So, let's pass btf pointer to the map_check_btf() callbacks.

Signed-off-by: Roman Gushchin <guro@fb.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h   | 3 +++
 kernel/bpf/arraymap.c | 1 +
 kernel/bpf/lpm_trie.c | 1 +
 kernel/bpf/syscall.c  | 3 ++-
 4 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 0c992b86eb2c..e734f163bd0b 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -23,6 +23,7 @@ struct bpf_prog;
 struct bpf_map;
 struct sock;
 struct seq_file;
+struct btf;
 struct btf_type;
 
 /* map is generic key/value storage optionally accesible by eBPF programs */
@@ -52,6 +53,7 @@ struct bpf_map_ops {
 	void (*map_seq_show_elem)(struct bpf_map *map, void *key,
 				  struct seq_file *m);
 	int (*map_check_btf)(const struct bpf_map *map,
+			     const struct btf *btf,
 			     const struct btf_type *key_type,
 			     const struct btf_type *value_type);
 };
@@ -126,6 +128,7 @@ static inline bool bpf_map_support_seq_show(const struct bpf_map *map)
 }
 
 int map_check_no_btf(const struct bpf_map *map,
+		     const struct btf *btf,
 		     const struct btf_type *key_type,
 		     const struct btf_type *value_type);
 
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 24583da9ffd1..25632a75d630 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -382,6 +382,7 @@ static void percpu_array_map_seq_show_elem(struct bpf_map *map, void *key,
 }
 
 static int array_map_check_btf(const struct bpf_map *map,
+			       const struct btf *btf,
 			       const struct btf_type *key_type,
 			       const struct btf_type *value_type)
 {
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index bfd4882e1106..abf1002080df 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -728,6 +728,7 @@ free_stack:
 }
 
 static int trie_check_btf(const struct bpf_map *map,
+			  const struct btf *btf,
 			  const struct btf_type *key_type,
 			  const struct btf_type *value_type)
 {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 5745c7837621..70fb11106fc2 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -456,6 +456,7 @@ static int bpf_obj_name_cpy(char *dst, const char *src)
 }
 
 int map_check_no_btf(const struct bpf_map *map,
+		     const struct btf *btf,
 		     const struct btf_type *key_type,
 		     const struct btf_type *value_type)
 {
@@ -478,7 +479,7 @@ static int map_check_btf(const struct bpf_map *map, const struct btf *btf,
 		return -EINVAL;
 
 	if (map->ops->map_check_btf)
-		ret = map->ops->map_check_btf(map, key_type, value_type);
+		ret = map->ops->map_check_btf(map, btf, key_type, value_type);
 
 	return ret;
 }
-- 
cgit 


From 9a1126b63190e2541dd5d643f4bfeb5a7f493729 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guroan@gmail.com>
Date: Mon, 10 Dec 2018 15:43:01 -0800
Subject: bpf: add bpffs pretty print for cgroup local storage maps

Implement bpffs pretty printing for cgroup local storage maps
(both shared and per-cpu).
Output example (captured for tools/testing/selftests/bpf/netcnt_prog.c):

Shared:
  $ cat /sys/fs/bpf/map_2
  # WARNING!! The output is for debug purpose only
  # WARNING!! The output format will change
  {4294968594,1}: {9999,1039896}

Per-cpu:
  $ cat /sys/fs/bpf/map_1
  # WARNING!! The output is for debug purpose only
  # WARNING!! The output format will change
  {4294968594,1}: {
  	cpu0: {0,0,0,0,0}
  	cpu1: {0,0,0,0,0}
  	cpu2: {1,104,0,0,0}
  	cpu3: {0,0,0,0,0}
  }

Signed-off-by: Roman Gushchin <guro@fb.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/btf.h        |  1 +
 kernel/bpf/btf.c           | 22 +++++++++++
 kernel/bpf/local_storage.c | 93 +++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 115 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/btf.h b/include/linux/btf.h
index b98405a56383..a4cf075b89eb 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -47,6 +47,7 @@ void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj,
 int btf_get_fd_by_id(u32 id);
 u32 btf_id(const struct btf *btf);
 bool btf_name_offset_valid(const struct btf *btf, u32 offset);
+bool btf_type_is_reg_int(const struct btf_type *t, u32 expected_size);
 
 #ifdef CONFIG_BPF_SYSCALL
 const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id);
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index bf34933cc413..1545ddfb6fa5 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -514,6 +514,28 @@ static bool btf_type_int_is_regular(const struct btf_type *t)
 	return true;
 }
 
+/*
+ * Check that given type is a regular int and has the expected size.
+ */
+bool btf_type_is_reg_int(const struct btf_type *t, u32 expected_size)
+{
+	u8 nr_bits, nr_bytes;
+	u32 int_data;
+
+	if (!btf_type_is_int(t))
+		return false;
+
+	int_data = btf_type_int(t);
+	nr_bits = BTF_INT_BITS(int_data);
+	nr_bytes = BITS_ROUNDUP_BYTES(nr_bits);
+	if (BITS_PER_BYTE_MASKED(nr_bits) ||
+	    BTF_INT_OFFSET(int_data) ||
+	    nr_bytes != expected_size)
+		return false;
+
+	return true;
+}
+
 __printf(2, 3) static void __btf_verifier_log(struct bpf_verifier_log *log,
 					      const char *fmt, ...)
 {
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index b65017dead44..5eca03da0989 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -1,11 +1,13 @@
 //SPDX-License-Identifier: GPL-2.0
 #include <linux/bpf-cgroup.h>
 #include <linux/bpf.h>
+#include <linux/btf.h>
 #include <linux/bug.h>
 #include <linux/filter.h>
 #include <linux/mm.h>
 #include <linux/rbtree.h>
 #include <linux/slab.h>
+#include <uapi/linux/btf.h>
 
 DEFINE_PER_CPU(struct bpf_cgroup_storage*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
 
@@ -308,6 +310,94 @@ static int cgroup_storage_delete_elem(struct bpf_map *map, void *key)
 	return -EINVAL;
 }
 
+static int cgroup_storage_check_btf(const struct bpf_map *map,
+				    const struct btf *btf,
+				    const struct btf_type *key_type,
+				    const struct btf_type *value_type)
+{
+	const struct btf_type *t;
+	struct btf_member *m;
+	u32 id, size;
+
+	/* Key is expected to be of struct bpf_cgroup_storage_key type,
+	 * which is:
+	 * struct bpf_cgroup_storage_key {
+	 *	__u64	cgroup_inode_id;
+	 *	__u32	attach_type;
+	 * };
+	 */
+
+	/*
+	 * Key_type must be a structure with two fields.
+	 */
+	if (BTF_INFO_KIND(key_type->info) != BTF_KIND_STRUCT ||
+	    BTF_INFO_VLEN(key_type->info) != 2)
+		return -EINVAL;
+
+	/*
+	 * The first field must be a 64 bit integer at 0 offset.
+	 */
+	m = (struct btf_member *)(key_type + 1);
+	if (m->offset)
+		return -EINVAL;
+	id = m->type;
+	t = btf_type_id_size(btf, &id, NULL);
+	size = FIELD_SIZEOF(struct bpf_cgroup_storage_key, cgroup_inode_id);
+	if (!t || !btf_type_is_reg_int(t, size))
+		return -EINVAL;
+
+	/*
+	 * The second field must be a 32 bit integer at 64 bit offset.
+	 */
+	m++;
+	if (m->offset != offsetof(struct bpf_cgroup_storage_key, attach_type) *
+	    BITS_PER_BYTE)
+		return -EINVAL;
+	id = m->type;
+	t = btf_type_id_size(btf, &id, NULL);
+	size = FIELD_SIZEOF(struct bpf_cgroup_storage_key, attach_type);
+	if (!t || !btf_type_is_reg_int(t, size))
+		return -EINVAL;
+
+	return 0;
+}
+
+static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *_key,
+					 struct seq_file *m)
+{
+	enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
+	struct bpf_cgroup_storage_key *key = _key;
+	struct bpf_cgroup_storage *storage;
+	int cpu;
+
+	rcu_read_lock();
+	storage = cgroup_storage_lookup(map_to_storage(map), key, false);
+	if (!storage) {
+		rcu_read_unlock();
+		return;
+	}
+
+	btf_type_seq_show(map->btf, map->btf_key_type_id, key, m);
+	stype = cgroup_storage_type(map);
+	if (stype == BPF_CGROUP_STORAGE_SHARED) {
+		seq_puts(m, ": ");
+		btf_type_seq_show(map->btf, map->btf_value_type_id,
+				  &READ_ONCE(storage->buf)->data[0], m);
+		seq_puts(m, "\n");
+	} else {
+		seq_puts(m, ": {\n");
+		for_each_possible_cpu(cpu) {
+			seq_printf(m, "\tcpu%d: ", cpu);
+			btf_type_seq_show(map->btf, map->btf_value_type_id,
+					  per_cpu_ptr(storage->percpu_buf, cpu),
+					  m);
+			seq_puts(m, "\n");
+		}
+		seq_puts(m, "}\n");
+	}
+	rcu_read_unlock();
+}
+
 const struct bpf_map_ops cgroup_storage_map_ops = {
 	.map_alloc = cgroup_storage_map_alloc,
 	.map_free = cgroup_storage_map_free,
@@ -315,7 +405,8 @@ const struct bpf_map_ops cgroup_storage_map_ops = {
 	.map_lookup_elem = cgroup_storage_lookup_elem,
 	.map_update_elem = cgroup_storage_update_elem,
 	.map_delete_elem = cgroup_storage_delete_elem,
-	.map_check_btf = map_check_no_btf,
+	.map_check_btf = cgroup_storage_check_btf,
+	.map_seq_show_elem = cgroup_storage_seq_show_elem,
 };
 
 int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *_map)
-- 
cgit 


From 06459901d55ee2f690b8e1fe084fb03061d617cf Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <brgl@bgdev.pl>
Date: Fri, 9 Nov 2018 18:21:32 +0100
Subject: irq/irq_sim: Store multiple interrupt offsets in a bitmap

Two threads can try to fire the irq_sim with different offsets and will
end up fighting for the irq_work asignment. Thomas Gleixner suggested a
solution based on a bitfield where we set a bit for every offset
associated with an interrupt that should be fired and then iterate over
all set bits in the interrupt handler.

This is a slightly modified solution using a bitmap so that we don't
impose a limit on the number of interrupts one can allocate with
irq_sim.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Bartosz Golaszewski <brgl@bgdev.pl>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 include/linux/irq_sim.h |  2 +-
 kernel/irq/irq_sim.c    | 23 +++++++++++++++++++++--
 2 files changed, 22 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/irq_sim.h b/include/linux/irq_sim.h
index 630a57e55db6..4500d453a63e 100644
--- a/include/linux/irq_sim.h
+++ b/include/linux/irq_sim.h
@@ -16,7 +16,7 @@
 
 struct irq_sim_work_ctx {
 	struct irq_work		work;
-	int			irq;
+	unsigned long		*pending;
 };
 
 struct irq_sim_irq_ctx {
diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c
index dd20d0d528d4..98a20e1594ce 100644
--- a/kernel/irq/irq_sim.c
+++ b/kernel/irq/irq_sim.c
@@ -34,9 +34,20 @@ static struct irq_chip irq_sim_irqchip = {
 static void irq_sim_handle_irq(struct irq_work *work)
 {
 	struct irq_sim_work_ctx *work_ctx;
+	unsigned int offset = 0;
+	struct irq_sim *sim;
+	int irqnum;
 
 	work_ctx = container_of(work, struct irq_sim_work_ctx, work);
-	handle_simple_irq(irq_to_desc(work_ctx->irq));
+	sim = container_of(work_ctx, struct irq_sim, work_ctx);
+
+	while (!bitmap_empty(work_ctx->pending, sim->irq_count)) {
+		offset = find_next_bit(work_ctx->pending,
+				       sim->irq_count, offset);
+		clear_bit(offset, work_ctx->pending);
+		irqnum = irq_sim_irqnum(sim, offset);
+		handle_simple_irq(irq_to_desc(irqnum));
+	}
 }
 
 /**
@@ -63,6 +74,13 @@ int irq_sim_init(struct irq_sim *sim, unsigned int num_irqs)
 		return sim->irq_base;
 	}
 
+	sim->work_ctx.pending = bitmap_zalloc(num_irqs, GFP_KERNEL);
+	if (!sim->work_ctx.pending) {
+		kfree(sim->irqs);
+		irq_free_descs(sim->irq_base, num_irqs);
+		return -ENOMEM;
+	}
+
 	for (i = 0; i < num_irqs; i++) {
 		sim->irqs[i].irqnum = sim->irq_base + i;
 		sim->irqs[i].enabled = false;
@@ -89,6 +107,7 @@ EXPORT_SYMBOL_GPL(irq_sim_init);
 void irq_sim_fini(struct irq_sim *sim)
 {
 	irq_work_sync(&sim->work_ctx.work);
+	bitmap_free(sim->work_ctx.pending);
 	irq_free_descs(sim->irq_base, sim->irq_count);
 	kfree(sim->irqs);
 }
@@ -143,7 +162,7 @@ EXPORT_SYMBOL_GPL(devm_irq_sim_init);
 void irq_sim_fire(struct irq_sim *sim, unsigned int offset)
 {
 	if (sim->irqs[offset].enabled) {
-		sim->work_ctx.irq = irq_sim_irqnum(sim, offset);
+		set_bit(offset, sim->work_ctx.pending);
 		irq_work_queue(&sim->work_ctx.work);
 	}
 }
-- 
cgit 


From 9e794163a69c103633fefb10a3879408d4e4e2c8 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 12 Dec 2018 10:18:21 -0800
Subject: bpf: Remove bpf_dump_raw_ok() check for func_info and line_info

The func_info and line_info have the bpf insn offset but
they do not contain kernel address.  They will still be useful
for the userspace tool to annotate the xlated insn.

This patch removes the bpf_dump_raw_ok() guard for the
func_info and line_info during bpf_prog_get_info_by_fd().

The guard stays for jited_line_info which contains the kernel
address.

Although this bpf_dump_raw_ok() guard behavior has started since
the earlier func_info patch series, I marked the Fixes tag to the
latest line_info patch series which contains both func_info and
line_info and this patch is fixing for both of them.

Fixes: c454a46b5efd ("bpf: Add bpf_line_info support")
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/syscall.c | 32 ++++++++++++--------------------
 1 file changed, 12 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 70fb11106fc2..b7c585838c72 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2272,33 +2272,25 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 	ulen = info.nr_func_info;
 	info.nr_func_info = prog->aux->func_info_cnt;
 	if (info.nr_func_info && ulen) {
-		if (bpf_dump_raw_ok()) {
-			char __user *user_finfo;
+		char __user *user_finfo;
 
-			user_finfo = u64_to_user_ptr(info.func_info);
-			ulen = min_t(u32, info.nr_func_info, ulen);
-			if (copy_to_user(user_finfo, prog->aux->func_info,
-					 info.func_info_rec_size * ulen))
-				return -EFAULT;
-		} else {
-			info.func_info = 0;
-		}
+		user_finfo = u64_to_user_ptr(info.func_info);
+		ulen = min_t(u32, info.nr_func_info, ulen);
+		if (copy_to_user(user_finfo, prog->aux->func_info,
+				 info.func_info_rec_size * ulen))
+			return -EFAULT;
 	}
 
 	ulen = info.nr_line_info;
 	info.nr_line_info = prog->aux->nr_linfo;
 	if (info.nr_line_info && ulen) {
-		if (bpf_dump_raw_ok()) {
-			__u8 __user *user_linfo;
+		__u8 __user *user_linfo;
 
-			user_linfo = u64_to_user_ptr(info.line_info);
-			ulen = min_t(u32, info.nr_line_info, ulen);
-			if (copy_to_user(user_linfo, prog->aux->linfo,
-					 info.line_info_rec_size * ulen))
-				return -EFAULT;
-		} else {
-			info.line_info = 0;
-		}
+		user_linfo = u64_to_user_ptr(info.line_info);
+		ulen = min_t(u32, info.nr_line_info, ulen);
+		if (copy_to_user(user_linfo, prog->aux->linfo,
+				 info.line_info_rec_size * ulen))
+			return -EFAULT;
 	}
 
 	ulen = info.nr_jited_line_info;
-- 
cgit 


From c872bdb38febb4c31ece3599c52cf1f833b89f4e Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Wed, 12 Dec 2018 09:37:46 -0800
Subject: bpf: include sub program tags in bpf_prog_info

Changes v2 -> v3:
1. remove check for bpf_dump_raw_ok().

Changes v1 -> v2:
1. Fix error path as Martin suggested.

This patch adds nr_prog_tags and prog_tags to bpf_prog_info. This is a
reliable way for user space to get tags of all sub programs. Before this
patch, user space need to find sub program tags via kallsyms.

This feature will be used in BPF introspection, where user space queries
information about BPF programs via sys_bpf.

Signed-off-by: Song Liu <songliubraving@fb.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h |  2 ++
 kernel/bpf/syscall.c     | 22 ++++++++++++++++++++++
 2 files changed, 24 insertions(+)

(limited to 'kernel')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index aa582cd5bfcf..e7d57e89f25f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2717,6 +2717,8 @@ struct bpf_prog_info {
 	__u32 nr_jited_line_info;
 	__u32 line_info_rec_size;
 	__u32 jited_line_info_rec_size;
+	__u32 nr_prog_tags;
+	__aligned_u64 prog_tags;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index b7c585838c72..7f1410d6fbe9 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2315,6 +2315,28 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 		}
 	}
 
+	ulen = info.nr_prog_tags;
+	info.nr_prog_tags = prog->aux->func_cnt ? : 1;
+	if (ulen) {
+		__u8 __user (*user_prog_tags)[BPF_TAG_SIZE];
+		u32 i;
+
+		user_prog_tags = u64_to_user_ptr(info.prog_tags);
+		ulen = min_t(u32, info.nr_prog_tags, ulen);
+		if (prog->aux->func_cnt) {
+			for (i = 0; i < ulen; i++) {
+				if (copy_to_user(user_prog_tags[i],
+						 prog->aux->func[i]->tag,
+						 BPF_TAG_SIZE))
+					return -EFAULT;
+			}
+		} else {
+			if (copy_to_user(user_prog_tags[0],
+					 prog->tag, BPF_TAG_SIZE))
+				return -EFAULT;
+		}
+	}
+
 done:
 	if (copy_to_user(uinfo, &info, info_len) ||
 	    put_user(info_len, &uattr->info.info_len))
-- 
cgit 


From ba830885656414101b2f8ca88786524d4bb5e8c1 Mon Sep 17 00:00:00 2001
From: Kristina Martsenko <kristina.martsenko@arm.com>
Date: Fri, 7 Dec 2018 18:39:28 +0000
Subject: arm64: add prctl control for resetting ptrauth keys

Add an arm64-specific prctl to allow a thread to reinitialize its
pointer authentication keys to random values. This can be useful when
exec() is not used for starting new processes, to ensure that different
processes still have different keys.

Signed-off-by: Kristina Martsenko <kristina.martsenko@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/arm64/include/asm/pointer_auth.h |  3 +++
 arch/arm64/include/asm/processor.h    |  4 +++
 arch/arm64/kernel/Makefile            |  1 +
 arch/arm64/kernel/pointer_auth.c      | 47 +++++++++++++++++++++++++++++++++++
 include/uapi/linux/prctl.h            |  8 ++++++
 kernel/sys.c                          |  8 ++++++
 6 files changed, 71 insertions(+)
 create mode 100644 arch/arm64/kernel/pointer_auth.c

(limited to 'kernel')

diff --git a/arch/arm64/include/asm/pointer_auth.h b/arch/arm64/include/asm/pointer_auth.h
index 5ccf49b4dac3..80eb03afd677 100644
--- a/arch/arm64/include/asm/pointer_auth.h
+++ b/arch/arm64/include/asm/pointer_auth.h
@@ -63,6 +63,8 @@ static inline void ptrauth_keys_switch(struct ptrauth_keys *keys)
 		__ptrauth_key_install(APGA, keys->apga);
 }
 
+extern int ptrauth_prctl_reset_keys(struct task_struct *tsk, unsigned long arg);
+
 /*
  * The EL0 pointer bits used by a pointer authentication code.
  * This is dependent on TBI0 being enabled, or bits 63:56 would also apply.
@@ -86,6 +88,7 @@ do {									\
 	ptrauth_keys_switch(&(tsk)->thread_info.keys_user)
 
 #else /* CONFIG_ARM64_PTR_AUTH */
+#define ptrauth_prctl_reset_keys(tsk, arg)	(-EINVAL)
 #define ptrauth_strip_insn_pac(lr)	(lr)
 #define ptrauth_thread_init_user(tsk)
 #define ptrauth_thread_switch(tsk)
diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index f4b8e09aff56..142c708cb429 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -44,6 +44,7 @@
 #include <asm/hw_breakpoint.h>
 #include <asm/lse.h>
 #include <asm/pgtable-hwdef.h>
+#include <asm/pointer_auth.h>
 #include <asm/ptrace.h>
 #include <asm/types.h>
 
@@ -289,6 +290,9 @@ extern void __init minsigstksz_setup(void);
 #define SVE_SET_VL(arg)	sve_set_current_vl(arg)
 #define SVE_GET_VL()	sve_get_current_vl()
 
+/* PR_PAC_RESET_KEYS prctl */
+#define PAC_RESET_KEYS(tsk, arg)	ptrauth_prctl_reset_keys(tsk, arg)
+
 /*
  * For CONFIG_GCC_PLUGIN_STACKLEAK
  *
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index 583334ce1c2c..df08d735b21d 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -58,6 +58,7 @@ arm64-obj-$(CONFIG_CRASH_DUMP)		+= crash_dump.o
 arm64-obj-$(CONFIG_CRASH_CORE)		+= crash_core.o
 arm64-obj-$(CONFIG_ARM_SDE_INTERFACE)	+= sdei.o
 arm64-obj-$(CONFIG_ARM64_SSBD)		+= ssbd.o
+arm64-obj-$(CONFIG_ARM64_PTR_AUTH)	+= pointer_auth.o
 
 obj-y					+= $(arm64-obj-y) vdso/ probes/
 obj-m					+= $(arm64-obj-m)
diff --git a/arch/arm64/kernel/pointer_auth.c b/arch/arm64/kernel/pointer_auth.c
new file mode 100644
index 000000000000..b9f6f5f3409a
--- /dev/null
+++ b/arch/arm64/kernel/pointer_auth.c
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/errno.h>
+#include <linux/prctl.h>
+#include <linux/random.h>
+#include <linux/sched.h>
+#include <asm/cpufeature.h>
+#include <asm/pointer_auth.h>
+
+int ptrauth_prctl_reset_keys(struct task_struct *tsk, unsigned long arg)
+{
+	struct ptrauth_keys *keys = &tsk->thread_info.keys_user;
+	unsigned long addr_key_mask = PR_PAC_APIAKEY | PR_PAC_APIBKEY |
+				      PR_PAC_APDAKEY | PR_PAC_APDBKEY;
+	unsigned long key_mask = addr_key_mask | PR_PAC_APGAKEY;
+
+	if (!system_supports_address_auth() && !system_supports_generic_auth())
+		return -EINVAL;
+
+	if (!arg) {
+		ptrauth_keys_init(keys);
+		ptrauth_keys_switch(keys);
+		return 0;
+	}
+
+	if (arg & ~key_mask)
+		return -EINVAL;
+
+	if (((arg & addr_key_mask) && !system_supports_address_auth()) ||
+	    ((arg & PR_PAC_APGAKEY) && !system_supports_generic_auth()))
+		return -EINVAL;
+
+	if (arg & PR_PAC_APIAKEY)
+		get_random_bytes(&keys->apia, sizeof(keys->apia));
+	if (arg & PR_PAC_APIBKEY)
+		get_random_bytes(&keys->apib, sizeof(keys->apib));
+	if (arg & PR_PAC_APDAKEY)
+		get_random_bytes(&keys->apda, sizeof(keys->apda));
+	if (arg & PR_PAC_APDBKEY)
+		get_random_bytes(&keys->apdb, sizeof(keys->apdb));
+	if (arg & PR_PAC_APGAKEY)
+		get_random_bytes(&keys->apga, sizeof(keys->apga));
+
+	ptrauth_keys_switch(keys);
+
+	return 0;
+}
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index c0d7ea0bf5b6..0f535a501391 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -219,4 +219,12 @@ struct prctl_mm_map {
 # define PR_SPEC_DISABLE		(1UL << 2)
 # define PR_SPEC_FORCE_DISABLE		(1UL << 3)
 
+/* Reset arm64 pointer authentication keys */
+#define PR_PAC_RESET_KEYS		54
+# define PR_PAC_APIAKEY			(1UL << 0)
+# define PR_PAC_APIBKEY			(1UL << 1)
+# define PR_PAC_APDAKEY			(1UL << 2)
+# define PR_PAC_APDBKEY			(1UL << 3)
+# define PR_PAC_APGAKEY			(1UL << 4)
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/sys.c b/kernel/sys.c
index 123bd73046ec..64b5a230f38d 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -121,6 +121,9 @@
 #ifndef SVE_GET_VL
 # define SVE_GET_VL()		(-EINVAL)
 #endif
+#ifndef PAC_RESET_KEYS
+# define PAC_RESET_KEYS(a, b)	(-EINVAL)
+#endif
 
 /*
  * this is where the system-wide overflow UID and GID are defined, for
@@ -2476,6 +2479,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 			return -EINVAL;
 		error = arch_prctl_spec_ctrl_set(me, arg2, arg3);
 		break;
+	case PR_PAC_RESET_KEYS:
+		if (arg3 || arg4 || arg5)
+			return -EINVAL;
+		error = PAC_RESET_KEYS(me, arg2);
+		break;
 	default:
 		error = -EINVAL;
 		break;
-- 
cgit 


From 7640ead939247e91e84b7ec6ec001f30193cc7df Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 12 Dec 2018 16:29:07 -0800
Subject: bpf: verifier: make sure callees don't prune with caller differences

Currently for liveness and state pruning the register parentage
chains don't include states of the callee.  This makes some sense
as the callee can't access those registers.  However, this means
that READs done after the callee returns will not propagate into
the states of the callee.  Callee will then perform pruning
disregarding differences in caller state.

Example:

   0: (85) call bpf_user_rnd_u32
   1: (b7) r8 = 0
   2: (55) if r0 != 0x0 goto pc+1
   3: (b7) r8 = 1
   4: (bf) r1 = r8
   5: (85) call pc+4
   6: (15) if r8 == 0x1 goto pc+1
   7: (05) *(u64 *)(r9 - 8) = r3
   8: (b7) r0 = 0
   9: (95) exit

   10: (15) if r1 == 0x0 goto pc+0
   11: (95) exit

Here we acquire unknown state with call to get_random() [1].  Then
we store this random state in r8 (either 0 or 1) [1 - 3], and make
a call on line 5.  Callee does nothing but a trivial conditional
jump (to create a pruning point).  Upon return caller checks the
state of r8 and either performs an unsafe read or not.

Verifier will first explore the path with r8 == 1, creating a pruning
point at [11].  The parentage chain for r8 will include only callers
states so once verifier reaches [6] it will mark liveness only on states
in the caller, and not [11].  Now when verifier walks the paths with
r8 == 0 it will reach [11] and since REG_LIVE_READ on r8 was not
propagated there it will prune the walk entirely (stop walking
the entire program, not just the callee).  Since [6] was never walked
with r8 == 0, [7] will be considered dead and replaced with "goto -1"
causing hang at runtime.

This patch weaves the callee's explored states onto the callers
parentage chain.  Rough parentage for r8 would have looked like this
before:

[0] [1] [2] [3] [4] [5]   [10]      [11]      [6]      [7]
     |           |      ,---|----.    |        |        |
  sl0:         sl0:    / sl0:     \ sl0:      sl0:     sl0:
  fr0: r8 <-- fr0: r8<+--fr0: r8   `fr0: r8  ,fr0: r8<-fr0: r8
                       \ fr1: r8 <- fr1: r8 /
                        \__________________/

after:

[0] [1] [2] [3] [4] [5]   [10]      [11]      [6]      [7]
     |           |          |         |        |        |
   sl0:         sl0:      sl0:       sl0:      sl0:     sl0:
   fr0: r8 <-- fr0: r8 <- fr0: r8 <- fr0: r8 <-fr0: r8<-fr0: r8
                          fr1: r8 <- fr1: r8

Now the mark from instruction 6 will travel through callees states.

Note that we don't have to connect r0 because its overwritten by
callees state on return and r1 - r5 because those are not alive
any more once a call is made.

v2:
 - don't connect the callees registers twice (Alexei: suggestion & code)
 - add more details to the comment (Ed & Alexei)
v1: don't unnecessarily link caller saved regs (Jiong)

Fixes: f4d7e40a5b71 ("bpf: introduce function calls (verification)")
Reported-by: David Beckett <david.beckett@netronome.com>
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Jiong Wang <jiong.wang@netronome.com>
Reviewed-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c                       | 13 ++++++++++---
 tools/testing/selftests/bpf/test_verifier.c | 28 ++++++++++++++++++++++++++++
 2 files changed, 38 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index fc760d00a38c..51ba84d4d34a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5102,9 +5102,16 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 	}
 	new_sl->next = env->explored_states[insn_idx];
 	env->explored_states[insn_idx] = new_sl;
-	/* connect new state to parentage chain */
-	for (i = 0; i < BPF_REG_FP; i++)
-		cur_regs(env)[i].parent = &new->frame[new->curframe]->regs[i];
+	/* connect new state to parentage chain. Current frame needs all
+	 * registers connected. Only r6 - r9 of the callers are alive (pushed
+	 * to the stack implicitly by JITs) so in callers' frames connect just
+	 * r6 - r9 as an optimization. Callers will have r1 - r5 connected to
+	 * the state of the call instruction (with WRITTEN set), and r0 comes
+	 * from callee with its full parentage chain, anyway.
+	 */
+	for (j = 0; j <= cur->curframe; j++)
+		for (i = j < cur->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++)
+			cur->frame[j]->regs[i].parent = &new->frame[j]->regs[i];
 	/* clear write marks in current state: the writes we did are not writes
 	 * our child did, so they don't screen off its reads from us.
 	 * (There are no read marks in current state, because reads always mark
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 37583267bdbc..f8eac4a544f4 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -13915,6 +13915,34 @@ static struct bpf_test tests[] = {
 		.result_unpriv = REJECT,
 		.result = ACCEPT,
 	},
+	{
+		"calls: cross frame pruning",
+		.insns = {
+			/* r8 = !!random();
+			 * call pruner()
+			 * if (r8)
+			 *     do something bad;
+			 */
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_get_prandom_u32),
+			BPF_MOV64_IMM(BPF_REG_8, 0),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+			BPF_MOV64_IMM(BPF_REG_8, 1),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_8),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 4),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_8, 1, 1),
+			BPF_LDX_MEM(BPF_B, BPF_REG_9, BPF_REG_1, 0),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SOCKET_FILTER,
+		.errstr_unpriv = "function calls to other bpf functions are allowed for root only",
+		.result_unpriv = REJECT,
+		.errstr = "!read_ok",
+		.result = REJECT,
+	},
 };
 
 static int probe_filter_length(const struct bpf_insn *fp)
-- 
cgit 


From 20b105feda8d42360bd690b8fdf6b2f00ba4a993 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 4 Dec 2018 08:04:31 -0800
Subject: dma-mapping: remove a pointless memset in dma_atomic_pool_init

We already zero the memory after allocating it from the pool that
this function fills, and having the memset here in this form means
we can't support CMA highmem allocations.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reported-by: Russell King - ARM Linux <linux@armlinux.org.uk>
---
 kernel/dma/remap.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c
index 8a44317cfc1a..18cc09fc27b9 100644
--- a/kernel/dma/remap.c
+++ b/kernel/dma/remap.c
@@ -121,7 +121,6 @@ int __init dma_atomic_pool_init(gfp_t gfp, pgprot_t prot)
 	if (!page)
 		goto out;
 
-	memset(page_address(page), 0, atomic_pool_size);
 	arch_dma_prep_coherent(page, atomic_pool_size);
 
 	atomic_pool = gen_pool_create(PAGE_SHIFT, -1);
-- 
cgit 


From 8d59b5f2a44611d7327a2a14b36090d692186f60 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 3 Dec 2018 14:58:59 +0100
Subject: dma-mapping: simplify the dma_sync_single_range_for_{cpu,device}
 implementation

We can just call the regular calls after adding offset the the address instead
of reimplementing them.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Tested-by: Jesper Dangaard Brouer <brouer@redhat.com>
Tested-by: Tony Luck <tony.luck@intel.com>
---
 include/linux/dma-debug.h   | 27 ---------------------------
 include/linux/dma-mapping.h | 34 ++++++++++------------------------
 kernel/dma/debug.c          | 42 ------------------------------------------
 3 files changed, 10 insertions(+), 93 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/dma-debug.h b/include/linux/dma-debug.h
index 46e6131a72b6..2ad5c363d7d5 100644
--- a/include/linux/dma-debug.h
+++ b/include/linux/dma-debug.h
@@ -70,17 +70,6 @@ extern void debug_dma_sync_single_for_device(struct device *dev,
 					     dma_addr_t dma_handle,
 					     size_t size, int direction);
 
-extern void debug_dma_sync_single_range_for_cpu(struct device *dev,
-						dma_addr_t dma_handle,
-						unsigned long offset,
-						size_t size,
-						int direction);
-
-extern void debug_dma_sync_single_range_for_device(struct device *dev,
-						   dma_addr_t dma_handle,
-						   unsigned long offset,
-						   size_t size, int direction);
-
 extern void debug_dma_sync_sg_for_cpu(struct device *dev,
 				      struct scatterlist *sg,
 				      int nelems, int direction);
@@ -167,22 +156,6 @@ static inline void debug_dma_sync_single_for_device(struct device *dev,
 {
 }
 
-static inline void debug_dma_sync_single_range_for_cpu(struct device *dev,
-						       dma_addr_t dma_handle,
-						       unsigned long offset,
-						       size_t size,
-						       int direction)
-{
-}
-
-static inline void debug_dma_sync_single_range_for_device(struct device *dev,
-							  dma_addr_t dma_handle,
-							  unsigned long offset,
-							  size_t size,
-							  int direction)
-{
-}
-
 static inline void debug_dma_sync_sg_for_cpu(struct device *dev,
 					     struct scatterlist *sg,
 					     int nelems, int direction)
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 7799c2b27849..8916499d2805 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -360,6 +360,13 @@ static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
 	debug_dma_sync_single_for_cpu(dev, addr, size, dir);
 }
 
+static inline void dma_sync_single_range_for_cpu(struct device *dev,
+		dma_addr_t addr, unsigned long offset, size_t size,
+		enum dma_data_direction dir)
+{
+	return dma_sync_single_for_cpu(dev, addr + offset, size, dir);
+}
+
 static inline void dma_sync_single_for_device(struct device *dev,
 					      dma_addr_t addr, size_t size,
 					      enum dma_data_direction dir)
@@ -372,32 +379,11 @@ static inline void dma_sync_single_for_device(struct device *dev,
 	debug_dma_sync_single_for_device(dev, addr, size, dir);
 }
 
-static inline void dma_sync_single_range_for_cpu(struct device *dev,
-						 dma_addr_t addr,
-						 unsigned long offset,
-						 size_t size,
-						 enum dma_data_direction dir)
-{
-	const struct dma_map_ops *ops = get_dma_ops(dev);
-
-	BUG_ON(!valid_dma_direction(dir));
-	if (ops->sync_single_for_cpu)
-		ops->sync_single_for_cpu(dev, addr + offset, size, dir);
-	debug_dma_sync_single_range_for_cpu(dev, addr, offset, size, dir);
-}
-
 static inline void dma_sync_single_range_for_device(struct device *dev,
-						    dma_addr_t addr,
-						    unsigned long offset,
-						    size_t size,
-						    enum dma_data_direction dir)
+		dma_addr_t addr, unsigned long offset, size_t size,
+		enum dma_data_direction dir)
 {
-	const struct dma_map_ops *ops = get_dma_ops(dev);
-
-	BUG_ON(!valid_dma_direction(dir));
-	if (ops->sync_single_for_device)
-		ops->sync_single_for_device(dev, addr + offset, size, dir);
-	debug_dma_sync_single_range_for_device(dev, addr, offset, size, dir);
+	return dma_sync_single_for_device(dev, addr + offset, size, dir);
 }
 
 static inline void
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index 20ab0f6c1b70..164706da2a73 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -1633,48 +1633,6 @@ void debug_dma_sync_single_for_device(struct device *dev,
 }
 EXPORT_SYMBOL(debug_dma_sync_single_for_device);
 
-void debug_dma_sync_single_range_for_cpu(struct device *dev,
-					 dma_addr_t dma_handle,
-					 unsigned long offset, size_t size,
-					 int direction)
-{
-	struct dma_debug_entry ref;
-
-	if (unlikely(dma_debug_disabled()))
-		return;
-
-	ref.type         = dma_debug_single;
-	ref.dev          = dev;
-	ref.dev_addr     = dma_handle;
-	ref.size         = offset + size;
-	ref.direction    = direction;
-	ref.sg_call_ents = 0;
-
-	check_sync(dev, &ref, true);
-}
-EXPORT_SYMBOL(debug_dma_sync_single_range_for_cpu);
-
-void debug_dma_sync_single_range_for_device(struct device *dev,
-					    dma_addr_t dma_handle,
-					    unsigned long offset,
-					    size_t size, int direction)
-{
-	struct dma_debug_entry ref;
-
-	if (unlikely(dma_debug_disabled()))
-		return;
-
-	ref.type         = dma_debug_single;
-	ref.dev          = dev;
-	ref.dev_addr     = dma_handle;
-	ref.size         = offset + size;
-	ref.direction    = direction;
-	ref.sg_call_ents = 0;
-
-	check_sync(dev, &ref, false);
-}
-EXPORT_SYMBOL(debug_dma_sync_single_range_for_device);
-
 void debug_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
 			       int nelems, int direction)
 {
-- 
cgit 


From 05887cb610a54bf568de7f0bc07c4a64e45ac6f9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 6 Dec 2018 12:25:54 -0800
Subject: dma-mapping: move dma_get_required_mask to kernel/dma

dma_get_required_mask should really be with the rest of the DMA mapping
implementation instead of in drivers/base as a lone outlier.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Tested-by: Jesper Dangaard Brouer <brouer@redhat.com>
Tested-by: Tony Luck <tony.luck@intel.com>
---
 drivers/base/platform.c | 31 -------------------------------
 kernel/dma/mapping.c    | 34 +++++++++++++++++++++++++++++++++-
 2 files changed, 33 insertions(+), 32 deletions(-)

(limited to 'kernel')

diff --git a/drivers/base/platform.c b/drivers/base/platform.c
index 41b91af95afb..eae841935a45 100644
--- a/drivers/base/platform.c
+++ b/drivers/base/platform.c
@@ -1179,37 +1179,6 @@ int __init platform_bus_init(void)
 	return error;
 }
 
-#ifndef ARCH_HAS_DMA_GET_REQUIRED_MASK
-static u64 dma_default_get_required_mask(struct device *dev)
-{
-	u32 low_totalram = ((max_pfn - 1) << PAGE_SHIFT);
-	u32 high_totalram = ((max_pfn - 1) >> (32 - PAGE_SHIFT));
-	u64 mask;
-
-	if (!high_totalram) {
-		/* convert to mask just covering totalram */
-		low_totalram = (1 << (fls(low_totalram) - 1));
-		low_totalram += low_totalram - 1;
-		mask = low_totalram;
-	} else {
-		high_totalram = (1 << (fls(high_totalram) - 1));
-		high_totalram += high_totalram - 1;
-		mask = (((u64)high_totalram) << 32) + 0xffffffff;
-	}
-	return mask;
-}
-
-u64 dma_get_required_mask(struct device *dev)
-{
-	const struct dma_map_ops *ops = get_dma_ops(dev);
-
-	if (ops->get_required_mask)
-		return ops->get_required_mask(dev);
-	return dma_default_get_required_mask(dev);
-}
-EXPORT_SYMBOL_GPL(dma_get_required_mask);
-#endif
-
 static __initdata LIST_HEAD(early_platform_driver_list);
 static __initdata LIST_HEAD(early_platform_device_list);
 
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index dfbc3deb95cd..dfe29d18dba1 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -5,7 +5,7 @@
  * Copyright (c) 2006  SUSE Linux Products GmbH
  * Copyright (c) 2006  Tejun Heo <teheo@suse.de>
  */
-
+#include <linux/memblock.h> /* for max_pfn */
 #include <linux/acpi.h>
 #include <linux/dma-noncoherent.h>
 #include <linux/export.h>
@@ -262,3 +262,35 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma,
 #endif /* !CONFIG_ARCH_NO_COHERENT_DMA_MMAP */
 }
 EXPORT_SYMBOL(dma_common_mmap);
+
+#ifndef ARCH_HAS_DMA_GET_REQUIRED_MASK
+static u64 dma_default_get_required_mask(struct device *dev)
+{
+	u32 low_totalram = ((max_pfn - 1) << PAGE_SHIFT);
+	u32 high_totalram = ((max_pfn - 1) >> (32 - PAGE_SHIFT));
+	u64 mask;
+
+	if (!high_totalram) {
+		/* convert to mask just covering totalram */
+		low_totalram = (1 << (fls(low_totalram) - 1));
+		low_totalram += low_totalram - 1;
+		mask = low_totalram;
+	} else {
+		high_totalram = (1 << (fls(high_totalram) - 1));
+		high_totalram += high_totalram - 1;
+		mask = (((u64)high_totalram) << 32) + 0xffffffff;
+	}
+	return mask;
+}
+
+u64 dma_get_required_mask(struct device *dev)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+
+	if (ops->get_required_mask)
+		return ops->get_required_mask(dev);
+	return dma_default_get_required_mask(dev);
+}
+EXPORT_SYMBOL_GPL(dma_get_required_mask);
+#endif
+
-- 
cgit 


From 7249c1a52df9967cd23550f3dc24fb6ca43cdc6a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 6 Dec 2018 12:43:30 -0800
Subject: dma-mapping: move various slow path functions out of line

There is no need to have all setup and coherent allocation / freeing
routines inline.  Move them out of line to keep the implemeation
nicely encapsulated and save some kernel text size.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Tested-by: Jesper Dangaard Brouer <brouer@redhat.com>
Tested-by: Tony Luck <tony.luck@intel.com>
---
 arch/powerpc/include/asm/dma-mapping.h |   1 -
 include/linux/dma-mapping.h            | 150 +++------------------------------
 kernel/dma/mapping.c                   | 140 +++++++++++++++++++++++++++++-
 3 files changed, 151 insertions(+), 140 deletions(-)

(limited to 'kernel')

diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h
index 8fa394520af6..5201f2b7838c 100644
--- a/arch/powerpc/include/asm/dma-mapping.h
+++ b/arch/powerpc/include/asm/dma-mapping.h
@@ -108,7 +108,6 @@ static inline void set_dma_offset(struct device *dev, dma_addr_t off)
 }
 
 #define HAVE_ARCH_DMA_SET_MASK 1
-extern int dma_set_mask(struct device *dev, u64 dma_mask);
 
 extern u64 __dma_get_required_mask(struct device *dev);
 
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 3b431cc58794..0bbce52606c2 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -440,107 +440,24 @@ bool dma_in_atomic_pool(void *start, size_t size);
 void *dma_alloc_from_pool(size_t size, struct page **ret_page, gfp_t flags);
 bool dma_free_from_pool(void *start, size_t size);
 
-/**
- * dma_mmap_attrs - map a coherent DMA allocation into user space
- * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
- * @vma: vm_area_struct describing requested user mapping
- * @cpu_addr: kernel CPU-view address returned from dma_alloc_attrs
- * @handle: device-view address returned from dma_alloc_attrs
- * @size: size of memory originally requested in dma_alloc_attrs
- * @attrs: attributes of mapping properties requested in dma_alloc_attrs
- *
- * Map a coherent DMA buffer previously allocated by dma_alloc_attrs
- * into user space.  The coherent DMA buffer must not be freed by the
- * driver until the user space mapping has been released.
- */
-static inline int
-dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma, void *cpu_addr,
-	       dma_addr_t dma_addr, size_t size, unsigned long attrs)
-{
-	const struct dma_map_ops *ops = get_dma_ops(dev);
-	BUG_ON(!ops);
-	if (ops->mmap)
-		return ops->mmap(dev, vma, cpu_addr, dma_addr, size, attrs);
-	return dma_common_mmap(dev, vma, cpu_addr, dma_addr, size, attrs);
-}
-
+int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma,
+		void *cpu_addr, dma_addr_t dma_addr, size_t size,
+		unsigned long attrs);
 #define dma_mmap_coherent(d, v, c, h, s) dma_mmap_attrs(d, v, c, h, s, 0)
 
 int
 dma_common_get_sgtable(struct device *dev, struct sg_table *sgt, void *cpu_addr,
 		dma_addr_t dma_addr, size_t size, unsigned long attrs);
 
-static inline int
-dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt, void *cpu_addr,
-		      dma_addr_t dma_addr, size_t size,
-		      unsigned long attrs)
-{
-	const struct dma_map_ops *ops = get_dma_ops(dev);
-	BUG_ON(!ops);
-	if (ops->get_sgtable)
-		return ops->get_sgtable(dev, sgt, cpu_addr, dma_addr, size,
-					attrs);
-	return dma_common_get_sgtable(dev, sgt, cpu_addr, dma_addr, size,
-			attrs);
-}
-
+int dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt,
+		void *cpu_addr, dma_addr_t dma_addr, size_t size,
+		unsigned long attrs);
 #define dma_get_sgtable(d, t, v, h, s) dma_get_sgtable_attrs(d, t, v, h, s, 0)
 
-#ifndef arch_dma_alloc_attrs
-#define arch_dma_alloc_attrs(dev)	(true)
-#endif
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-				       dma_addr_t *dma_handle, gfp_t flag,
-				       unsigned long attrs)
-{
-	const struct dma_map_ops *ops = get_dma_ops(dev);
-	void *cpu_addr;
-
-	BUG_ON(!ops);
-	WARN_ON_ONCE(dev && !dev->coherent_dma_mask);
-
-	if (dma_alloc_from_dev_coherent(dev, size, dma_handle, &cpu_addr))
-		return cpu_addr;
-
-	/* let the implementation decide on the zone to allocate from: */
-	flag &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM);
-
-	if (!arch_dma_alloc_attrs(&dev))
-		return NULL;
-	if (!ops->alloc)
-		return NULL;
-
-	cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs);
-	debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr);
-	return cpu_addr;
-}
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-				     void *cpu_addr, dma_addr_t dma_handle,
-				     unsigned long attrs)
-{
-	const struct dma_map_ops *ops = get_dma_ops(dev);
-
-	BUG_ON(!ops);
-
-	if (dma_release_from_dev_coherent(dev, get_order(size), cpu_addr))
-		return;
-	/*
-	 * On non-coherent platforms which implement DMA-coherent buffers via
-	 * non-cacheable remaps, ops->free() may call vunmap(). Thus getting
-	 * this far in IRQ context is a) at risk of a BUG_ON() or trying to
-	 * sleep on some machines, and b) an indication that the driver is
-	 * probably misusing the coherent API anyway.
-	 */
-	WARN_ON(irqs_disabled());
-
-	if (!ops->free || !cpu_addr)
-		return;
-
-	debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
-	ops->free(dev, size, cpu_addr, dma_handle, attrs);
-}
+void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
+		gfp_t flag, unsigned long attrs);
+void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr,
+		dma_addr_t dma_handle, unsigned long attrs);
 
 static inline void *dma_alloc_coherent(struct device *dev, size_t size,
 		dma_addr_t *dma_handle, gfp_t gfp)
@@ -565,35 +482,9 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 	return 0;
 }
 
-static inline void dma_check_mask(struct device *dev, u64 mask)
-{
-	if (sme_active() && (mask < (((u64)sme_get_me_mask() << 1) - 1)))
-		dev_warn(dev, "SME is active, device will require DMA bounce buffers\n");
-}
-
-static inline int dma_supported(struct device *dev, u64 mask)
-{
-	const struct dma_map_ops *ops = get_dma_ops(dev);
-
-	if (!ops)
-		return 0;
-	if (!ops->dma_supported)
-		return 1;
-	return ops->dma_supported(dev, mask);
-}
-
-#ifndef HAVE_ARCH_DMA_SET_MASK
-static inline int dma_set_mask(struct device *dev, u64 mask)
-{
-	if (!dev->dma_mask || !dma_supported(dev, mask))
-		return -EIO;
-
-	dma_check_mask(dev, mask);
-
-	*dev->dma_mask = mask;
-	return 0;
-}
-#endif
+int dma_supported(struct device *dev, u64 mask);
+int dma_set_mask(struct device *dev, u64 mask);
+int dma_set_coherent_mask(struct device *dev, u64 mask);
 
 static inline u64 dma_get_mask(struct device *dev)
 {
@@ -602,21 +493,6 @@ static inline u64 dma_get_mask(struct device *dev)
 	return DMA_BIT_MASK(32);
 }
 
-#ifdef CONFIG_ARCH_HAS_DMA_SET_COHERENT_MASK
-int dma_set_coherent_mask(struct device *dev, u64 mask);
-#else
-static inline int dma_set_coherent_mask(struct device *dev, u64 mask)
-{
-	if (!dma_supported(dev, mask))
-		return -EIO;
-
-	dma_check_mask(dev, mask);
-
-	dev->coherent_dma_mask = mask;
-	return 0;
-}
-#endif
-
 /*
  * Set both the DMA mask and the coherent DMA mask to the same thing.
  * Note that we don't check the return value from dma_set_coherent_mask()
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index dfe29d18dba1..176ae3e08916 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -223,7 +223,20 @@ int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt,
 		sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0);
 	return ret;
 }
-EXPORT_SYMBOL(dma_common_get_sgtable);
+
+int dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt,
+		void *cpu_addr, dma_addr_t dma_addr, size_t size,
+		unsigned long attrs)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+	BUG_ON(!ops);
+	if (ops->get_sgtable)
+		return ops->get_sgtable(dev, sgt, cpu_addr, dma_addr, size,
+					attrs);
+	return dma_common_get_sgtable(dev, sgt, cpu_addr, dma_addr, size,
+			attrs);
+}
+EXPORT_SYMBOL(dma_get_sgtable_attrs);
 
 /*
  * Create userspace mapping for the DMA-coherent memory.
@@ -261,7 +274,31 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma,
 	return -ENXIO;
 #endif /* !CONFIG_ARCH_NO_COHERENT_DMA_MMAP */
 }
-EXPORT_SYMBOL(dma_common_mmap);
+
+/**
+ * dma_mmap_attrs - map a coherent DMA allocation into user space
+ * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
+ * @vma: vm_area_struct describing requested user mapping
+ * @cpu_addr: kernel CPU-view address returned from dma_alloc_attrs
+ * @dma_addr: device-view address returned from dma_alloc_attrs
+ * @size: size of memory originally requested in dma_alloc_attrs
+ * @attrs: attributes of mapping properties requested in dma_alloc_attrs
+ *
+ * Map a coherent DMA buffer previously allocated by dma_alloc_attrs into user
+ * space.  The coherent DMA buffer must not be freed by the driver until the
+ * user space mapping has been released.
+ */
+int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma,
+		void *cpu_addr, dma_addr_t dma_addr, size_t size,
+		unsigned long attrs)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+	BUG_ON(!ops);
+	if (ops->mmap)
+		return ops->mmap(dev, vma, cpu_addr, dma_addr, size, attrs);
+	return dma_common_mmap(dev, vma, cpu_addr, dma_addr, size, attrs);
+}
+EXPORT_SYMBOL(dma_mmap_attrs);
 
 #ifndef ARCH_HAS_DMA_GET_REQUIRED_MASK
 static u64 dma_default_get_required_mask(struct device *dev)
@@ -294,3 +331,102 @@ u64 dma_get_required_mask(struct device *dev)
 EXPORT_SYMBOL_GPL(dma_get_required_mask);
 #endif
 
+#ifndef arch_dma_alloc_attrs
+#define arch_dma_alloc_attrs(dev)	(true)
+#endif
+
+void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
+		gfp_t flag, unsigned long attrs)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+	void *cpu_addr;
+
+	BUG_ON(!ops);
+	WARN_ON_ONCE(dev && !dev->coherent_dma_mask);
+
+	if (dma_alloc_from_dev_coherent(dev, size, dma_handle, &cpu_addr))
+		return cpu_addr;
+
+	/* let the implementation decide on the zone to allocate from: */
+	flag &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM);
+
+	if (!arch_dma_alloc_attrs(&dev))
+		return NULL;
+	if (!ops->alloc)
+		return NULL;
+
+	cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs);
+	debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr);
+	return cpu_addr;
+}
+EXPORT_SYMBOL(dma_alloc_attrs);
+
+void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr,
+		dma_addr_t dma_handle, unsigned long attrs)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+
+	BUG_ON(!ops);
+
+	if (dma_release_from_dev_coherent(dev, get_order(size), cpu_addr))
+		return;
+	/*
+	 * On non-coherent platforms which implement DMA-coherent buffers via
+	 * non-cacheable remaps, ops->free() may call vunmap(). Thus getting
+	 * this far in IRQ context is a) at risk of a BUG_ON() or trying to
+	 * sleep on some machines, and b) an indication that the driver is
+	 * probably misusing the coherent API anyway.
+	 */
+	WARN_ON(irqs_disabled());
+
+	if (!ops->free || !cpu_addr)
+		return;
+
+	debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
+	ops->free(dev, size, cpu_addr, dma_handle, attrs);
+}
+EXPORT_SYMBOL(dma_free_attrs);
+
+static inline void dma_check_mask(struct device *dev, u64 mask)
+{
+	if (sme_active() && (mask < (((u64)sme_get_me_mask() << 1) - 1)))
+		dev_warn(dev, "SME is active, device will require DMA bounce buffers\n");
+}
+
+int dma_supported(struct device *dev, u64 mask)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+
+	if (!ops)
+		return 0;
+	if (!ops->dma_supported)
+		return 1;
+	return ops->dma_supported(dev, mask);
+}
+EXPORT_SYMBOL(dma_supported);
+
+#ifndef HAVE_ARCH_DMA_SET_MASK
+int dma_set_mask(struct device *dev, u64 mask)
+{
+	if (!dev->dma_mask || !dma_supported(dev, mask))
+		return -EIO;
+
+	dma_check_mask(dev, mask);
+	*dev->dma_mask = mask;
+	return 0;
+}
+EXPORT_SYMBOL(dma_set_mask);
+#endif
+
+#ifndef CONFIG_ARCH_HAS_DMA_SET_COHERENT_MASK
+int dma_set_coherent_mask(struct device *dev, u64 mask)
+{
+	if (!dma_supported(dev, mask))
+		return -EIO;
+
+	dma_check_mask(dev, mask);
+	dev->coherent_dma_mask = mask;
+	return 0;
+}
+EXPORT_SYMBOL(dma_set_coherent_mask);
+#endif
-- 
cgit 


From 8ddbe5943c0b1259b5ddb6dc1729863433fc256c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 6 Dec 2018 12:47:50 -0800
Subject: dma-mapping: move dma_cache_sync out of line

This isn't exactly a slow path routine, but it is not super critical
either, and moving it out of line will help to keep the include chain
clean for the following DMA indirection bypass work.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Tested-by: Jesper Dangaard Brouer <brouer@redhat.com>
Tested-by: Tony Luck <tony.luck@intel.com>
---
 include/linux/dma-mapping.h | 12 ++----------
 kernel/dma/mapping.c        | 11 +++++++++++
 2 files changed, 13 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 0bbce52606c2..0f0078490df4 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -411,16 +411,8 @@ dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
 #define dma_map_page(d, p, o, s, r) dma_map_page_attrs(d, p, o, s, r, 0)
 #define dma_unmap_page(d, a, s, r) dma_unmap_page_attrs(d, a, s, r, 0)
 
-static inline void
-dma_cache_sync(struct device *dev, void *vaddr, size_t size,
-		enum dma_data_direction dir)
-{
-	const struct dma_map_ops *ops = get_dma_ops(dev);
-
-	BUG_ON(!valid_dma_direction(dir));
-	if (ops->cache_sync)
-		ops->cache_sync(dev, vaddr, size, dir);
-}
+void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
+		enum dma_data_direction dir);
 
 extern int dma_common_mmap(struct device *dev, struct vm_area_struct *vma,
 		void *cpu_addr, dma_addr_t dma_addr, size_t size,
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 176ae3e08916..0b18cfbdde95 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -430,3 +430,14 @@ int dma_set_coherent_mask(struct device *dev, u64 mask)
 }
 EXPORT_SYMBOL(dma_set_coherent_mask);
 #endif
+
+void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
+		enum dma_data_direction dir)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+
+	BUG_ON(!valid_dma_direction(dir));
+	if (ops->cache_sync)
+		ops->cache_sync(dev, vaddr, size, dir);
+}
+EXPORT_SYMBOL(dma_cache_sync);
-- 
cgit 


From 3731c3d4774e38b9d91c01943e1e6a243c1776be Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 6 Dec 2018 12:50:26 -0800
Subject: dma-mapping: always build the direct mapping code

All architectures except for sparc64 use the dma-direct code in some
form, and even for sparc64 we had the discussion of a direct mapping
mode a while ago.  In preparation for directly calling the direct
mapping code don't bother having it optionally but always build the
code in.  This is a minor hardship for some powerpc and arm configs
that don't pull it in yet (although they should in a relase ot two),
and sparc64 which currently doesn't need it at all, but it will
reduce the ifdef mess we'd otherwise need significantly.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Tested-by: Jesper Dangaard Brouer <brouer@redhat.com>
Tested-by: Tony Luck <tony.luck@intel.com>
---
 arch/alpha/Kconfig      | 1 -
 arch/arc/Kconfig        | 1 -
 arch/arm/Kconfig        | 1 -
 arch/arm64/Kconfig      | 1 -
 arch/c6x/Kconfig        | 1 -
 arch/csky/Kconfig       | 1 -
 arch/h8300/Kconfig      | 1 -
 arch/hexagon/Kconfig    | 1 -
 arch/m68k/Kconfig       | 1 -
 arch/microblaze/Kconfig | 1 -
 arch/mips/Kconfig       | 1 -
 arch/nds32/Kconfig      | 1 -
 arch/nios2/Kconfig      | 1 -
 arch/openrisc/Kconfig   | 1 -
 arch/parisc/Kconfig     | 1 -
 arch/riscv/Kconfig      | 1 -
 arch/s390/Kconfig       | 1 -
 arch/sh/Kconfig         | 1 -
 arch/sparc/Kconfig      | 1 -
 arch/unicore32/Kconfig  | 1 -
 arch/x86/Kconfig        | 1 -
 arch/xtensa/Kconfig     | 1 -
 kernel/dma/Kconfig      | 7 -------
 kernel/dma/Makefile     | 3 +--
 24 files changed, 1 insertion(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index a7e748a46c18..5da6ff54b3e7 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -203,7 +203,6 @@ config ALPHA_EIGER
 config ALPHA_JENSEN
 	bool "Jensen"
 	depends on BROKEN
-	select DMA_DIRECT_OPS
 	help
 	  DEC PC 150 AXP (aka Jensen): This is a very old Digital system - one
 	  of the first-generation Alpha systems. A number of these systems
diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index fd48d698da29..7deaabeb531a 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -17,7 +17,6 @@ config ARC
 	select BUILDTIME_EXTABLE_SORT
 	select CLONE_BACKWARDS
 	select COMMON_CLK
-	select DMA_DIRECT_OPS
 	select GENERIC_ATOMIC64 if !ISA_ARCV2 || !(ARC_HAS_LL64 && ARC_HAS_LLSC)
 	select GENERIC_CLOCKEVENTS
 	select GENERIC_FIND_FIRST_BIT
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index a858ee791ef0..586fc30b23bd 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -30,7 +30,6 @@ config ARM
 	select CLONE_BACKWARDS
 	select CPU_PM if (SUSPEND || CPU_IDLE)
 	select DCACHE_WORD_ACCESS if HAVE_EFFICIENT_UNALIGNED_ACCESS
-	select DMA_DIRECT_OPS if !MMU
 	select DMA_REMAP if MMU
 	select EDAC_SUPPORT
 	select EDAC_ATOMIC_SCRUB
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 06cf0ef24367..2092080240b0 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -80,7 +80,6 @@ config ARM64
 	select CPU_PM if (SUSPEND || CPU_IDLE)
 	select CRC32
 	select DCACHE_WORD_ACCESS
-	select DMA_DIRECT_OPS
 	select DMA_DIRECT_REMAP
 	select EDAC_SUPPORT
 	select FRAME_POINTER
diff --git a/arch/c6x/Kconfig b/arch/c6x/Kconfig
index 84420109113d..456e154674d1 100644
--- a/arch/c6x/Kconfig
+++ b/arch/c6x/Kconfig
@@ -9,7 +9,6 @@ config C6X
 	select ARCH_HAS_SYNC_DMA_FOR_CPU
 	select ARCH_HAS_SYNC_DMA_FOR_DEVICE
 	select CLKDEV_LOOKUP
-	select DMA_DIRECT_OPS
 	select GENERIC_ATOMIC64
 	select GENERIC_IRQ_SHOW
 	select HAVE_ARCH_TRACEHOOK
diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig
index ea74f3a9eeaf..37bed8aadf95 100644
--- a/arch/csky/Kconfig
+++ b/arch/csky/Kconfig
@@ -7,7 +7,6 @@ config CSKY
 	select COMMON_CLK
 	select CLKSRC_MMIO
 	select CLKSRC_OF
-	select DMA_DIRECT_OPS
 	select DMA_DIRECT_REMAP
 	select IRQ_DOMAIN
 	select HANDLE_DOMAIN_IRQ
diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig
index d19c6b16cd5d..6472a0685470 100644
--- a/arch/h8300/Kconfig
+++ b/arch/h8300/Kconfig
@@ -22,7 +22,6 @@ config H8300
 	select HAVE_ARCH_KGDB
 	select HAVE_ARCH_HASH
 	select CPU_NO_EFFICIENT_FFS
-	select DMA_DIRECT_OPS
 
 config CPU_BIG_ENDIAN
 	def_bool y
diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig
index 2b688af379e6..d71036c598de 100644
--- a/arch/hexagon/Kconfig
+++ b/arch/hexagon/Kconfig
@@ -31,7 +31,6 @@ config HEXAGON
 	select GENERIC_CLOCKEVENTS_BROADCAST
 	select MODULES_USE_ELF_RELA
 	select GENERIC_CPU_DEVICES
-	select DMA_DIRECT_OPS
 	---help---
 	  Qualcomm Hexagon is a processor architecture designed for high
 	  performance and low power across a wide variety of applications.
diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig
index 1bc9f1ba759a..8a5868e9a3a0 100644
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -26,7 +26,6 @@ config M68K
 	select MODULES_USE_ELF_RELA
 	select OLD_SIGSUSPEND3
 	select OLD_SIGACTION
-	select DMA_DIRECT_OPS if HAS_DMA
 	select ARCH_DISCARD_MEMBLOCK
 
 config CPU_BIG_ENDIAN
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index effed2efd306..eda9e2315ef5 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -12,7 +12,6 @@ config MICROBLAZE
 	select TIMER_OF
 	select CLONE_BACKWARDS3
 	select COMMON_CLK
-	select DMA_DIRECT_OPS
 	select GENERIC_ATOMIC64
 	select GENERIC_CLOCKEVENTS
 	select GENERIC_CPU_DEVICES
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 8272ea4c7264..2993aa9842c0 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -16,7 +16,6 @@ config MIPS
 	select BUILDTIME_EXTABLE_SORT
 	select CLONE_BACKWARDS
 	select CPU_PM if CPU_IDLE
-	select DMA_DIRECT_OPS
 	select GENERIC_ATOMIC64 if !64BIT
 	select GENERIC_CLOCKEVENTS
 	select GENERIC_CMOS_UPDATE
diff --git a/arch/nds32/Kconfig b/arch/nds32/Kconfig
index 7a04adacb2f0..1af6bbae7220 100644
--- a/arch/nds32/Kconfig
+++ b/arch/nds32/Kconfig
@@ -11,7 +11,6 @@ config NDS32
 	select CLKSRC_MMIO
 	select CLONE_BACKWARDS
 	select COMMON_CLK
-	select DMA_DIRECT_OPS
 	select GENERIC_ATOMIC64
 	select GENERIC_CPU_DEVICES
 	select GENERIC_CLOCKEVENTS
diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig
index 7e95506e957a..f6c4b0f49997 100644
--- a/arch/nios2/Kconfig
+++ b/arch/nios2/Kconfig
@@ -4,7 +4,6 @@ config NIOS2
 	select ARCH_HAS_SYNC_DMA_FOR_CPU
 	select ARCH_HAS_SYNC_DMA_FOR_DEVICE
 	select ARCH_NO_SWAP
-	select DMA_DIRECT_OPS
 	select TIMER_OF
 	select GENERIC_ATOMIC64
 	select GENERIC_CLOCKEVENTS
diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig
index 285f7d05c8ed..d0feebad5a8f 100644
--- a/arch/openrisc/Kconfig
+++ b/arch/openrisc/Kconfig
@@ -7,7 +7,6 @@
 config OPENRISC
 	def_bool y
 	select ARCH_HAS_SYNC_DMA_FOR_DEVICE
-	select DMA_DIRECT_OPS
 	select OF
 	select OF_EARLY_FLATTREE
 	select IRQ_DOMAIN
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 428ee50fc3db..6e1b71da0e71 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -185,7 +185,6 @@ config PA11
 	depends on PA7000 || PA7100LC || PA7200 || PA7300LC
 	select ARCH_HAS_SYNC_DMA_FOR_CPU
 	select ARCH_HAS_SYNC_DMA_FOR_DEVICE
-	select DMA_DIRECT_OPS
 	select DMA_NONCOHERENT_CACHE_SYNC
 
 config PREFETCH
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 55da93f4e818..51d89c4b1dca 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -19,7 +19,6 @@ config RISCV
 	select ARCH_WANT_FRAME_POINTERS
 	select CLONE_BACKWARDS
 	select COMMON_CLK
-	select DMA_DIRECT_OPS
 	select GENERIC_CLOCKEVENTS
 	select GENERIC_CPU_DEVICES
 	select GENERIC_IRQ_SHOW
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 5624e8607054..21d271d04ca6 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -139,7 +139,6 @@ config S390
 	select HAVE_COPY_THREAD_TLS
 	select HAVE_DEBUG_KMEMLEAK
 	select HAVE_DMA_CONTIGUOUS
-	select DMA_DIRECT_OPS
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_DYNAMIC_FTRACE_WITH_REGS
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index f82a4da7adf3..10fd4e9c454b 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -7,7 +7,6 @@ config SUPERH
 	select ARCH_NO_COHERENT_DMA_MMAP if !MMU
 	select HAVE_PATA_PLATFORM
 	select CLKDEV_LOOKUP
-	select DMA_DIRECT_OPS
 	select HAVE_IDE if HAS_IOPORT_MAP
 	select HAVE_MEMBLOCK_NODE_MAP
 	select ARCH_DISCARD_MEMBLOCK
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 8853b6ceae17..f5bb9ded1d18 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -48,7 +48,6 @@ config SPARC
 config SPARC32
 	def_bool !64BIT
 	select ARCH_HAS_SYNC_DMA_FOR_CPU
-	select DMA_DIRECT_OPS
 	select GENERIC_ATOMIC64
 	select CLZ_TAB
 	select HAVE_UID16
diff --git a/arch/unicore32/Kconfig b/arch/unicore32/Kconfig
index a4c05159dca5..2681027d7bff 100644
--- a/arch/unicore32/Kconfig
+++ b/arch/unicore32/Kconfig
@@ -4,7 +4,6 @@ config UNICORE32
 	select ARCH_HAS_DEVMEM_IS_ALLOWED
 	select ARCH_MIGHT_HAVE_PC_PARPORT
 	select ARCH_MIGHT_HAVE_PC_SERIO
-	select DMA_DIRECT_OPS
 	select HAVE_GENERIC_DMA_COHERENT
 	select HAVE_KERNEL_GZIP
 	select HAVE_KERNEL_BZIP2
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index adc845b66f01..c14d4a35be13 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -89,7 +89,6 @@ config X86
 	select CLOCKSOURCE_VALIDATE_LAST_CYCLE
 	select CLOCKSOURCE_WATCHDOG
 	select DCACHE_WORD_ACCESS
-	select DMA_DIRECT_OPS
 	select EDAC_ATOMIC_SCRUB
 	select EDAC_SUPPORT
 	select GENERIC_CLOCKEVENTS
diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig
index 75488b606edc..36338e7564a3 100644
--- a/arch/xtensa/Kconfig
+++ b/arch/xtensa/Kconfig
@@ -9,7 +9,6 @@ config XTENSA
 	select BUILDTIME_EXTABLE_SORT
 	select CLONE_BACKWARDS
 	select COMMON_CLK
-	select DMA_DIRECT_OPS
 	select DMA_REMAP if MMU
 	select GENERIC_ATOMIC64
 	select GENERIC_CLOCKEVENTS
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 41c3b1df70eb..ca88b867e7fe 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -35,13 +35,8 @@ config ARCH_HAS_DMA_COHERENT_TO_PFN
 config ARCH_HAS_DMA_MMAP_PGPROT
 	bool
 
-config DMA_DIRECT_OPS
-	bool
-	depends on HAS_DMA
-
 config DMA_NONCOHERENT_CACHE_SYNC
 	bool
-	depends on DMA_DIRECT_OPS
 
 config DMA_VIRT_OPS
 	bool
@@ -49,7 +44,6 @@ config DMA_VIRT_OPS
 
 config SWIOTLB
 	bool
-	select DMA_DIRECT_OPS
 	select NEED_DMA_MAP_STATE
 
 config DMA_REMAP
@@ -58,5 +52,4 @@ config DMA_REMAP
 
 config DMA_DIRECT_REMAP
 	bool
-	depends on DMA_DIRECT_OPS
 	select DMA_REMAP
diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile
index f4feeceb8020..a626f643cd63 100644
--- a/kernel/dma/Makefile
+++ b/kernel/dma/Makefile
@@ -1,9 +1,8 @@
 # SPDX-License-Identifier: GPL-2.0
 
-obj-$(CONFIG_HAS_DMA)			+= mapping.o
+obj-$(CONFIG_HAS_DMA)			+= mapping.o direct.o
 obj-$(CONFIG_DMA_CMA)			+= contiguous.o
 obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += coherent.o
-obj-$(CONFIG_DMA_DIRECT_OPS)		+= direct.o
 obj-$(CONFIG_DMA_VIRT_OPS)		+= virt.o
 obj-$(CONFIG_DMA_API_DEBUG)		+= debug.o
 obj-$(CONFIG_SWIOTLB)			+= swiotlb.o
-- 
cgit 


From 90ac706e98fcb24fb0b0a259558987f33cc2f0f6 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Thu, 6 Dec 2018 13:14:44 -0800
Subject: dma-mapping: factor out dummy DMA ops

The dummy DMA ops are currently used by arm64 for any device which has
an invalid ACPI description and is thus barred from using DMA due to not
knowing whether is is cache-coherent or not. Factor these out into
general dma-mapping code so that they can be referenced from other
common code paths. In the process, we can prune all the optional
callbacks which just do the same thing as the default behaviour, and
fill in .map_resource for completeness.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
[hch: moved to a separate source file]
Reviewed-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Tested-by: Jesper Dangaard Brouer <brouer@redhat.com>
Tested-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 arch/arm64/include/asm/dma-mapping.h |  4 +-
 arch/arm64/mm/dma-mapping.c          | 86 ------------------------------------
 include/linux/dma-mapping.h          |  1 +
 kernel/dma/Makefile                  |  2 +-
 kernel/dma/dummy.c                   | 39 ++++++++++++++++
 5 files changed, 42 insertions(+), 90 deletions(-)
 create mode 100644 kernel/dma/dummy.c

(limited to 'kernel')

diff --git a/arch/arm64/include/asm/dma-mapping.h b/arch/arm64/include/asm/dma-mapping.h
index c41f3fb1446c..273e778f7de2 100644
--- a/arch/arm64/include/asm/dma-mapping.h
+++ b/arch/arm64/include/asm/dma-mapping.h
@@ -24,15 +24,13 @@
 #include <xen/xen.h>
 #include <asm/xen/hypervisor.h>
 
-extern const struct dma_map_ops dummy_dma_ops;
-
 static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 {
 	/*
 	 * We expect no ISA devices, and all other DMA masters are expected to
 	 * have someone call arch_setup_dma_ops at device creation time.
 	 */
-	return &dummy_dma_ops;
+	return &dma_dummy_ops;
 }
 
 void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index 4c0f498069e8..6ff6ec8806c1 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -89,92 +89,6 @@ static int __swiotlb_mmap_pfn(struct vm_area_struct *vma,
 }
 #endif /* CONFIG_IOMMU_DMA */
 
-/********************************************
- * The following APIs are for dummy DMA ops *
- ********************************************/
-
-static void *__dummy_alloc(struct device *dev, size_t size,
-			   dma_addr_t *dma_handle, gfp_t flags,
-			   unsigned long attrs)
-{
-	return NULL;
-}
-
-static void __dummy_free(struct device *dev, size_t size,
-			 void *vaddr, dma_addr_t dma_handle,
-			 unsigned long attrs)
-{
-}
-
-static int __dummy_mmap(struct device *dev,
-			struct vm_area_struct *vma,
-			void *cpu_addr, dma_addr_t dma_addr, size_t size,
-			unsigned long attrs)
-{
-	return -ENXIO;
-}
-
-static dma_addr_t __dummy_map_page(struct device *dev, struct page *page,
-				   unsigned long offset, size_t size,
-				   enum dma_data_direction dir,
-				   unsigned long attrs)
-{
-	return DMA_MAPPING_ERROR;
-}
-
-static void __dummy_unmap_page(struct device *dev, dma_addr_t dev_addr,
-			       size_t size, enum dma_data_direction dir,
-			       unsigned long attrs)
-{
-}
-
-static int __dummy_map_sg(struct device *dev, struct scatterlist *sgl,
-			  int nelems, enum dma_data_direction dir,
-			  unsigned long attrs)
-{
-	return 0;
-}
-
-static void __dummy_unmap_sg(struct device *dev,
-			     struct scatterlist *sgl, int nelems,
-			     enum dma_data_direction dir,
-			     unsigned long attrs)
-{
-}
-
-static void __dummy_sync_single(struct device *dev,
-				dma_addr_t dev_addr, size_t size,
-				enum dma_data_direction dir)
-{
-}
-
-static void __dummy_sync_sg(struct device *dev,
-			    struct scatterlist *sgl, int nelems,
-			    enum dma_data_direction dir)
-{
-}
-
-static int __dummy_dma_supported(struct device *hwdev, u64 mask)
-{
-	return 0;
-}
-
-const struct dma_map_ops dummy_dma_ops = {
-	.alloc                  = __dummy_alloc,
-	.free                   = __dummy_free,
-	.mmap                   = __dummy_mmap,
-	.map_page               = __dummy_map_page,
-	.unmap_page             = __dummy_unmap_page,
-	.map_sg                 = __dummy_map_sg,
-	.unmap_sg               = __dummy_unmap_sg,
-	.sync_single_for_cpu    = __dummy_sync_single,
-	.sync_single_for_device = __dummy_sync_single,
-	.sync_sg_for_cpu        = __dummy_sync_sg,
-	.sync_sg_for_device     = __dummy_sync_sg,
-	.dma_supported          = __dummy_dma_supported,
-};
-EXPORT_SYMBOL(dummy_dma_ops);
-
 static int __init arm64_dma_init(void)
 {
 	WARN_TAINT(ARCH_DMA_MINALIGN < cache_line_size(),
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 0f0078490df4..269ee27fc3d9 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -136,6 +136,7 @@ struct dma_map_ops {
 
 extern const struct dma_map_ops dma_direct_ops;
 extern const struct dma_map_ops dma_virt_ops;
+extern const struct dma_map_ops dma_dummy_ops;
 
 #define DMA_BIT_MASK(n)	(((n) == 64) ? ~0ULL : ((1ULL<<(n))-1))
 
diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile
index a626f643cd63..72ff6e46aa86 100644
--- a/kernel/dma/Makefile
+++ b/kernel/dma/Makefile
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 
-obj-$(CONFIG_HAS_DMA)			+= mapping.o direct.o
+obj-$(CONFIG_HAS_DMA)			+= mapping.o direct.o dummy.o
 obj-$(CONFIG_DMA_CMA)			+= contiguous.o
 obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += coherent.o
 obj-$(CONFIG_DMA_VIRT_OPS)		+= virt.o
diff --git a/kernel/dma/dummy.c b/kernel/dma/dummy.c
new file mode 100644
index 000000000000..05607642c888
--- /dev/null
+++ b/kernel/dma/dummy.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Dummy DMA ops that always fail.
+ */
+#include <linux/dma-mapping.h>
+
+static int dma_dummy_mmap(struct device *dev, struct vm_area_struct *vma,
+		void *cpu_addr, dma_addr_t dma_addr, size_t size,
+		unsigned long attrs)
+{
+	return -ENXIO;
+}
+
+static dma_addr_t dma_dummy_map_page(struct device *dev, struct page *page,
+		unsigned long offset, size_t size, enum dma_data_direction dir,
+		unsigned long attrs)
+{
+	return DMA_MAPPING_ERROR;
+}
+
+static int dma_dummy_map_sg(struct device *dev, struct scatterlist *sgl,
+		int nelems, enum dma_data_direction dir,
+		unsigned long attrs)
+{
+	return 0;
+}
+
+static int dma_dummy_supported(struct device *hwdev, u64 mask)
+{
+	return 0;
+}
+
+const struct dma_map_ops dma_dummy_ops = {
+	.mmap                   = dma_dummy_mmap,
+	.map_page               = dma_dummy_map_page,
+	.map_sg                 = dma_dummy_map_sg,
+	.dma_supported          = dma_dummy_supported,
+};
+EXPORT_SYMBOL(dma_dummy_ops);
-- 
cgit 


From b907e20508d02462a50c2841da0a5e3883fdab39 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 3 Dec 2018 11:42:52 +0100
Subject: swiotlb: remove SWIOTLB_MAP_ERROR

We can use DMA_MAPPING_ERROR instead, which already maps to the same
value.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Tested-by: Jesper Dangaard Brouer <brouer@redhat.com>
Tested-by: Tony Luck <tony.luck@intel.com>
---
 drivers/xen/swiotlb-xen.c | 4 ++--
 include/linux/swiotlb.h   | 3 ---
 kernel/dma/swiotlb.c      | 4 ++--
 3 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
index 6dc969d5ea2f..833e80b46eb2 100644
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c
@@ -403,7 +403,7 @@ static dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page,
 
 	map = swiotlb_tbl_map_single(dev, start_dma_addr, phys, size, dir,
 				     attrs);
-	if (map == SWIOTLB_MAP_ERROR)
+	if (map == DMA_MAPPING_ERROR)
 		return DMA_MAPPING_ERROR;
 
 	dev_addr = xen_phys_to_bus(map);
@@ -572,7 +572,7 @@ xen_swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
 								 sg_phys(sg),
 								 sg->length,
 								 dir, attrs);
-			if (map == SWIOTLB_MAP_ERROR) {
+			if (map == DMA_MAPPING_ERROR) {
 				dev_warn(hwdev, "swiotlb buffer is full\n");
 				/* Don't panic here, we expect map_sg users
 				   to do proper error handling. */
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index a387b59640a4..14aec0b70dd9 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -46,9 +46,6 @@ enum dma_sync_target {
 	SYNC_FOR_DEVICE = 1,
 };
 
-/* define the last possible byte of physical address space as a mapping error */
-#define SWIOTLB_MAP_ERROR (~(phys_addr_t)0x0)
-
 extern phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
 					  dma_addr_t tbl_dma_addr,
 					  phys_addr_t phys, size_t size,
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index ff1ce81bb623..19ba8e473d71 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -526,7 +526,7 @@ not_found:
 	spin_unlock_irqrestore(&io_tlb_lock, flags);
 	if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit())
 		dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes)\n", size);
-	return SWIOTLB_MAP_ERROR;
+	return DMA_MAPPING_ERROR;
 found:
 	spin_unlock_irqrestore(&io_tlb_lock, flags);
 
@@ -637,7 +637,7 @@ static dma_addr_t swiotlb_bounce_page(struct device *dev, phys_addr_t *phys,
 	/* Oh well, have to allocate and map a bounce buffer. */
 	*phys = swiotlb_tbl_map_single(dev, __phys_to_dma(dev, io_tlb_start),
 			*phys, size, dir, attrs);
-	if (*phys == SWIOTLB_MAP_ERROR)
+	if (*phys == DMA_MAPPING_ERROR)
 		return DMA_MAPPING_ERROR;
 
 	/* Ensure that the address returned is DMA'ble */
-- 
cgit 


From 68c608345cc569bcfa1c1b2add4c00c343ecf933 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 6 Dec 2018 07:06:04 -0800
Subject: swiotlb: remove dma_mark_clean

Instead of providing a special dma_mark_clean hook just for ia64, switch
ia64 to use the normal arch_sync_dma_for_cpu hooks instead.

This means that we now also set the PG_arch_1 bit for pages in the
swiotlb buffer, which isn't stricly needed as we will never execute code
out of the swiotlb buffer, but otherwise harmless.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Tested-by: Jesper Dangaard Brouer <brouer@redhat.com>
Tested-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/Kconfig              |  3 ++-
 arch/ia64/kernel/dma-mapping.c | 20 +++++++++++++++++++-
 arch/ia64/mm/init.c            | 19 ++++++++-----------
 drivers/xen/swiotlb-xen.c      | 20 +-------------------
 include/linux/dma-direct.h     |  8 --------
 kernel/dma/swiotlb.c           | 18 +-----------------
 6 files changed, 31 insertions(+), 57 deletions(-)

(limited to 'kernel')

diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index d6f203658994..c587e3316c38 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -28,7 +28,8 @@ config IA64
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_MEMBLOCK_NODE_MAP
 	select HAVE_VIRT_CPU_ACCOUNTING
-	select ARCH_HAS_DMA_MARK_CLEAN
+	select ARCH_HAS_DMA_COHERENT_TO_PFN
+	select ARCH_HAS_SYNC_DMA_FOR_CPU
 	select VIRT_TO_BUS
 	select ARCH_DISCARD_MEMBLOCK
 	select GENERIC_IRQ_PROBE
diff --git a/arch/ia64/kernel/dma-mapping.c b/arch/ia64/kernel/dma-mapping.c
index 7a471d8d67d4..36dd6aa6d759 100644
--- a/arch/ia64/kernel/dma-mapping.c
+++ b/arch/ia64/kernel/dma-mapping.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-#include <linux/dma-mapping.h>
+#include <linux/dma-direct.h>
 #include <linux/swiotlb.h>
 #include <linux/export.h>
 
@@ -16,6 +16,24 @@ const struct dma_map_ops *dma_get_ops(struct device *dev)
 EXPORT_SYMBOL(dma_get_ops);
 
 #ifdef CONFIG_SWIOTLB
+void *arch_dma_alloc(struct device *dev, size_t size,
+		dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
+{
+	return dma_direct_alloc_pages(dev, size, dma_handle, gfp, attrs);
+}
+
+void arch_dma_free(struct device *dev, size_t size, void *cpu_addr,
+		dma_addr_t dma_addr, unsigned long attrs)
+{
+	dma_direct_free_pages(dev, size, cpu_addr, dma_addr, attrs);
+}
+
+long arch_dma_coherent_to_pfn(struct device *dev, void *cpu_addr,
+		dma_addr_t dma_addr)
+{
+	return page_to_pfn(virt_to_page(cpu_addr));
+}
+
 void __init swiotlb_dma_init(void)
 {
 	dma_ops = &swiotlb_dma_ops;
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index d5e12ff1d73c..0cf43bb13d6e 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -8,6 +8,7 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 
+#include <linux/dma-noncoherent.h>
 #include <linux/efi.h>
 #include <linux/elf.h>
 #include <linux/memblock.h>
@@ -71,18 +72,14 @@ __ia64_sync_icache_dcache (pte_t pte)
  * DMA can be marked as "clean" so that lazy_mmu_prot_update() doesn't have to
  * flush them when they get mapped into an executable vm-area.
  */
-void
-dma_mark_clean(void *addr, size_t size)
+void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr,
+		size_t size, enum dma_data_direction dir)
 {
-	unsigned long pg_addr, end;
-
-	pg_addr = PAGE_ALIGN((unsigned long) addr);
-	end = (unsigned long) addr + size;
-	while (pg_addr + PAGE_SIZE <= end) {
-		struct page *page = virt_to_page(pg_addr);
-		set_bit(PG_arch_1, &page->flags);
-		pg_addr += PAGE_SIZE;
-	}
+	unsigned long pfn = PHYS_PFN(paddr);
+
+	do {
+		set_bit(PG_arch_1, &pfn_to_page(pfn)->flags);
+	} while (++pfn <= PHYS_PFN(paddr + size - 1));
 }
 
 inline void
diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
index 833e80b46eb2..989cf872b98c 100644
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c
@@ -441,21 +441,8 @@ static void xen_unmap_single(struct device *hwdev, dma_addr_t dev_addr,
 	xen_dma_unmap_page(hwdev, dev_addr, size, dir, attrs);
 
 	/* NOTE: We use dev_addr here, not paddr! */
-	if (is_xen_swiotlb_buffer(dev_addr)) {
+	if (is_xen_swiotlb_buffer(dev_addr))
 		swiotlb_tbl_unmap_single(hwdev, paddr, size, dir, attrs);
-		return;
-	}
-
-	if (dir != DMA_FROM_DEVICE)
-		return;
-
-	/*
-	 * phys_to_virt doesn't work with hihgmem page but we could
-	 * call dma_mark_clean() with hihgmem page here. However, we
-	 * are fine since dma_mark_clean() is null on POWERPC. We can
-	 * make dma_mark_clean() take a physical address if necessary.
-	 */
-	dma_mark_clean(phys_to_virt(paddr), size);
 }
 
 static void xen_swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr,
@@ -493,11 +480,6 @@ xen_swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr,
 
 	if (target == SYNC_FOR_DEVICE)
 		xen_dma_sync_single_for_device(hwdev, dev_addr, size, dir);
-
-	if (dir != DMA_FROM_DEVICE)
-		return;
-
-	dma_mark_clean(phys_to_virt(paddr), size);
 }
 
 void
diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index 6e5a47ae7d64..1aa73f4907ae 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -48,14 +48,6 @@ static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
 	return __sme_clr(__dma_to_phys(dev, daddr));
 }
 
-#ifdef CONFIG_ARCH_HAS_DMA_MARK_CLEAN
-void dma_mark_clean(void *addr, size_t size);
-#else
-static inline void dma_mark_clean(void *addr, size_t size)
-{
-}
-#endif /* CONFIG_ARCH_HAS_DMA_MARK_CLEAN */
-
 u64 dma_direct_get_required_mask(struct device *dev);
 void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
 		gfp_t gfp, unsigned long attrs);
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 19ba8e473d71..2e126bac5d7d 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -706,21 +706,8 @@ void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr,
 	    (attrs & DMA_ATTR_SKIP_CPU_SYNC) == 0)
 		arch_sync_dma_for_cpu(hwdev, paddr, size, dir);
 
-	if (is_swiotlb_buffer(paddr)) {
+	if (is_swiotlb_buffer(paddr))
 		swiotlb_tbl_unmap_single(hwdev, paddr, size, dir, attrs);
-		return;
-	}
-
-	if (dir != DMA_FROM_DEVICE)
-		return;
-
-	/*
-	 * phys_to_virt doesn't work with hihgmem page but we could
-	 * call dma_mark_clean() with hihgmem page here. However, we
-	 * are fine since dma_mark_clean() is null on POWERPC. We can
-	 * make dma_mark_clean() take a physical address if necessary.
-	 */
-	dma_mark_clean(phys_to_virt(paddr), size);
 }
 
 /*
@@ -750,9 +737,6 @@ swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr,
 
 	if (!dev_is_dma_coherent(hwdev) && target == SYNC_FOR_DEVICE)
 		arch_sync_dma_for_device(hwdev, paddr, size, dir);
-
-	if (!is_swiotlb_buffer(paddr) && dir == DMA_FROM_DEVICE)
-		dma_mark_clean(phys_to_virt(paddr), size);
 }
 
 void
-- 
cgit 


From 58dfd4ac022037c6a562e92fc6d2a778819b2162 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 3 Dec 2018 07:43:05 +0100
Subject: dma-direct: improve addressability error reporting

Only report report a DMA addressability report once to avoid spewing the
kernel log with repeated message.  Also provide a stack trace to make it
easy to find the actual caller that caused the problem.

Last but not least move the actual check into the fast path and only
leave the error reporting in a helper.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Tested-by: Jesper Dangaard Brouer <brouer@redhat.com>
Tested-by: Tony Luck <tony.luck@intel.com>
---
 kernel/dma/direct.c | 36 +++++++++++++++---------------------
 1 file changed, 15 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 308f88a750c8..edb24f94ea1e 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -30,27 +30,16 @@ static inline bool force_dma_unencrypted(void)
 	return sev_active();
 }
 
-static bool
-check_addr(struct device *dev, dma_addr_t dma_addr, size_t size,
-		const char *caller)
+static void report_addr(struct device *dev, dma_addr_t dma_addr, size_t size)
 {
-	if (unlikely(dev && !dma_capable(dev, dma_addr, size))) {
-		if (!dev->dma_mask) {
-			dev_err(dev,
-				"%s: call on device without dma_mask\n",
-				caller);
-			return false;
-		}
-
-		if (*dev->dma_mask >= DMA_BIT_MASK(32) || dev->bus_dma_mask) {
-			dev_err(dev,
-				"%s: overflow %pad+%zu of device mask %llx bus mask %llx\n",
-				caller, &dma_addr, size,
-				*dev->dma_mask, dev->bus_dma_mask);
-		}
-		return false;
+	if (!dev->dma_mask) {
+		dev_err_once(dev, "DMA map on device without dma_mask\n");
+	} else if (*dev->dma_mask >= DMA_BIT_MASK(32) || dev->bus_dma_mask) {
+		dev_err_once(dev,
+			"overflow %pad+%zu of DMA mask %llx bus mask %llx\n",
+			&dma_addr, size, *dev->dma_mask, dev->bus_dma_mask);
 	}
-	return true;
+	WARN_ON_ONCE(1);
 }
 
 static inline dma_addr_t phys_to_dma_direct(struct device *dev,
@@ -288,8 +277,10 @@ dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
 	phys_addr_t phys = page_to_phys(page) + offset;
 	dma_addr_t dma_addr = phys_to_dma(dev, phys);
 
-	if (!check_addr(dev, dma_addr, size, __func__))
+	if (unlikely(dev && !dma_capable(dev, dma_addr, size))) {
+		report_addr(dev, dma_addr, size);
 		return DMA_MAPPING_ERROR;
+	}
 
 	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
 		dma_direct_sync_single_for_device(dev, dma_addr, size, dir);
@@ -306,8 +297,11 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
 		BUG_ON(!sg_page(sg));
 
 		sg_dma_address(sg) = phys_to_dma(dev, sg_phys(sg));
-		if (!check_addr(dev, sg_dma_address(sg), sg->length, __func__))
+		if (unlikely(dev && !dma_capable(dev, sg_dma_address(sg),
+				sg->length))) {
+			report_addr(dev, sg_dma_address(sg), sg->length);
 			return 0;
+		}
 		sg_dma_len(sg) = sg->length;
 	}
 
-- 
cgit 


From 17ac524719f3fc88c1a90528f4789e4b4f618512 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 3 Dec 2018 11:14:09 +0100
Subject: dma-direct: use dma_direct_map_page to implement dma_direct_map_sg

No need to duplicate the mapping logic.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Tested-by: Jesper Dangaard Brouer <brouer@redhat.com>
Tested-by: Tony Luck <tony.luck@intel.com>
---
 kernel/dma/direct.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index edb24f94ea1e..d45306473c90 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -217,6 +217,7 @@ static void dma_direct_sync_single_for_device(struct device *dev,
 	arch_sync_dma_for_device(dev, dma_to_phys(dev, addr), size, dir);
 }
 
+#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE)
 static void dma_direct_sync_sg_for_device(struct device *dev,
 		struct scatterlist *sgl, int nents, enum dma_data_direction dir)
 {
@@ -229,6 +230,7 @@ static void dma_direct_sync_sg_for_device(struct device *dev,
 	for_each_sg(sgl, sg, nents, i)
 		arch_sync_dma_for_device(dev, sg_phys(sg), sg->length, dir);
 }
+#endif
 
 #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
     defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL)
@@ -294,19 +296,13 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
 	struct scatterlist *sg;
 
 	for_each_sg(sgl, sg, nents, i) {
-		BUG_ON(!sg_page(sg));
-
-		sg_dma_address(sg) = phys_to_dma(dev, sg_phys(sg));
-		if (unlikely(dev && !dma_capable(dev, sg_dma_address(sg),
-				sg->length))) {
-			report_addr(dev, sg_dma_address(sg), sg->length);
+		sg->dma_address = dma_direct_map_page(dev, sg_page(sg),
+				sg->offset, sg->length, dir, attrs);
+		if (sg->dma_address == DMA_MAPPING_ERROR)
 			return 0;
-		}
 		sg_dma_len(sg) = sg->length;
 	}
 
-	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
-		dma_direct_sync_sg_for_device(dev, sgl, nents, dir);
 	return nents;
 }
 
-- 
cgit 


From 55897af63091ebc2c3f239c6a6666f748113ac50 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 3 Dec 2018 11:43:54 +0100
Subject: dma-direct: merge swiotlb_dma_ops into the dma_direct code

While the dma-direct code is (relatively) clean and simple we actually
have to use the swiotlb ops for the mapping on many architectures due
to devices with addressing limits.  Instead of keeping two
implementations around this commit allows the dma-direct
implementation to call the swiotlb bounce buffering functions and
thus share the guts of the mapping implementation.  This also
simplified the dma-mapping setup on a few architectures where we
don't have to differenciate which implementation to use.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Tested-by: Jesper Dangaard Brouer <brouer@redhat.com>
Tested-by: Tony Luck <tony.luck@intel.com>
---
 arch/arm64/mm/dma-mapping.c          |   2 +-
 arch/ia64/hp/common/hwsw_iommu.c     |   2 +-
 arch/ia64/hp/common/sba_iommu.c      |   6 +-
 arch/ia64/kernel/dma-mapping.c       |   2 +-
 arch/mips/include/asm/dma-mapping.h  |   2 -
 arch/powerpc/kernel/dma-swiotlb.c    |  16 +--
 arch/riscv/include/asm/dma-mapping.h |  15 ---
 arch/x86/kernel/pci-swiotlb.c        |   4 +-
 arch/x86/mm/mem_encrypt.c            |   7 --
 arch/x86/pci/sta2x11-fixup.c         |   1 -
 include/linux/dma-direct.h           |  12 ++
 include/linux/swiotlb.h              |  74 +++++------
 kernel/dma/direct.c                  | 113 ++++++++++++-----
 kernel/dma/swiotlb.c                 | 232 ++---------------------------------
 14 files changed, 150 insertions(+), 338 deletions(-)
 delete mode 100644 arch/riscv/include/asm/dma-mapping.h

(limited to 'kernel')

diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index 6ff6ec8806c1..ab1e417204d0 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -463,7 +463,7 @@ void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
 			const struct iommu_ops *iommu, bool coherent)
 {
 	if (!dev->dma_ops)
-		dev->dma_ops = &swiotlb_dma_ops;
+		dev->dma_ops = &dma_direct_ops;
 
 	dev->dma_coherent = coherent;
 	__iommu_setup_dma_ops(dev, dma_base, size, iommu);
diff --git a/arch/ia64/hp/common/hwsw_iommu.c b/arch/ia64/hp/common/hwsw_iommu.c
index 58969039bed2..f40ca499b246 100644
--- a/arch/ia64/hp/common/hwsw_iommu.c
+++ b/arch/ia64/hp/common/hwsw_iommu.c
@@ -38,7 +38,7 @@ static inline int use_swiotlb(struct device *dev)
 const struct dma_map_ops *hwsw_dma_get_ops(struct device *dev)
 {
 	if (use_swiotlb(dev))
-		return &swiotlb_dma_ops;
+		return &dma_direct_ops;
 	return &sba_dma_ops;
 }
 EXPORT_SYMBOL(hwsw_dma_get_ops);
diff --git a/arch/ia64/hp/common/sba_iommu.c b/arch/ia64/hp/common/sba_iommu.c
index 0d21c0b5b23d..5ee74820a0f6 100644
--- a/arch/ia64/hp/common/sba_iommu.c
+++ b/arch/ia64/hp/common/sba_iommu.c
@@ -2065,8 +2065,6 @@ static int __init acpi_sba_ioc_init_acpi(void)
 /* This has to run before acpi_scan_init(). */
 arch_initcall(acpi_sba_ioc_init_acpi);
 
-extern const struct dma_map_ops swiotlb_dma_ops;
-
 static int __init
 sba_init(void)
 {
@@ -2080,7 +2078,7 @@ sba_init(void)
 	 * a successful kdump kernel boot is to use the swiotlb.
 	 */
 	if (is_kdump_kernel()) {
-		dma_ops = &swiotlb_dma_ops;
+		dma_ops = &dma_direct_ops;
 		if (swiotlb_late_init_with_default_size(64 * (1<<20)) != 0)
 			panic("Unable to initialize software I/O TLB:"
 				  " Try machvec=dig boot option");
@@ -2102,7 +2100,7 @@ sba_init(void)
 		 * If we didn't find something sba_iommu can claim, we
 		 * need to setup the swiotlb and switch to the dig machvec.
 		 */
-		dma_ops = &swiotlb_dma_ops;
+		dma_ops = &dma_direct_ops;
 		if (swiotlb_late_init_with_default_size(64 * (1<<20)) != 0)
 			panic("Unable to find SBA IOMMU or initialize "
 			      "software I/O TLB: Try machvec=dig boot option");
diff --git a/arch/ia64/kernel/dma-mapping.c b/arch/ia64/kernel/dma-mapping.c
index 36dd6aa6d759..80cd3e1ea95a 100644
--- a/arch/ia64/kernel/dma-mapping.c
+++ b/arch/ia64/kernel/dma-mapping.c
@@ -36,7 +36,7 @@ long arch_dma_coherent_to_pfn(struct device *dev, void *cpu_addr,
 
 void __init swiotlb_dma_init(void)
 {
-	dma_ops = &swiotlb_dma_ops;
+	dma_ops = &dma_direct_ops;
 	swiotlb_init(1);
 }
 #endif
diff --git a/arch/mips/include/asm/dma-mapping.h b/arch/mips/include/asm/dma-mapping.h
index b4c477eb46ce..69f914667f3e 100644
--- a/arch/mips/include/asm/dma-mapping.h
+++ b/arch/mips/include/asm/dma-mapping.h
@@ -10,8 +10,6 @@ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 {
 #if defined(CONFIG_MACH_JAZZ)
 	return &jazz_dma_ops;
-#elif defined(CONFIG_SWIOTLB)
-	return &swiotlb_dma_ops;
 #else
 	return &dma_direct_ops;
 #endif
diff --git a/arch/powerpc/kernel/dma-swiotlb.c b/arch/powerpc/kernel/dma-swiotlb.c
index 3d8df2cf8be9..430a7d0aa2cb 100644
--- a/arch/powerpc/kernel/dma-swiotlb.c
+++ b/arch/powerpc/kernel/dma-swiotlb.c
@@ -50,15 +50,15 @@ const struct dma_map_ops powerpc_swiotlb_dma_ops = {
 	.alloc = __dma_nommu_alloc_coherent,
 	.free = __dma_nommu_free_coherent,
 	.mmap = dma_nommu_mmap_coherent,
-	.map_sg = swiotlb_map_sg_attrs,
-	.unmap_sg = swiotlb_unmap_sg_attrs,
+	.map_sg = dma_direct_map_sg,
+	.unmap_sg = dma_direct_unmap_sg,
 	.dma_supported = swiotlb_dma_supported,
-	.map_page = swiotlb_map_page,
-	.unmap_page = swiotlb_unmap_page,
-	.sync_single_for_cpu = swiotlb_sync_single_for_cpu,
-	.sync_single_for_device = swiotlb_sync_single_for_device,
-	.sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
-	.sync_sg_for_device = swiotlb_sync_sg_for_device,
+	.map_page = dma_direct_map_page,
+	.unmap_page = dma_direct_unmap_page,
+	.sync_single_for_cpu = dma_direct_sync_single_for_cpu,
+	.sync_single_for_device = dma_direct_sync_single_for_device,
+	.sync_sg_for_cpu = dma_direct_sync_sg_for_cpu,
+	.sync_sg_for_device = dma_direct_sync_sg_for_device,
 	.get_required_mask = swiotlb_powerpc_get_required,
 };
 
diff --git a/arch/riscv/include/asm/dma-mapping.h b/arch/riscv/include/asm/dma-mapping.h
deleted file mode 100644
index 8facc1c8fa05..000000000000
--- a/arch/riscv/include/asm/dma-mapping.h
+++ /dev/null
@@ -1,15 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#ifndef _RISCV_ASM_DMA_MAPPING_H
-#define _RISCV_ASM_DMA_MAPPING_H 1
-
-#ifdef CONFIG_SWIOTLB
-#include <linux/swiotlb.h>
-static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
-{
-	return &swiotlb_dma_ops;
-}
-#else
-#include <asm-generic/dma-mapping.h>
-#endif /* CONFIG_SWIOTLB */
-
-#endif /* _RISCV_ASM_DMA_MAPPING_H */
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index bd08b9e1c9e2..5f5302028a9a 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -62,10 +62,8 @@ IOMMU_INIT(pci_swiotlb_detect_4gb,
 
 void __init pci_swiotlb_init(void)
 {
-	if (swiotlb) {
+	if (swiotlb)
 		swiotlb_init(0);
-		dma_ops = &swiotlb_dma_ops;
-	}
 }
 
 void __init pci_swiotlb_late_init(void)
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index 006f373f54ab..385afa2b9e17 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -380,13 +380,6 @@ void __init mem_encrypt_init(void)
 	/* Call into SWIOTLB to update the SWIOTLB DMA buffers */
 	swiotlb_update_mem_attributes();
 
-	/*
-	 * With SEV, DMA operations cannot use encryption, we need to use
-	 * SWIOTLB to bounce buffer DMA operation.
-	 */
-	if (sev_active())
-		dma_ops = &swiotlb_dma_ops;
-
 	/*
 	 * With SEV, we need to unroll the rep string I/O instructions.
 	 */
diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c
index 7a5bafb76d77..3cdafea55ab6 100644
--- a/arch/x86/pci/sta2x11-fixup.c
+++ b/arch/x86/pci/sta2x11-fixup.c
@@ -168,7 +168,6 @@ static void sta2x11_setup_pdev(struct pci_dev *pdev)
 		return;
 	pci_set_consistent_dma_mask(pdev, STA2X11_AMBA_SIZE - 1);
 	pci_set_dma_mask(pdev, STA2X11_AMBA_SIZE - 1);
-	pdev->dev.dma_ops = &swiotlb_dma_ops;
 	pdev->dev.archdata.is_sta2x11 = true;
 
 	/* We must enable all devices as master, for audio DMA to work */
diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index 1aa73f4907ae..3b0a3ea3876d 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -63,7 +63,19 @@ void __dma_direct_free_pages(struct device *dev, size_t size, struct page *page)
 dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
 		unsigned long offset, size_t size, enum dma_data_direction dir,
 		unsigned long attrs);
+void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
+		size_t size, enum dma_data_direction dir, unsigned long attrs);
 int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
 		enum dma_data_direction dir, unsigned long attrs);
+void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
+		int nents, enum dma_data_direction dir, unsigned long attrs);
+void dma_direct_sync_single_for_device(struct device *dev,
+		dma_addr_t addr, size_t size, enum dma_data_direction dir);
+void dma_direct_sync_sg_for_device(struct device *dev,
+		struct scatterlist *sgl, int nents, enum dma_data_direction dir);
+void dma_direct_sync_single_for_cpu(struct device *dev,
+		dma_addr_t addr, size_t size, enum dma_data_direction dir);
+void dma_direct_sync_sg_for_cpu(struct device *dev,
+		struct scatterlist *sgl, int nents, enum dma_data_direction dir);
 int dma_direct_supported(struct device *dev, u64 mask);
 #endif /* _LINUX_DMA_DIRECT_H */
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 14aec0b70dd9..7c007ed7505f 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -16,8 +16,6 @@ enum swiotlb_force {
 	SWIOTLB_NO_FORCE,	/* swiotlb=noforce */
 };
 
-extern enum swiotlb_force swiotlb_force;
-
 /*
  * Maximum allowable number of contiguous slabs to map,
  * must be a power of 2.  What is the appropriate value ?
@@ -62,56 +60,44 @@ extern void swiotlb_tbl_sync_single(struct device *hwdev,
 				    size_t size, enum dma_data_direction dir,
 				    enum dma_sync_target target);
 
-/* Accessory functions. */
-
-extern dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
-				   unsigned long offset, size_t size,
-				   enum dma_data_direction dir,
-				   unsigned long attrs);
-extern void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr,
-			       size_t size, enum dma_data_direction dir,
-			       unsigned long attrs);
-
-extern int
-swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems,
-		     enum dma_data_direction dir,
-		     unsigned long attrs);
-
-extern void
-swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
-		       int nelems, enum dma_data_direction dir,
-		       unsigned long attrs);
-
-extern void
-swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
-			    size_t size, enum dma_data_direction dir);
-
-extern void
-swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
-			int nelems, enum dma_data_direction dir);
-
-extern void
-swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr,
-			       size_t size, enum dma_data_direction dir);
-
-extern void
-swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
-			   int nelems, enum dma_data_direction dir);
-
 extern int
 swiotlb_dma_supported(struct device *hwdev, u64 mask);
 
 #ifdef CONFIG_SWIOTLB
-extern void __init swiotlb_exit(void);
+extern enum swiotlb_force swiotlb_force;
+extern phys_addr_t io_tlb_start, io_tlb_end;
+
+static inline bool is_swiotlb_buffer(phys_addr_t paddr)
+{
+	return paddr >= io_tlb_start && paddr < io_tlb_end;
+}
+
+bool swiotlb_map(struct device *dev, phys_addr_t *phys, dma_addr_t *dma_addr,
+		size_t size, enum dma_data_direction dir, unsigned long attrs);
+void __init swiotlb_exit(void);
 unsigned int swiotlb_max_segment(void);
 #else
-static inline void swiotlb_exit(void) { }
-static inline unsigned int swiotlb_max_segment(void) { return 0; }
-#endif
+#define swiotlb_force SWIOTLB_NO_FORCE
+static inline bool is_swiotlb_buffer(phys_addr_t paddr)
+{
+	return false;
+}
+static inline bool swiotlb_map(struct device *dev, phys_addr_t *phys,
+		dma_addr_t *dma_addr, size_t size, enum dma_data_direction dir,
+		unsigned long attrs)
+{
+	return false;
+}
+static inline void swiotlb_exit(void)
+{
+}
+static inline unsigned int swiotlb_max_segment(void)
+{
+	return 0;
+}
+#endif /* CONFIG_SWIOTLB */
 
 extern void swiotlb_print_info(void);
 extern void swiotlb_set_max_segment(unsigned int);
 
-extern const struct dma_map_ops swiotlb_dma_ops;
-
 #endif /* __LINUX_SWIOTLB_H */
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index d45306473c90..85d8286a0ba2 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -13,6 +13,7 @@
 #include <linux/dma-noncoherent.h>
 #include <linux/pfn.h>
 #include <linux/set_memory.h>
+#include <linux/swiotlb.h>
 
 /*
  * Most architectures use ZONE_DMA for the first 16 Megabytes, but
@@ -209,69 +210,110 @@ void dma_direct_free(struct device *dev, size_t size,
 		dma_direct_free_pages(dev, size, cpu_addr, dma_addr, attrs);
 }
 
-static void dma_direct_sync_single_for_device(struct device *dev,
+#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
+    defined(CONFIG_SWIOTLB)
+void dma_direct_sync_single_for_device(struct device *dev,
 		dma_addr_t addr, size_t size, enum dma_data_direction dir)
 {
-	if (dev_is_dma_coherent(dev))
-		return;
-	arch_sync_dma_for_device(dev, dma_to_phys(dev, addr), size, dir);
+	phys_addr_t paddr = dma_to_phys(dev, addr);
+
+	if (unlikely(is_swiotlb_buffer(paddr)))
+		swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_DEVICE);
+
+	if (!dev_is_dma_coherent(dev))
+		arch_sync_dma_for_device(dev, paddr, size, dir);
 }
 
-#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE)
-static void dma_direct_sync_sg_for_device(struct device *dev,
+void dma_direct_sync_sg_for_device(struct device *dev,
 		struct scatterlist *sgl, int nents, enum dma_data_direction dir)
 {
 	struct scatterlist *sg;
 	int i;
 
-	if (dev_is_dma_coherent(dev))
-		return;
+	for_each_sg(sgl, sg, nents, i) {
+		if (unlikely(is_swiotlb_buffer(sg_phys(sg))))
+			swiotlb_tbl_sync_single(dev, sg_phys(sg), sg->length,
+					dir, SYNC_FOR_DEVICE);
 
-	for_each_sg(sgl, sg, nents, i)
-		arch_sync_dma_for_device(dev, sg_phys(sg), sg->length, dir);
+		if (!dev_is_dma_coherent(dev))
+			arch_sync_dma_for_device(dev, sg_phys(sg), sg->length,
+					dir);
+	}
 }
 #endif
 
 #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
-    defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL)
-static void dma_direct_sync_single_for_cpu(struct device *dev,
+    defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \
+    defined(CONFIG_SWIOTLB)
+void dma_direct_sync_single_for_cpu(struct device *dev,
 		dma_addr_t addr, size_t size, enum dma_data_direction dir)
 {
-	if (dev_is_dma_coherent(dev))
-		return;
-	arch_sync_dma_for_cpu(dev, dma_to_phys(dev, addr), size, dir);
-	arch_sync_dma_for_cpu_all(dev);
+	phys_addr_t paddr = dma_to_phys(dev, addr);
+
+	if (!dev_is_dma_coherent(dev)) {
+		arch_sync_dma_for_cpu(dev, paddr, size, dir);
+		arch_sync_dma_for_cpu_all(dev);
+	}
+
+	if (unlikely(is_swiotlb_buffer(paddr)))
+		swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_CPU);
 }
 
-static void dma_direct_sync_sg_for_cpu(struct device *dev,
+void dma_direct_sync_sg_for_cpu(struct device *dev,
 		struct scatterlist *sgl, int nents, enum dma_data_direction dir)
 {
 	struct scatterlist *sg;
 	int i;
 
-	if (dev_is_dma_coherent(dev))
-		return;
+	for_each_sg(sgl, sg, nents, i) {
+		if (!dev_is_dma_coherent(dev))
+			arch_sync_dma_for_cpu(dev, sg_phys(sg), sg->length, dir);
+	
+		if (unlikely(is_swiotlb_buffer(sg_phys(sg))))
+			swiotlb_tbl_sync_single(dev, sg_phys(sg), sg->length, dir,
+					SYNC_FOR_CPU);
+	}
 
-	for_each_sg(sgl, sg, nents, i)
-		arch_sync_dma_for_cpu(dev, sg_phys(sg), sg->length, dir);
-	arch_sync_dma_for_cpu_all(dev);
+	if (!dev_is_dma_coherent(dev))
+		arch_sync_dma_for_cpu_all(dev);
 }
 
-static void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
+void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
 		size_t size, enum dma_data_direction dir, unsigned long attrs)
 {
+	phys_addr_t phys = dma_to_phys(dev, addr);
+
 	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
 		dma_direct_sync_single_for_cpu(dev, addr, size, dir);
+
+	if (unlikely(is_swiotlb_buffer(phys)))
+		swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs);
 }
 
-static void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
+void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
+		int nents, enum dma_data_direction dir, unsigned long attrs)
+{
+	struct scatterlist *sg;
+	int i;
+
+	for_each_sg(sgl, sg, nents, i)
+		dma_direct_unmap_page(dev, sg->dma_address, sg_dma_len(sg), dir,
+			     attrs);
+}
+#else
+void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
 		int nents, enum dma_data_direction dir, unsigned long attrs)
 {
-	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
-		dma_direct_sync_sg_for_cpu(dev, sgl, nents, dir);
 }
 #endif
 
+static inline bool dma_direct_possible(struct device *dev, dma_addr_t dma_addr,
+		size_t size)
+{
+	return swiotlb_force != SWIOTLB_FORCE &&
+		(!dev || dma_capable(dev, dma_addr, size));
+}
+
 dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
 		unsigned long offset, size_t size, enum dma_data_direction dir,
 		unsigned long attrs)
@@ -279,13 +321,14 @@ dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
 	phys_addr_t phys = page_to_phys(page) + offset;
 	dma_addr_t dma_addr = phys_to_dma(dev, phys);
 
-	if (unlikely(dev && !dma_capable(dev, dma_addr, size))) {
+	if (unlikely(!dma_direct_possible(dev, dma_addr, size)) &&
+	    !swiotlb_map(dev, &phys, &dma_addr, size, dir, attrs)) {
 		report_addr(dev, dma_addr, size);
 		return DMA_MAPPING_ERROR;
 	}
 
-	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
-		dma_direct_sync_single_for_device(dev, dma_addr, size, dir);
+	if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+		arch_sync_dma_for_device(dev, phys, size, dir);
 	return dma_addr;
 }
 
@@ -299,11 +342,15 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
 		sg->dma_address = dma_direct_map_page(dev, sg_page(sg),
 				sg->offset, sg->length, dir, attrs);
 		if (sg->dma_address == DMA_MAPPING_ERROR)
-			return 0;
+			goto out_unmap;
 		sg_dma_len(sg) = sg->length;
 	}
 
 	return nents;
+
+out_unmap:
+	dma_direct_unmap_sg(dev, sgl, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
+	return 0;
 }
 
 /*
@@ -331,12 +378,14 @@ const struct dma_map_ops dma_direct_ops = {
 	.free			= dma_direct_free,
 	.map_page		= dma_direct_map_page,
 	.map_sg			= dma_direct_map_sg,
-#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE)
+#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
+    defined(CONFIG_SWIOTLB)
 	.sync_single_for_device	= dma_direct_sync_single_for_device,
 	.sync_sg_for_device	= dma_direct_sync_sg_for_device,
 #endif
 #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
-    defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL)
+    defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \
+    defined(CONFIG_SWIOTLB)
 	.sync_single_for_cpu	= dma_direct_sync_single_for_cpu,
 	.sync_sg_for_cpu	= dma_direct_sync_sg_for_cpu,
 	.unmap_page		= dma_direct_unmap_page,
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 2e126bac5d7d..d6361776dc5c 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -21,7 +21,6 @@
 
 #include <linux/cache.h>
 #include <linux/dma-direct.h>
-#include <linux/dma-noncoherent.h>
 #include <linux/mm.h>
 #include <linux/export.h>
 #include <linux/spinlock.h>
@@ -65,7 +64,7 @@ enum swiotlb_force swiotlb_force;
  * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this
  * API.
  */
-static phys_addr_t io_tlb_start, io_tlb_end;
+phys_addr_t io_tlb_start, io_tlb_end;
 
 /*
  * The number of IO TLB blocks (in groups of 64) between io_tlb_start and
@@ -383,11 +382,6 @@ void __init swiotlb_exit(void)
 	max_segment = 0;
 }
 
-static int is_swiotlb_buffer(phys_addr_t paddr)
-{
-	return paddr >= io_tlb_start && paddr < io_tlb_end;
-}
-
 /*
  * Bounce: copy the swiotlb buffer back to the original dma location
  */
@@ -623,221 +617,36 @@ void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr,
 	}
 }
 
-static dma_addr_t swiotlb_bounce_page(struct device *dev, phys_addr_t *phys,
+/*
+ * Create a swiotlb mapping for the buffer at @phys, and in case of DMAing
+ * to the device copy the data into it as well.
+ */
+bool swiotlb_map(struct device *dev, phys_addr_t *phys, dma_addr_t *dma_addr,
 		size_t size, enum dma_data_direction dir, unsigned long attrs)
 {
-	dma_addr_t dma_addr;
+	trace_swiotlb_bounced(dev, *dma_addr, size, swiotlb_force);
 
 	if (unlikely(swiotlb_force == SWIOTLB_NO_FORCE)) {
 		dev_warn_ratelimited(dev,
 			"Cannot do DMA to address %pa\n", phys);
-		return DMA_MAPPING_ERROR;
+		return false;
 	}
 
 	/* Oh well, have to allocate and map a bounce buffer. */
 	*phys = swiotlb_tbl_map_single(dev, __phys_to_dma(dev, io_tlb_start),
 			*phys, size, dir, attrs);
 	if (*phys == DMA_MAPPING_ERROR)
-		return DMA_MAPPING_ERROR;
+		return false;
 
 	/* Ensure that the address returned is DMA'ble */
-	dma_addr = __phys_to_dma(dev, *phys);
-	if (unlikely(!dma_capable(dev, dma_addr, size))) {
+	*dma_addr = __phys_to_dma(dev, *phys);
+	if (unlikely(!dma_capable(dev, *dma_addr, size))) {
 		swiotlb_tbl_unmap_single(dev, *phys, size, dir,
 			attrs | DMA_ATTR_SKIP_CPU_SYNC);
-		return DMA_MAPPING_ERROR;
-	}
-
-	return dma_addr;
-}
-
-/*
- * Map a single buffer of the indicated size for DMA in streaming mode.  The
- * physical address to use is returned.
- *
- * Once the device is given the dma address, the device owns this memory until
- * either swiotlb_unmap_page or swiotlb_dma_sync_single is performed.
- */
-dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
-			    unsigned long offset, size_t size,
-			    enum dma_data_direction dir,
-			    unsigned long attrs)
-{
-	phys_addr_t phys = page_to_phys(page) + offset;
-	dma_addr_t dev_addr = phys_to_dma(dev, phys);
-
-	BUG_ON(dir == DMA_NONE);
-	/*
-	 * If the address happens to be in the device's DMA window,
-	 * we can safely return the device addr and not worry about bounce
-	 * buffering it.
-	 */
-	if (!dma_capable(dev, dev_addr, size) ||
-	    swiotlb_force == SWIOTLB_FORCE) {
-		trace_swiotlb_bounced(dev, dev_addr, size, swiotlb_force);
-		dev_addr = swiotlb_bounce_page(dev, &phys, size, dir, attrs);
+		return false;
 	}
 
-	if (!dev_is_dma_coherent(dev) &&
-	    (attrs & DMA_ATTR_SKIP_CPU_SYNC) == 0 &&
-	    dev_addr != DMA_MAPPING_ERROR)
-		arch_sync_dma_for_device(dev, phys, size, dir);
-
-	return dev_addr;
-}
-
-/*
- * Unmap a single streaming mode DMA translation.  The dma_addr and size must
- * match what was provided for in a previous swiotlb_map_page call.  All
- * other usages are undefined.
- *
- * After this call, reads by the cpu to the buffer are guaranteed to see
- * whatever the device wrote there.
- */
-void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr,
-			size_t size, enum dma_data_direction dir,
-			unsigned long attrs)
-{
-	phys_addr_t paddr = dma_to_phys(hwdev, dev_addr);
-
-	BUG_ON(dir == DMA_NONE);
-
-	if (!dev_is_dma_coherent(hwdev) &&
-	    (attrs & DMA_ATTR_SKIP_CPU_SYNC) == 0)
-		arch_sync_dma_for_cpu(hwdev, paddr, size, dir);
-
-	if (is_swiotlb_buffer(paddr))
-		swiotlb_tbl_unmap_single(hwdev, paddr, size, dir, attrs);
-}
-
-/*
- * Make physical memory consistent for a single streaming mode DMA translation
- * after a transfer.
- *
- * If you perform a swiotlb_map_page() but wish to interrogate the buffer
- * using the cpu, yet do not wish to teardown the dma mapping, you must
- * call this function before doing so.  At the next point you give the dma
- * address back to the card, you must first perform a
- * swiotlb_dma_sync_for_device, and then the device again owns the buffer
- */
-static void
-swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr,
-		    size_t size, enum dma_data_direction dir,
-		    enum dma_sync_target target)
-{
-	phys_addr_t paddr = dma_to_phys(hwdev, dev_addr);
-
-	BUG_ON(dir == DMA_NONE);
-
-	if (!dev_is_dma_coherent(hwdev) && target == SYNC_FOR_CPU)
-		arch_sync_dma_for_cpu(hwdev, paddr, size, dir);
-
-	if (is_swiotlb_buffer(paddr))
-		swiotlb_tbl_sync_single(hwdev, paddr, size, dir, target);
-
-	if (!dev_is_dma_coherent(hwdev) && target == SYNC_FOR_DEVICE)
-		arch_sync_dma_for_device(hwdev, paddr, size, dir);
-}
-
-void
-swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
-			    size_t size, enum dma_data_direction dir)
-{
-	swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_CPU);
-}
-
-void
-swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr,
-			       size_t size, enum dma_data_direction dir)
-{
-	swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_DEVICE);
-}
-
-/*
- * Map a set of buffers described by scatterlist in streaming mode for DMA.
- * This is the scatter-gather version of the above swiotlb_map_page
- * interface.  Here the scatter gather list elements are each tagged with the
- * appropriate dma address and length.  They are obtained via
- * sg_dma_{address,length}(SG).
- *
- * Device ownership issues as mentioned above for swiotlb_map_page are the
- * same here.
- */
-int
-swiotlb_map_sg_attrs(struct device *dev, struct scatterlist *sgl, int nelems,
-		     enum dma_data_direction dir, unsigned long attrs)
-{
-	struct scatterlist *sg;
-	int i;
-
-	for_each_sg(sgl, sg, nelems, i) {
-		sg->dma_address = swiotlb_map_page(dev, sg_page(sg), sg->offset,
-				sg->length, dir, attrs);
-		if (sg->dma_address == DMA_MAPPING_ERROR)
-			goto out_error;
-		sg_dma_len(sg) = sg->length;
-	}
-
-	return nelems;
-
-out_error:
-	swiotlb_unmap_sg_attrs(dev, sgl, i, dir,
-			attrs | DMA_ATTR_SKIP_CPU_SYNC);
-	sg_dma_len(sgl) = 0;
-	return 0;
-}
-
-/*
- * Unmap a set of streaming mode DMA translations.  Again, cpu read rules
- * concerning calls here are the same as for swiotlb_unmap_page() above.
- */
-void
-swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
-		       int nelems, enum dma_data_direction dir,
-		       unsigned long attrs)
-{
-	struct scatterlist *sg;
-	int i;
-
-	BUG_ON(dir == DMA_NONE);
-
-	for_each_sg(sgl, sg, nelems, i)
-		swiotlb_unmap_page(hwdev, sg->dma_address, sg_dma_len(sg), dir,
-			     attrs);
-}
-
-/*
- * Make physical memory consistent for a set of streaming mode DMA translations
- * after a transfer.
- *
- * The same as swiotlb_sync_single_* but for a scatter-gather list, same rules
- * and usage.
- */
-static void
-swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sgl,
-		int nelems, enum dma_data_direction dir,
-		enum dma_sync_target target)
-{
-	struct scatterlist *sg;
-	int i;
-
-	for_each_sg(sgl, sg, nelems, i)
-		swiotlb_sync_single(hwdev, sg->dma_address,
-				    sg_dma_len(sg), dir, target);
-}
-
-void
-swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
-			int nelems, enum dma_data_direction dir)
-{
-	swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_CPU);
-}
-
-void
-swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
-			   int nelems, enum dma_data_direction dir)
-{
-	swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_DEVICE);
+	return true;
 }
 
 /*
@@ -851,18 +660,3 @@ swiotlb_dma_supported(struct device *hwdev, u64 mask)
 {
 	return __phys_to_dma(hwdev, io_tlb_end - 1) <= mask;
 }
-
-const struct dma_map_ops swiotlb_dma_ops = {
-	.alloc			= dma_direct_alloc,
-	.free			= dma_direct_free,
-	.sync_single_for_cpu	= swiotlb_sync_single_for_cpu,
-	.sync_single_for_device	= swiotlb_sync_single_for_device,
-	.sync_sg_for_cpu	= swiotlb_sync_sg_for_cpu,
-	.sync_sg_for_device	= swiotlb_sync_sg_for_device,
-	.map_sg			= swiotlb_map_sg_attrs,
-	.unmap_sg		= swiotlb_unmap_sg_attrs,
-	.map_page		= swiotlb_map_page,
-	.unmap_page		= swiotlb_unmap_page,
-	.dma_supported		= dma_direct_supported,
-};
-EXPORT_SYMBOL(swiotlb_dma_ops);
-- 
cgit 


From 356da6d0cde3323236977fce54c1f9612a742036 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 6 Dec 2018 13:39:32 -0800
Subject: dma-mapping: bypass indirect calls for dma-direct

Avoid expensive indirect calls in the fast path DMA mapping
operations by directly calling the dma_direct_* ops if we are using
the directly mapped DMA operations.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Tested-by: Jesper Dangaard Brouer <brouer@redhat.com>
Tested-by: Tony Luck <tony.luck@intel.com>
---
 arch/alpha/include/asm/dma-mapping.h |   2 +-
 arch/arc/mm/cache.c                  |   2 +-
 arch/arm/include/asm/dma-mapping.h   |   2 +-
 arch/arm/mm/dma-mapping-nommu.c      |  14 +----
 arch/arm64/mm/dma-mapping.c          |   3 -
 arch/ia64/hp/common/hwsw_iommu.c     |   2 +-
 arch/ia64/hp/common/sba_iommu.c      |   4 +-
 arch/ia64/kernel/dma-mapping.c       |   1 -
 arch/mips/include/asm/dma-mapping.h  |   2 +-
 arch/parisc/kernel/setup.c           |   4 --
 arch/sparc/include/asm/dma-mapping.h |   4 +-
 arch/x86/kernel/pci-dma.c            |   2 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_drv.c  |   2 +-
 drivers/iommu/amd_iommu.c            |  13 +---
 include/asm-generic/dma-mapping.h    |   2 +-
 include/linux/dma-direct.h           |  17 ------
 include/linux/dma-mapping.h          | 111 ++++++++++++++++++++++++++++++-----
 include/linux/dma-noncoherent.h      |   5 +-
 kernel/dma/direct.c                  |  37 +++---------
 kernel/dma/mapping.c                 |  40 ++++++++-----
 20 files changed, 150 insertions(+), 119 deletions(-)

(limited to 'kernel')

diff --git a/arch/alpha/include/asm/dma-mapping.h b/arch/alpha/include/asm/dma-mapping.h
index 8beeafd4f68e..0ee6a5c99b16 100644
--- a/arch/alpha/include/asm/dma-mapping.h
+++ b/arch/alpha/include/asm/dma-mapping.h
@@ -7,7 +7,7 @@ extern const struct dma_map_ops alpha_pci_ops;
 static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 {
 #ifdef CONFIG_ALPHA_JENSEN
-	return &dma_direct_ops;
+	return NULL;
 #else
 	return &alpha_pci_ops;
 #endif
diff --git a/arch/arc/mm/cache.c b/arch/arc/mm/cache.c
index f2701c13a66b..e188bb3ede53 100644
--- a/arch/arc/mm/cache.c
+++ b/arch/arc/mm/cache.c
@@ -1280,7 +1280,7 @@ void __init arc_cache_init_master(void)
 	/*
 	 * In case of IOC (say IOC+SLC case), pointers above could still be set
 	 * but end up not being relevant as the first function in chain is not
-	 * called at all for @dma_direct_ops
+	 * called at all for devices using coherent DMA.
 	 *     arch_sync_dma_for_cpu() -> dma_cache_*() -> __dma_cache_*()
 	 */
 }
diff --git a/arch/arm/include/asm/dma-mapping.h b/arch/arm/include/asm/dma-mapping.h
index 965b7c846ecb..31d3b96f0f4b 100644
--- a/arch/arm/include/asm/dma-mapping.h
+++ b/arch/arm/include/asm/dma-mapping.h
@@ -18,7 +18,7 @@ extern const struct dma_map_ops arm_coherent_dma_ops;
 
 static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 {
-	return IS_ENABLED(CONFIG_MMU) ? &arm_dma_ops : &dma_direct_ops;
+	return IS_ENABLED(CONFIG_MMU) ? &arm_dma_ops : NULL;
 }
 
 #ifdef __arch_page_to_dma
diff --git a/arch/arm/mm/dma-mapping-nommu.c b/arch/arm/mm/dma-mapping-nommu.c
index 712416ecd8e6..f304b10e23a4 100644
--- a/arch/arm/mm/dma-mapping-nommu.c
+++ b/arch/arm/mm/dma-mapping-nommu.c
@@ -22,7 +22,7 @@
 #include "dma.h"
 
 /*
- *  dma_direct_ops is used if
+ *  The generic direct mapping code is used if
  *   - MMU/MPU is off
  *   - cpu is v7m w/o cache support
  *   - device is coherent
@@ -209,16 +209,9 @@ const struct dma_map_ops arm_nommu_dma_ops = {
 };
 EXPORT_SYMBOL(arm_nommu_dma_ops);
 
-static const struct dma_map_ops *arm_nommu_get_dma_map_ops(bool coherent)
-{
-	return coherent ? &dma_direct_ops : &arm_nommu_dma_ops;
-}
-
 void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
 			const struct iommu_ops *iommu, bool coherent)
 {
-	const struct dma_map_ops *dma_ops;
-
 	if (IS_ENABLED(CONFIG_CPU_V7M)) {
 		/*
 		 * Cache support for v7m is optional, so can be treated as
@@ -234,7 +227,6 @@ void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
 		dev->archdata.dma_coherent = (get_cr() & CR_M) ? coherent : true;
 	}
 
-	dma_ops = arm_nommu_get_dma_map_ops(dev->archdata.dma_coherent);
-
-	set_dma_ops(dev, dma_ops);
+	if (!dev->archdata.dma_coherent)
+		set_dma_ops(dev, &arm_nommu_dma_ops);
 }
diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index ab1e417204d0..95eda81e3f2d 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -462,9 +462,6 @@ static void __iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
 void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
 			const struct iommu_ops *iommu, bool coherent)
 {
-	if (!dev->dma_ops)
-		dev->dma_ops = &dma_direct_ops;
-
 	dev->dma_coherent = coherent;
 	__iommu_setup_dma_ops(dev, dma_base, size, iommu);
 
diff --git a/arch/ia64/hp/common/hwsw_iommu.c b/arch/ia64/hp/common/hwsw_iommu.c
index f40ca499b246..8840ed97712f 100644
--- a/arch/ia64/hp/common/hwsw_iommu.c
+++ b/arch/ia64/hp/common/hwsw_iommu.c
@@ -38,7 +38,7 @@ static inline int use_swiotlb(struct device *dev)
 const struct dma_map_ops *hwsw_dma_get_ops(struct device *dev)
 {
 	if (use_swiotlb(dev))
-		return &dma_direct_ops;
+		return NULL;
 	return &sba_dma_ops;
 }
 EXPORT_SYMBOL(hwsw_dma_get_ops);
diff --git a/arch/ia64/hp/common/sba_iommu.c b/arch/ia64/hp/common/sba_iommu.c
index 5ee74820a0f6..5a361e51cb1e 100644
--- a/arch/ia64/hp/common/sba_iommu.c
+++ b/arch/ia64/hp/common/sba_iommu.c
@@ -2078,7 +2078,7 @@ sba_init(void)
 	 * a successful kdump kernel boot is to use the swiotlb.
 	 */
 	if (is_kdump_kernel()) {
-		dma_ops = &dma_direct_ops;
+		dma_ops = NULL;
 		if (swiotlb_late_init_with_default_size(64 * (1<<20)) != 0)
 			panic("Unable to initialize software I/O TLB:"
 				  " Try machvec=dig boot option");
@@ -2100,7 +2100,7 @@ sba_init(void)
 		 * If we didn't find something sba_iommu can claim, we
 		 * need to setup the swiotlb and switch to the dig machvec.
 		 */
-		dma_ops = &dma_direct_ops;
+		dma_ops = NULL;
 		if (swiotlb_late_init_with_default_size(64 * (1<<20)) != 0)
 			panic("Unable to find SBA IOMMU or initialize "
 			      "software I/O TLB: Try machvec=dig boot option");
diff --git a/arch/ia64/kernel/dma-mapping.c b/arch/ia64/kernel/dma-mapping.c
index 80cd3e1ea95a..ad7d9963de34 100644
--- a/arch/ia64/kernel/dma-mapping.c
+++ b/arch/ia64/kernel/dma-mapping.c
@@ -36,7 +36,6 @@ long arch_dma_coherent_to_pfn(struct device *dev, void *cpu_addr,
 
 void __init swiotlb_dma_init(void)
 {
-	dma_ops = &dma_direct_ops;
 	swiotlb_init(1);
 }
 #endif
diff --git a/arch/mips/include/asm/dma-mapping.h b/arch/mips/include/asm/dma-mapping.h
index 69f914667f3e..20dfaad3a55d 100644
--- a/arch/mips/include/asm/dma-mapping.h
+++ b/arch/mips/include/asm/dma-mapping.h
@@ -11,7 +11,7 @@ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 #if defined(CONFIG_MACH_JAZZ)
 	return &jazz_dma_ops;
 #else
-	return &dma_direct_ops;
+	return NULL;
 #endif
 }
 
diff --git a/arch/parisc/kernel/setup.c b/arch/parisc/kernel/setup.c
index cd227f1cf629..54818cd78bd0 100644
--- a/arch/parisc/kernel/setup.c
+++ b/arch/parisc/kernel/setup.c
@@ -99,10 +99,6 @@ void __init dma_ops_init(void)
 
 	case pcxl2:
 		pa7300lc_init();
-	case pcxl: /* falls through */
-	case pcxs:
-	case pcxt:
-		hppa_dma_ops = &dma_direct_ops;
 		break;
 	default:
 		break;
diff --git a/arch/sparc/include/asm/dma-mapping.h b/arch/sparc/include/asm/dma-mapping.h
index 55a44f08a9a4..ed32845bd2d2 100644
--- a/arch/sparc/include/asm/dma-mapping.h
+++ b/arch/sparc/include/asm/dma-mapping.h
@@ -12,11 +12,11 @@ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 {
 #ifdef CONFIG_SPARC_LEON
 	if (sparc_cpu_model == sparc_leon)
-		return &dma_direct_ops;
+		return NULL;
 #endif
 #if defined(CONFIG_SPARC32) && defined(CONFIG_PCI)
 	if (bus == &pci_bus_type)
-		return &dma_direct_ops;
+		return NULL;
 #endif
 	return dma_ops;
 }
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index f4562fcec681..d460998ae828 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -17,7 +17,7 @@
 
 static bool disable_dac_quirk __read_mostly;
 
-const struct dma_map_ops *dma_ops = &dma_direct_ops;
+const struct dma_map_ops *dma_ops;
 EXPORT_SYMBOL(dma_ops);
 
 #ifdef CONFIG_IOMMU_DEBUG
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
index 61a84b958d67..50637f372e9f 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
@@ -581,7 +581,7 @@ static int vmw_dma_select_mode(struct vmw_private *dev_priv)
 
 	dev_priv->map_mode = vmw_dma_map_populate;
 
-	if (dma_ops->sync_single_for_cpu)
+	if (dma_ops && dma_ops->sync_single_for_cpu)
 		dev_priv->map_mode = vmw_dma_alloc_coherent;
 #ifdef CONFIG_SWIOTLB
 	if (swiotlb_nr_tbl() == 0)
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index c5d6c7c42b0a..567221cca13c 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -2184,7 +2184,7 @@ static int amd_iommu_add_device(struct device *dev)
 				dev_name(dev));
 
 		iommu_ignore_device(dev);
-		dev->dma_ops = &dma_direct_ops;
+		dev->dma_ops = NULL;
 		goto out;
 	}
 	init_iommu_group(dev);
@@ -2770,17 +2770,6 @@ int __init amd_iommu_init_dma_ops(void)
 	swiotlb        = (iommu_pass_through || sme_me_mask) ? 1 : 0;
 	iommu_detected = 1;
 
-	/*
-	 * In case we don't initialize SWIOTLB (actually the common case
-	 * when AMD IOMMU is enabled and SME is not active), make sure there
-	 * are global dma_ops set as a fall-back for devices not handled by
-	 * this driver (for example non-PCI devices). When SME is active,
-	 * make sure that swiotlb variable remains set so the global dma_ops
-	 * continue to be SWIOTLB.
-	 */
-	if (!swiotlb)
-		dma_ops = &dma_direct_ops;
-
 	if (amd_iommu_unmap_flush)
 		pr_info("AMD-Vi: IO/TLB flush on unmap enabled\n");
 	else
diff --git a/include/asm-generic/dma-mapping.h b/include/asm-generic/dma-mapping.h
index 880a292d792f..c13f46109e88 100644
--- a/include/asm-generic/dma-mapping.h
+++ b/include/asm-generic/dma-mapping.h
@@ -4,7 +4,7 @@
 
 static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 {
-	return &dma_direct_ops;
+	return NULL;
 }
 
 #endif /* _ASM_GENERIC_DMA_MAPPING_H */
diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index 3b0a3ea3876d..b7338702592a 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -60,22 +60,5 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
 struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
 		dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs);
 void __dma_direct_free_pages(struct device *dev, size_t size, struct page *page);
-dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
-		unsigned long offset, size_t size, enum dma_data_direction dir,
-		unsigned long attrs);
-void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
-		size_t size, enum dma_data_direction dir, unsigned long attrs);
-int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
-		enum dma_data_direction dir, unsigned long attrs);
-void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
-		int nents, enum dma_data_direction dir, unsigned long attrs);
-void dma_direct_sync_single_for_device(struct device *dev,
-		dma_addr_t addr, size_t size, enum dma_data_direction dir);
-void dma_direct_sync_sg_for_device(struct device *dev,
-		struct scatterlist *sgl, int nents, enum dma_data_direction dir);
-void dma_direct_sync_single_for_cpu(struct device *dev,
-		dma_addr_t addr, size_t size, enum dma_data_direction dir);
-void dma_direct_sync_sg_for_cpu(struct device *dev,
-		struct scatterlist *sgl, int nents, enum dma_data_direction dir);
 int dma_direct_supported(struct device *dev, u64 mask);
 #endif /* _LINUX_DMA_DIRECT_H */
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 269ee27fc3d9..f422aec0f53c 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -134,7 +134,6 @@ struct dma_map_ops {
 
 #define DMA_MAPPING_ERROR		(~(dma_addr_t)0)
 
-extern const struct dma_map_ops dma_direct_ops;
 extern const struct dma_map_ops dma_virt_ops;
 extern const struct dma_map_ops dma_dummy_ops;
 
@@ -222,6 +221,69 @@ static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
 }
 #endif
 
+static inline bool dma_is_direct(const struct dma_map_ops *ops)
+{
+	return likely(!ops);
+}
+
+/*
+ * All the dma_direct_* declarations are here just for the indirect call bypass,
+ * and must not be used directly drivers!
+ */
+dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
+		unsigned long offset, size_t size, enum dma_data_direction dir,
+		unsigned long attrs);
+int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
+		enum dma_data_direction dir, unsigned long attrs);
+
+#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
+    defined(CONFIG_SWIOTLB)
+void dma_direct_sync_single_for_device(struct device *dev,
+		dma_addr_t addr, size_t size, enum dma_data_direction dir);
+void dma_direct_sync_sg_for_device(struct device *dev,
+		struct scatterlist *sgl, int nents, enum dma_data_direction dir);
+#else
+static inline void dma_direct_sync_single_for_device(struct device *dev,
+		dma_addr_t addr, size_t size, enum dma_data_direction dir)
+{
+}
+static inline void dma_direct_sync_sg_for_device(struct device *dev,
+		struct scatterlist *sgl, int nents, enum dma_data_direction dir)
+{
+}
+#endif
+
+#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
+    defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \
+    defined(CONFIG_SWIOTLB)
+void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
+		size_t size, enum dma_data_direction dir, unsigned long attrs);
+void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
+		int nents, enum dma_data_direction dir, unsigned long attrs);
+void dma_direct_sync_single_for_cpu(struct device *dev,
+		dma_addr_t addr, size_t size, enum dma_data_direction dir);
+void dma_direct_sync_sg_for_cpu(struct device *dev,
+		struct scatterlist *sgl, int nents, enum dma_data_direction dir);
+#else
+static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
+		size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
+}
+static inline void dma_direct_unmap_sg(struct device *dev,
+		struct scatterlist *sgl, int nents, enum dma_data_direction dir,
+		unsigned long attrs)
+{
+}
+static inline void dma_direct_sync_single_for_cpu(struct device *dev,
+		dma_addr_t addr, size_t size, enum dma_data_direction dir)
+{
+}
+static inline void dma_direct_sync_sg_for_cpu(struct device *dev,
+		struct scatterlist *sgl, int nents, enum dma_data_direction dir)
+{
+}
+#endif
+
 static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr,
 					      size_t size,
 					      enum dma_data_direction dir,
@@ -232,9 +294,12 @@ static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr,
 
 	BUG_ON(!valid_dma_direction(dir));
 	debug_dma_map_single(dev, ptr, size);
-	addr = ops->map_page(dev, virt_to_page(ptr),
-			     offset_in_page(ptr), size,
-			     dir, attrs);
+	if (dma_is_direct(ops))
+		addr = dma_direct_map_page(dev, virt_to_page(ptr),
+				offset_in_page(ptr), size, dir, attrs);
+	else
+		addr = ops->map_page(dev, virt_to_page(ptr),
+				offset_in_page(ptr), size, dir, attrs);
 	debug_dma_map_page(dev, virt_to_page(ptr),
 			   offset_in_page(ptr), size,
 			   dir, addr, true);
@@ -249,7 +314,9 @@ static inline void dma_unmap_single_attrs(struct device *dev, dma_addr_t addr,
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	BUG_ON(!valid_dma_direction(dir));
-	if (ops->unmap_page)
+	if (dma_is_direct(ops))
+		dma_direct_unmap_page(dev, addr, size, dir, attrs);
+	else if (ops->unmap_page)
 		ops->unmap_page(dev, addr, size, dir, attrs);
 	debug_dma_unmap_page(dev, addr, size, dir, true);
 }
@@ -272,7 +339,10 @@ static inline int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
 	int ents;
 
 	BUG_ON(!valid_dma_direction(dir));
-	ents = ops->map_sg(dev, sg, nents, dir, attrs);
+	if (dma_is_direct(ops))
+		ents = dma_direct_map_sg(dev, sg, nents, dir, attrs);
+	else
+		ents = ops->map_sg(dev, sg, nents, dir, attrs);
 	BUG_ON(ents < 0);
 	debug_dma_map_sg(dev, sg, nents, ents, dir);
 
@@ -287,7 +357,9 @@ static inline void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg
 
 	BUG_ON(!valid_dma_direction(dir));
 	debug_dma_unmap_sg(dev, sg, nents, dir);
-	if (ops->unmap_sg)
+	if (dma_is_direct(ops))
+		dma_direct_unmap_sg(dev, sg, nents, dir, attrs);
+	else if (ops->unmap_sg)
 		ops->unmap_sg(dev, sg, nents, dir, attrs);
 }
 
@@ -301,7 +373,10 @@ static inline dma_addr_t dma_map_page_attrs(struct device *dev,
 	dma_addr_t addr;
 
 	BUG_ON(!valid_dma_direction(dir));
-	addr = ops->map_page(dev, page, offset, size, dir, attrs);
+	if (dma_is_direct(ops))
+		addr = dma_direct_map_page(dev, page, offset, size, dir, attrs);
+	else
+		addr = ops->map_page(dev, page, offset, size, dir, attrs);
 	debug_dma_map_page(dev, page, offset, size, dir, addr, false);
 
 	return addr;
@@ -322,7 +397,7 @@ static inline dma_addr_t dma_map_resource(struct device *dev,
 	BUG_ON(pfn_valid(PHYS_PFN(phys_addr)));
 
 	addr = phys_addr;
-	if (ops->map_resource)
+	if (ops && ops->map_resource)
 		addr = ops->map_resource(dev, phys_addr, size, dir, attrs);
 
 	debug_dma_map_resource(dev, phys_addr, size, dir, addr);
@@ -337,7 +412,7 @@ static inline void dma_unmap_resource(struct device *dev, dma_addr_t addr,
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	BUG_ON(!valid_dma_direction(dir));
-	if (ops->unmap_resource)
+	if (ops && ops->unmap_resource)
 		ops->unmap_resource(dev, addr, size, dir, attrs);
 	debug_dma_unmap_resource(dev, addr, size, dir);
 }
@@ -349,7 +424,9 @@ static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	BUG_ON(!valid_dma_direction(dir));
-	if (ops->sync_single_for_cpu)
+	if (dma_is_direct(ops))
+		dma_direct_sync_single_for_cpu(dev, addr, size, dir);
+	else if (ops->sync_single_for_cpu)
 		ops->sync_single_for_cpu(dev, addr, size, dir);
 	debug_dma_sync_single_for_cpu(dev, addr, size, dir);
 }
@@ -368,7 +445,9 @@ static inline void dma_sync_single_for_device(struct device *dev,
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	BUG_ON(!valid_dma_direction(dir));
-	if (ops->sync_single_for_device)
+	if (dma_is_direct(ops))
+		dma_direct_sync_single_for_device(dev, addr, size, dir);
+	else if (ops->sync_single_for_device)
 		ops->sync_single_for_device(dev, addr, size, dir);
 	debug_dma_sync_single_for_device(dev, addr, size, dir);
 }
@@ -387,7 +466,9 @@ dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	BUG_ON(!valid_dma_direction(dir));
-	if (ops->sync_sg_for_cpu)
+	if (dma_is_direct(ops))
+		dma_direct_sync_sg_for_cpu(dev, sg, nelems, dir);
+	else if (ops->sync_sg_for_cpu)
 		ops->sync_sg_for_cpu(dev, sg, nelems, dir);
 	debug_dma_sync_sg_for_cpu(dev, sg, nelems, dir);
 }
@@ -399,7 +480,9 @@ dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	BUG_ON(!valid_dma_direction(dir));
-	if (ops->sync_sg_for_device)
+	if (dma_is_direct(ops))
+		dma_direct_sync_sg_for_device(dev, sg, nelems, dir);
+	else if (ops->sync_sg_for_device)
 		ops->sync_sg_for_device(dev, sg, nelems, dir);
 	debug_dma_sync_sg_for_device(dev, sg, nelems, dir);
 
diff --git a/include/linux/dma-noncoherent.h b/include/linux/dma-noncoherent.h
index 306557331d7d..69b36ed31a99 100644
--- a/include/linux/dma-noncoherent.h
+++ b/include/linux/dma-noncoherent.h
@@ -38,7 +38,10 @@ pgprot_t arch_dma_mmap_pgprot(struct device *dev, pgprot_t prot,
 void arch_dma_cache_sync(struct device *dev, void *vaddr, size_t size,
 		enum dma_data_direction direction);
 #else
-#define arch_dma_cache_sync NULL
+static inline void arch_dma_cache_sync(struct device *dev, void *vaddr,
+		size_t size, enum dma_data_direction direction)
+{
+}
 #endif /* CONFIG_DMA_NONCOHERENT_CACHE_SYNC */
 
 #ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 85d8286a0ba2..79da61b49fa4 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -223,6 +223,7 @@ void dma_direct_sync_single_for_device(struct device *dev,
 	if (!dev_is_dma_coherent(dev))
 		arch_sync_dma_for_device(dev, paddr, size, dir);
 }
+EXPORT_SYMBOL(dma_direct_sync_single_for_device);
 
 void dma_direct_sync_sg_for_device(struct device *dev,
 		struct scatterlist *sgl, int nents, enum dma_data_direction dir)
@@ -240,6 +241,7 @@ void dma_direct_sync_sg_for_device(struct device *dev,
 					dir);
 	}
 }
+EXPORT_SYMBOL(dma_direct_sync_sg_for_device);
 #endif
 
 #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
@@ -258,6 +260,7 @@ void dma_direct_sync_single_for_cpu(struct device *dev,
 	if (unlikely(is_swiotlb_buffer(paddr)))
 		swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_CPU);
 }
+EXPORT_SYMBOL(dma_direct_sync_single_for_cpu);
 
 void dma_direct_sync_sg_for_cpu(struct device *dev,
 		struct scatterlist *sgl, int nents, enum dma_data_direction dir)
@@ -277,6 +280,7 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
 	if (!dev_is_dma_coherent(dev))
 		arch_sync_dma_for_cpu_all(dev);
 }
+EXPORT_SYMBOL(dma_direct_sync_sg_for_cpu);
 
 void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
 		size_t size, enum dma_data_direction dir, unsigned long attrs)
@@ -289,6 +293,7 @@ void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
 	if (unlikely(is_swiotlb_buffer(phys)))
 		swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs);
 }
+EXPORT_SYMBOL(dma_direct_unmap_page);
 
 void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
 		int nents, enum dma_data_direction dir, unsigned long attrs)
@@ -300,11 +305,7 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
 		dma_direct_unmap_page(dev, sg->dma_address, sg_dma_len(sg), dir,
 			     attrs);
 }
-#else
-void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
-		int nents, enum dma_data_direction dir, unsigned long attrs)
-{
-}
+EXPORT_SYMBOL(dma_direct_unmap_sg);
 #endif
 
 static inline bool dma_direct_possible(struct device *dev, dma_addr_t dma_addr,
@@ -331,6 +332,7 @@ dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
 		arch_sync_dma_for_device(dev, phys, size, dir);
 	return dma_addr;
 }
+EXPORT_SYMBOL(dma_direct_map_page);
 
 int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
 		enum dma_data_direction dir, unsigned long attrs)
@@ -352,6 +354,7 @@ out_unmap:
 	dma_direct_unmap_sg(dev, sgl, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
 	return 0;
 }
+EXPORT_SYMBOL(dma_direct_map_sg);
 
 /*
  * Because 32-bit DMA masks are so common we expect every architecture to be
@@ -372,27 +375,3 @@ int dma_direct_supported(struct device *dev, u64 mask)
 
 	return mask >= phys_to_dma(dev, min_mask);
 }
-
-const struct dma_map_ops dma_direct_ops = {
-	.alloc			= dma_direct_alloc,
-	.free			= dma_direct_free,
-	.map_page		= dma_direct_map_page,
-	.map_sg			= dma_direct_map_sg,
-#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
-    defined(CONFIG_SWIOTLB)
-	.sync_single_for_device	= dma_direct_sync_single_for_device,
-	.sync_sg_for_device	= dma_direct_sync_sg_for_device,
-#endif
-#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
-    defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \
-    defined(CONFIG_SWIOTLB)
-	.sync_single_for_cpu	= dma_direct_sync_single_for_cpu,
-	.sync_sg_for_cpu	= dma_direct_sync_sg_for_cpu,
-	.unmap_page		= dma_direct_unmap_page,
-	.unmap_sg		= dma_direct_unmap_sg,
-#endif
-	.get_required_mask	= dma_direct_get_required_mask,
-	.dma_supported		= dma_direct_supported,
-	.cache_sync		= arch_dma_cache_sync,
-};
-EXPORT_SYMBOL(dma_direct_ops);
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 0b18cfbdde95..fc84c81029d9 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -7,6 +7,7 @@
  */
 #include <linux/memblock.h> /* for max_pfn */
 #include <linux/acpi.h>
+#include <linux/dma-direct.h>
 #include <linux/dma-noncoherent.h>
 #include <linux/export.h>
 #include <linux/gfp.h>
@@ -229,8 +230,8 @@ int dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt,
 		unsigned long attrs)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
-	BUG_ON(!ops);
-	if (ops->get_sgtable)
+
+	if (!dma_is_direct(ops) && ops->get_sgtable)
 		return ops->get_sgtable(dev, sgt, cpu_addr, dma_addr, size,
 					attrs);
 	return dma_common_get_sgtable(dev, sgt, cpu_addr, dma_addr, size,
@@ -293,8 +294,8 @@ int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma,
 		unsigned long attrs)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
-	BUG_ON(!ops);
-	if (ops->mmap)
+
+	if (!dma_is_direct(ops) && ops->mmap)
 		return ops->mmap(dev, vma, cpu_addr, dma_addr, size, attrs);
 	return dma_common_mmap(dev, vma, cpu_addr, dma_addr, size, attrs);
 }
@@ -324,6 +325,8 @@ u64 dma_get_required_mask(struct device *dev)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
+	if (dma_is_direct(ops))
+		return dma_direct_get_required_mask(dev);
 	if (ops->get_required_mask)
 		return ops->get_required_mask(dev);
 	return dma_default_get_required_mask(dev);
@@ -341,7 +344,6 @@ void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 	void *cpu_addr;
 
-	BUG_ON(!ops);
 	WARN_ON_ONCE(dev && !dev->coherent_dma_mask);
 
 	if (dma_alloc_from_dev_coherent(dev, size, dma_handle, &cpu_addr))
@@ -352,10 +354,14 @@ void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
 
 	if (!arch_dma_alloc_attrs(&dev))
 		return NULL;
-	if (!ops->alloc)
+
+	if (dma_is_direct(ops))
+		cpu_addr = dma_direct_alloc(dev, size, dma_handle, flag, attrs);
+	else if (ops->alloc)
+		cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs);
+	else
 		return NULL;
 
-	cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs);
 	debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr);
 	return cpu_addr;
 }
@@ -366,8 +372,6 @@ void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr,
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
-	BUG_ON(!ops);
-
 	if (dma_release_from_dev_coherent(dev, get_order(size), cpu_addr))
 		return;
 	/*
@@ -379,11 +383,14 @@ void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr,
 	 */
 	WARN_ON(irqs_disabled());
 
-	if (!ops->free || !cpu_addr)
+	if (!cpu_addr)
 		return;
 
 	debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
-	ops->free(dev, size, cpu_addr, dma_handle, attrs);
+	if (dma_is_direct(ops))
+		dma_direct_free(dev, size, cpu_addr, dma_handle, attrs);
+	else if (ops->free)
+		ops->free(dev, size, cpu_addr, dma_handle, attrs);
 }
 EXPORT_SYMBOL(dma_free_attrs);
 
@@ -397,9 +404,9 @@ int dma_supported(struct device *dev, u64 mask)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
-	if (!ops)
-		return 0;
-	if (!ops->dma_supported)
+	if (dma_is_direct(ops))
+		return dma_direct_supported(dev, mask);
+	if (ops->dma_supported)
 		return 1;
 	return ops->dma_supported(dev, mask);
 }
@@ -437,7 +444,10 @@ void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	BUG_ON(!valid_dma_direction(dir));
-	if (ops->cache_sync)
+
+	if (dma_is_direct(ops))
+		arch_dma_cache_sync(dev, vaddr, size, dir);
+	else if (ops->cache_sync)
 		ops->cache_sync(dev, vaddr, size, dir);
 }
 EXPORT_SYMBOL(dma_cache_sync);
-- 
cgit 


From 9f8c1c5712954f9d8877ac55b18adbdf03e51e1f Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 12 Dec 2018 10:45:38 +0100
Subject: bpf: remove obsolete prog->aux sanitation in bpf_insn_prepare_dump

This logic is not needed anymore since we got rid of the verifier
rewrite that was using prog->aux address in f6069b9aa993 ("bpf:
fix redirect to map under tail calls").

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/syscall.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 7f1410d6fbe9..6ae062f1cf20 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2032,13 +2032,6 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog)
 			insns[i + 1].imm = 0;
 			continue;
 		}
-
-		if (!bpf_dump_raw_ok() &&
-		    imm == (unsigned long)prog->aux) {
-			insns[i].imm = 0;
-			insns[i + 1].imm = 0;
-			continue;
-		}
 	}
 
 	return insns;
-- 
cgit 


From 319deec7db6c0aab276d2447f778e7cffed24c7c Mon Sep 17 00:00:00 2001
From: Tycho Andersen <tycho@tycho.ws>
Date: Wed, 12 Dec 2018 19:46:54 -0700
Subject: seccomp: fix poor type promotion

sparse complains,

kernel/seccomp.c:1172:13: warning: incorrect type in assignment (different base types)
kernel/seccomp.c:1172:13:    expected restricted __poll_t [usertype] ret
kernel/seccomp.c:1172:13:    got int
kernel/seccomp.c:1173:13: warning: restricted __poll_t degrades to integer

Instead of assigning this to ret, since we don't use this anywhere, let's
just test it against 0 directly.

Signed-off-by: Tycho Andersen <tycho@tycho.ws>
Reported-by: 0day robot <lkp@intel.com>
Fixes: 6a21cc50f0c7 ("seccomp: add a return code to trap to userspace")
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 kernel/seccomp.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 15b6be97fc09..d7f538847b84 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -1169,8 +1169,7 @@ static __poll_t seccomp_notify_poll(struct file *file,
 
 	poll_wait(file, &filter->notif->wqh, poll_tab);
 
-	ret = mutex_lock_interruptible(&filter->notify_lock);
-	if (ret < 0)
+	if (mutex_lock_interruptible(&filter->notify_lock) < 0)
 		return EPOLLERR;
 
 	list_for_each_entry(cur, &filter->notif->notifications, list) {
-- 
cgit 


From d406db524c32ca35bd85cada28a547fff3115715 Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Sun, 9 Dec 2018 14:25:02 +0800
Subject: audit: remove duplicated include from audit.c

Remove duplicated include.

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index a0a4544e69ca..632d36059556 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -60,7 +60,6 @@
 #include <linux/mutex.h>
 #include <linux/gfp.h>
 #include <linux/pid.h>
-#include <linux/slab.h>
 
 #include <linux/audit.h>
 
-- 
cgit 


From 5439c985c5a83a8419f762115afdf560ab72a452 Mon Sep 17 00:00:00 2001
From: Vincent Whitchurch <vincent.whitchurch@axis.com>
Date: Fri, 14 Dec 2018 17:05:54 +0100
Subject: module: Overwrite st_size instead of st_info

st_info is currently overwritten after relocation and used to store the
elf_type().  However, we're going to need it fix kallsyms on ARM's
Thumb-2 kernels, so preserve st_info and overwrite the st_size field
instead.  st_size is neither used by the module core nor by any
architecture.

Reviewed-by: Miroslav Benes <mbenes@suse.cz>
Reviewed-by: Dave Martin <Dave.Martin@arm.com>
Signed-off-by: Vincent Whitchurch <vincent.whitchurch@axis.com>
Signed-off-by: Jessica Yu <jeyu@kernel.org>
---
 kernel/module.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 1b5edf78694c..b36ff8a3d562 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2684,7 +2684,7 @@ static void add_kallsyms(struct module *mod, const struct load_info *info)
 
 	/* Set types up while we still have access to sections. */
 	for (i = 0; i < mod->kallsyms->num_symtab; i++)
-		mod->kallsyms->symtab[i].st_info
+		mod->kallsyms->symtab[i].st_size
 			= elf_type(&mod->kallsyms->symtab[i], info);
 
 	/* Now populate the cut down core kallsyms for after init. */
@@ -4070,7 +4070,7 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
 		kallsyms = rcu_dereference_sched(mod->kallsyms);
 		if (symnum < kallsyms->num_symtab) {
 			*value = kallsyms->symtab[symnum].st_value;
-			*type = kallsyms->symtab[symnum].st_info;
+			*type = kallsyms->symtab[symnum].st_size;
 			strlcpy(name, kallsyms_symbol_name(kallsyms, symnum), KSYM_NAME_LEN);
 			strlcpy(module_name, mod->name, MODULE_NAME_LEN);
 			*exported = is_exported(name, *value, mod);
-- 
cgit 


From 93d77e7f1410c366050d6035dcba1a5167c7cf0b Mon Sep 17 00:00:00 2001
From: Vincent Whitchurch <vincent.whitchurch@axis.com>
Date: Fri, 14 Dec 2018 17:05:55 +0100
Subject: ARM: module: Fix function kallsyms on Thumb-2

Thumb-2 functions have the lowest bit set in the symbol value in the
symtab.  When kallsyms are generated for the vmlinux, the kallsyms are
generated from the output of nm, and nm clears the lowest bit.

 $ arm-linux-gnueabihf-readelf -a vmlinux | grep show_interrupts
  95947: 8015dc89   686 FUNC    GLOBAL DEFAULT    2 show_interrupts
 $ arm-linux-gnueabihf-nm vmlinux | grep show_interrupts
 8015dc88 T show_interrupts
 $ cat /proc/kallsyms | grep show_interrupts
 8015dc88 T show_interrupts

However, for modules, the kallsyms uses the values in the symbol table
without modification, so for functions in modules, the lowest bit is set
in kallsyms.

 $ arm-linux-gnueabihf-readelf -a drivers/net/tun.ko | grep tun_get_socket
    333: 00002d4d    36 FUNC    GLOBAL DEFAULT    1 tun_get_socket
 $ arm-linux-gnueabihf-nm drivers/net/tun.ko | grep tun_get_socket
 00002d4c T tun_get_socket
 $ cat /proc/kallsyms | grep tun_get_socket
 7f802d4d t tun_get_socket      [tun]

Because of this, the symbol+offset of the crashing instruction shown in
oopses is incorrect when the crash is in a module.  For example, given a
tun_get_socket which starts like this,

 00002d4c <tun_get_socket>:
     2d4c:       6943            ldr     r3, [r0, #20]
     2d4e:       4a07            ldr     r2, [pc, #28]
     2d50:       4293            cmp     r3, r2

a crash when tun_get_socket is called with NULL results in:

 PC is at tun_xdp+0xa3/0xa4 [tun]
 pc : [<7f802d4c>]

As can be seen, the "PC is at" line reports the wrong symbol name, and
the symbol+offset will point to the wrong source line if it is passed to
gdb.

To solve this, add a way for archs to fixup the reading of these module
kallsyms values, and use that to clear the lowest bit for function
symbols on Thumb-2.

After the fix:

 # cat /proc/kallsyms | grep tun_get_socket
 7f802d4c t tun_get_socket       [tun]

 PC is at tun_get_socket+0x0/0x24 [tun]
 pc : [<7f802d4c>]

Signed-off-by: Vincent Whitchurch <vincent.whitchurch@axis.com>
Signed-off-by: Jessica Yu <jeyu@kernel.org>
---
 arch/arm/include/asm/module.h | 11 +++++++++++
 include/linux/module.h        |  7 +++++++
 kernel/module.c               | 43 +++++++++++++++++++++++++++----------------
 3 files changed, 45 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm/include/asm/module.h b/arch/arm/include/asm/module.h
index 9e81b7c498d8..182163b55546 100644
--- a/arch/arm/include/asm/module.h
+++ b/arch/arm/include/asm/module.h
@@ -61,4 +61,15 @@ u32 get_module_plt(struct module *mod, unsigned long loc, Elf32_Addr val);
 	MODULE_ARCH_VERMAGIC_ARMTHUMB \
 	MODULE_ARCH_VERMAGIC_P2V
 
+#ifdef CONFIG_THUMB2_KERNEL
+#define HAVE_ARCH_KALLSYMS_SYMBOL_VALUE
+static inline unsigned long kallsyms_symbol_value(const Elf_Sym *sym)
+{
+	if (ELF_ST_TYPE(sym->st_info) == STT_FUNC)
+		return sym->st_value & ~1;
+
+	return sym->st_value;
+}
+#endif
+
 #endif /* _ASM_ARM_MODULE_H */
diff --git a/include/linux/module.h b/include/linux/module.h
index fce6b4335e36..c0b4b7840b57 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -486,6 +486,13 @@ struct module {
 #define MODULE_ARCH_INIT {}
 #endif
 
+#ifndef HAVE_ARCH_KALLSYMS_SYMBOL_VALUE
+static inline unsigned long kallsyms_symbol_value(const Elf_Sym *sym)
+{
+	return sym->st_value;
+}
+#endif
+
 extern struct mutex module_mutex;
 
 /* FIXME: It'd be nice to isolate modules during init, too, so they
diff --git a/kernel/module.c b/kernel/module.c
index b36ff8a3d562..164bf201eae4 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3928,7 +3928,7 @@ static const char *find_kallsyms_symbol(struct module *mod,
 					unsigned long *offset)
 {
 	unsigned int i, best = 0;
-	unsigned long nextval;
+	unsigned long nextval, bestval;
 	struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms);
 
 	/* At worse, next value is at end of module */
@@ -3937,10 +3937,15 @@ static const char *find_kallsyms_symbol(struct module *mod,
 	else
 		nextval = (unsigned long)mod->core_layout.base+mod->core_layout.text_size;
 
+	bestval = kallsyms_symbol_value(&kallsyms->symtab[best]);
+
 	/* Scan for closest preceding symbol, and next symbol. (ELF
 	   starts real symbols at 1). */
 	for (i = 1; i < kallsyms->num_symtab; i++) {
-		if (kallsyms->symtab[i].st_shndx == SHN_UNDEF)
+		const Elf_Sym *sym = &kallsyms->symtab[i];
+		unsigned long thisval = kallsyms_symbol_value(sym);
+
+		if (sym->st_shndx == SHN_UNDEF)
 			continue;
 
 		/* We ignore unnamed symbols: they're uninformative
@@ -3949,21 +3954,21 @@ static const char *find_kallsyms_symbol(struct module *mod,
 		    || is_arm_mapping_symbol(kallsyms_symbol_name(kallsyms, i)))
 			continue;
 
-		if (kallsyms->symtab[i].st_value <= addr
-		    && kallsyms->symtab[i].st_value > kallsyms->symtab[best].st_value)
+		if (thisval <= addr && thisval > bestval) {
 			best = i;
-		if (kallsyms->symtab[i].st_value > addr
-		    && kallsyms->symtab[i].st_value < nextval)
-			nextval = kallsyms->symtab[i].st_value;
+			bestval = thisval;
+		}
+		if (thisval > addr && thisval < nextval)
+			nextval = thisval;
 	}
 
 	if (!best)
 		return NULL;
 
 	if (size)
-		*size = nextval - kallsyms->symtab[best].st_value;
+		*size = nextval - bestval;
 	if (offset)
-		*offset = addr - kallsyms->symtab[best].st_value;
+		*offset = addr - bestval;
 
 	return kallsyms_symbol_name(kallsyms, best);
 }
@@ -4069,8 +4074,10 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
 			continue;
 		kallsyms = rcu_dereference_sched(mod->kallsyms);
 		if (symnum < kallsyms->num_symtab) {
-			*value = kallsyms->symtab[symnum].st_value;
-			*type = kallsyms->symtab[symnum].st_size;
+			const Elf_Sym *sym = &kallsyms->symtab[symnum];
+
+			*value = kallsyms_symbol_value(sym);
+			*type = sym->st_size;
 			strlcpy(name, kallsyms_symbol_name(kallsyms, symnum), KSYM_NAME_LEN);
 			strlcpy(module_name, mod->name, MODULE_NAME_LEN);
 			*exported = is_exported(name, *value, mod);
@@ -4089,10 +4096,13 @@ static unsigned long find_kallsyms_symbol_value(struct module *mod, const char *
 	unsigned int i;
 	struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms);
 
-	for (i = 0; i < kallsyms->num_symtab; i++)
+	for (i = 0; i < kallsyms->num_symtab; i++) {
+		const Elf_Sym *sym = &kallsyms->symtab[i];
+
 		if (strcmp(name, kallsyms_symbol_name(kallsyms, i)) == 0 &&
-		    kallsyms->symtab[i].st_shndx != SHN_UNDEF)
-			return kallsyms->symtab[i].st_value;
+		    sym->st_shndx != SHN_UNDEF)
+			return kallsyms_symbol_value(sym);
+	}
 	return 0;
 }
 
@@ -4137,12 +4147,13 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
 		if (mod->state == MODULE_STATE_UNFORMED)
 			continue;
 		for (i = 0; i < kallsyms->num_symtab; i++) {
+			const Elf_Sym *sym = &kallsyms->symtab[i];
 
-			if (kallsyms->symtab[i].st_shndx == SHN_UNDEF)
+			if (sym->st_shndx == SHN_UNDEF)
 				continue;
 
 			ret = fn(data, kallsyms_symbol_name(kallsyms, i),
-				 mod, kallsyms->symtab[i].st_value);
+				 mod, kallsyms_symbol_value(sym));
 			if (ret != 0)
 				return ret;
 		}
-- 
cgit 


From 23127b33ec80e656921362d7dc82a0064bac20a2 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Thu, 13 Dec 2018 10:41:46 -0800
Subject: bpf: Create a new btf_name_by_offset() for non type name use case

The current btf_name_by_offset() is returning "(anon)" type name for
the offset == 0 case and "(invalid-name-offset)" for the out-of-bound
offset case.

It fits well for the internal BTF verbose log purpose which
is focusing on type.  For example,
offset == 0 => "(anon)" => anonymous type/name.
Returning non-NULL for the bad offset case is needed
during the BTF verification process because the BTF verifier may
complain about another field first before discovering the name_off
is invalid.

However, it may not be ideal for the newer use case which does not
necessary mean type name.  For example, when logging line_info
in the BPF verifier in the next patch, it is better to log an
empty src line instead of logging "(anon)".

The existing bpf_name_by_offset() is renamed to __bpf_name_by_offset()
and static to btf.c.

A new bpf_name_by_offset() is added for generic context usage.  It
returns "\0" for name_off == 0 (note that btf->strings[0] is "\0")
and NULL for invalid offset.  It allows the caller to decide
what is the best output in its context.

The new btf_name_by_offset() is overlapped with btf_name_offset_valid().
Hence, btf_name_offset_valid() is removed from btf.h to keep the btf.h API
minimal.  The existing btf_name_offset_valid() usage in btf.c could also be
replaced later.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/btf.h   |  1 -
 kernel/bpf/btf.c      | 31 ++++++++++++++++++++-----------
 kernel/bpf/verifier.c |  4 ++--
 3 files changed, 22 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/btf.h b/include/linux/btf.h
index a4cf075b89eb..58000d7e06e3 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -46,7 +46,6 @@ void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj,
 		       struct seq_file *m);
 int btf_get_fd_by_id(u32 id);
 u32 btf_id(const struct btf *btf);
-bool btf_name_offset_valid(const struct btf *btf, u32 offset);
 bool btf_type_is_reg_int(const struct btf_type *t, u32 expected_size);
 
 #ifdef CONFIG_BPF_SYSCALL
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 1545ddfb6fa5..8fa0bf1c33fd 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -474,7 +474,7 @@ static bool btf_name_valid_identifier(const struct btf *btf, u32 offset)
 	return !*src;
 }
 
-const char *btf_name_by_offset(const struct btf *btf, u32 offset)
+static const char *__btf_name_by_offset(const struct btf *btf, u32 offset)
 {
 	if (!offset)
 		return "(anon)";
@@ -484,6 +484,14 @@ const char *btf_name_by_offset(const struct btf *btf, u32 offset)
 		return "(invalid-name-offset)";
 }
 
+const char *btf_name_by_offset(const struct btf *btf, u32 offset)
+{
+	if (offset < btf->hdr.str_len)
+		return &btf->strings[offset];
+
+	return NULL;
+}
+
 const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id)
 {
 	if (type_id > btf->nr_types)
@@ -576,7 +584,7 @@ __printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env,
 	__btf_verifier_log(log, "[%u] %s %s%s",
 			   env->log_type_id,
 			   btf_kind_str[kind],
-			   btf_name_by_offset(btf, t->name_off),
+			   __btf_name_by_offset(btf, t->name_off),
 			   log_details ? " " : "");
 
 	if (log_details)
@@ -620,7 +628,7 @@ static void btf_verifier_log_member(struct btf_verifier_env *env,
 		btf_verifier_log_type(env, struct_type, NULL);
 
 	__btf_verifier_log(log, "\t%s type_id=%u bits_offset=%u",
-			   btf_name_by_offset(btf, member->name_off),
+			   __btf_name_by_offset(btf, member->name_off),
 			   member->type, member->offset);
 
 	if (fmt && *fmt) {
@@ -1872,7 +1880,7 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env,
 
 
 		btf_verifier_log(env, "\t%s val=%d\n",
-				 btf_name_by_offset(btf, enums[i].name_off),
+				 __btf_name_by_offset(btf, enums[i].name_off),
 				 enums[i].val);
 	}
 
@@ -1896,7 +1904,8 @@ static void btf_enum_seq_show(const struct btf *btf, const struct btf_type *t,
 	for (i = 0; i < nr_enums; i++) {
 		if (v == enums[i].val) {
 			seq_printf(m, "%s",
-				   btf_name_by_offset(btf, enums[i].name_off));
+				   __btf_name_by_offset(btf,
+							enums[i].name_off));
 			return;
 		}
 	}
@@ -1954,20 +1963,20 @@ static void btf_func_proto_log(struct btf_verifier_env *env,
 	}
 
 	btf_verifier_log(env, "%u %s", args[0].type,
-			 btf_name_by_offset(env->btf,
-					    args[0].name_off));
+			 __btf_name_by_offset(env->btf,
+					      args[0].name_off));
 	for (i = 1; i < nr_args - 1; i++)
 		btf_verifier_log(env, ", %u %s", args[i].type,
-				 btf_name_by_offset(env->btf,
-						    args[i].name_off));
+				 __btf_name_by_offset(env->btf,
+						      args[i].name_off));
 
 	if (nr_args > 1) {
 		const struct btf_param *last_arg = &args[nr_args - 1];
 
 		if (last_arg->type)
 			btf_verifier_log(env, ", %u %s", last_arg->type,
-					 btf_name_by_offset(env->btf,
-							    last_arg->name_off));
+					 __btf_name_by_offset(env->btf,
+							      last_arg->name_off));
 		else
 			btf_verifier_log(env, ", vararg");
 	}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8b511a4fe84a..89ce2613fdb0 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4910,8 +4910,8 @@ static int check_btf_line(struct bpf_verifier_env *env,
 			goto err_free;
 		}
 
-		if (!btf_name_offset_valid(btf, linfo[i].line_off) ||
-		    !btf_name_offset_valid(btf, linfo[i].file_name_off)) {
+		if (!btf_name_by_offset(btf, linfo[i].line_off) ||
+		    !btf_name_by_offset(btf, linfo[i].file_name_off)) {
 			verbose(env, "Invalid line_info[%u].line_off or .file_name_off\n", i);
 			err = -EINVAL;
 			goto err_free;
-- 
cgit 


From d9762e84ede3eae9636f5dbbe0c8f0390d37e114 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Thu, 13 Dec 2018 10:41:48 -0800
Subject: bpf: verbose log bpf_line_info in verifier

This patch adds bpf_line_info during the verifier's verbose.
It can give error context for debug purpose.

~~~~~~~~~~
Here is the verbose log for backedge:
	while (a) {
		a += bpf_get_smp_processor_id();
		bpf_trace_printk(fmt, sizeof(fmt), a);
	}

~> bpftool prog load ./test_loop.o /sys/fs/bpf/test_loop type tracepoint
13: while (a) {
3: a += bpf_get_smp_processor_id();
back-edge from insn 13 to 3

~~~~~~~~~~
Here is the verbose log for invalid pkt access:
Modification to test_xdp_noinline.c:

	data = (void *)(long)xdp->data;
	data_end = (void *)(long)xdp->data_end;
/*
	if (data + 4 > data_end)
		return XDP_DROP;
*/
	*(u32 *)data = dst->dst;

~> bpftool prog load ./test_xdp_noinline.o /sys/fs/bpf/test_xdp_noinline type xdp
; data = (void *)(long)xdp->data;
224: (79) r2 = *(u64 *)(r10 -112)
225: (61) r2 = *(u32 *)(r2 +0)
; *(u32 *)data = dst->dst;
226: (63) *(u32 *)(r2 +0) = r1
invalid access to packet, off=0 size=4, R2(id=0,off=0,r=0)
R2 offset is outside of the packet

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |  1 +
 kernel/bpf/verifier.c        | 74 +++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 70 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index c736945be7c5..548dcbdb7111 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -224,6 +224,7 @@ struct bpf_verifier_env {
 	bool allow_ptr_leaks;
 	bool seen_direct_write;
 	struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */
+	const struct bpf_line_info *prev_linfo;
 	struct bpf_verifier_log log;
 	struct bpf_subprog_info subprog_info[BPF_MAX_SUBPROGS + 1];
 	u32 subprog_cnt;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 89ce2613fdb0..ba8e3134bbc2 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -26,6 +26,7 @@
 #include <linux/bsearch.h>
 #include <linux/sort.h>
 #include <linux/perf_event.h>
+#include <linux/ctype.h>
 
 #include "disasm.h"
 
@@ -216,6 +217,27 @@ struct bpf_call_arg_meta {
 
 static DEFINE_MUTEX(bpf_verifier_lock);
 
+static const struct bpf_line_info *
+find_linfo(const struct bpf_verifier_env *env, u32 insn_off)
+{
+	const struct bpf_line_info *linfo;
+	const struct bpf_prog *prog;
+	u32 i, nr_linfo;
+
+	prog = env->prog;
+	nr_linfo = prog->aux->nr_linfo;
+
+	if (!nr_linfo || insn_off >= prog->len)
+		return NULL;
+
+	linfo = prog->aux->linfo;
+	for (i = 1; i < nr_linfo; i++)
+		if (insn_off < linfo[i].insn_off)
+			break;
+
+	return &linfo[i - 1];
+}
+
 void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
 		       va_list args)
 {
@@ -266,6 +288,42 @@ __printf(2, 3) static void verbose(void *private_data, const char *fmt, ...)
 	va_end(args);
 }
 
+static const char *ltrim(const char *s)
+{
+	while (isspace(*s))
+		s++;
+
+	return s;
+}
+
+__printf(3, 4) static void verbose_linfo(struct bpf_verifier_env *env,
+					 u32 insn_off,
+					 const char *prefix_fmt, ...)
+{
+	const struct bpf_line_info *linfo;
+
+	if (!bpf_verifier_log_needed(&env->log))
+		return;
+
+	linfo = find_linfo(env, insn_off);
+	if (!linfo || linfo == env->prev_linfo)
+		return;
+
+	if (prefix_fmt) {
+		va_list args;
+
+		va_start(args, prefix_fmt);
+		bpf_verifier_vlog(&env->log, prefix_fmt, args);
+		va_end(args);
+	}
+
+	verbose(env, "%s\n",
+		ltrim(btf_name_by_offset(env->prog->aux->btf,
+					 linfo->line_off)));
+
+	env->prev_linfo = linfo;
+}
+
 static bool type_is_pkt_pointer(enum bpf_reg_type type)
 {
 	return type == PTR_TO_PACKET ||
@@ -4561,6 +4619,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
 		return 0;
 
 	if (w < 0 || w >= env->prog->len) {
+		verbose_linfo(env, t, "%d: ", t);
 		verbose(env, "jump out of range from insn %d to %d\n", t, w);
 		return -EINVAL;
 	}
@@ -4578,6 +4637,8 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
 		insn_stack[cur_stack++] = w;
 		return 1;
 	} else if ((insn_state[w] & 0xF0) == DISCOVERED) {
+		verbose_linfo(env, t, "%d: ", t);
+		verbose_linfo(env, w, "%d: ", w);
 		verbose(env, "back-edge from insn %d to %d\n", t, w);
 		return -EINVAL;
 	} else if (insn_state[w] == EXPLORED) {
@@ -4600,10 +4661,6 @@ static int check_cfg(struct bpf_verifier_env *env)
 	int ret = 0;
 	int i, t;
 
-	ret = check_subprogs(env);
-	if (ret < 0)
-		return ret;
-
 	insn_state = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
 	if (!insn_state)
 		return -ENOMEM;
@@ -5448,6 +5505,8 @@ static int do_check(struct bpf_verifier_env *env)
 	int insn_processed = 0;
 	bool do_print_state = false;
 
+	env->prev_linfo = NULL;
+
 	state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL);
 	if (!state)
 		return -ENOMEM;
@@ -5521,6 +5580,7 @@ static int do_check(struct bpf_verifier_env *env)
 				.private_data	= env,
 			};
 
+			verbose_linfo(env, insn_idx, "; ");
 			verbose(env, "%d: ", insn_idx);
 			print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
 		}
@@ -6755,7 +6815,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
 
 	env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
 
-	ret = check_cfg(env);
+	ret = check_subprogs(env);
 	if (ret < 0)
 		goto skip_full_check;
 
@@ -6763,6 +6823,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
 	if (ret < 0)
 		goto skip_full_check;
 
+	ret = check_cfg(env);
+	if (ret < 0)
+		goto skip_full_check;
+
 	ret = do_check(env);
 	if (env->cur_state) {
 		free_verifier_state(env->cur_state, true);
-- 
cgit 


From b233920c97a6201eb47fbafd78365c6946e6d7b6 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Thu, 13 Dec 2018 11:42:31 -0800
Subject: bpf: speed up stacksafe check

Don't check the same stack liveness condition 8 times.
once is enough.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Edward Cree <ecree@solarflare.com>
Acked-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/verifier.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ba8e3134bbc2..0c6deb3f2be4 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5204,9 +5204,11 @@ static bool stacksafe(struct bpf_func_state *old,
 	for (i = 0; i < old->allocated_stack; i++) {
 		spi = i / BPF_REG_SIZE;
 
-		if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ))
+		if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ)) {
+			i += BPF_REG_SIZE - 1;
 			/* explored state didn't use this */
 			continue;
+		}
 
 		if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID)
 			continue;
-- 
cgit 


From 19e2dbb7dd978d24505e918ac54d6f7dfdc88b1d Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Thu, 13 Dec 2018 11:42:33 -0800
Subject: bpf: improve stacksafe state comparison

"if (old->allocated_stack > cur->allocated_stack)" check is too conservative.
In some cases explored stack could have allocated more space,
but that stack space was not live.
The test case improves from 19 to 15 processed insns
and improvement on real programs is significant as well:

                       before    after
bpf_lb-DLB_L3.o        1940      1831
bpf_lb-DLB_L4.o        3089      3029
bpf_lb-DUNKNOWN.o      1065      1064
bpf_lxc-DDROP_ALL.o    28052     26309
bpf_lxc-DUNKNOWN.o     35487     33517
bpf_netdev.o           10864     9713
bpf_overlay.o          6643      6184
bpf_lcx_jit.o          38437     37335

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Edward Cree <ecree@solarflare.com>
Acked-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/verifier.c                       | 13 +++++++------
 tools/testing/selftests/bpf/test_verifier.c | 22 ++++++++++++++++++++++
 2 files changed, 29 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 0c6deb3f2be4..e4724fe8120f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5191,12 +5191,6 @@ static bool stacksafe(struct bpf_func_state *old,
 {
 	int i, spi;
 
-	/* if explored stack has more populated slots than current stack
-	 * such stacks are not equivalent
-	 */
-	if (old->allocated_stack > cur->allocated_stack)
-		return false;
-
 	/* walk slots of the explored stack and ignore any additional
 	 * slots in the current stack, since explored(safe) state
 	 * didn't use them
@@ -5212,6 +5206,13 @@ static bool stacksafe(struct bpf_func_state *old,
 
 		if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID)
 			continue;
+
+		/* explored stack has more populated slots than current stack
+		 * and these slots were used
+		 */
+		if (i >= cur->allocated_stack)
+			return false;
+
 		/* if old state was safe with misc data in the stack
 		 * it will be safe with zero-initialized stack.
 		 * The opposite is not true
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 82359cdbc805..f9de7fe0c26d 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -13647,6 +13647,28 @@ static struct bpf_test tests[] = {
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 		.result = ACCEPT,
 	},
+	{
+		"allocated_stack",
+		.insns = {
+			BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_1),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+			BPF_ALU64_REG(BPF_MOV, BPF_REG_7, BPF_REG_0),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, -8),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_10, -8),
+			BPF_STX_MEM(BPF_B, BPF_REG_10, BPF_REG_7, -9),
+			BPF_LDX_MEM(BPF_B, BPF_REG_7, BPF_REG_10, -9),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 0),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 0),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 0),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.result_unpriv = ACCEPT,
+		.insn_processed = 15,
+	},
 	{
 		"reference tracking in call: free reference in subprog and outside",
 		.insns = {
-- 
cgit 


From 9242b5f5615c823bfc1e9aea284617ff25a55f10 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Thu, 13 Dec 2018 11:42:34 -0800
Subject: bpf: add self-check logic to liveness analysis

Introduce REG_LIVE_DONE to check the liveness propagation
and prepare the states for merging.
See algorithm description in clean_live_states().

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf_verifier.h |   1 +
 kernel/bpf/verifier.c        | 108 ++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 108 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 548dcbdb7111..c233efc106c6 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -38,6 +38,7 @@ enum bpf_reg_liveness {
 	REG_LIVE_NONE = 0, /* reg hasn't been read or written this branch */
 	REG_LIVE_READ, /* reg was read, so we're sensitive to initial value */
 	REG_LIVE_WRITTEN, /* reg was written first, screening off later reads */
+	REG_LIVE_DONE = 4, /* liveness won't be updating this register anymore */
 };
 
 struct bpf_reg_state {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e4724fe8120f..0125731e2512 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -397,12 +397,14 @@ static char slot_type_char[] = {
 static void print_liveness(struct bpf_verifier_env *env,
 			   enum bpf_reg_liveness live)
 {
-	if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN))
+	if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN | REG_LIVE_DONE))
 	    verbose(env, "_");
 	if (live & REG_LIVE_READ)
 		verbose(env, "r");
 	if (live & REG_LIVE_WRITTEN)
 		verbose(env, "w");
+	if (live & REG_LIVE_DONE)
+		verbose(env, "D");
 }
 
 static struct bpf_func_state *func(struct bpf_verifier_env *env,
@@ -1132,6 +1134,12 @@ static int mark_reg_read(struct bpf_verifier_env *env,
 		/* if read wasn't screened by an earlier write ... */
 		if (writes && state->live & REG_LIVE_WRITTEN)
 			break;
+		if (parent->live & REG_LIVE_DONE) {
+			verbose(env, "verifier BUG type %s var_off %lld off %d\n",
+				reg_type_str[parent->type],
+				parent->var_off.value, parent->off);
+			return -EFAULT;
+		}
 		/* ... then we depend on parent's value */
 		parent->live |= REG_LIVE_READ;
 		state = parent;
@@ -5078,6 +5086,102 @@ static bool check_ids(u32 old_id, u32 cur_id, struct idpair *idmap)
 	return false;
 }
 
+static void clean_func_state(struct bpf_verifier_env *env,
+			     struct bpf_func_state *st)
+{
+	enum bpf_reg_liveness live;
+	int i, j;
+
+	for (i = 0; i < BPF_REG_FP; i++) {
+		live = st->regs[i].live;
+		/* liveness must not touch this register anymore */
+		st->regs[i].live |= REG_LIVE_DONE;
+		if (!(live & REG_LIVE_READ))
+			/* since the register is unused, clear its state
+			 * to make further comparison simpler
+			 */
+			__mark_reg_not_init(&st->regs[i]);
+	}
+
+	for (i = 0; i < st->allocated_stack / BPF_REG_SIZE; i++) {
+		live = st->stack[i].spilled_ptr.live;
+		/* liveness must not touch this stack slot anymore */
+		st->stack[i].spilled_ptr.live |= REG_LIVE_DONE;
+		if (!(live & REG_LIVE_READ)) {
+			__mark_reg_not_init(&st->stack[i].spilled_ptr);
+			for (j = 0; j < BPF_REG_SIZE; j++)
+				st->stack[i].slot_type[j] = STACK_INVALID;
+		}
+	}
+}
+
+static void clean_verifier_state(struct bpf_verifier_env *env,
+				 struct bpf_verifier_state *st)
+{
+	int i;
+
+	if (st->frame[0]->regs[0].live & REG_LIVE_DONE)
+		/* all regs in this state in all frames were already marked */
+		return;
+
+	for (i = 0; i <= st->curframe; i++)
+		clean_func_state(env, st->frame[i]);
+}
+
+/* the parentage chains form a tree.
+ * the verifier states are added to state lists at given insn and
+ * pushed into state stack for future exploration.
+ * when the verifier reaches bpf_exit insn some of the verifer states
+ * stored in the state lists have their final liveness state already,
+ * but a lot of states will get revised from liveness point of view when
+ * the verifier explores other branches.
+ * Example:
+ * 1: r0 = 1
+ * 2: if r1 == 100 goto pc+1
+ * 3: r0 = 2
+ * 4: exit
+ * when the verifier reaches exit insn the register r0 in the state list of
+ * insn 2 will be seen as !REG_LIVE_READ. Then the verifier pops the other_branch
+ * of insn 2 and goes exploring further. At the insn 4 it will walk the
+ * parentage chain from insn 4 into insn 2 and will mark r0 as REG_LIVE_READ.
+ *
+ * Since the verifier pushes the branch states as it sees them while exploring
+ * the program the condition of walking the branch instruction for the second
+ * time means that all states below this branch were already explored and
+ * their final liveness markes are already propagated.
+ * Hence when the verifier completes the search of state list in is_state_visited()
+ * we can call this clean_live_states() function to mark all liveness states
+ * as REG_LIVE_DONE to indicate that 'parent' pointers of 'struct bpf_reg_state'
+ * will not be used.
+ * This function also clears the registers and stack for states that !READ
+ * to simplify state merging.
+ *
+ * Important note here that walking the same branch instruction in the callee
+ * doesn't meant that the states are DONE. The verifier has to compare
+ * the callsites
+ */
+static void clean_live_states(struct bpf_verifier_env *env, int insn,
+			      struct bpf_verifier_state *cur)
+{
+	struct bpf_verifier_state_list *sl;
+	int i;
+
+	sl = env->explored_states[insn];
+	if (!sl)
+		return;
+
+	while (sl != STATE_LIST_MARK) {
+		if (sl->state.curframe != cur->curframe)
+			goto next;
+		for (i = 0; i <= cur->curframe; i++)
+			if (sl->state.frame[i]->callsite != cur->frame[i]->callsite)
+				goto next;
+		clean_verifier_state(env, &sl->state);
+next:
+		sl = sl->next;
+	}
+}
+
 /* Returns true if (rold safe implies rcur safe) */
 static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
 		    struct idpair *idmap)
@@ -5396,6 +5500,8 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 		 */
 		return 0;
 
+	clean_live_states(env, insn_idx, cur);
+
 	while (sl != STATE_LIST_MARK) {
 		if (states_equal(env, &sl->state, cur)) {
 			/* reached equivalent register/stack state,
-- 
cgit 


From 723679339d087d79e36c0af67f4be84d866fee20 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Tue, 11 Dec 2018 20:00:51 +0900
Subject: kconfig: warn no new line at end of file

It would be nice to warn if a new line is missing at end of file.

We could do this by checkpatch.pl for arbitrary files, but new line
is rather essential as a statement terminator in Kconfig.

The warning message looks like this:

  kernel/Kconfig.preempt:60:warning: no new line at end of file

Currently, kernel/Kconfig.preempt is the only file with no new line
at end of file. Fix it.

I know there are some false negative cases. For example, no warning
is displayed when the last line contains some whitespaces/comments,
but no new line. Yet, this commit works well for most cases.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
---
 kernel/Kconfig.preempt  | 2 +-
 scripts/kconfig/zconf.l | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index cd1655122ec0..0fee5fe6c899 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -57,4 +57,4 @@ config PREEMPT
 endchoice
 
 config PREEMPT_COUNT
-       bool
\ No newline at end of file
+       bool
diff --git a/scripts/kconfig/zconf.l b/scripts/kconfig/zconf.l
index 9038e9736bf0..8e856f9e6da9 100644
--- a/scripts/kconfig/zconf.l
+++ b/scripts/kconfig/zconf.l
@@ -261,6 +261,10 @@ n	[A-Za-z0-9_-]
 <<EOF>>	{
 	BEGIN(INITIAL);
 
+	if (prev_token != T_EOL && prev_token != T_HELPTEXT)
+		fprintf(stderr, "%s:%d:warning: no new line at end of file\n",
+			current_file->name, yylineno);
+
 	if (current_file) {
 		zconf_endfile();
 		return T_EOL;
-- 
cgit 


From 0e334db6bb4b1fd1e2d72c1f3d8f004313cd9f94 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 17 Dec 2018 13:31:05 +0100
Subject: posix-timers: Fix division by zero bug

The signal delivery path of posix-timers can try to rearm the timer even if
the interval is zero. That's handled for the common case (hrtimer) but not
for alarm timers. In that case the forwarding function raises a division by
zero exception.

The handling for hrtimer based posix timers is wrong because it marks the
timer as active despite the fact that it is stopped.

Move the check from common_hrtimer_rearm() to posixtimer_rearm() to cure
both issues.

Reported-by: syzbot+9d38bedac9cc77b8ad5e@syzkaller.appspotmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: sboyd@kernel.org
Cc: stable@vger.kernel.org
Cc: syzkaller-bugs@googlegroups.com
Link: http://lkml.kernel.org/r/alpine.DEB.2.21.1812171328050.1880@nanos.tec.linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/posix-timers.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index bd62b5eeb5a0..31f49ae80f43 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -289,9 +289,6 @@ static void common_hrtimer_rearm(struct k_itimer *timr)
 {
 	struct hrtimer *timer = &timr->it.real.timer;
 
-	if (!timr->it_interval)
-		return;
-
 	timr->it_overrun += hrtimer_forward(timer, timer->base->get_time(),
 					    timr->it_interval);
 	hrtimer_restart(timer);
@@ -317,7 +314,7 @@ void posixtimer_rearm(struct kernel_siginfo *info)
 	if (!timr)
 		return;
 
-	if (timr->it_requeue_pending == info->si_sys_private) {
+	if (timr->it_interval && timr->it_requeue_pending == info->si_sys_private) {
 		timr->kclock->timer_rearm(timr);
 
 		timr->it_active = 1;
-- 
cgit 


From fb1a59fae8baa3f3c69b72a87ff94fc4fa5683ec Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Mon, 17 Dec 2018 17:20:55 +0900
Subject: kprobes: Blacklist symbols in arch-defined prohibited area

Blacklist symbols in arch-defined probe-prohibited areas.
With this change, user can see all symbols which are prohibited
to probe in debugfs.

All archtectures which have custom prohibit areas should define
its own arch_populate_kprobe_blacklist() function, but unless that,
all symbols marked __kprobes are blacklisted.

Reported-by: Andrea Righi <righi.andrea@gmail.com>
Tested-by: Andrea Righi <righi.andrea@gmail.com>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: David S. Miller <davem@davemloft.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yonghong Song <yhs@fb.com>
Link: http://lkml.kernel.org/r/154503485491.26176.15823229545155174796.stgit@devbox
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/kprobes.h |  3 +++
 kernel/kprobes.c        | 67 ++++++++++++++++++++++++++++++++++++++-----------
 2 files changed, 56 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index e909413e4e38..5da8a1de2187 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -242,10 +242,13 @@ extern int arch_init_kprobes(void);
 extern void show_registers(struct pt_regs *regs);
 extern void kprobes_inc_nmissed_count(struct kprobe *p);
 extern bool arch_within_kprobe_blacklist(unsigned long addr);
+extern int arch_populate_kprobe_blacklist(void);
 extern bool arch_kprobe_on_func_entry(unsigned long offset);
 extern bool kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset);
 
 extern bool within_kprobe_blacklist(unsigned long addr);
+extern int kprobe_add_ksym_blacklist(unsigned long entry);
+extern int kprobe_add_area_blacklist(unsigned long start, unsigned long end);
 
 struct kprobe_insn_cache {
 	struct mutex mutex;
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 90e98e233647..90569aec0f24 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -2093,6 +2093,47 @@ void dump_kprobe(struct kprobe *kp)
 }
 NOKPROBE_SYMBOL(dump_kprobe);
 
+int kprobe_add_ksym_blacklist(unsigned long entry)
+{
+	struct kprobe_blacklist_entry *ent;
+	unsigned long offset = 0, size = 0;
+
+	if (!kernel_text_address(entry) ||
+	    !kallsyms_lookup_size_offset(entry, &size, &offset))
+		return -EINVAL;
+
+	ent = kmalloc(sizeof(*ent), GFP_KERNEL);
+	if (!ent)
+		return -ENOMEM;
+	ent->start_addr = entry;
+	ent->end_addr = entry + size;
+	INIT_LIST_HEAD(&ent->list);
+	list_add_tail(&ent->list, &kprobe_blacklist);
+
+	return (int)size;
+}
+
+/* Add all symbols in given area into kprobe blacklist */
+int kprobe_add_area_blacklist(unsigned long start, unsigned long end)
+{
+	unsigned long entry;
+	int ret = 0;
+
+	for (entry = start; entry < end; entry += ret) {
+		ret = kprobe_add_ksym_blacklist(entry);
+		if (ret < 0)
+			return ret;
+		if (ret == 0)	/* In case of alias symbol */
+			ret = 1;
+	}
+	return 0;
+}
+
+int __init __weak arch_populate_kprobe_blacklist(void)
+{
+	return 0;
+}
+
 /*
  * Lookup and populate the kprobe_blacklist.
  *
@@ -2104,26 +2145,24 @@ NOKPROBE_SYMBOL(dump_kprobe);
 static int __init populate_kprobe_blacklist(unsigned long *start,
 					     unsigned long *end)
 {
+	unsigned long entry;
 	unsigned long *iter;
-	struct kprobe_blacklist_entry *ent;
-	unsigned long entry, offset = 0, size = 0;
+	int ret;
 
 	for (iter = start; iter < end; iter++) {
 		entry = arch_deref_entry_point((void *)*iter);
-
-		if (!kernel_text_address(entry) ||
-		    !kallsyms_lookup_size_offset(entry, &size, &offset))
+		ret = kprobe_add_ksym_blacklist(entry);
+		if (ret == -EINVAL)
 			continue;
-
-		ent = kmalloc(sizeof(*ent), GFP_KERNEL);
-		if (!ent)
-			return -ENOMEM;
-		ent->start_addr = entry;
-		ent->end_addr = entry + size;
-		INIT_LIST_HEAD(&ent->list);
-		list_add_tail(&ent->list, &kprobe_blacklist);
+		if (ret < 0)
+			return ret;
 	}
-	return 0;
+
+	/* Symbols in __kprobes_text are blacklisted */
+	ret = kprobe_add_area_blacklist((unsigned long)__kprobes_text_start,
+					(unsigned long)__kprobes_text_end);
+
+	return ret ? : arch_populate_kprobe_blacklist();
 }
 
 /* Module notifier call back, checking kprobes on the module */
-- 
cgit 


From c92a54cfa0257e8ffd66b2a17d49e9c0bd4b769f Mon Sep 17 00:00:00 2001
From: "Lendacky, Thomas" <Thomas.Lendacky@amd.com>
Date: Mon, 17 Dec 2018 14:39:16 +0000
Subject: dma-direct: do not include SME mask in the DMA supported check

The dma_direct_supported() function intends to check the DMA mask against
specific values. However, the phys_to_dma() function includes the SME
encryption mask, which defeats the intended purpose of the check. This
results in drivers that support less than 48-bit DMA (SME encryption mask
is bit 47) from being able to set the DMA mask successfully when SME is
active, which results in the driver failing to initialize.

Change the function used to check the mask from phys_to_dma() to
__phys_to_dma() so that the SME encryption mask is not part of the check.

Fixes: c1d0af1a1d5d ("kernel/dma/direct: take DMA offset into account in dma_direct_supported")
Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 kernel/dma/direct.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 22a12ab5a5e9..375c77e8d52f 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -309,7 +309,12 @@ int dma_direct_supported(struct device *dev, u64 mask)
 
 	min_mask = min_t(u64, min_mask, (max_pfn - 1) << PAGE_SHIFT);
 
-	return mask >= phys_to_dma(dev, min_mask);
+	/*
+	 * This check needs to be against the actual bit mask value, so
+	 * use __phys_to_dma() here so that the SME encryption mask isn't
+	 * part of the check.
+	 */
+	return mask >= __phys_to_dma(dev, min_mask);
 }
 
 int dma_direct_mapping_error(struct device *dev, dma_addr_t dma_addr)
-- 
cgit 


From 6c4fc209fcf9d27efbaa48368773e4d2bfbd59aa Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sun, 16 Dec 2018 00:49:47 +0100
Subject: bpf: remove useless version check for prog load

Existing libraries and tracing frameworks work around this kernel
version check by automatically deriving the kernel version from
uname(3) or similar such that the user does not need to do it
manually; these workarounds also make the version check useless
at the same time.

Moreover, most other BPF tracing types enabling bpf_probe_read()-like
functionality have /not/ adapted this check, and in general these
days it is well understood anyway that all the tracing programs are
not stable with regards to future kernels as kernel internal data
structures are subject to change from release to release.

Back at last netconf we discussed [0] and agreed to remove this
check from bpf_prog_load() and instead document it here in the uapi
header that there is no such guarantee for stable API for these
programs.

  [0] http://vger.kernel.org/netconf2018_files/DanielBorkmann_netconf2018.pdf

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h       | 10 +++++++++-
 kernel/bpf/syscall.c           |  5 -----
 tools/include/uapi/linux/bpf.h | 10 +++++++++-
 3 files changed, 18 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e7d57e89f25f..1d324c2cbca2 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -133,6 +133,14 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_STACK,
 };
 
+/* Note that tracing related programs such as
+ * BPF_PROG_TYPE_{KPROBE,TRACEPOINT,PERF_EVENT,RAW_TRACEPOINT}
+ * are not subject to a stable API since kernel internal data
+ * structures can change from release to release and may
+ * therefore break existing tracing BPF programs. Tracing BPF
+ * programs correspond to /a/ specific kernel which is to be
+ * analyzed, and not /a/ specific kernel /and/ all future ones.
+ */
 enum bpf_prog_type {
 	BPF_PROG_TYPE_UNSPEC,
 	BPF_PROG_TYPE_SOCKET_FILTER,
@@ -343,7 +351,7 @@ union bpf_attr {
 		__u32		log_level;	/* verbosity level of verifier */
 		__u32		log_size;	/* size of user buffer */
 		__aligned_u64	log_buf;	/* user supplied buffer */
-		__u32		kern_version;	/* checked when prog_type=kprobe */
+		__u32		kern_version;	/* not used */
 		__u32		prog_flags;
 		char		prog_name[BPF_OBJ_NAME_LEN];
 		__u32		prog_ifindex;	/* ifindex of netdev to prep for */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 6ae062f1cf20..5db31067d85e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1473,11 +1473,6 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
 
 	if (attr->insn_cnt == 0 || attr->insn_cnt > BPF_MAXINSNS)
 		return -E2BIG;
-
-	if (type == BPF_PROG_TYPE_KPROBE &&
-	    attr->kern_version != LINUX_VERSION_CODE)
-		return -EINVAL;
-
 	if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
 	    type != BPF_PROG_TYPE_CGROUP_SKB &&
 	    !capable(CAP_SYS_ADMIN))
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index e7d57e89f25f..1d324c2cbca2 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -133,6 +133,14 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_STACK,
 };
 
+/* Note that tracing related programs such as
+ * BPF_PROG_TYPE_{KPROBE,TRACEPOINT,PERF_EVENT,RAW_TRACEPOINT}
+ * are not subject to a stable API since kernel internal data
+ * structures can change from release to release and may
+ * therefore break existing tracing BPF programs. Tracing BPF
+ * programs correspond to /a/ specific kernel which is to be
+ * analyzed, and not /a/ specific kernel /and/ all future ones.
+ */
 enum bpf_prog_type {
 	BPF_PROG_TYPE_UNSPEC,
 	BPF_PROG_TYPE_SOCKET_FILTER,
@@ -343,7 +351,7 @@ union bpf_attr {
 		__u32		log_level;	/* verbosity level of verifier */
 		__u32		log_size;	/* size of user buffer */
 		__aligned_u64	log_buf;	/* user supplied buffer */
-		__u32		kern_version;	/* checked when prog_type=kprobe */
+		__u32		kern_version;	/* not used */
 		__u32		prog_flags;
 		char		prog_name[BPF_OBJ_NAME_LEN];
 		__u32		prog_ifindex;	/* ifindex of netdev to prep for */
-- 
cgit 


From f97be3ab044cf6dd342fed7668c977ba07a7cd95 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Sat, 15 Dec 2018 22:13:50 -0800
Subject: bpf: btf: refactor btf_int_bits_seq_show()

Refactor function btf_int_bits_seq_show() by creating
function btf_bitfield_seq_show() which has no dependence
on btf and btf_type. The function btf_bitfield_seq_show()
will be in later patch to directly dump bitfield member values.

Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/btf.c | 35 +++++++++++++++++++++--------------
 1 file changed, 21 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 8fa0bf1c33fd..72caa799e82f 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -1068,26 +1068,16 @@ static void btf_int_log(struct btf_verifier_env *env,
 			 btf_int_encoding_str(BTF_INT_ENCODING(int_data)));
 }
 
-static void btf_int_bits_seq_show(const struct btf *btf,
-				  const struct btf_type *t,
-				  void *data, u8 bits_offset,
-				  struct seq_file *m)
+static void btf_bitfield_seq_show(void *data, u8 bits_offset,
+				  u8 nr_bits, struct seq_file *m)
 {
 	u16 left_shift_bits, right_shift_bits;
-	u32 int_data = btf_type_int(t);
-	u8 nr_bits = BTF_INT_BITS(int_data);
-	u8 total_bits_offset;
 	u8 nr_copy_bytes;
 	u8 nr_copy_bits;
 	u64 print_num;
 
-	/*
-	 * bits_offset is at most 7.
-	 * BTF_INT_OFFSET() cannot exceed 64 bits.
-	 */
-	total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data);
-	data += BITS_ROUNDDOWN_BYTES(total_bits_offset);
-	bits_offset = BITS_PER_BYTE_MASKED(total_bits_offset);
+	data += BITS_ROUNDDOWN_BYTES(bits_offset);
+	bits_offset = BITS_PER_BYTE_MASKED(bits_offset);
 	nr_copy_bits = nr_bits + bits_offset;
 	nr_copy_bytes = BITS_ROUNDUP_BYTES(nr_copy_bits);
 
@@ -1107,6 +1097,23 @@ static void btf_int_bits_seq_show(const struct btf *btf,
 	seq_printf(m, "0x%llx", print_num);
 }
 
+static void btf_int_bits_seq_show(const struct btf *btf,
+				  const struct btf_type *t,
+				  void *data, u8 bits_offset,
+				  struct seq_file *m)
+{
+	u32 int_data = btf_type_int(t);
+	u8 nr_bits = BTF_INT_BITS(int_data);
+	u8 total_bits_offset;
+
+	/*
+	 * bits_offset is at most 7.
+	 * BTF_INT_OFFSET() cannot exceed 64 bits.
+	 */
+	total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data);
+	btf_bitfield_seq_show(data, total_bits_offset, nr_bits, m);
+}
+
 static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t,
 			     u32 type_id, void *data, u8 bits_offset,
 			     struct seq_file *m)
-- 
cgit 


From 9d5f9f701b1891466fb3dbb1806ad97716f95cc3 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Sat, 15 Dec 2018 22:13:51 -0800
Subject: bpf: btf: fix struct/union/fwd types with kind_flag

This patch fixed two issues with BTF. One is related to
struct/union bitfield encoding and the other is related to
forward type.

Issue #1 and solution:

======================

Current btf encoding of bitfield follows what pahole generates.
For each bitfield, pahole will duplicate the type chain and
put the bitfield size at the final int or enum type.
Since the BTF enum type cannot encode bit size,
pahole workarounds the issue by generating
an int type whenever the enum bit size is not 32.

For example,
  -bash-4.4$ cat t.c
  typedef int ___int;
  enum A { A1, A2, A3 };
  struct t {
    int a[5];
    ___int b:4;
    volatile enum A c:4;
  } g;
  -bash-4.4$ gcc -c -O2 -g t.c
The current kernel supports the following BTF encoding:
  $ pahole -JV t.o
  [1] TYPEDEF ___int type_id=2
  [2] INT int size=4 bit_offset=0 nr_bits=32 encoding=SIGNED
  [3] ENUM A size=4 vlen=3
        A1 val=0
        A2 val=1
        A3 val=2
  [4] STRUCT t size=24 vlen=3
        a type_id=5 bits_offset=0
        b type_id=9 bits_offset=160
        c type_id=11 bits_offset=164
  [5] ARRAY (anon) type_id=2 index_type_id=2 nr_elems=5
  [6] INT sizetype size=8 bit_offset=0 nr_bits=64 encoding=(none)
  [7] VOLATILE (anon) type_id=3
  [8] INT int size=1 bit_offset=0 nr_bits=4 encoding=(none)
  [9] TYPEDEF ___int type_id=8
  [10] INT (anon) size=1 bit_offset=0 nr_bits=4 encoding=SIGNED
  [11] VOLATILE (anon) type_id=10

Two issues are in the above:
  . by changing enum type to int, we lost the original
    type information and this will not be ideal later
    when we try to convert BTF to a header file.
  . the type duplication for bitfields will cause
    BTF bloat. Duplicated types cannot be deduplicated
    later if the bitfield size is different.

To fix this issue, this patch implemented a compatible
change for BTF struct type encoding:
  . the bit 31 of struct_type->info, previously reserved,
    now is used to indicate whether bitfield_size is
    encoded in btf_member or not.
  . if bit 31 of struct_type->info is set,
    btf_member->offset will encode like:
      bit 0 - 23: bit offset
      bit 24 - 31: bitfield size
    if bit 31 is not set, the old behavior is preserved:
      bit 0 - 31: bit offset

So if the struct contains a bit field, the maximum bit offset
will be reduced to (2^24 - 1) instead of MAX_UINT. The maximum
bitfield size will be 256 which is enough for today as maximum
bitfield in compiler can be 128 where int128 type is supported.

This kernel patch intends to support the new BTF encoding:
  $ pahole -JV t.o
  [1] TYPEDEF ___int type_id=2
  [2] INT int size=4 bit_offset=0 nr_bits=32 encoding=SIGNED
  [3] ENUM A size=4 vlen=3
        A1 val=0
        A2 val=1
        A3 val=2
  [4] STRUCT t kind_flag=1 size=24 vlen=3
        a type_id=5 bitfield_size=0 bits_offset=0
        b type_id=1 bitfield_size=4 bits_offset=160
        c type_id=7 bitfield_size=4 bits_offset=164
  [5] ARRAY (anon) type_id=2 index_type_id=2 nr_elems=5
  [6] INT sizetype size=8 bit_offset=0 nr_bits=64 encoding=(none)
  [7] VOLATILE (anon) type_id=3

Issue #2 and solution:
======================

Current forward type in BTF does not specify whether the original
type is struct or union. This will not work for type pretty print
and BTF-to-header-file conversion as struct/union must be specified.
  $ cat tt.c
  struct t;
  union u;
  int foo(struct t *t, union u *u) { return 0; }
  $ gcc -c -g -O2 tt.c
  $ pahole -JV tt.o
  [1] INT int size=4 bit_offset=0 nr_bits=32 encoding=SIGNED
  [2] FWD t type_id=0
  [3] PTR (anon) type_id=2
  [4] FWD u type_id=0
  [5] PTR (anon) type_id=4

To fix this issue, similar to issue #1, type->info bit 31
is used. If the bit is set, it is union type. Otherwise, it is
a struct type.

  $ pahole -JV tt.o
  [1] INT int size=4 bit_offset=0 nr_bits=32 encoding=SIGNED
  [2] FWD t kind_flag=0 type_id=0
  [3] PTR (anon) kind_flag=0 type_id=2
  [4] FWD u kind_flag=1 type_id=0
  [5] PTR (anon) kind_flag=0 type_id=4

Pahole/LLVM change:
===================

The new kind_flag functionality has been implemented in pahole
and llvm:
  https://github.com/yonghong-song/pahole/tree/bitfield
  https://github.com/yonghong-song/llvm/tree/bitfield

Note that pahole hasn't implemented func/func_proto kind
and .BTF.ext. So to print function signature with bpftool,
the llvm compiler should be used.

Fixes: 69b693f0aefa ("bpf: btf: Introduce BPF Type Format (BTF)")
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/btf.h |  20 +++-
 kernel/bpf/btf.c         | 280 +++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 278 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h
index 14f66948fc95..7b7475ef2f17 100644
--- a/include/uapi/linux/btf.h
+++ b/include/uapi/linux/btf.h
@@ -34,7 +34,9 @@ struct btf_type {
 	 * bits  0-15: vlen (e.g. # of struct's members)
 	 * bits 16-23: unused
 	 * bits 24-27: kind (e.g. int, ptr, array...etc)
-	 * bits 28-31: unused
+	 * bits 28-30: unused
+	 * bit     31: kind_flag, currently used by
+	 *             struct, union and fwd
 	 */
 	__u32 info;
 	/* "size" is used by INT, ENUM, STRUCT and UNION.
@@ -52,6 +54,7 @@ struct btf_type {
 
 #define BTF_INFO_KIND(info)	(((info) >> 24) & 0x0f)
 #define BTF_INFO_VLEN(info)	((info) & 0xffff)
+#define BTF_INFO_KFLAG(info)	((info) >> 31)
 
 #define BTF_KIND_UNKN		0	/* Unknown	*/
 #define BTF_KIND_INT		1	/* Integer	*/
@@ -110,9 +113,22 @@ struct btf_array {
 struct btf_member {
 	__u32	name_off;
 	__u32	type;
-	__u32	offset;	/* offset in bits */
+	/* If the type info kind_flag is set, the btf_member offset
+	 * contains both member bitfield size and bit offset. The
+	 * bitfield size is set for bitfield members. If the type
+	 * info kind_flag is not set, the offset contains only bit
+	 * offset.
+	 */
+	__u32	offset;
 };
 
+/* If the struct/union type info kind_flag is set, the
+ * following two macros are used to access bitfield_size
+ * and bit_offset from btf_member.offset.
+ */
+#define BTF_MEMBER_BITFIELD_SIZE(val)	((val) >> 24)
+#define BTF_MEMBER_BIT_OFFSET(val)	((val) & 0xffffff)
+
 /* BTF_KIND_FUNC_PROTO is followed by multiple "struct btf_param".
  * The exact number of btf_param is stored in the vlen (of the
  * info in "struct btf_type").
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 72caa799e82f..93b6905e3a9b 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -164,7 +164,7 @@
 #define BITS_ROUNDUP_BYTES(bits) \
 	(BITS_ROUNDDOWN_BYTES(bits) + !!BITS_PER_BYTE_MASKED(bits))
 
-#define BTF_INFO_MASK 0x0f00ffff
+#define BTF_INFO_MASK 0x8f00ffff
 #define BTF_INT_MASK 0x0fffffff
 #define BTF_TYPE_ID_VALID(type_id) ((type_id) <= BTF_MAX_TYPE)
 #define BTF_STR_OFFSET_VALID(name_off) ((name_off) <= BTF_MAX_NAME_OFFSET)
@@ -274,6 +274,10 @@ struct btf_kind_operations {
 			    const struct btf_type *struct_type,
 			    const struct btf_member *member,
 			    const struct btf_type *member_type);
+	int (*check_kflag_member)(struct btf_verifier_env *env,
+				  const struct btf_type *struct_type,
+				  const struct btf_member *member,
+				  const struct btf_type *member_type);
 	void (*log_details)(struct btf_verifier_env *env,
 			    const struct btf_type *t);
 	void (*seq_show)(const struct btf *btf, const struct btf_type *t,
@@ -419,6 +423,25 @@ static u16 btf_type_vlen(const struct btf_type *t)
 	return BTF_INFO_VLEN(t->info);
 }
 
+static bool btf_type_kflag(const struct btf_type *t)
+{
+	return BTF_INFO_KFLAG(t->info);
+}
+
+static u32 btf_member_bit_offset(const struct btf_type *struct_type,
+			     const struct btf_member *member)
+{
+	return btf_type_kflag(struct_type) ? BTF_MEMBER_BIT_OFFSET(member->offset)
+					   : member->offset;
+}
+
+static u32 btf_member_bitfield_size(const struct btf_type *struct_type,
+				    const struct btf_member *member)
+{
+	return btf_type_kflag(struct_type) ? BTF_MEMBER_BITFIELD_SIZE(member->offset)
+					   : 0;
+}
+
 static u32 btf_type_int(const struct btf_type *t)
 {
 	return *(u32 *)(t + 1);
@@ -627,9 +650,17 @@ static void btf_verifier_log_member(struct btf_verifier_env *env,
 	if (env->phase != CHECK_META)
 		btf_verifier_log_type(env, struct_type, NULL);
 
-	__btf_verifier_log(log, "\t%s type_id=%u bits_offset=%u",
-			   __btf_name_by_offset(btf, member->name_off),
-			   member->type, member->offset);
+	if (btf_type_kflag(struct_type))
+		__btf_verifier_log(log,
+				   "\t%s type_id=%u bitfield_size=%u bits_offset=%u",
+				   __btf_name_by_offset(btf, member->name_off),
+				   member->type,
+				   BTF_MEMBER_BITFIELD_SIZE(member->offset),
+				   BTF_MEMBER_BIT_OFFSET(member->offset));
+	else
+		__btf_verifier_log(log, "\t%s type_id=%u bits_offset=%u",
+				   __btf_name_by_offset(btf, member->name_off),
+				   member->type, member->offset);
 
 	if (fmt && *fmt) {
 		__btf_verifier_log(log, " ");
@@ -945,6 +976,38 @@ static int btf_df_check_member(struct btf_verifier_env *env,
 	return -EINVAL;
 }
 
+static int btf_df_check_kflag_member(struct btf_verifier_env *env,
+				     const struct btf_type *struct_type,
+				     const struct btf_member *member,
+				     const struct btf_type *member_type)
+{
+	btf_verifier_log_basic(env, struct_type,
+			       "Unsupported check_kflag_member");
+	return -EINVAL;
+}
+
+/* Used for ptr, array and struct/union type members.
+ * int, enum and modifier types have their specific callback functions.
+ */
+static int btf_generic_check_kflag_member(struct btf_verifier_env *env,
+					  const struct btf_type *struct_type,
+					  const struct btf_member *member,
+					  const struct btf_type *member_type)
+{
+	if (BTF_MEMBER_BITFIELD_SIZE(member->offset)) {
+		btf_verifier_log_member(env, struct_type, member,
+					"Invalid member bitfield_size");
+		return -EINVAL;
+	}
+
+	/* bitfield size is 0, so member->offset represents bit offset only.
+	 * It is safe to call non kflag check_member variants.
+	 */
+	return btf_type_ops(member_type)->check_member(env, struct_type,
+						       member,
+						       member_type);
+}
+
 static int btf_df_resolve(struct btf_verifier_env *env,
 			  const struct resolve_vertex *v)
 {
@@ -997,6 +1060,62 @@ static int btf_int_check_member(struct btf_verifier_env *env,
 	return 0;
 }
 
+static int btf_int_check_kflag_member(struct btf_verifier_env *env,
+				      const struct btf_type *struct_type,
+				      const struct btf_member *member,
+				      const struct btf_type *member_type)
+{
+	u32 struct_bits_off, nr_bits, nr_int_data_bits, bytes_offset;
+	u32 int_data = btf_type_int(member_type);
+	u32 struct_size = struct_type->size;
+	u32 nr_copy_bits;
+
+	/* a regular int type is required for the kflag int member */
+	if (!btf_type_int_is_regular(member_type)) {
+		btf_verifier_log_member(env, struct_type, member,
+					"Invalid member base type");
+		return -EINVAL;
+	}
+
+	/* check sanity of bitfield size */
+	nr_bits = BTF_MEMBER_BITFIELD_SIZE(member->offset);
+	struct_bits_off = BTF_MEMBER_BIT_OFFSET(member->offset);
+	nr_int_data_bits = BTF_INT_BITS(int_data);
+	if (!nr_bits) {
+		/* Not a bitfield member, member offset must be at byte
+		 * boundary.
+		 */
+		if (BITS_PER_BYTE_MASKED(struct_bits_off)) {
+			btf_verifier_log_member(env, struct_type, member,
+						"Invalid member offset");
+			return -EINVAL;
+		}
+
+		nr_bits = nr_int_data_bits;
+	} else if (nr_bits > nr_int_data_bits) {
+		btf_verifier_log_member(env, struct_type, member,
+					"Invalid member bitfield_size");
+		return -EINVAL;
+	}
+
+	bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);
+	nr_copy_bits = nr_bits + BITS_PER_BYTE_MASKED(struct_bits_off);
+	if (nr_copy_bits > BITS_PER_U64) {
+		btf_verifier_log_member(env, struct_type, member,
+					"nr_copy_bits exceeds 64");
+		return -EINVAL;
+	}
+
+	if (struct_size < bytes_offset ||
+	    struct_size - bytes_offset < BITS_ROUNDUP_BYTES(nr_copy_bits)) {
+		btf_verifier_log_member(env, struct_type, member,
+					"Member exceeds struct_size");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static s32 btf_int_check_meta(struct btf_verifier_env *env,
 			      const struct btf_type *t,
 			      u32 meta_left)
@@ -1016,6 +1135,11 @@ static s32 btf_int_check_meta(struct btf_verifier_env *env,
 		return -EINVAL;
 	}
 
+	if (btf_type_kflag(t)) {
+		btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
+		return -EINVAL;
+	}
+
 	int_data = btf_type_int(t);
 	if (int_data & ~BTF_INT_MASK) {
 		btf_verifier_log_basic(env, t, "Invalid int_data:%x",
@@ -1097,6 +1221,7 @@ static void btf_bitfield_seq_show(void *data, u8 bits_offset,
 	seq_printf(m, "0x%llx", print_num);
 }
 
+
 static void btf_int_bits_seq_show(const struct btf *btf,
 				  const struct btf_type *t,
 				  void *data, u8 bits_offset,
@@ -1163,6 +1288,7 @@ static const struct btf_kind_operations int_ops = {
 	.check_meta = btf_int_check_meta,
 	.resolve = btf_df_resolve,
 	.check_member = btf_int_check_member,
+	.check_kflag_member = btf_int_check_kflag_member,
 	.log_details = btf_int_log,
 	.seq_show = btf_int_seq_show,
 };
@@ -1192,6 +1318,31 @@ static int btf_modifier_check_member(struct btf_verifier_env *env,
 							 resolved_type);
 }
 
+static int btf_modifier_check_kflag_member(struct btf_verifier_env *env,
+					   const struct btf_type *struct_type,
+					   const struct btf_member *member,
+					   const struct btf_type *member_type)
+{
+	const struct btf_type *resolved_type;
+	u32 resolved_type_id = member->type;
+	struct btf_member resolved_member;
+	struct btf *btf = env->btf;
+
+	resolved_type = btf_type_id_size(btf, &resolved_type_id, NULL);
+	if (!resolved_type) {
+		btf_verifier_log_member(env, struct_type, member,
+					"Invalid member");
+		return -EINVAL;
+	}
+
+	resolved_member = *member;
+	resolved_member.type = resolved_type_id;
+
+	return btf_type_ops(resolved_type)->check_kflag_member(env, struct_type,
+							       &resolved_member,
+							       resolved_type);
+}
+
 static int btf_ptr_check_member(struct btf_verifier_env *env,
 				const struct btf_type *struct_type,
 				const struct btf_member *member,
@@ -1227,6 +1378,11 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env,
 		return -EINVAL;
 	}
 
+	if (btf_type_kflag(t)) {
+		btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
+		return -EINVAL;
+	}
+
 	if (!BTF_TYPE_ID_VALID(t->type)) {
 		btf_verifier_log_type(env, t, "Invalid type_id");
 		return -EINVAL;
@@ -1380,6 +1536,7 @@ static struct btf_kind_operations modifier_ops = {
 	.check_meta = btf_ref_type_check_meta,
 	.resolve = btf_modifier_resolve,
 	.check_member = btf_modifier_check_member,
+	.check_kflag_member = btf_modifier_check_kflag_member,
 	.log_details = btf_ref_type_log,
 	.seq_show = btf_modifier_seq_show,
 };
@@ -1388,6 +1545,7 @@ static struct btf_kind_operations ptr_ops = {
 	.check_meta = btf_ref_type_check_meta,
 	.resolve = btf_ptr_resolve,
 	.check_member = btf_ptr_check_member,
+	.check_kflag_member = btf_generic_check_kflag_member,
 	.log_details = btf_ref_type_log,
 	.seq_show = btf_ptr_seq_show,
 };
@@ -1422,6 +1580,7 @@ static struct btf_kind_operations fwd_ops = {
 	.check_meta = btf_fwd_check_meta,
 	.resolve = btf_df_resolve,
 	.check_member = btf_df_check_member,
+	.check_kflag_member = btf_df_check_kflag_member,
 	.log_details = btf_ref_type_log,
 	.seq_show = btf_df_seq_show,
 };
@@ -1480,6 +1639,11 @@ static s32 btf_array_check_meta(struct btf_verifier_env *env,
 		return -EINVAL;
 	}
 
+	if (btf_type_kflag(t)) {
+		btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
+		return -EINVAL;
+	}
+
 	if (t->size) {
 		btf_verifier_log_type(env, t, "size != 0");
 		return -EINVAL;
@@ -1603,6 +1767,7 @@ static struct btf_kind_operations array_ops = {
 	.check_meta = btf_array_check_meta,
 	.resolve = btf_array_resolve,
 	.check_member = btf_array_check_member,
+	.check_kflag_member = btf_generic_check_kflag_member,
 	.log_details = btf_array_log,
 	.seq_show = btf_array_seq_show,
 };
@@ -1641,6 +1806,7 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env,
 	u32 meta_needed, last_offset;
 	struct btf *btf = env->btf;
 	u32 struct_size = t->size;
+	u32 offset;
 	u16 i;
 
 	meta_needed = btf_type_vlen(t) * sizeof(*member);
@@ -1682,7 +1848,8 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env,
 			return -EINVAL;
 		}
 
-		if (is_union && member->offset) {
+		offset = btf_member_bit_offset(t, member);
+		if (is_union && offset) {
 			btf_verifier_log_member(env, t, member,
 						"Invalid member bits_offset");
 			return -EINVAL;
@@ -1692,20 +1859,20 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env,
 		 * ">" instead of ">=" because the last member could be
 		 * "char a[0];"
 		 */
-		if (last_offset > member->offset) {
+		if (last_offset > offset) {
 			btf_verifier_log_member(env, t, member,
 						"Invalid member bits_offset");
 			return -EINVAL;
 		}
 
-		if (BITS_ROUNDUP_BYTES(member->offset) > struct_size) {
+		if (BITS_ROUNDUP_BYTES(offset) > struct_size) {
 			btf_verifier_log_member(env, t, member,
 						"Member bits_offset exceeds its struct size");
 			return -EINVAL;
 		}
 
 		btf_verifier_log_member(env, t, member, NULL);
-		last_offset = member->offset;
+		last_offset = offset;
 	}
 
 	return meta_needed;
@@ -1735,9 +1902,14 @@ static int btf_struct_resolve(struct btf_verifier_env *env,
 
 		last_member_type = btf_type_by_id(env->btf,
 						  last_member_type_id);
-		err = btf_type_ops(last_member_type)->check_member(env, v->t,
-							last_member,
-							last_member_type);
+		if (btf_type_kflag(v->t))
+			err = btf_type_ops(last_member_type)->check_kflag_member(env, v->t,
+								last_member,
+								last_member_type);
+		else
+			err = btf_type_ops(last_member_type)->check_member(env, v->t,
+								last_member,
+								last_member_type);
 		if (err)
 			return err;
 	}
@@ -1759,9 +1931,14 @@ static int btf_struct_resolve(struct btf_verifier_env *env,
 			return env_stack_push(env, member_type, member_type_id);
 		}
 
-		err = btf_type_ops(member_type)->check_member(env, v->t,
-							      member,
-							      member_type);
+		if (btf_type_kflag(v->t))
+			err = btf_type_ops(member_type)->check_kflag_member(env, v->t,
+									    member,
+									    member_type);
+		else
+			err = btf_type_ops(member_type)->check_member(env, v->t,
+								      member,
+								      member_type);
 		if (err)
 			return err;
 	}
@@ -1789,17 +1966,26 @@ static void btf_struct_seq_show(const struct btf *btf, const struct btf_type *t,
 	for_each_member(i, t, member) {
 		const struct btf_type *member_type = btf_type_by_id(btf,
 								member->type);
-		u32 member_offset = member->offset;
-		u32 bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset);
-		u8 bits8_offset = BITS_PER_BYTE_MASKED(member_offset);
 		const struct btf_kind_operations *ops;
+		u32 member_offset, bitfield_size;
+		u32 bytes_offset;
+		u8 bits8_offset;
 
 		if (i)
 			seq_puts(m, seq);
 
-		ops = btf_type_ops(member_type);
-		ops->seq_show(btf, member_type, member->type,
-			      data + bytes_offset, bits8_offset, m);
+		member_offset = btf_member_bit_offset(t, member);
+		bitfield_size = btf_member_bitfield_size(t, member);
+		if (bitfield_size) {
+			btf_bitfield_seq_show(data, member_offset,
+					      bitfield_size, m);
+		} else {
+			bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset);
+			bits8_offset = BITS_PER_BYTE_MASKED(member_offset);
+			ops = btf_type_ops(member_type);
+			ops->seq_show(btf, member_type, member->type,
+				      data + bytes_offset, bits8_offset, m);
+		}
 	}
 	seq_puts(m, "}");
 }
@@ -1808,6 +1994,7 @@ static struct btf_kind_operations struct_ops = {
 	.check_meta = btf_struct_check_meta,
 	.resolve = btf_struct_resolve,
 	.check_member = btf_struct_check_member,
+	.check_kflag_member = btf_generic_check_kflag_member,
 	.log_details = btf_struct_log,
 	.seq_show = btf_struct_seq_show,
 };
@@ -1837,6 +2024,41 @@ static int btf_enum_check_member(struct btf_verifier_env *env,
 	return 0;
 }
 
+static int btf_enum_check_kflag_member(struct btf_verifier_env *env,
+				       const struct btf_type *struct_type,
+				       const struct btf_member *member,
+				       const struct btf_type *member_type)
+{
+	u32 struct_bits_off, nr_bits, bytes_end, struct_size;
+	u32 int_bitsize = sizeof(int) * BITS_PER_BYTE;
+
+	struct_bits_off = BTF_MEMBER_BIT_OFFSET(member->offset);
+	nr_bits = BTF_MEMBER_BITFIELD_SIZE(member->offset);
+	if (!nr_bits) {
+		if (BITS_PER_BYTE_MASKED(struct_bits_off)) {
+			btf_verifier_log_member(env, struct_type, member,
+						"Member is not byte aligned");
+				return -EINVAL;
+		}
+
+		nr_bits = int_bitsize;
+	} else if (nr_bits > int_bitsize) {
+		btf_verifier_log_member(env, struct_type, member,
+					"Invalid member bitfield_size");
+		return -EINVAL;
+	}
+
+	struct_size = struct_type->size;
+	bytes_end = BITS_ROUNDUP_BYTES(struct_bits_off + nr_bits);
+	if (struct_size < bytes_end) {
+		btf_verifier_log_member(env, struct_type, member,
+					"Member exceeds struct_size");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static s32 btf_enum_check_meta(struct btf_verifier_env *env,
 			       const struct btf_type *t,
 			       u32 meta_left)
@@ -1856,6 +2078,11 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env,
 		return -EINVAL;
 	}
 
+	if (btf_type_kflag(t)) {
+		btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
+		return -EINVAL;
+	}
+
 	if (t->size != sizeof(int)) {
 		btf_verifier_log_type(env, t, "Expected size:%zu",
 				      sizeof(int));
@@ -1924,6 +2151,7 @@ static struct btf_kind_operations enum_ops = {
 	.check_meta = btf_enum_check_meta,
 	.resolve = btf_df_resolve,
 	.check_member = btf_enum_check_member,
+	.check_kflag_member = btf_enum_check_kflag_member,
 	.log_details = btf_enum_log,
 	.seq_show = btf_enum_seq_show,
 };
@@ -1946,6 +2174,11 @@ static s32 btf_func_proto_check_meta(struct btf_verifier_env *env,
 		return -EINVAL;
 	}
 
+	if (btf_type_kflag(t)) {
+		btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
+		return -EINVAL;
+	}
+
 	btf_verifier_log_type(env, t, NULL);
 
 	return meta_needed;
@@ -2005,6 +2238,7 @@ static struct btf_kind_operations func_proto_ops = {
 	 * Hence, there is no btf_func_check_member().
 	 */
 	.check_member = btf_df_check_member,
+	.check_kflag_member = btf_df_check_kflag_member,
 	.log_details = btf_func_proto_log,
 	.seq_show = btf_df_seq_show,
 };
@@ -2024,6 +2258,11 @@ static s32 btf_func_check_meta(struct btf_verifier_env *env,
 		return -EINVAL;
 	}
 
+	if (btf_type_kflag(t)) {
+		btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
+		return -EINVAL;
+	}
+
 	btf_verifier_log_type(env, t, NULL);
 
 	return 0;
@@ -2033,6 +2272,7 @@ static struct btf_kind_operations func_ops = {
 	.check_meta = btf_func_check_meta,
 	.resolve = btf_df_resolve,
 	.check_member = btf_df_check_member,
+	.check_kflag_member = btf_df_check_kflag_member,
 	.log_details = btf_ref_type_log,
 	.seq_show = btf_df_seq_show,
 };
-- 
cgit 


From ffa0c1cf59596fba54546ea828305acfcc2cf55e Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Sat, 15 Dec 2018 22:13:52 -0800
Subject: bpf: enable cgroup local storage map pretty print with kind_flag

Commit 970289fc0a83 ("bpf: add bpffs pretty print for cgroup
local storage maps") added bpffs pretty print for cgroup
local storage maps. The commit worked for struct without kind_flag
set.

This patch refactored and made pretty print also work
with kind_flag set for the struct.

Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/btf.h        |  5 ++++-
 kernel/bpf/btf.c           | 37 ++++++++++++++++++++++++++++---------
 kernel/bpf/local_storage.c | 17 ++++-------------
 3 files changed, 36 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/btf.h b/include/linux/btf.h
index 58000d7e06e3..12502e25e767 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -7,6 +7,7 @@
 #include <linux/types.h>
 
 struct btf;
+struct btf_member;
 struct btf_type;
 union bpf_attr;
 
@@ -46,7 +47,9 @@ void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj,
 		       struct seq_file *m);
 int btf_get_fd_by_id(u32 id);
 u32 btf_id(const struct btf *btf);
-bool btf_type_is_reg_int(const struct btf_type *t, u32 expected_size);
+bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s,
+			   const struct btf_member *m,
+			   u32 expected_offset, u32 expected_size);
 
 #ifdef CONFIG_BPF_SYSCALL
 const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id);
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 93b6905e3a9b..e804b26a0506 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -546,22 +546,41 @@ static bool btf_type_int_is_regular(const struct btf_type *t)
 }
 
 /*
- * Check that given type is a regular int and has the expected size.
+ * Check that given struct member is a regular int with expected
+ * offset and size.
  */
-bool btf_type_is_reg_int(const struct btf_type *t, u32 expected_size)
+bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s,
+			   const struct btf_member *m,
+			   u32 expected_offset, u32 expected_size)
 {
-	u8 nr_bits, nr_bytes;
-	u32 int_data;
+	const struct btf_type *t;
+	u32 id, int_data;
+	u8 nr_bits;
 
-	if (!btf_type_is_int(t))
+	id = m->type;
+	t = btf_type_id_size(btf, &id, NULL);
+	if (!t || !btf_type_is_int(t))
 		return false;
 
 	int_data = btf_type_int(t);
 	nr_bits = BTF_INT_BITS(int_data);
-	nr_bytes = BITS_ROUNDUP_BYTES(nr_bits);
-	if (BITS_PER_BYTE_MASKED(nr_bits) ||
-	    BTF_INT_OFFSET(int_data) ||
-	    nr_bytes != expected_size)
+	if (btf_type_kflag(s)) {
+		u32 bitfield_size = BTF_MEMBER_BITFIELD_SIZE(m->offset);
+		u32 bit_offset = BTF_MEMBER_BIT_OFFSET(m->offset);
+
+		/* if kflag set, int should be a regular int and
+		 * bit offset should be at byte boundary.
+		 */
+		return !bitfield_size &&
+		       BITS_ROUNDUP_BYTES(bit_offset) == expected_offset &&
+		       BITS_ROUNDUP_BYTES(nr_bits) == expected_size;
+	}
+
+	if (BTF_INT_OFFSET(int_data) ||
+	    BITS_PER_BYTE_MASKED(m->offset) ||
+	    BITS_ROUNDUP_BYTES(m->offset) != expected_offset ||
+	    BITS_PER_BYTE_MASKED(nr_bits) ||
+	    BITS_ROUNDUP_BYTES(nr_bits) != expected_size)
 		return false;
 
 	return true;
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 5eca03da0989..07a34ef562a0 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -315,9 +315,8 @@ static int cgroup_storage_check_btf(const struct bpf_map *map,
 				    const struct btf_type *key_type,
 				    const struct btf_type *value_type)
 {
-	const struct btf_type *t;
 	struct btf_member *m;
-	u32 id, size;
+	u32 offset, size;
 
 	/* Key is expected to be of struct bpf_cgroup_storage_key type,
 	 * which is:
@@ -338,25 +337,17 @@ static int cgroup_storage_check_btf(const struct bpf_map *map,
 	 * The first field must be a 64 bit integer at 0 offset.
 	 */
 	m = (struct btf_member *)(key_type + 1);
-	if (m->offset)
-		return -EINVAL;
-	id = m->type;
-	t = btf_type_id_size(btf, &id, NULL);
 	size = FIELD_SIZEOF(struct bpf_cgroup_storage_key, cgroup_inode_id);
-	if (!t || !btf_type_is_reg_int(t, size))
+	if (!btf_member_is_reg_int(btf, key_type, m, 0, size))
 		return -EINVAL;
 
 	/*
 	 * The second field must be a 32 bit integer at 64 bit offset.
 	 */
 	m++;
-	if (m->offset != offsetof(struct bpf_cgroup_storage_key, attach_type) *
-	    BITS_PER_BYTE)
-		return -EINVAL;
-	id = m->type;
-	t = btf_type_id_size(btf, &id, NULL);
+	offset = offsetof(struct bpf_cgroup_storage_key, attach_type);
 	size = FIELD_SIZEOF(struct bpf_cgroup_storage_key, attach_type);
-	if (!t || !btf_type_is_reg_int(t, size))
+	if (!btf_member_is_reg_int(btf, key_type, m, offset, size))
 		return -EINVAL;
 
 	return 0;
-- 
cgit 


From 07daef8b41e0d9e7802a448f6766504e7641a234 Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Sun, 9 Dec 2018 14:22:25 +0800
Subject: ntp: Remove duplicated include

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: <john.stultz@linaro.org>
Cc: <sboyd@kernel.org>
Link: https://lkml.kernel.org/r/20181209062225.4344-1-yuehaibing@huawei.com
---
 kernel/time/ntp.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index c5e0cba3b39c..bc3a3c37ec9c 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -17,7 +17,6 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/rtc.h>
-#include <linux/math64.h>
 
 #include "ntp_internal.h"
 #include "timekeeping_internal.h"
-- 
cgit 


From c5f48c0a7aa1a8c82d81cdf27e63aa0a5544c6e6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Mon, 3 Dec 2018 11:44:51 +0100
Subject: genirq: Fix various typos in comments

Go over the IRQ subsystem source code (including irqchip drivers) and
fix common typos in comments.

No change in functionality intended.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: linux-kernel@vger.kernel.org
---
 drivers/irqchip/irq-dw-apb-ictl.c | 2 +-
 drivers/irqchip/irq-gic.c         | 6 +++---
 drivers/irqchip/irq-renesas-h8s.c | 2 +-
 drivers/irqchip/irq-s3c24xx.c     | 2 +-
 include/linux/irqchip.h           | 4 ++--
 kernel/irq/chip.c                 | 2 +-
 kernel/irq/ipi.c                  | 4 ++--
 kernel/irq/manage.c               | 2 +-
 kernel/irq/spurious.c             | 6 +++---
 9 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/drivers/irqchip/irq-dw-apb-ictl.c b/drivers/irqchip/irq-dw-apb-ictl.c
index 0a19618ce2c8..e4550e9c810b 100644
--- a/drivers/irqchip/irq-dw-apb-ictl.c
+++ b/drivers/irqchip/irq-dw-apb-ictl.c
@@ -105,7 +105,7 @@ static int __init dw_apb_ictl_init(struct device_node *np,
 	 * DW IP can be configured to allow 2-64 irqs. We can determine
 	 * the number of irqs supported by writing into enable register
 	 * and look for bits not set, as corresponding flip-flops will
-	 * have been removed by sythesis tool.
+	 * have been removed by synthesis tool.
 	 */
 
 	/* mask and enable all interrupts */
diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c
index ced10c44b68a..ba2a37a27a54 100644
--- a/drivers/irqchip/irq-gic.c
+++ b/drivers/irqchip/irq-gic.c
@@ -604,8 +604,8 @@ void gic_dist_save(struct gic_chip_data *gic)
 /*
  * Restores the GIC distributor registers during resume or when coming out of
  * idle.  Must be called before enabling interrupts.  If a level interrupt
- * that occured while the GIC was suspended is still present, it will be
- * handled normally, but any edge interrupts that occured will not be seen by
+ * that occurred while the GIC was suspended is still present, it will be
+ * handled normally, but any edge interrupts that occurred will not be seen by
  * the GIC and need to be handled by the platform-specific wakeup source.
  */
 void gic_dist_restore(struct gic_chip_data *gic)
@@ -899,7 +899,7 @@ void gic_migrate_target(unsigned int new_cpu_id)
 	gic_cpu_map[cpu] = 1 << new_cpu_id;
 
 	/*
-	 * Find all the peripheral interrupts targetting the current
+	 * Find all the peripheral interrupts targeting the current
 	 * CPU interface and migrate them to the new CPU interface.
 	 * We skip DIST_TARGET 0 to 7 as they are read-only.
 	 */
diff --git a/drivers/irqchip/irq-renesas-h8s.c b/drivers/irqchip/irq-renesas-h8s.c
index 85234d456638..4e2461bae944 100644
--- a/drivers/irqchip/irq-renesas-h8s.c
+++ b/drivers/irqchip/irq-renesas-h8s.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * H8S interrupt contoller driver
+ * H8S interrupt controller driver
  *
  * Copyright 2015 Yoshinori Sato <ysato@users.sourceforge.jp>
  */
diff --git a/drivers/irqchip/irq-s3c24xx.c b/drivers/irqchip/irq-s3c24xx.c
index c19766fe8a1a..b623f300f1b1 100644
--- a/drivers/irqchip/irq-s3c24xx.c
+++ b/drivers/irqchip/irq-s3c24xx.c
@@ -58,7 +58,7 @@ struct s3c_irq_data {
 };
 
 /*
- * Sructure holding the controller data
+ * Structure holding the controller data
  * @reg_pending		register holding pending irqs
  * @reg_intpnd		special register intpnd in main intc
  * @reg_mask		mask register
diff --git a/include/linux/irqchip.h b/include/linux/irqchip.h
index 89c34b200671..950e4b2458f0 100644
--- a/include/linux/irqchip.h
+++ b/include/linux/irqchip.h
@@ -19,7 +19,7 @@
  * the association between their DT compatible string and their
  * initialization function.
  *
- * @name: name that must be unique accross all IRQCHIP_DECLARE of the
+ * @name: name that must be unique across all IRQCHIP_DECLARE of the
  * same file.
  * @compstr: compatible string of the irqchip driver
  * @fn: initialization function
@@ -30,7 +30,7 @@
  * This macro must be used by the different irqchip drivers to declare
  * the association between their version and their initialization function.
  *
- * @name: name that must be unique accross all IRQCHIP_ACPI_DECLARE of the
+ * @name: name that must be unique across all IRQCHIP_ACPI_DECLARE of the
  * same file.
  * @subtable: Subtable to be identified in MADT
  * @validate: Function to be called on that subtable to check its validity.
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index a2b3d9de999c..34e969069488 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -929,7 +929,7 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
 				break;
 			/*
 			 * Bail out if the outer chip is not set up
-			 * and the interrrupt supposed to be started
+			 * and the interrupt supposed to be started
 			 * right away.
 			 */
 			if (WARN_ON(is_chained))
diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c
index 8b778e37dc6d..43e3d1be622c 100644
--- a/kernel/irq/ipi.c
+++ b/kernel/irq/ipi.c
@@ -56,7 +56,7 @@ int irq_reserve_ipi(struct irq_domain *domain,
 		unsigned int next;
 
 		/*
-		 * The IPI requires a seperate HW irq on each CPU. We require
+		 * The IPI requires a separate HW irq on each CPU. We require
 		 * that the destination mask is consecutive. If an
 		 * implementation needs to support holes, it can reserve
 		 * several IPI ranges.
@@ -172,7 +172,7 @@ irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu)
 
 	/*
 	 * Get the real hardware irq number if the underlying implementation
-	 * uses a seperate irq per cpu. If the underlying implementation uses
+	 * uses a separate irq per cpu. If the underlying implementation uses
 	 * a single hardware irq for all cpus then the IPI send mechanism
 	 * needs to take care of the cpu destinations.
 	 */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 9dbdccab3b6a..a4888ce4667a 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -915,7 +915,7 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
 #endif
 
 /*
- * Interrupts which are not explicitely requested as threaded
+ * Interrupts which are not explicitly requested as threaded
  * interrupts rely on the implicit bh/preempt disable of the hard irq
  * context. So we need to disable bh here to avoid deadlocks and other
  * side effects.
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index d867d6ddafdd..6d2fa6914b30 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -66,7 +66,7 @@ static int try_one_irq(struct irq_desc *desc, bool force)
 	raw_spin_lock(&desc->lock);
 
 	/*
-	 * PER_CPU, nested thread interrupts and interrupts explicitely
+	 * PER_CPU, nested thread interrupts and interrupts explicitly
 	 * marked polled are excluded from polling.
 	 */
 	if (irq_settings_is_per_cpu(desc) ||
@@ -76,7 +76,7 @@ static int try_one_irq(struct irq_desc *desc, bool force)
 
 	/*
 	 * Do not poll disabled interrupts unless the spurious
-	 * disabled poller asks explicitely.
+	 * disabled poller asks explicitly.
 	 */
 	if (irqd_irq_disabled(&desc->irq_data) && !force)
 		goto out;
@@ -292,7 +292,7 @@ void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret)
 	 * So in case a thread is woken, we just note the fact and
 	 * defer the analysis to the next hardware interrupt.
 	 *
-	 * The threaded handlers store whether they sucessfully
+	 * The threaded handlers store whether they successfully
 	 * handled an interrupt and we check whether that number
 	 * changed versus the last invocation.
 	 *
-- 
cgit 


From e11d4284e2f4de5048c6d1787c82226f0a198292 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 18 Apr 2018 13:43:52 +0200
Subject: y2038: socket: Add compat_sys_recvmmsg_time64

recvmmsg() takes two arguments to pointers of structures that differ
between 32-bit and 64-bit architectures: mmsghdr and timespec.

For y2038 compatbility, we are changing the native system call from
timespec to __kernel_timespec with a 64-bit time_t (in another patch),
and use the existing compat system call on both 32-bit and 64-bit
architectures for compatibility with traditional 32-bit user space.

As we now have two variants of recvmmsg() for 32-bit tasks that are both
different from the variant that we use on 64-bit tasks, this means we
also require two compat system calls!

The solution I picked is to flip things around: The existing
compat_sys_recvmmsg() call gets moved from net/compat.c into net/socket.c
and now handles the case for old user space on all architectures that
have set CONFIG_COMPAT_32BIT_TIME.  A new compat_sys_recvmmsg_time64()
call gets added in the old place for 64-bit architectures only, this
one handles the case of a compat mmsghdr structure combined with
__kernel_timespec.

In the indirect sys_socketcall(), we now need to call either
do_sys_recvmmsg() or __compat_sys_recvmmsg(), depending on what kind of
architecture we are on. For compat_sys_socketcall(), no such change is
needed, we always call __compat_sys_recvmmsg().

I decided to not add a new SYS_RECVMMSG_TIME64 socketcall: Any libc
implementation for 64-bit time_t will need significant changes including
an updated asm/unistd.h, and it seems better to consistently use the
separate syscalls that configuration, leaving the socketcall only for
backward compatibility with 32-bit time_t based libc.

The naming is asymmetric for the moment, so both existing syscalls
entry points keep their names, while the new ones are recvmmsg_time32
and compat_recvmmsg_time64 respectively. I expect that we will rename
the compat syscalls later as we start using generated syscall tables
everywhere and add these entry points.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 include/linux/compat.h   |  3 +++
 include/linux/socket.h   |  9 ++++---
 include/linux/syscalls.h |  3 +++
 kernel/sys_ni.c          |  2 ++
 net/compat.c             | 34 ++++++++++----------------
 net/socket.c             | 62 +++++++++++++++++++++++++++++++++++-------------
 6 files changed, 72 insertions(+), 41 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/compat.h b/include/linux/compat.h
index 8be8daa38c9a..4b0463608589 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -893,6 +893,9 @@ asmlinkage long compat_sys_move_pages(pid_t pid, compat_ulong_t nr_pages,
 asmlinkage long compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid,
 					compat_pid_t pid, int sig,
 					struct compat_siginfo __user *uinfo);
+asmlinkage long compat_sys_recvmmsg_time64(int fd, struct compat_mmsghdr __user *mmsg,
+				    unsigned vlen, unsigned int flags,
+				    struct __kernel_timespec __user *timeout);
 asmlinkage long compat_sys_recvmmsg(int fd, struct compat_mmsghdr __user *mmsg,
 				    unsigned vlen, unsigned int flags,
 				    struct old_timespec32 __user *timeout);
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 8b571e9b9f76..333b5df8a1b2 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -348,7 +348,8 @@ struct ucred {
 extern int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr_storage *kaddr);
 extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data);
 
-struct timespec64;
+struct __kernel_timespec;
+struct old_timespec32;
 
 /* The __sys_...msg variants allow MSG_CMSG_COMPAT iff
  * forbid_cmsg_compat==false
@@ -357,8 +358,10 @@ extern long __sys_recvmsg(int fd, struct user_msghdr __user *msg,
 			  unsigned int flags, bool forbid_cmsg_compat);
 extern long __sys_sendmsg(int fd, struct user_msghdr __user *msg,
 			  unsigned int flags, bool forbid_cmsg_compat);
-extern int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
-			  unsigned int flags, struct timespec64 *timeout);
+extern int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg,
+			  unsigned int vlen, unsigned int flags,
+			  struct __kernel_timespec __user *timeout,
+			  struct old_timespec32 __user *timeout32);
 extern int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg,
 			  unsigned int vlen, unsigned int flags,
 			  bool forbid_cmsg_compat);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 247ad9eca955..03cda6793be3 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -843,6 +843,9 @@ asmlinkage long sys_accept4(int, struct sockaddr __user *, int __user *, int);
 asmlinkage long sys_recvmmsg(int fd, struct mmsghdr __user *msg,
 			     unsigned int vlen, unsigned flags,
 			     struct __kernel_timespec __user *timeout);
+asmlinkage long sys_recvmmsg_time32(int fd, struct mmsghdr __user *msg,
+			     unsigned int vlen, unsigned flags,
+			     struct old_timespec32 __user *timeout);
 
 asmlinkage long sys_wait4(pid_t pid, int __user *stat_addr,
 				int options, struct rusage __user *ru);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index df556175be50..ab9d0e3c6d50 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -284,7 +284,9 @@ COND_SYSCALL_COMPAT(move_pages);
 COND_SYSCALL(perf_event_open);
 COND_SYSCALL(accept4);
 COND_SYSCALL(recvmmsg);
+COND_SYSCALL(recvmmsg_time32);
 COND_SYSCALL_COMPAT(recvmmsg);
+COND_SYSCALL_COMPAT(recvmmsg_time64);
 
 /*
  * Architecture specific syscalls: see further below
diff --git a/net/compat.c b/net/compat.c
index 47a614b370cd..f7084780a8f8 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -810,34 +810,23 @@ COMPAT_SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, buf, compat_size_t, len
 	return __compat_sys_recvfrom(fd, buf, len, flags, addr, addrlen);
 }
 
-static int __compat_sys_recvmmsg(int fd, struct compat_mmsghdr __user *mmsg,
-				 unsigned int vlen, unsigned int flags,
-				 struct old_timespec32 __user *timeout)
+COMPAT_SYSCALL_DEFINE5(recvmmsg_time64, int, fd, struct compat_mmsghdr __user *, mmsg,
+		       unsigned int, vlen, unsigned int, flags,
+		       struct __kernel_timespec __user *, timeout)
 {
-	int datagrams;
-	struct timespec64 ktspec;
-
-	if (timeout == NULL)
-		return __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen,
-				      flags | MSG_CMSG_COMPAT, NULL);
-
-	if (compat_get_timespec64(&ktspec, timeout))
-		return -EFAULT;
-
-	datagrams = __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen,
-				   flags | MSG_CMSG_COMPAT, &ktspec);
-	if (datagrams > 0 && compat_put_timespec64(&ktspec, timeout))
-		datagrams = -EFAULT;
-
-	return datagrams;
+	return __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen,
+			      flags | MSG_CMSG_COMPAT, timeout, NULL);
 }
 
+#ifdef CONFIG_COMPAT_32BIT_TIME
 COMPAT_SYSCALL_DEFINE5(recvmmsg, int, fd, struct compat_mmsghdr __user *, mmsg,
 		       unsigned int, vlen, unsigned int, flags,
 		       struct old_timespec32 __user *, timeout)
 {
-	return __compat_sys_recvmmsg(fd, mmsg, vlen, flags, timeout);
+	return __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen,
+			      flags | MSG_CMSG_COMPAT, NULL, timeout);
 }
+#endif
 
 COMPAT_SYSCALL_DEFINE2(socketcall, int, call, u32 __user *, args)
 {
@@ -925,8 +914,9 @@ COMPAT_SYSCALL_DEFINE2(socketcall, int, call, u32 __user *, args)
 		ret = __compat_sys_recvmsg(a0, compat_ptr(a1), a[2]);
 		break;
 	case SYS_RECVMMSG:
-		ret = __compat_sys_recvmmsg(a0, compat_ptr(a1), a[2], a[3],
-					    compat_ptr(a[4]));
+		ret = __sys_recvmmsg(a0, compat_ptr(a1), a[2],
+				     a[3] | MSG_CMSG_COMPAT, NULL,
+				     compat_ptr(a[4]));
 		break;
 	case SYS_ACCEPT4:
 		ret = __sys_accept4(a0, compat_ptr(a1), compat_ptr(a[2]), a[3]);
diff --git a/net/socket.c b/net/socket.c
index 593826e11a53..f137a96628f1 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2341,8 +2341,9 @@ SYSCALL_DEFINE3(recvmsg, int, fd, struct user_msghdr __user *, msg,
  *     Linux recvmmsg interface
  */
 
-int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
-		   unsigned int flags, struct timespec64 *timeout)
+static int do_recvmmsg(int fd, struct mmsghdr __user *mmsg,
+			  unsigned int vlen, unsigned int flags,
+			  struct timespec64 *timeout)
 {
 	int fput_needed, err, datagrams;
 	struct socket *sock;
@@ -2451,25 +2452,32 @@ out_put:
 	return datagrams;
 }
 
-static int do_sys_recvmmsg(int fd, struct mmsghdr __user *mmsg,
-			   unsigned int vlen, unsigned int flags,
-			   struct __kernel_timespec __user *timeout)
+int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg,
+		   unsigned int vlen, unsigned int flags,
+		   struct __kernel_timespec __user *timeout,
+		   struct old_timespec32 __user *timeout32)
 {
 	int datagrams;
 	struct timespec64 timeout_sys;
 
-	if (flags & MSG_CMSG_COMPAT)
-		return -EINVAL;
-
-	if (!timeout)
-		return __sys_recvmmsg(fd, mmsg, vlen, flags, NULL);
+	if (timeout && get_timespec64(&timeout_sys, timeout))
+		return -EFAULT;
 
-	if (get_timespec64(&timeout_sys, timeout))
+	if (timeout32 && get_old_timespec32(&timeout_sys, timeout32))
 		return -EFAULT;
 
-	datagrams = __sys_recvmmsg(fd, mmsg, vlen, flags, &timeout_sys);
+	if (!timeout && !timeout32)
+		return do_recvmmsg(fd, mmsg, vlen, flags, NULL);
+
+	datagrams = do_recvmmsg(fd, mmsg, vlen, flags, &timeout_sys);
 
-	if (datagrams > 0 && put_timespec64(&timeout_sys, timeout))
+	if (datagrams <= 0)
+		return datagrams;
+
+	if (timeout && put_timespec64(&timeout_sys, timeout))
+		datagrams = -EFAULT;
+
+	if (timeout32 && put_old_timespec32(&timeout_sys, timeout32))
 		datagrams = -EFAULT;
 
 	return datagrams;
@@ -2479,8 +2487,23 @@ SYSCALL_DEFINE5(recvmmsg, int, fd, struct mmsghdr __user *, mmsg,
 		unsigned int, vlen, unsigned int, flags,
 		struct __kernel_timespec __user *, timeout)
 {
-	return do_sys_recvmmsg(fd, mmsg, vlen, flags, timeout);
+	if (flags & MSG_CMSG_COMPAT)
+		return -EINVAL;
+
+	return __sys_recvmmsg(fd, mmsg, vlen, flags, timeout, NULL);
+}
+
+#ifdef CONFIG_COMPAT_32BIT_TIME
+SYSCALL_DEFINE5(recvmmsg_time32, int, fd, struct mmsghdr __user *, mmsg,
+		unsigned int, vlen, unsigned int, flags,
+		struct old_timespec32 __user *, timeout)
+{
+	if (flags & MSG_CMSG_COMPAT)
+		return -EINVAL;
+
+	return __sys_recvmmsg(fd, mmsg, vlen, flags, NULL, timeout);
 }
+#endif
 
 #ifdef __ARCH_WANT_SYS_SOCKETCALL
 /* Argument list sizes for sys_socketcall */
@@ -2600,8 +2623,15 @@ SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args)
 				    a[2], true);
 		break;
 	case SYS_RECVMMSG:
-		err = do_sys_recvmmsg(a0, (struct mmsghdr __user *)a1, a[2],
-				      a[3], (struct __kernel_timespec __user *)a[4]);
+		if (IS_ENABLED(CONFIG_64BIT) || !IS_ENABLED(CONFIG_64BIT_TIME))
+			err = __sys_recvmmsg(a0, (struct mmsghdr __user *)a1,
+					     a[2], a[3],
+					     (struct __kernel_timespec __user *)a[4],
+					     NULL);
+		else
+			err = __sys_recvmmsg(a0, (struct mmsghdr __user *)a1,
+					     a[2], a[3], NULL,
+					     (struct old_timespec32 __user *)a[4]);
 		break;
 	case SYS_ACCEPT4:
 		err = __sys_accept4(a0, (struct sockaddr __user *)a1,
-- 
cgit 


From df8522a340ee4ccb725036e1f9145f5646939aed Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 18 Apr 2018 16:15:37 +0200
Subject: y2038: signal: Add sys_rt_sigtimedwait_time32

Once sys_rt_sigtimedwait() gets changed to a 64-bit time_t, we have
to provide compatibility support for existing binaries.

An earlier version of this patch reused the compat_sys_rt_sigtimedwait
entry point to avoid code duplication, but this newer approach
duplicates the existing native entry point instead, which seems
a bit cleaner.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 include/linux/syscalls.h |  4 ++++
 kernel/signal.c          | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 03cda6793be3..251979d2e709 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -649,6 +649,10 @@ asmlinkage long sys_rt_sigtimedwait(const sigset_t __user *uthese,
 				siginfo_t __user *uinfo,
 				const struct __kernel_timespec __user *uts,
 				size_t sigsetsize);
+asmlinkage long sys_rt_sigtimedwait_time32(const sigset_t __user *uthese,
+				siginfo_t __user *uinfo,
+				const struct old_timespec32 __user *uts,
+				size_t sigsetsize);
 asmlinkage long sys_rt_sigqueueinfo(pid_t pid, int sig, siginfo_t __user *uinfo);
 
 /* kernel/sys.c */
diff --git a/kernel/signal.c b/kernel/signal.c
index 3c8ea7a328e0..be6744cd0a11 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3332,6 +3332,39 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
 	return ret;
 }
 
+#ifdef CONFIG_COMPAT_32BIT_TIME
+SYSCALL_DEFINE4(rt_sigtimedwait_time32, const sigset_t __user *, uthese,
+		siginfo_t __user *, uinfo,
+		const struct old_timespec32 __user *, uts,
+		size_t, sigsetsize)
+{
+	sigset_t these;
+	struct timespec64 ts;
+	kernel_siginfo_t info;
+	int ret;
+
+	if (sigsetsize != sizeof(sigset_t))
+		return -EINVAL;
+
+	if (copy_from_user(&these, uthese, sizeof(these)))
+		return -EFAULT;
+
+	if (uts) {
+		if (get_old_timespec32(&ts, uts))
+			return -EFAULT;
+	}
+
+	ret = do_sigtimedwait(&these, &info, uts ? &ts : NULL);
+
+	if (ret > 0 && uinfo) {
+		if (copy_siginfo_to_user(uinfo, &info))
+			ret = -EFAULT;
+	}
+
+	return ret;
+}
+#endif
+
 #ifdef CONFIG_COMPAT
 COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
 		struct compat_siginfo __user *, uinfo,
-- 
cgit 


From 2367c4b5fa09b2947d03c5cd23d7bc0200b7fe4f Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 18 Apr 2018 16:18:35 +0200
Subject: y2038: signal: Add compat_sys_rt_sigtimedwait_time64

Now that 32-bit architectures have two variants of sys_rt_sigtimedwaid()
for 32-bit and 64-bit time_t, we also need to have a second compat system
call entry point on the corresponding 64-bit architectures.

The traditional system call keeps getting handled
by compat_sys_rt_sigtimedwait(), and this adds a new
compat_sys_rt_sigtimedwait_time64() that differs only in the timeout
argument type.

The naming remains a bit asymmetric for the moment. Ideally we would
want to have compat_sys_rt_sigtimedwait_time32() for the old version
and compat_sys_rt_sigtimedwait() for the new one to mirror the names
of the native entry points, but renaming the existing system call
tables causes unnecessary churn. I would suggest renaming all such
system calls together at a later point.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 include/linux/compat.h |  3 +++
 kernel/signal.c        | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/compat.h b/include/linux/compat.h
index 4b0463608589..056be0d03722 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -788,6 +788,9 @@ asmlinkage long compat_sys_rt_sigpending(compat_sigset_t __user *uset,
 asmlinkage long compat_sys_rt_sigtimedwait(compat_sigset_t __user *uthese,
 		struct compat_siginfo __user *uinfo,
 		struct old_timespec32 __user *uts, compat_size_t sigsetsize);
+asmlinkage long compat_sys_rt_sigtimedwait_time64(compat_sigset_t __user *uthese,
+		struct compat_siginfo __user *uinfo,
+		struct __kernel_timespec __user *uts, compat_size_t sigsetsize);
 asmlinkage long compat_sys_rt_sigqueueinfo(compat_pid_t pid, int sig,
 				struct compat_siginfo __user *uinfo);
 /* No generic prototype for rt_sigreturn */
diff --git a/kernel/signal.c b/kernel/signal.c
index be6744cd0a11..53e07d97ffe0 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3366,6 +3366,37 @@ SYSCALL_DEFINE4(rt_sigtimedwait_time32, const sigset_t __user *, uthese,
 #endif
 
 #ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait_time64, compat_sigset_t __user *, uthese,
+		struct compat_siginfo __user *, uinfo,
+		struct __kernel_timespec __user *, uts, compat_size_t, sigsetsize)
+{
+	sigset_t s;
+	struct timespec64 t;
+	kernel_siginfo_t info;
+	long ret;
+
+	if (sigsetsize != sizeof(sigset_t))
+		return -EINVAL;
+
+	if (get_compat_sigset(&s, uthese))
+		return -EFAULT;
+
+	if (uts) {
+		if (get_timespec64(&t, uts))
+			return -EFAULT;
+	}
+
+	ret = do_sigtimedwait(&s, &info, uts ? &t : NULL);
+
+	if (ret > 0 && uinfo) {
+		if (copy_siginfo_to_user32(uinfo, &info))
+			ret = -EFAULT;
+	}
+
+	return ret;
+}
+
+#ifdef CONFIG_COMPAT_32BIT_TIME
 COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
 		struct compat_siginfo __user *, uinfo,
 		struct old_timespec32 __user *, uts, compat_size_t, sigsetsize)
@@ -3396,6 +3427,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
 	return ret;
 }
 #endif
+#endif
 
 /**
  *  sys_kill - send a signal to a process
-- 
cgit 


From 926617889dc8383a120c66a2ecf7959a69f96950 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 14 Aug 2018 14:15:23 +0200
Subject: timekeeping: remove unused {read,update}_persistent_clock

After arch/sh has removed the last reference to these functions,
we can remove them completely and just rely on the 64-bit time_t
based versions. This cleans up a rather ugly use of __weak
functions.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/timekeeping32.h |  6 ------
 kernel/time/ntp.c             | 10 +---------
 kernel/time/timekeeping.c     | 12 ++----------
 3 files changed, 3 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/timekeeping32.h b/include/linux/timekeeping32.h
index a502616f7e1c..0036ff314ac5 100644
--- a/include/linux/timekeeping32.h
+++ b/include/linux/timekeeping32.h
@@ -52,10 +52,4 @@ static inline void getboottime(struct timespec *ts)
 	*ts = timespec64_to_timespec(ts64);
 }
 
-/*
- * Persistent clock related interfaces
- */
-extern void read_persistent_clock(struct timespec *ts);
-extern int update_persistent_clock(struct timespec now);
-
 #endif
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index c5e0cba3b39c..e23be418d015 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -555,17 +555,9 @@ static void sync_rtc_clock(void)
 }
 
 #ifdef CONFIG_GENERIC_CMOS_UPDATE
-int __weak update_persistent_clock(struct timespec now)
-{
-	return -ENODEV;
-}
-
 int __weak update_persistent_clock64(struct timespec64 now64)
 {
-	struct timespec now;
-
-	now = timespec64_to_timespec(now64);
-	return update_persistent_clock(now);
+	return -ENODEV;
 }
 #endif
 
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 2d110c948805..eb09be4871b3 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1467,7 +1467,7 @@ u64 timekeeping_max_deferment(void)
 }
 
 /**
- * read_persistent_clock -  Return time from the persistent clock.
+ * read_persistent_clock64 -  Return time from the persistent clock.
  *
  * Weak dummy function for arches that do not yet support it.
  * Reads the time from the battery backed persistent clock.
@@ -1475,20 +1475,12 @@ u64 timekeeping_max_deferment(void)
  *
  *  XXX - Do be sure to remove it once all arches implement it.
  */
-void __weak read_persistent_clock(struct timespec *ts)
+void __weak read_persistent_clock64(struct timespec64 *ts)
 {
 	ts->tv_sec = 0;
 	ts->tv_nsec = 0;
 }
 
-void __weak read_persistent_clock64(struct timespec64 *ts64)
-{
-	struct timespec ts;
-
-	read_persistent_clock(&ts);
-	*ts64 = timespec_to_timespec64(ts);
-}
-
 /**
  * read_persistent_wall_and_boot_offset - Read persistent clock, and also offset
  *                                        from the boot.
-- 
cgit 


From 437e78d3fd6d35e6d56230962e6d03bb5dcda7f6 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 7 Dec 2018 13:41:02 +0100
Subject: timekeeping: remove timespec_add/timespec_del

The last users were removed a while ago since everyone moved to ktime_t,
so we can remove the two unused interfaces for old timespec structures.

With those two gone, set_normalized_timespec() is also unused, so
remove that as well.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/time32.h | 25 -------------------------
 kernel/time/time.c     | 36 ------------------------------------
 2 files changed, 61 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/time32.h b/include/linux/time32.h
index 61904a6c098f..118b9977080c 100644
--- a/include/linux/time32.h
+++ b/include/linux/time32.h
@@ -96,31 +96,6 @@ static inline int timespec_compare(const struct timespec *lhs, const struct time
 	return lhs->tv_nsec - rhs->tv_nsec;
 }
 
-extern void set_normalized_timespec(struct timespec *ts, time_t sec, s64 nsec);
-
-static inline struct timespec timespec_add(struct timespec lhs,
-						struct timespec rhs)
-{
-	struct timespec ts_delta;
-
-	set_normalized_timespec(&ts_delta, lhs.tv_sec + rhs.tv_sec,
-				lhs.tv_nsec + rhs.tv_nsec);
-	return ts_delta;
-}
-
-/*
- * sub = lhs - rhs, in normalized form
- */
-static inline struct timespec timespec_sub(struct timespec lhs,
-						struct timespec rhs)
-{
-	struct timespec ts_delta;
-
-	set_normalized_timespec(&ts_delta, lhs.tv_sec - rhs.tv_sec,
-				lhs.tv_nsec - rhs.tv_nsec);
-	return ts_delta;
-}
-
 /*
  * Returns true if the timespec is norm, false if denorm:
  */
diff --git a/kernel/time/time.c b/kernel/time/time.c
index ad204cf6d001..532bb560252d 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -386,42 +386,6 @@ time64_t mktime64(const unsigned int year0, const unsigned int mon0,
 }
 EXPORT_SYMBOL(mktime64);
 
-/**
- * set_normalized_timespec - set timespec sec and nsec parts and normalize
- *
- * @ts:		pointer to timespec variable to be set
- * @sec:	seconds to set
- * @nsec:	nanoseconds to set
- *
- * Set seconds and nanoseconds field of a timespec variable and
- * normalize to the timespec storage format
- *
- * Note: The tv_nsec part is always in the range of
- *	0 <= tv_nsec < NSEC_PER_SEC
- * For negative values only the tv_sec field is negative !
- */
-void set_normalized_timespec(struct timespec *ts, time_t sec, s64 nsec)
-{
-	while (nsec >= NSEC_PER_SEC) {
-		/*
-		 * The following asm() prevents the compiler from
-		 * optimising this loop into a modulo operation. See
-		 * also __iter_div_u64_rem() in include/linux/time.h
-		 */
-		asm("" : "+rm"(nsec));
-		nsec -= NSEC_PER_SEC;
-		++sec;
-	}
-	while (nsec < 0) {
-		asm("" : "+rm"(nsec));
-		nsec += NSEC_PER_SEC;
-		--sec;
-	}
-	ts->tv_sec = sec;
-	ts->tv_nsec = nsec;
-}
-EXPORT_SYMBOL(set_normalized_timespec);
-
 /**
  * ns_to_timespec - Convert nanoseconds to timespec
  * @nsec:       the nanoseconds value to be converted
-- 
cgit 


From a38d1107f937ca95dcf820161ef44ea683d6a0b1 Mon Sep 17 00:00:00 2001
From: Matt Mullins <mmullins@fb.com>
Date: Wed, 12 Dec 2018 16:42:37 -0800
Subject: bpf: support raw tracepoints in modules

Distributions build drivers as modules, including network and filesystem
drivers which export numerous tracepoints.  This enables
bpf(BPF_RAW_TRACEPOINT_OPEN) to attach to those tracepoints.

Signed-off-by: Matt Mullins <mmullins@fb.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/module.h       |  4 ++
 include/linux/trace_events.h |  8 +++-
 kernel/bpf/syscall.c         | 11 +++--
 kernel/module.c              |  5 +++
 kernel/trace/bpf_trace.c     | 99 +++++++++++++++++++++++++++++++++++++++++++-
 5 files changed, 120 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/module.h b/include/linux/module.h
index fce6b4335e36..5f147dd5e709 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -432,6 +432,10 @@ struct module {
 	unsigned int num_tracepoints;
 	tracepoint_ptr_t *tracepoints_ptrs;
 #endif
+#ifdef CONFIG_BPF_EVENTS
+	unsigned int num_bpf_raw_events;
+	struct bpf_raw_event_map *bpf_raw_events;
+#endif
 #ifdef HAVE_JUMP_LABEL
 	struct jump_entry *jump_entries;
 	unsigned int num_jump_entries;
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 4130a5497d40..8a62731673f7 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -471,7 +471,8 @@ void perf_event_detach_bpf_prog(struct perf_event *event);
 int perf_event_query_prog_array(struct perf_event *event, void __user *info);
 int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog);
 int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog);
-struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name);
+struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name);
+void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp);
 int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
 			    u32 *fd_type, const char **buf,
 			    u64 *probe_offset, u64 *probe_addr);
@@ -502,10 +503,13 @@ static inline int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf
 {
 	return -EOPNOTSUPP;
 }
-static inline struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name)
+static inline struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name)
 {
 	return NULL;
 }
+static inline void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp)
+{
+}
 static inline int bpf_get_perf_event_info(const struct perf_event *event,
 					  u32 *prog_id, u32 *fd_type,
 					  const char **buf, u64 *probe_offset,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 5db31067d85e..0607db304def 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1604,6 +1604,7 @@ static int bpf_raw_tracepoint_release(struct inode *inode, struct file *filp)
 		bpf_probe_unregister(raw_tp->btp, raw_tp->prog);
 		bpf_prog_put(raw_tp->prog);
 	}
+	bpf_put_raw_tracepoint(raw_tp->btp);
 	kfree(raw_tp);
 	return 0;
 }
@@ -1629,13 +1630,15 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
 		return -EFAULT;
 	tp_name[sizeof(tp_name) - 1] = 0;
 
-	btp = bpf_find_raw_tracepoint(tp_name);
+	btp = bpf_get_raw_tracepoint(tp_name);
 	if (!btp)
 		return -ENOENT;
 
 	raw_tp = kzalloc(sizeof(*raw_tp), GFP_USER);
-	if (!raw_tp)
-		return -ENOMEM;
+	if (!raw_tp) {
+		err = -ENOMEM;
+		goto out_put_btp;
+	}
 	raw_tp->btp = btp;
 
 	prog = bpf_prog_get_type(attr->raw_tracepoint.prog_fd,
@@ -1663,6 +1666,8 @@ out_put_prog:
 	bpf_prog_put(prog);
 out_free_tp:
 	kfree(raw_tp);
+out_put_btp:
+	bpf_put_raw_tracepoint(btp);
 	return err;
 }
 
diff --git a/kernel/module.c b/kernel/module.c
index 49a405891587..06ec68f08387 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3093,6 +3093,11 @@ static int find_module_sections(struct module *mod, struct load_info *info)
 					     sizeof(*mod->tracepoints_ptrs),
 					     &mod->num_tracepoints);
 #endif
+#ifdef CONFIG_BPF_EVENTS
+	mod->bpf_raw_events = section_objs(info, "__bpf_raw_tp_map",
+					   sizeof(*mod->bpf_raw_events),
+					   &mod->num_bpf_raw_events);
+#endif
 #ifdef HAVE_JUMP_LABEL
 	mod->jump_entries = section_objs(info, "__jump_table",
 					sizeof(*mod->jump_entries),
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 9864a35c8bb5..9ddb6fddb4e0 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -17,6 +17,43 @@
 #include "trace_probe.h"
 #include "trace.h"
 
+#ifdef CONFIG_MODULES
+struct bpf_trace_module {
+	struct module *module;
+	struct list_head list;
+};
+
+static LIST_HEAD(bpf_trace_modules);
+static DEFINE_MUTEX(bpf_module_mutex);
+
+static struct bpf_raw_event_map *bpf_get_raw_tracepoint_module(const char *name)
+{
+	struct bpf_raw_event_map *btp, *ret = NULL;
+	struct bpf_trace_module *btm;
+	unsigned int i;
+
+	mutex_lock(&bpf_module_mutex);
+	list_for_each_entry(btm, &bpf_trace_modules, list) {
+		for (i = 0; i < btm->module->num_bpf_raw_events; ++i) {
+			btp = &btm->module->bpf_raw_events[i];
+			if (!strcmp(btp->tp->name, name)) {
+				if (try_module_get(btm->module))
+					ret = btp;
+				goto out;
+			}
+		}
+	}
+out:
+	mutex_unlock(&bpf_module_mutex);
+	return ret;
+}
+#else
+static struct bpf_raw_event_map *bpf_get_raw_tracepoint_module(const char *name)
+{
+	return NULL;
+}
+#endif /* CONFIG_MODULES */
+
 u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 u64 bpf_get_stack(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 
@@ -1076,7 +1113,7 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info)
 extern struct bpf_raw_event_map __start__bpf_raw_tp[];
 extern struct bpf_raw_event_map __stop__bpf_raw_tp[];
 
-struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name)
+struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name)
 {
 	struct bpf_raw_event_map *btp = __start__bpf_raw_tp;
 
@@ -1084,7 +1121,16 @@ struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name)
 		if (!strcmp(btp->tp->name, name))
 			return btp;
 	}
-	return NULL;
+
+	return bpf_get_raw_tracepoint_module(name);
+}
+
+void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp)
+{
+	struct module *mod = __module_address((unsigned long)btp);
+
+	if (mod)
+		module_put(mod);
 }
 
 static __always_inline
@@ -1222,3 +1268,52 @@ int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
 
 	return err;
 }
+
+#ifdef CONFIG_MODULES
+int bpf_event_notify(struct notifier_block *nb, unsigned long op, void *module)
+{
+	struct bpf_trace_module *btm, *tmp;
+	struct module *mod = module;
+
+	if (mod->num_bpf_raw_events == 0 ||
+	    (op != MODULE_STATE_COMING && op != MODULE_STATE_GOING))
+		return 0;
+
+	mutex_lock(&bpf_module_mutex);
+
+	switch (op) {
+	case MODULE_STATE_COMING:
+		btm = kzalloc(sizeof(*btm), GFP_KERNEL);
+		if (btm) {
+			btm->module = module;
+			list_add(&btm->list, &bpf_trace_modules);
+		}
+		break;
+	case MODULE_STATE_GOING:
+		list_for_each_entry_safe(btm, tmp, &bpf_trace_modules, list) {
+			if (btm->module == module) {
+				list_del(&btm->list);
+				kfree(btm);
+				break;
+			}
+		}
+		break;
+	}
+
+	mutex_unlock(&bpf_module_mutex);
+
+	return 0;
+}
+
+static struct notifier_block bpf_module_nb = {
+	.notifier_call = bpf_event_notify,
+};
+
+int __init bpf_event_init(void)
+{
+	register_module_notifier(&bpf_module_nb);
+	return 0;
+}
+
+fs_initcall(bpf_event_init);
+#endif /* CONFIG_MODULES */
-- 
cgit 


From da791a667536bf8322042e38ca85d55a78d3c273 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 10 Dec 2018 14:35:14 +0100
Subject: futex: Cure exit race

Stefan reported, that the glibc tst-robustpi4 test case fails
occasionally. That case creates the following race between
sys_exit() and sys_futex_lock_pi():

 CPU0				CPU1

 sys_exit()			sys_futex()
  do_exit()			 futex_lock_pi()
   exit_signals(tsk)		  No waiters:
    tsk->flags |= PF_EXITING;	  *uaddr == 0x00000PID
  mm_release(tsk)		  Set waiter bit
   exit_robust_list(tsk) {	  *uaddr = 0x80000PID;
      Set owner died		  attach_to_pi_owner() {
    *uaddr = 0xC0000000;	   tsk = get_task(PID);
   }				   if (!tsk->flags & PF_EXITING) {
  ...				     attach();
  tsk->flags |= PF_EXITPIDONE;	   } else {
				     if (!(tsk->flags & PF_EXITPIDONE))
				       return -EAGAIN;
				     return -ESRCH; <--- FAIL
				   }

ESRCH is returned all the way to user space, which triggers the glibc test
case assert. Returning ESRCH unconditionally is wrong here because the user
space value has been changed by the exiting task to 0xC0000000, i.e. the
FUTEX_OWNER_DIED bit is set and the futex PID value has been cleared. This
is a valid state and the kernel has to handle it, i.e. taking the futex.

Cure it by rereading the user space value when PF_EXITING and PF_EXITPIDONE
is set in the task which 'owns' the futex. If the value has changed, let
the kernel retry the operation, which includes all regular sanity checks
and correctly handles the FUTEX_OWNER_DIED case.

If it hasn't changed, then return ESRCH as there is no way to distinguish
this case from malfunctioning user space. This happens when the exiting
task did not have a robust list, the robust list was corrupted or the user
space value in the futex was simply bogus.

Reported-by: Stefan Liebler <stli@linux.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Darren Hart <dvhart@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Sasha Levin <sashal@kernel.org>
Cc: stable@vger.kernel.org
Link: https://bugzilla.kernel.org/show_bug.cgi?id=200467
Link: https://lkml.kernel.org/r/20181210152311.986181245@linutronix.de
---
 kernel/futex.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 63 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index f423f9b6577e..5cc8083a4c89 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1148,11 +1148,65 @@ out_error:
 	return ret;
 }
 
+static int handle_exit_race(u32 __user *uaddr, u32 uval,
+			    struct task_struct *tsk)
+{
+	u32 uval2;
+
+	/*
+	 * If PF_EXITPIDONE is not yet set, then try again.
+	 */
+	if (tsk && !(tsk->flags & PF_EXITPIDONE))
+		return -EAGAIN;
+
+	/*
+	 * Reread the user space value to handle the following situation:
+	 *
+	 * CPU0				CPU1
+	 *
+	 * sys_exit()			sys_futex()
+	 *  do_exit()			 futex_lock_pi()
+	 *                                futex_lock_pi_atomic()
+	 *   exit_signals(tsk)		    No waiters:
+	 *    tsk->flags |= PF_EXITING;	    *uaddr == 0x00000PID
+	 *  mm_release(tsk)		    Set waiter bit
+	 *   exit_robust_list(tsk) {	    *uaddr = 0x80000PID;
+	 *      Set owner died		    attach_to_pi_owner() {
+	 *    *uaddr = 0xC0000000;	     tsk = get_task(PID);
+	 *   }				     if (!tsk->flags & PF_EXITING) {
+	 *  ...				       attach();
+	 *  tsk->flags |= PF_EXITPIDONE;     } else {
+	 *				       if (!(tsk->flags & PF_EXITPIDONE))
+	 *				         return -EAGAIN;
+	 *				       return -ESRCH; <--- FAIL
+	 *				     }
+	 *
+	 * Returning ESRCH unconditionally is wrong here because the
+	 * user space value has been changed by the exiting task.
+	 *
+	 * The same logic applies to the case where the exiting task is
+	 * already gone.
+	 */
+	if (get_futex_value_locked(&uval2, uaddr))
+		return -EFAULT;
+
+	/* If the user space value has changed, try again. */
+	if (uval2 != uval)
+		return -EAGAIN;
+
+	/*
+	 * The exiting task did not have a robust list, the robust list was
+	 * corrupted or the user space value in *uaddr is simply bogus.
+	 * Give up and tell user space.
+	 */
+	return -ESRCH;
+}
+
 /*
  * Lookup the task for the TID provided from user space and attach to
  * it after doing proper sanity checks.
  */
-static int attach_to_pi_owner(u32 uval, union futex_key *key,
+static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
 			      struct futex_pi_state **ps)
 {
 	pid_t pid = uval & FUTEX_TID_MASK;
@@ -1162,12 +1216,15 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
 	/*
 	 * We are the first waiter - try to look up the real owner and attach
 	 * the new pi_state to it, but bail out when TID = 0 [1]
+	 *
+	 * The !pid check is paranoid. None of the call sites should end up
+	 * with pid == 0, but better safe than sorry. Let the caller retry
 	 */
 	if (!pid)
-		return -ESRCH;
+		return -EAGAIN;
 	p = find_get_task_by_vpid(pid);
 	if (!p)
-		return -ESRCH;
+		return handle_exit_race(uaddr, uval, NULL);
 
 	if (unlikely(p->flags & PF_KTHREAD)) {
 		put_task_struct(p);
@@ -1187,7 +1244,7 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
 		 * set, we know that the task has finished the
 		 * cleanup:
 		 */
-		int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN;
+		int ret = handle_exit_race(uaddr, uval, p);
 
 		raw_spin_unlock_irq(&p->pi_lock);
 		put_task_struct(p);
@@ -1244,7 +1301,7 @@ static int lookup_pi_state(u32 __user *uaddr, u32 uval,
 	 * We are the first waiter - try to look up the owner based on
 	 * @uval and attach to it.
 	 */
-	return attach_to_pi_owner(uval, key, ps);
+	return attach_to_pi_owner(uaddr, uval, key, ps);
 }
 
 static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
@@ -1352,7 +1409,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
 	 * attach to the owner. If that fails, no harm done, we only
 	 * set the FUTEX_WAITERS bit in the user space variable.
 	 */
-	return attach_to_pi_owner(uval, key, ps);
+	return attach_to_pi_owner(uaddr, newval, key, ps);
 }
 
 /**
-- 
cgit 


From 0bae2d4d62d523f06ff1a8e88ce38b45400acd28 Mon Sep 17 00:00:00 2001
From: Jiong Wang <jiong.wang@netronome.com>
Date: Sat, 15 Dec 2018 03:34:40 -0500
Subject: bpf: correct slot_type marking logic to allow more stack slot sharing

Verifier is supposed to support sharing stack slot allocated to ptr with
SCALAR_VALUE for privileged program. However this doesn't happen for some
cases.

The reason is verifier is not clearing slot_type STACK_SPILL for all bytes,
it only clears part of them, while verifier is using:

  slot_type[0] == STACK_SPILL

as a convention to check one slot is ptr type.

So, the consequence of partial clearing slot_type is verifier could treat a
partially overridden ptr slot, which should now be a SCALAR_VALUE slot,
still as ptr slot, and rejects some valid programs.

Before this patch, test_xdp_noinline.o under bpf selftests, bpf_lxc.o and
bpf_netdev.o under Cilium bpf repo, when built with -mattr=+alu32 are
rejected due to this issue. After this patch, they all accepted.

There is no processed insn number change before and after this patch on
Cilium bpf programs.

Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Reviewed-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c                       |  5 +++++
 tools/testing/selftests/bpf/test_verifier.c | 34 +++++++++++++++++++++++++++--
 2 files changed, 37 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 0125731e2512..e0e77ffeefb8 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1286,6 +1286,10 @@ static int check_stack_write(struct bpf_verifier_env *env,
 
 		/* regular write of data into stack destroys any spilled ptr */
 		state->stack[spi].spilled_ptr.type = NOT_INIT;
+		/* Mark slots as STACK_MISC if they belonged to spilled ptr. */
+		if (state->stack[spi].slot_type[0] == STACK_SPILL)
+			for (i = 0; i < BPF_REG_SIZE; i++)
+				state->stack[spi].slot_type[i] = STACK_MISC;
 
 		/* only mark the slot as written if all 8 bytes were written
 		 * otherwise read propagation may incorrectly stop too soon
@@ -1303,6 +1307,7 @@ static int check_stack_write(struct bpf_verifier_env *env,
 		    register_is_null(&cur->regs[value_regno]))
 			type = STACK_ZERO;
 
+		/* Mark slots affected by this stack write. */
 		for (i = 0; i < size; i++)
 			state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] =
 				type;
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index f9de7fe0c26d..cf242734e2eb 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -1001,14 +1001,44 @@ static struct bpf_test tests[] = {
 			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8),
 			/* mess up with R1 pointer on stack */
 			BPF_ST_MEM(BPF_B, BPF_REG_10, -7, 0x23),
-			/* fill back into R0 should fail */
+			/* fill back into R0 is fine for priv.
+			 * R0 now becomes SCALAR_VALUE.
+			 */
 			BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -8),
+			/* Load from R0 should fail. */
+			BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 8),
 			BPF_EXIT_INSN(),
 		},
 		.errstr_unpriv = "attempt to corrupt spilled",
-		.errstr = "corrupted spill",
+		.errstr = "R0 invalid mem access 'inv",
 		.result = REJECT,
 	},
+	{
+		"check corrupted spill/fill, LSB",
+		.insns = {
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8),
+			BPF_ST_MEM(BPF_H, BPF_REG_10, -8, 0xcafe),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -8),
+			BPF_EXIT_INSN(),
+		},
+		.errstr_unpriv = "attempt to corrupt spilled",
+		.result_unpriv = REJECT,
+		.result = ACCEPT,
+		.retval = POINTER_VALUE,
+	},
+	{
+		"check corrupted spill/fill, MSB",
+		.insns = {
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8),
+			BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0x12345678),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -8),
+			BPF_EXIT_INSN(),
+		},
+		.errstr_unpriv = "attempt to corrupt spilled",
+		.result_unpriv = REJECT,
+		.result = ACCEPT,
+		.retval = POINTER_VALUE,
+	},
 	{
 		"invalid src register in STX",
 		.insns = {
-- 
cgit 


From 76c43ae84e3f455e0b460ed0c43799e018d09ee9 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Tue, 18 Dec 2018 13:43:58 -0800
Subject: bpf: log struct/union attribute for forward type

Current btf internal verbose logger logs fwd type as
  [2] FWD A type_id=0
where A is the type name.

Commit 9d5f9f701b18 ("bpf: btf: fix struct/union/fwd types
with kind_flag") introduced kind_flag which can be used
to distinguish whether a forward type is a struct or
union.

Also, "type_id=0" does not carry any meaningful
information for fwd type as btf_type.type = 0 is simply
enforced during btf verification and is not used
anywhere else.

This commit changed the log to
  [2] FWD A struct
if kind_flag = 0, or
  [2] FWD A union
if kind_flag = 1.

Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/btf.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index e804b26a0506..715f9fcf4712 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -1595,12 +1595,18 @@ static s32 btf_fwd_check_meta(struct btf_verifier_env *env,
 	return 0;
 }
 
+static void btf_fwd_type_log(struct btf_verifier_env *env,
+			     const struct btf_type *t)
+{
+	btf_verifier_log(env, "%s", btf_type_kflag(t) ? "union" : "struct");
+}
+
 static struct btf_kind_operations fwd_ops = {
 	.check_meta = btf_fwd_check_meta,
 	.resolve = btf_df_resolve,
 	.check_member = btf_df_check_member,
 	.check_kflag_member = btf_df_check_kflag_member,
-	.log_details = btf_ref_type_log,
+	.log_details = btf_fwd_type_log,
 	.seq_show = btf_df_seq_show,
 };
 
-- 
cgit 


From c2899c3470de04823870b28646981695c0046efe Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 18 Dec 2018 16:06:53 +0100
Subject: genirq/affinity: Remove excess indentation

Plus other coding style issues which stood out while staring at that code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/affinity.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 08c904eb7279..e423bff1928c 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -95,11 +95,11 @@ static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask,
 }
 
 static int __irq_build_affinity_masks(const struct irq_affinity *affd,
-				    int startvec, int numvecs, int firstvec,
-				    cpumask_var_t *node_to_cpumask,
-				    const struct cpumask *cpu_mask,
-				    struct cpumask *nmsk,
-				    struct cpumask *masks)
+				      int startvec, int numvecs, int firstvec,
+				      cpumask_var_t *node_to_cpumask,
+				      const struct cpumask *cpu_mask,
+				      struct cpumask *nmsk,
+				      struct cpumask *masks)
 {
 	int n, nodes, cpus_per_vec, extra_vecs, done = 0;
 	int last_affv = firstvec + numvecs;
@@ -180,10 +180,10 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd,
 	cpumask_var_t nmsk, npresmsk;
 
 	if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
-			return ret;
+		return ret;
 
 	if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL))
-			goto fail;
+		goto fail;
 
 	ret = 0;
 	/* Stabilize the cpumasks */
@@ -212,7 +212,7 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd,
 	put_online_cpus();
 
 	if (nr_present < numvecs)
-			WARN_ON(nr_present + nr_others < numvecs);
+		WARN_ON(nr_present + nr_others < numvecs);
 
 	free_cpumask_var(npresmsk);
 
@@ -271,9 +271,9 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 		ret = irq_build_affinity_masks(affd, curvec, this_vecs,
 						curvec, node_to_cpumask, masks);
 		if (ret) {
-				kfree(masks);
-				masks = NULL;
-				goto outnodemsk;
+			kfree(masks);
+			masks = NULL;
+			goto outnodemsk;
 		}
 		curvec += this_vecs;
 		usedvecs += this_vecs;
-- 
cgit 


From bec04037e4e484f41ee4d9409e40616874169d20 Mon Sep 17 00:00:00 2001
From: Dou Liyang <douliyangs@gmail.com>
Date: Tue, 4 Dec 2018 23:51:20 +0800
Subject: genirq/core: Introduce struct irq_affinity_desc

The interrupt affinity management uses straight cpumask pointers to convey
the automatically assigned affinity masks for managed interrupts. The core
interrupt descriptor allocation also decides based on the pointer being non
NULL whether an interrupt is managed or not.

Devices which use managed interrupts usually have two classes of
interrupts:

  - Interrupts for multiple device queues
  - Interrupts for general device management

Currently both classes are treated the same way, i.e. as managed
interrupts. The general interrupts get the default affinity mask assigned
while the device queue interrupts are spread out over the possible CPUs.

Treating the general interrupts as managed is both a limitation and under
certain circumstances a bug. Assume the following situation:

 default_irq_affinity = 4..7

So if CPUs 4-7 are offlined, then the core code will shut down the device
management interrupts because the last CPU in their affinity mask went
offline.

It's also a limitation because it's desired to allow manual placement of
the general device interrupts for various reasons. If they are marked
managed then the interrupt affinity setting from both user and kernel space
is disabled.

To remedy that situation it's required to convey more information than the
cpumasks through various interfaces related to interrupt descriptor
allocation.

Instead of adding yet another argument, create a new data structure
'irq_affinity_desc' which for now just contains the cpumask. This struct
can be expanded to convey auxilliary information in the next step.

No functional change, just preparatory work.

[ tglx: Simplified logic and clarified changelog ]

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Suggested-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Dou Liyang <douliyangs@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-pci@vger.kernel.org
Cc: kashyap.desai@broadcom.com
Cc: shivasharan.srikanteshwara@broadcom.com
Cc: sumit.saxena@broadcom.com
Cc: ming.lei@redhat.com
Cc: hch@lst.de
Cc: douliyang1@huawei.com
Link: https://lkml.kernel.org/r/20181204155122.6327-2-douliyangs@gmail.com
---
 drivers/pci/msi.c         |  9 ++++-----
 include/linux/interrupt.h | 14 ++++++++++++--
 include/linux/irq.h       |  6 ++++--
 include/linux/irqdomain.h |  6 ++++--
 include/linux/msi.h       |  4 ++--
 kernel/irq/affinity.c     | 22 ++++++++++++----------
 kernel/irq/devres.c       |  4 ++--
 kernel/irq/irqdesc.c      | 17 +++++++++--------
 kernel/irq/irqdomain.c    |  4 ++--
 kernel/irq/msi.c          |  8 ++++----
 10 files changed, 55 insertions(+), 39 deletions(-)

(limited to 'kernel')

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 265ed3e4c920..7a1c8a09efa5 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -534,14 +534,13 @@ error_attrs:
 static struct msi_desc *
 msi_setup_entry(struct pci_dev *dev, int nvec, const struct irq_affinity *affd)
 {
-	struct cpumask *masks = NULL;
+	struct irq_affinity_desc *masks = NULL;
 	struct msi_desc *entry;
 	u16 control;
 
 	if (affd)
 		masks = irq_create_affinity_masks(nvec, affd);
 
-
 	/* MSI Entry Initialization */
 	entry = alloc_msi_entry(&dev->dev, nvec, masks);
 	if (!entry)
@@ -672,7 +671,7 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base,
 			      struct msix_entry *entries, int nvec,
 			      const struct irq_affinity *affd)
 {
-	struct cpumask *curmsk, *masks = NULL;
+	struct irq_affinity_desc *curmsk, *masks = NULL;
 	struct msi_desc *entry;
 	int ret, i;
 
@@ -1264,7 +1263,7 @@ const struct cpumask *pci_irq_get_affinity(struct pci_dev *dev, int nr)
 
 		for_each_pci_msi_entry(entry, dev) {
 			if (i == nr)
-				return entry->affinity;
+				return &entry->affinity->mask;
 			i++;
 		}
 		WARN_ON_ONCE(1);
@@ -1276,7 +1275,7 @@ const struct cpumask *pci_irq_get_affinity(struct pci_dev *dev, int nr)
 				 nr >= entry->nvec_used))
 			return NULL;
 
-		return &entry->affinity[nr];
+		return &entry->affinity[nr].mask;
 	} else {
 		return cpu_possible_mask;
 	}
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index ca397ff40836..c44b7844dc83 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -257,6 +257,14 @@ struct irq_affinity {
 	int	*sets;
 };
 
+/**
+ * struct irq_affinity_desc - Interrupt affinity descriptor
+ * @mask:	cpumask to hold the affinity assignment
+ */
+struct irq_affinity_desc {
+	struct cpumask	mask;
+};
+
 #if defined(CONFIG_SMP)
 
 extern cpumask_var_t irq_default_affinity;
@@ -303,7 +311,9 @@ extern int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m);
 extern int
 irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify);
 
-struct cpumask *irq_create_affinity_masks(int nvec, const struct irq_affinity *affd);
+struct irq_affinity_desc *
+irq_create_affinity_masks(int nvec, const struct irq_affinity *affd);
+
 int irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity *affd);
 
 #else /* CONFIG_SMP */
@@ -337,7 +347,7 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
 	return 0;
 }
 
-static inline struct cpumask *
+static inline struct irq_affinity_desc *
 irq_create_affinity_masks(int nvec, const struct irq_affinity *affd)
 {
 	return NULL;
diff --git a/include/linux/irq.h b/include/linux/irq.h
index c9bffda04a45..def2b2aac8b1 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -27,6 +27,7 @@
 struct seq_file;
 struct module;
 struct msi_msg;
+struct irq_affinity_desc;
 enum irqchip_irq_state;
 
 /*
@@ -834,11 +835,12 @@ struct cpumask *irq_data_get_effective_affinity_mask(struct irq_data *d)
 unsigned int arch_dynirq_lower_bound(unsigned int from);
 
 int __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
-		      struct module *owner, const struct cpumask *affinity);
+		      struct module *owner,
+		      const struct irq_affinity_desc *affinity);
 
 int __devm_irq_alloc_descs(struct device *dev, int irq, unsigned int from,
 			   unsigned int cnt, int node, struct module *owner,
-			   const struct cpumask *affinity);
+			   const struct irq_affinity_desc *affinity);
 
 /* use macros to avoid needing export.h for THIS_MODULE */
 #define irq_alloc_descs(irq, from, cnt, node)	\
diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 068aa46f0d55..35965f41d7be 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -43,6 +43,7 @@ struct irq_chip;
 struct irq_data;
 struct cpumask;
 struct seq_file;
+struct irq_affinity_desc;
 
 /* Number of irqs reserved for a legacy isa controller */
 #define NUM_ISA_INTERRUPTS	16
@@ -266,7 +267,7 @@ extern bool irq_domain_check_msi_remap(void);
 extern void irq_set_default_host(struct irq_domain *host);
 extern int irq_domain_alloc_descs(int virq, unsigned int nr_irqs,
 				  irq_hw_number_t hwirq, int node,
-				  const struct cpumask *affinity);
+				  const struct irq_affinity_desc *affinity);
 
 static inline struct fwnode_handle *of_node_to_fwnode(struct device_node *node)
 {
@@ -449,7 +450,8 @@ static inline struct irq_domain *irq_domain_add_hierarchy(struct irq_domain *par
 
 extern int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
 				   unsigned int nr_irqs, int node, void *arg,
-				   bool realloc, const struct cpumask *affinity);
+				   bool realloc,
+				   const struct irq_affinity_desc *affinity);
 extern void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs);
 extern int irq_domain_activate_irq(struct irq_data *irq_data, bool early);
 extern void irq_domain_deactivate_irq(struct irq_data *irq_data);
diff --git a/include/linux/msi.h b/include/linux/msi.h
index eb213b87617c..784fb52b9900 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -76,7 +76,7 @@ struct msi_desc {
 	unsigned int			nvec_used;
 	struct device			*dev;
 	struct msi_msg			msg;
-	struct cpumask			*affinity;
+	struct irq_affinity_desc	*affinity;
 
 	union {
 		/* PCI MSI/X specific data */
@@ -138,7 +138,7 @@ static inline void pci_write_msi_msg(unsigned int irq, struct msi_msg *msg)
 #endif /* CONFIG_PCI_MSI */
 
 struct msi_desc *alloc_msi_entry(struct device *dev, int nvec,
-				 const struct cpumask *affinity);
+				 const struct irq_affinity_desc *affinity);
 void free_msi_entry(struct msi_desc *entry);
 void __pci_read_msi_msg(struct msi_desc *entry, struct msi_msg *msg);
 void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg);
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index e423bff1928c..c0fe591b0dc9 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -99,7 +99,7 @@ static int __irq_build_affinity_masks(const struct irq_affinity *affd,
 				      cpumask_var_t *node_to_cpumask,
 				      const struct cpumask *cpu_mask,
 				      struct cpumask *nmsk,
-				      struct cpumask *masks)
+				      struct irq_affinity_desc *masks)
 {
 	int n, nodes, cpus_per_vec, extra_vecs, done = 0;
 	int last_affv = firstvec + numvecs;
@@ -117,7 +117,9 @@ static int __irq_build_affinity_masks(const struct irq_affinity *affd,
 	 */
 	if (numvecs <= nodes) {
 		for_each_node_mask(n, nodemsk) {
-			cpumask_or(masks + curvec, masks + curvec, node_to_cpumask[n]);
+			cpumask_or(&masks[curvec].mask,
+					&masks[curvec].mask,
+					node_to_cpumask[n]);
 			if (++curvec == last_affv)
 				curvec = firstvec;
 		}
@@ -150,7 +152,8 @@ static int __irq_build_affinity_masks(const struct irq_affinity *affd,
 				cpus_per_vec++;
 				--extra_vecs;
 			}
-			irq_spread_init_one(masks + curvec, nmsk, cpus_per_vec);
+			irq_spread_init_one(&masks[curvec].mask, nmsk,
+						cpus_per_vec);
 		}
 
 		done += v;
@@ -173,7 +176,7 @@ out:
 static int irq_build_affinity_masks(const struct irq_affinity *affd,
 				    int startvec, int numvecs, int firstvec,
 				    cpumask_var_t *node_to_cpumask,
-				    struct cpumask *masks)
+				    struct irq_affinity_desc *masks)
 {
 	int curvec = startvec, nr_present, nr_others;
 	int ret = -ENOMEM;
@@ -226,15 +229,15 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd,
  * @nvecs:	The total number of vectors
  * @affd:	Description of the affinity requirements
  *
- * Returns the masks pointer or NULL if allocation failed.
+ * Returns the irq_affinity_desc pointer or NULL if allocation failed.
  */
-struct cpumask *
+struct irq_affinity_desc *
 irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 {
 	int affvecs = nvecs - affd->pre_vectors - affd->post_vectors;
 	int curvec, usedvecs;
 	cpumask_var_t *node_to_cpumask;
-	struct cpumask *masks = NULL;
+	struct irq_affinity_desc *masks = NULL;
 	int i, nr_sets;
 
 	/*
@@ -254,8 +257,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 
 	/* Fill out vectors at the beginning that don't need affinity */
 	for (curvec = 0; curvec < affd->pre_vectors; curvec++)
-		cpumask_copy(masks + curvec, irq_default_affinity);
-
+		cpumask_copy(&masks[curvec].mask, irq_default_affinity);
 	/*
 	 * Spread on present CPUs starting from affd->pre_vectors. If we
 	 * have multiple sets, build each sets affinity mask separately.
@@ -285,7 +287,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 	else
 		curvec = affd->pre_vectors + usedvecs;
 	for (; curvec < nvecs; curvec++)
-		cpumask_copy(masks + curvec, irq_default_affinity);
+		cpumask_copy(&masks[curvec].mask, irq_default_affinity);
 
 outnodemsk:
 	free_node_to_cpumask(node_to_cpumask);
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index 6a682c229e10..5d5378ea0afe 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -169,7 +169,7 @@ static void devm_irq_desc_release(struct device *dev, void *res)
  * @cnt:	Number of consecutive irqs to allocate
  * @node:	Preferred node on which the irq descriptor should be allocated
  * @owner:	Owning module (can be NULL)
- * @affinity:	Optional pointer to an affinity mask array of size @cnt
+ * @affinity:	Optional pointer to an irq_affinity_desc array of size @cnt
  *		which hints where the irq descriptors should be allocated
  *		and which default affinities to use
  *
@@ -179,7 +179,7 @@ static void devm_irq_desc_release(struct device *dev, void *res)
  */
 int __devm_irq_alloc_descs(struct device *dev, int irq, unsigned int from,
 			   unsigned int cnt, int node, struct module *owner,
-			   const struct cpumask *affinity)
+			   const struct irq_affinity_desc *affinity)
 {
 	struct irq_desc_devres *dr;
 	int base;
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 578d0e5f1b5b..cb401d6c5040 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -449,28 +449,29 @@ static void free_desc(unsigned int irq)
 }
 
 static int alloc_descs(unsigned int start, unsigned int cnt, int node,
-		       const struct cpumask *affinity, struct module *owner)
+		       const struct irq_affinity_desc *affinity,
+		       struct module *owner)
 {
-	const struct cpumask *mask = NULL;
 	struct irq_desc *desc;
 	unsigned int flags;
 	int i;
 
 	/* Validate affinity mask(s) */
 	if (affinity) {
-		for (i = 0, mask = affinity; i < cnt; i++, mask++) {
-			if (cpumask_empty(mask))
+		for (i = 0; i < cnt; i++) {
+			if (cpumask_empty(&affinity[i].mask))
 				return -EINVAL;
 		}
 	}
 
 	flags = affinity ? IRQD_AFFINITY_MANAGED | IRQD_MANAGED_SHUTDOWN : 0;
-	mask = NULL;
 
 	for (i = 0; i < cnt; i++) {
+		const struct cpumask *mask = NULL;
+
 		if (affinity) {
 			node = cpu_to_node(cpumask_first(affinity));
-			mask = affinity;
+			mask = &affinity->mask;
 			affinity++;
 		}
 		desc = alloc_desc(start + i, node, flags, mask, owner);
@@ -575,7 +576,7 @@ static void free_desc(unsigned int irq)
 }
 
 static inline int alloc_descs(unsigned int start, unsigned int cnt, int node,
-			      const struct cpumask *affinity,
+			      const struct irq_affinity_desc *affinity,
 			      struct module *owner)
 {
 	u32 i;
@@ -705,7 +706,7 @@ EXPORT_SYMBOL_GPL(irq_free_descs);
  */
 int __ref
 __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
-		  struct module *owner, const struct cpumask *affinity)
+		  struct module *owner, const struct irq_affinity_desc *affinity)
 {
 	int start, ret;
 
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 3366d11c3e02..8b0be4bd6565 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -969,7 +969,7 @@ const struct irq_domain_ops irq_domain_simple_ops = {
 EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
 
 int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq,
-			   int node, const struct cpumask *affinity)
+			   int node, const struct irq_affinity_desc *affinity)
 {
 	unsigned int hint;
 
@@ -1281,7 +1281,7 @@ int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain,
  */
 int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
 			    unsigned int nr_irqs, int node, void *arg,
-			    bool realloc, const struct cpumask *affinity)
+			    bool realloc, const struct irq_affinity_desc *affinity)
 {
 	int i, ret, virq;
 
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 4ca2fd46645d..ad26fbcfbfc8 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -23,11 +23,11 @@
  * @nvec:	The number of vectors used in this entry
  * @affinity:	Optional pointer to an affinity mask array size of @nvec
  *
- * If @affinity is not NULL then a an affinity array[@nvec] is allocated
- * and the affinity masks from @affinity are copied.
+ * If @affinity is not NULL then an affinity array[@nvec] is allocated
+ * and the affinity masks and flags from @affinity are copied.
  */
-struct msi_desc *
-alloc_msi_entry(struct device *dev, int nvec, const struct cpumask *affinity)
+struct msi_desc *alloc_msi_entry(struct device *dev, int nvec,
+				 const struct irq_affinity_desc *affinity)
 {
 	struct msi_desc *desc;
 
-- 
cgit 


From c410abbbacb9b378365ba17a30df08b4b9eec64f Mon Sep 17 00:00:00 2001
From: Dou Liyang <douliyangs@gmail.com>
Date: Tue, 4 Dec 2018 23:51:21 +0800
Subject: genirq/affinity: Add is_managed to struct irq_affinity_desc

Devices which use managed interrupts usually have two classes of
interrupts:

  - Interrupts for multiple device queues
  - Interrupts for general device management

Currently both classes are treated the same way, i.e. as managed
interrupts. The general interrupts get the default affinity mask assigned
while the device queue interrupts are spread out over the possible CPUs.

Treating the general interrupts as managed is both a limitation and under
certain circumstances a bug. Assume the following situation:

 default_irq_affinity = 4..7

So if CPUs 4-7 are offlined, then the core code will shut down the device
management interrupts because the last CPU in their affinity mask went
offline.

It's also a limitation because it's desired to allow manual placement of
the general device interrupts for various reasons. If they are marked
managed then the interrupt affinity setting from both user and kernel space
is disabled. That limitation was reported by Kashyap and Sumit.

Expand struct irq_affinity_desc with a new bit 'is_managed' which is set
for truly managed interrupts (queue interrupts) and cleared for the general
device interrupts.

[ tglx: Simplify code and massage changelog ]

Reported-by: Kashyap Desai <kashyap.desai@broadcom.com>
Reported-by: Sumit Saxena <sumit.saxena@broadcom.com>
Signed-off-by: Dou Liyang <douliyangs@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-pci@vger.kernel.org
Cc: shivasharan.srikanteshwara@broadcom.com
Cc: ming.lei@redhat.com
Cc: hch@lst.de
Cc: bhelgaas@google.com
Cc: douliyang1@huawei.com
Link: https://lkml.kernel.org/r/20181204155122.6327-3-douliyangs@gmail.com
---
 include/linux/interrupt.h |  1 +
 kernel/irq/affinity.c     |  4 ++++
 kernel/irq/irqdesc.c      | 13 ++++++++-----
 3 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index c44b7844dc83..c672f34235e7 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -263,6 +263,7 @@ struct irq_affinity {
  */
 struct irq_affinity_desc {
 	struct cpumask	mask;
+	unsigned int	is_managed : 1;
 };
 
 #if defined(CONFIG_SMP)
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index c0fe591b0dc9..45b68b4ea48b 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -289,6 +289,10 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 	for (; curvec < nvecs; curvec++)
 		cpumask_copy(&masks[curvec].mask, irq_default_affinity);
 
+	/* Mark the managed interrupts */
+	for (i = affd->pre_vectors; i < nvecs - affd->post_vectors; i++)
+		masks[i].is_managed = 1;
+
 outnodemsk:
 	free_node_to_cpumask(node_to_cpumask);
 	return masks;
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index cb401d6c5040..ee062b7939d3 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -453,27 +453,30 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node,
 		       struct module *owner)
 {
 	struct irq_desc *desc;
-	unsigned int flags;
 	int i;
 
 	/* Validate affinity mask(s) */
 	if (affinity) {
-		for (i = 0; i < cnt; i++) {
+		for (i = 0; i < cnt; i++, i++) {
 			if (cpumask_empty(&affinity[i].mask))
 				return -EINVAL;
 		}
 	}
 
-	flags = affinity ? IRQD_AFFINITY_MANAGED | IRQD_MANAGED_SHUTDOWN : 0;
-
 	for (i = 0; i < cnt; i++) {
 		const struct cpumask *mask = NULL;
+		unsigned int flags = 0;
 
 		if (affinity) {
-			node = cpu_to_node(cpumask_first(affinity));
+			if (affinity->is_managed) {
+				flags = IRQD_AFFINITY_MANAGED |
+					IRQD_MANAGED_SHUTDOWN;
+			}
 			mask = &affinity->mask;
+			node = cpu_to_node(cpumask_first(mask));
 			affinity++;
 		}
+
 		desc = alloc_desc(start + i, node, flags, mask, owner);
 		if (!desc)
 			goto err;
-- 
cgit 


From d89b22d46a40da3a1630ecea111beaf3ef10bc21 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Mon, 3 Dec 2018 11:30:30 +1100
Subject: cred: add cred_fscmp() for comparing creds.

NFS needs to compare to credentials, to see if they can
be treated the same w.r.t. filesystem access.  Sometimes
an ordering is needed when credentials are used as a key
to an rbtree.
NFS currently has its own private credential management from
before 'struct cred' existed.  To move it over to more consistent
use of 'struct cred' we need a comparison function.
This patch adds that function.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/cred.h |  1 +
 kernel/cred.c        | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 56 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/cred.h b/include/linux/cred.h
index 7eed6101c791..f1085767e1b3 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -169,6 +169,7 @@ extern int change_create_files_as(struct cred *, struct inode *);
 extern int set_security_override(struct cred *, u32);
 extern int set_security_override_from_ctx(struct cred *, const char *);
 extern int set_create_files_as(struct cred *, struct inode *);
+extern int cred_fscmp(const struct cred *, const struct cred *);
 extern void __init cred_init(void);
 
 /*
diff --git a/kernel/cred.c b/kernel/cred.c
index ecf03657e71c..0b3ac72bd717 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -19,6 +19,7 @@
 #include <linux/security.h>
 #include <linux/binfmts.h>
 #include <linux/cn_proc.h>
+#include <linux/uidgid.h>
 
 #if 0
 #define kdebug(FMT, ...)						\
@@ -564,6 +565,60 @@ void revert_creds(const struct cred *old)
 }
 EXPORT_SYMBOL(revert_creds);
 
+/**
+ * cred_fscmp - Compare two credentials with respect to filesystem access.
+ * @a: The first credential
+ * @b: The second credential
+ *
+ * cred_cmp() will return zero if both credentials have the same
+ * fsuid, fsgid, and supplementary groups.  That is, if they will both
+ * provide the same access to files based on mode/uid/gid.
+ * If the credentials are different, then either -1 or 1 will
+ * be returned depending on whether @a comes before or after @b
+ * respectively in an arbitrary, but stable, ordering of credentials.
+ *
+ * Return: -1, 0, or 1 depending on comparison
+ */
+int cred_fscmp(const struct cred *a, const struct cred *b)
+{
+	struct group_info *ga, *gb;
+	int g;
+
+	if (a == b)
+		return 0;
+	if (uid_lt(a->fsuid, b->fsuid))
+		return -1;
+	if (uid_gt(a->fsuid, b->fsuid))
+		return 1;
+
+	if (gid_lt(a->fsgid, b->fsgid))
+		return -1;
+	if (gid_gt(a->fsgid, b->fsgid))
+		return 1;
+
+	ga = a->group_info;
+	gb = b->group_info;
+	if (ga == gb)
+		return 0;
+	if (ga == NULL)
+		return -1;
+	if (gb == NULL)
+		return 1;
+	if (ga->ngroups < gb->ngroups)
+		return -1;
+	if (ga->ngroups > gb->ngroups)
+		return 1;
+
+	for (g = 0; g < ga->ngroups; g++) {
+		if (gid_lt(ga->gid[g], gb->gid[g]))
+			return -1;
+		if (gid_gt(ga->gid[g], gb->gid[g]))
+			return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(cred_fscmp);
+
 /*
  * initialise the credentials stuff
  */
-- 
cgit 


From 97d0fb239c041f5f99655af74812c3ab75cc4346 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Mon, 3 Dec 2018 11:30:30 +1100
Subject: cred: add get_cred_rcu()

Sometimes we want to opportunistically get a
ref to a cred in an rcu_read_lock protected section.
get_task_cred() does this, and NFS does as similar thing
with its own credential structures.
To prepare for NFS converting to use 'struct cred' more
uniformly, define get_cred_rcu(), and use it in
get_task_cred().

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/cred.h | 11 +++++++++++
 kernel/cred.c        |  2 +-
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/cred.h b/include/linux/cred.h
index f1085767e1b3..48979fcb95cf 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -252,6 +252,17 @@ static inline const struct cred *get_cred(const struct cred *cred)
 	return get_new_cred(nonconst_cred);
 }
 
+static inline const struct cred *get_cred_rcu(const struct cred *cred)
+{
+	struct cred *nonconst_cred = (struct cred *) cred;
+	if (!cred)
+		return NULL;
+	if (!atomic_inc_not_zero(&nonconst_cred->usage))
+		return NULL;
+	validate_creds(cred);
+	return cred;
+}
+
 /**
  * put_cred - Release a reference to a set of credentials
  * @cred: The credentials to release
diff --git a/kernel/cred.c b/kernel/cred.c
index 0b3ac72bd717..ba60162249e8 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -195,7 +195,7 @@ const struct cred *get_task_cred(struct task_struct *task)
 	do {
 		cred = __task_cred((task));
 		BUG_ON(!cred);
-	} while (!atomic_inc_not_zero(&((struct cred *)cred)->usage));
+	} while (!get_cred_rcu(cred));
 
 	rcu_read_unlock();
 	return cred;
-- 
cgit 


From a6d8e7637faac93f362b694311bff64d5a8c701e Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Mon, 3 Dec 2018 11:30:30 +1100
Subject: cred: export get_task_cred().

There is no reason that modules should not be able
to use this, and NFS will need it when converted to
use 'struct cred'.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 kernel/cred.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/cred.c b/kernel/cred.c
index ba60162249e8..21f4a97085b4 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -200,6 +200,7 @@ const struct cred *get_task_cred(struct task_struct *task)
 	rcu_read_unlock();
 	return cred;
 }
+EXPORT_SYMBOL(get_task_cred);
 
 /*
  * Allocate blank credentials, such that the credentials can be filled in at a
-- 
cgit 


From fdbaa0beb78b7c8847e261fe2c32816e9d1c54cc Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 19 Dec 2018 13:01:01 -0800
Subject: bpf: Ensure line_info.insn_off cannot point to insn with zero code

This patch rejects a line_info if the bpf insn code referred by
line_info.insn_off is 0. F.e. a broken userspace tool might generate
a line_info.insn_off that points to the second 8 bytes of a BPF_LD_IMM64.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e0e77ffeefb8..5c64281d566e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4980,6 +4980,14 @@ static int check_btf_line(struct bpf_verifier_env *env,
 			goto err_free;
 		}
 
+		if (!prog->insnsi[linfo[i].insn_off].code) {
+			verbose(env,
+				"Invalid insn code at line_info[%u].insn_off\n",
+				i);
+			err = -EINVAL;
+			goto err_free;
+		}
+
 		if (!btf_name_by_offset(btf, linfo[i].line_off) ||
 		    !btf_name_by_offset(btf, linfo[i].file_name_off)) {
 			verbose(env, "Invalid line_info[%u].line_off or .file_name_off\n", i);
-- 
cgit 


From 518a2f1925c3165befbf06b75e07636549d92c1c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 14 Dec 2018 09:00:40 +0100
Subject: dma-mapping: zero memory returned from dma_alloc_*

If we want to map memory from the DMA allocator to userspace it must be
zeroed at allocation time to prevent stale data leaks.   We already do
this on most common architectures, but some architectures don't do this
yet, fix them up, either by passing GFP_ZERO when we use the normal page
allocator or doing a manual memset otherwise.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org> [m68k]
Acked-by: Sam Ravnborg <sam@ravnborg.org> [sparc]
---
 arch/alpha/kernel/pci_iommu.c    | 2 +-
 arch/arc/mm/dma.c                | 2 +-
 arch/c6x/mm/dma-coherent.c       | 5 ++++-
 arch/m68k/kernel/dma.c           | 2 +-
 arch/microblaze/mm/consistent.c  | 2 +-
 arch/openrisc/kernel/dma.c       | 2 +-
 arch/parisc/kernel/pci-dma.c     | 4 ++--
 arch/s390/pci/pci_dma.c          | 2 +-
 arch/sparc/kernel/ioport.c       | 2 +-
 arch/sparc/mm/io-unit.c          | 2 +-
 arch/sparc/mm/iommu.c            | 2 +-
 arch/xtensa/kernel/pci-dma.c     | 2 +-
 drivers/misc/mic/host/mic_boot.c | 2 +-
 kernel/dma/virt.c                | 2 +-
 14 files changed, 18 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/arch/alpha/kernel/pci_iommu.c b/arch/alpha/kernel/pci_iommu.c
index e1716e0d92fd..aa0f50d0f823 100644
--- a/arch/alpha/kernel/pci_iommu.c
+++ b/arch/alpha/kernel/pci_iommu.c
@@ -443,7 +443,7 @@ static void *alpha_pci_alloc_coherent(struct device *dev, size_t size,
 	gfp &= ~GFP_DMA;
 
 try_again:
-	cpu_addr = (void *)__get_free_pages(gfp, order);
+	cpu_addr = (void *)__get_free_pages(gfp | __GFP_ZERO, order);
 	if (! cpu_addr) {
 		printk(KERN_INFO "pci_alloc_consistent: "
 		       "get_free_pages failed from %pf\n",
diff --git a/arch/arc/mm/dma.c b/arch/arc/mm/dma.c
index db203ff69ccf..1525ac00fd02 100644
--- a/arch/arc/mm/dma.c
+++ b/arch/arc/mm/dma.c
@@ -33,7 +33,7 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
 	 */
 	BUG_ON(gfp & __GFP_HIGHMEM);
 
-	page = alloc_pages(gfp, order);
+	page = alloc_pages(gfp | __GFP_ZERO, order);
 	if (!page)
 		return NULL;
 
diff --git a/arch/c6x/mm/dma-coherent.c b/arch/c6x/mm/dma-coherent.c
index 01305c787201..75b79571732c 100644
--- a/arch/c6x/mm/dma-coherent.c
+++ b/arch/c6x/mm/dma-coherent.c
@@ -78,6 +78,7 @@ static void __free_dma_pages(u32 addr, int order)
 void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
 		gfp_t gfp, unsigned long attrs)
 {
+	void *ret;
 	u32 paddr;
 	int order;
 
@@ -94,7 +95,9 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
 	if (!paddr)
 		return NULL;
 
-	return phys_to_virt(paddr);
+	ret = phys_to_virt(paddr);
+	memset(ret, 0, 1 << order);
+	return ret;
 }
 
 /*
diff --git a/arch/m68k/kernel/dma.c b/arch/m68k/kernel/dma.c
index e99993c57d6b..b4aa853051bd 100644
--- a/arch/m68k/kernel/dma.c
+++ b/arch/m68k/kernel/dma.c
@@ -32,7 +32,7 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
 	size = PAGE_ALIGN(size);
 	order = get_order(size);
 
-	page = alloc_pages(flag, order);
+	page = alloc_pages(flag | __GFP_ZERO, order);
 	if (!page)
 		return NULL;
 
diff --git a/arch/microblaze/mm/consistent.c b/arch/microblaze/mm/consistent.c
index 45e0a1aa9357..3002cbca3059 100644
--- a/arch/microblaze/mm/consistent.c
+++ b/arch/microblaze/mm/consistent.c
@@ -81,7 +81,7 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
 	size = PAGE_ALIGN(size);
 	order = get_order(size);
 
-	vaddr = __get_free_pages(gfp, order);
+	vaddr = __get_free_pages(gfp | __GFP_ZERO, order);
 	if (!vaddr)
 		return NULL;
 
diff --git a/arch/openrisc/kernel/dma.c b/arch/openrisc/kernel/dma.c
index 159336adfa2f..f79457cb3741 100644
--- a/arch/openrisc/kernel/dma.c
+++ b/arch/openrisc/kernel/dma.c
@@ -89,7 +89,7 @@ arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
 		.mm = &init_mm
 	};
 
-	page = alloc_pages_exact(size, gfp);
+	page = alloc_pages_exact(size, gfp | __GFP_ZERO);
 	if (!page)
 		return NULL;
 
diff --git a/arch/parisc/kernel/pci-dma.c b/arch/parisc/kernel/pci-dma.c
index 04c48f1ef3fb..239162355b58 100644
--- a/arch/parisc/kernel/pci-dma.c
+++ b/arch/parisc/kernel/pci-dma.c
@@ -404,7 +404,7 @@ static void *pcxl_dma_alloc(struct device *dev, size_t size,
 	order = get_order(size);
 	size = 1 << (order + PAGE_SHIFT);
 	vaddr = pcxl_alloc_range(size);
-	paddr = __get_free_pages(flag, order);
+	paddr = __get_free_pages(flag | __GFP_ZERO, order);
 	flush_kernel_dcache_range(paddr, size);
 	paddr = __pa(paddr);
 	map_uncached_pages(vaddr, size, paddr);
@@ -429,7 +429,7 @@ static void *pcx_dma_alloc(struct device *dev, size_t size,
 	if ((attrs & DMA_ATTR_NON_CONSISTENT) == 0)
 		return NULL;
 
-	addr = (void *)__get_free_pages(flag, get_order(size));
+	addr = (void *)__get_free_pages(flag | __GFP_ZERO, get_order(size));
 	if (addr)
 		*dma_handle = (dma_addr_t)virt_to_phys(addr);
 
diff --git a/arch/s390/pci/pci_dma.c b/arch/s390/pci/pci_dma.c
index 346ba382193a..9e52d1527f71 100644
--- a/arch/s390/pci/pci_dma.c
+++ b/arch/s390/pci/pci_dma.c
@@ -404,7 +404,7 @@ static void *s390_dma_alloc(struct device *dev, size_t size,
 	dma_addr_t map;
 
 	size = PAGE_ALIGN(size);
-	page = alloc_pages(flag, get_order(size));
+	page = alloc_pages(flag | __GFP_ZERO, get_order(size));
 	if (!page)
 		return NULL;
 
diff --git a/arch/sparc/kernel/ioport.c b/arch/sparc/kernel/ioport.c
index baa235652c27..f89603855f1e 100644
--- a/arch/sparc/kernel/ioport.c
+++ b/arch/sparc/kernel/ioport.c
@@ -325,7 +325,7 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
 		return NULL;
 
 	size = PAGE_ALIGN(size);
-	va = (void *) __get_free_pages(gfp, get_order(size));
+	va = (void *) __get_free_pages(gfp | __GFP_ZERO, get_order(size));
 	if (!va) {
 		printk("%s: no %zd pages\n", __func__, size >> PAGE_SHIFT);
 		return NULL;
diff --git a/arch/sparc/mm/io-unit.c b/arch/sparc/mm/io-unit.c
index 91be13935d40..f770ee7229d8 100644
--- a/arch/sparc/mm/io-unit.c
+++ b/arch/sparc/mm/io-unit.c
@@ -224,7 +224,7 @@ static void *iounit_alloc(struct device *dev, size_t len,
 		return NULL;
 
 	len = PAGE_ALIGN(len);
-	va = __get_free_pages(gfp, get_order(len));
+	va = __get_free_pages(gfp | __GFP_ZERO, get_order(len));
 	if (!va)
 		return NULL;
 
diff --git a/arch/sparc/mm/iommu.c b/arch/sparc/mm/iommu.c
index fb771a634452..e8d5d73ca40d 100644
--- a/arch/sparc/mm/iommu.c
+++ b/arch/sparc/mm/iommu.c
@@ -344,7 +344,7 @@ static void *sbus_iommu_alloc(struct device *dev, size_t len,
 		return NULL;
 
 	len = PAGE_ALIGN(len);
-	va = __get_free_pages(gfp, get_order(len));
+	va = __get_free_pages(gfp | __GFP_ZERO, get_order(len));
 	if (va == 0)
 		return NULL;
 
diff --git a/arch/xtensa/kernel/pci-dma.c b/arch/xtensa/kernel/pci-dma.c
index 1fc138b6bc0a..9171bff76fc4 100644
--- a/arch/xtensa/kernel/pci-dma.c
+++ b/arch/xtensa/kernel/pci-dma.c
@@ -160,7 +160,7 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
 						 flag & __GFP_NOWARN);
 
 	if (!page)
-		page = alloc_pages(flag, get_order(size));
+		page = alloc_pages(flag | __GFP_ZERO, get_order(size));
 
 	if (!page)
 		return NULL;
diff --git a/drivers/misc/mic/host/mic_boot.c b/drivers/misc/mic/host/mic_boot.c
index c327985c9523..6479435ac96b 100644
--- a/drivers/misc/mic/host/mic_boot.c
+++ b/drivers/misc/mic/host/mic_boot.c
@@ -149,7 +149,7 @@ static void *__mic_dma_alloc(struct device *dev, size_t size,
 	struct scif_hw_dev *scdev = dev_get_drvdata(dev);
 	struct mic_device *mdev = scdev_to_mdev(scdev);
 	dma_addr_t tmp;
-	void *va = kmalloc(size, gfp);
+	void *va = kmalloc(size, gfp | __GFP_ZERO);
 
 	if (va) {
 		tmp = mic_map_single(mdev, va, size);
diff --git a/kernel/dma/virt.c b/kernel/dma/virt.c
index 631ddec4b60a..ebe128833af7 100644
--- a/kernel/dma/virt.c
+++ b/kernel/dma/virt.c
@@ -13,7 +13,7 @@ static void *dma_virt_alloc(struct device *dev, size_t size,
 {
 	void *ret;
 
-	ret = (void *)__get_free_pages(gfp, get_order(size));
+	ret = (void *)__get_free_pages(gfp | __GFP_ZERO, get_order(size));
 	if (ret)
 		*dma_handle = (uintptr_t)ret;
 	return ret;
-- 
cgit 


From 960ea056561a08e2b837b2f02d22c53226414a84 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 19 Dec 2018 22:13:04 -0800
Subject: bpf: verifier: teach the verifier to reason about the BPF_JSET
 instruction

Some JITs (nfp) try to optimize code on their own.  It could make
sense in case of BPF_JSET instruction which is currently not interpreted
by the verifier, meaning for instance that dead could would not be
detected if it was under BPF_JSET branch.

Teach the verifier basics of BPF_JSET, JIT optimizations will be
removed shortly.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Jiong Wang <jiong.wang@netronome.com>
Acked-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/verifier.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5c64281d566e..98ed27bbd045 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3859,6 +3859,12 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode)
 		if (tnum_is_const(reg->var_off))
 			return !tnum_equals_const(reg->var_off, val);
 		break;
+	case BPF_JSET:
+		if ((~reg->var_off.mask & reg->var_off.value) & val)
+			return 1;
+		if (!((reg->var_off.mask | reg->var_off.value) & val))
+			return 0;
+		break;
 	case BPF_JGT:
 		if (reg->umin_value > val)
 			return 1;
@@ -3943,6 +3949,13 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
 		 */
 		__mark_reg_known(false_reg, val);
 		break;
+	case BPF_JSET:
+		false_reg->var_off = tnum_and(false_reg->var_off,
+					      tnum_const(~val));
+		if (is_power_of_2(val))
+			true_reg->var_off = tnum_or(true_reg->var_off,
+						    tnum_const(val));
+		break;
 	case BPF_JGT:
 		false_reg->umax_value = min(false_reg->umax_value, val);
 		true_reg->umin_value = max(true_reg->umin_value, val + 1);
@@ -4015,6 +4028,13 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
 		 */
 		__mark_reg_known(false_reg, val);
 		break;
+	case BPF_JSET:
+		false_reg->var_off = tnum_and(false_reg->var_off,
+					      tnum_const(~val));
+		if (is_power_of_2(val))
+			true_reg->var_off = tnum_or(true_reg->var_off,
+						    tnum_const(val));
+		break;
 	case BPF_JGT:
 		true_reg->umax_value = min(true_reg->umax_value, val - 1);
 		false_reg->umin_value = max(false_reg->umin_value, val);
-- 
cgit 


From 9b38c4056b2736bb5902e8b0911832db666fd19b Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 19 Dec 2018 22:13:06 -0800
Subject: bpf: verifier: reorder stack size check with dead code sanitization

Reorder the calls to check_max_stack_depth() and sanitize_dead_code()
to separate functions which can rewrite instructions from pure checks.

No functional changes.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Jiong Wang <jiong.wang@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/verifier.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 98ed27bbd045..d27d5a880015 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6983,10 +6983,11 @@ skip_full_check:
 	free_states(env);
 
 	if (ret == 0)
-		sanitize_dead_code(env);
+		ret = check_max_stack_depth(env);
 
+	/* instruction rewrites happen after this point */
 	if (ret == 0)
-		ret = check_max_stack_depth(env);
+		sanitize_dead_code(env);
 
 	if (ret == 0)
 		/* program is valid, convert *(u32*)(ctx + off) accesses */
-- 
cgit 


From 8b1cce9f5832a8eda17d37a3c49fb7dd2d650f46 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Thu, 20 Dec 2018 17:35:47 +0100
Subject: dma-mapping: fix inverted logic in dma_supported

The cleanup in commit 356da6d0cde3 ("dma-mapping: bypass indirect calls
for dma-direct") accidentally inverted the logic in the check for the
presence of a ->dma_supported() callback. Switch this back to the way it
was to prevent a crash on boot.

Fixes: 356da6d0cde3 ("dma-mapping: bypass indirect calls for dma-direct")
Signed-off-by: Thierry Reding <treding@nvidia.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 kernel/dma/mapping.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index fc84c81029d9..d7c34d2d1ba5 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -406,7 +406,7 @@ int dma_supported(struct device *dev, u64 mask)
 
 	if (dma_is_direct(ops))
 		return dma_direct_supported(dev, mask);
-	if (ops->dma_supported)
+	if (!ops->dma_supported)
 		return 1;
 	return ops->dma_supported(dev, mask);
 }
-- 
cgit 


From 77ea5f4cbe2084db9ab021ba73fb7eadf1610884 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Wed, 19 Dec 2018 17:00:23 +0100
Subject: bpf/cpumap: make sure frame_size for build_skb is aligned if headroom
 isn't

The frame_size passed to build_skb must be aligned, else it is
possible that the embedded struct skb_shared_info gets unaligned.

For correctness make sure that xdpf->headroom in included in the
alignment. No upstream drivers can hit this, as all XDP drivers provide
an aligned headroom.  This was discovered when playing with implementing
XDP support for mvneta, which have a 2 bytes DSA header, and this
Marvell ARM64 platform didn't like doing atomic operations on an
unaligned skb_shinfo(skb)->dataref addresses.

Fixes: 1c601d829ab0 ("bpf: cpumap xdp_buff to skb conversion and allocation")
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/cpumap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 24aac0d0f412..8974b3755670 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -183,7 +183,7 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
 	 * is not at a fixed memory location, with mixed length
 	 * packets, which is bad for cache-line hotness.
 	 */
-	frame_size = SKB_DATA_ALIGN(xdpf->len) + xdpf->headroom +
+	frame_size = SKB_DATA_ALIGN(xdpf->len + xdpf->headroom) +
 		SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 
 	pkt_data_start = xdpf->data - xdpf->headroom;
-- 
cgit 


From 5eed6f1dff87bfb5e545935def3843edf42800f2 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Fri, 21 Dec 2018 14:30:54 -0800
Subject: fork,memcg: fix crash in free_thread_stack on memcg charge fail

Commit 9b6f7e163cd0 ("mm: rework memcg kernel stack accounting") will
result in fork failing if allocating a kernel stack for a task in
dup_task_struct exceeds the kernel memory allowance for that cgroup.

Unfortunately, it also results in a crash.

This is due to the code jumping to free_stack and calling
free_thread_stack when the memcg kernel stack charge fails, but without
tsk->stack pointing at the freshly allocated stack.

This in turn results in the vfree_atomic in free_thread_stack oopsing
with a backtrace like this:

#5 [ffffc900244efc88] die at ffffffff8101f0ab
 #6 [ffffc900244efcb8] do_general_protection at ffffffff8101cb86
 #7 [ffffc900244efce0] general_protection at ffffffff818ff082
    [exception RIP: llist_add_batch+7]
    RIP: ffffffff8150d487  RSP: ffffc900244efd98  RFLAGS: 00010282
    RAX: 0000000000000000  RBX: ffff88085ef55980  RCX: 0000000000000000
    RDX: ffff88085ef55980  RSI: 343834343531203a  RDI: 343834343531203a
    RBP: ffffc900244efd98   R8: 0000000000000001   R9: ffff8808578c3600
    R10: 0000000000000000  R11: 0000000000000001  R12: ffff88029f6c21c0
    R13: 0000000000000286  R14: ffff880147759b00  R15: 0000000000000000
    ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
 #8 [ffffc900244efda0] vfree_atomic at ffffffff811df2c7
 #9 [ffffc900244efdb8] copy_process at ffffffff81086e37
#10 [ffffc900244efe98] _do_fork at ffffffff810884e0
#11 [ffffc900244eff10] sys_vfork at ffffffff810887ff
#12 [ffffc900244eff20] do_syscall_64 at ffffffff81002a43
    RIP: 000000000049b948  RSP: 00007ffcdb307830  RFLAGS: 00000246
    RAX: ffffffffffffffda  RBX: 0000000000896030  RCX: 000000000049b948
    RDX: 0000000000000000  RSI: 00007ffcdb307790  RDI: 00000000005d7421
    RBP: 000000000067370f   R8: 00007ffcdb3077b0   R9: 000000000001ed00
    R10: 0000000000000008  R11: 0000000000000246  R12: 0000000000000040
    R13: 000000000000000f  R14: 0000000000000000  R15: 000000000088d018
    ORIG_RAX: 000000000000003a  CS: 0033  SS: 002b

The simplest fix is to assign tsk->stack right where it is allocated.

Link: http://lkml.kernel.org/r/20181214231726.7ee4843c@imladris.surriel.com
Fixes: 9b6f7e163cd0 ("mm: rework memcg kernel stack accounting")
Signed-off-by: Rik van Riel <riel@surriel.com>
Acked-by: Roman Gushchin <guro@fb.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 07cddff89c7b..e2a5156bc9c3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -240,8 +240,10 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
 	 * free_thread_stack() can be called in interrupt context,
 	 * so cache the vm_struct.
 	 */
-	if (stack)
+	if (stack) {
 		tsk->stack_vm_area = find_vm_area(stack);
+		tsk->stack = stack;
+	}
 	return stack;
 #else
 	struct page *page = alloc_pages_node(node, THREADINFO_GFP,
@@ -288,7 +290,10 @@ static struct kmem_cache *thread_stack_cache;
 static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
 						  int node)
 {
-	return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
+	unsigned long *stack;
+	stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
+	tsk->stack = stack;
+	return stack;
 }
 
 static void free_thread_stack(struct task_struct *tsk)
-- 
cgit 


From e8d086ddb5339d72c60e6c7b8d28810f26960f9a Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Tue, 18 Dec 2018 15:50:02 -0500
Subject: tracing: Fix ftrace_graph_get_ret_stack() to use task and not current

The function ftrace_graph_get_ret_stack() takes a task struct descriptor but
uses current as the task to perform the operations on. In pretty much all
cases the task decriptor is the same as current, so this wasn't an issue.
But there is a case in the ARM architecture that passes in a task that is
not current, and expects a result from that task, and this code breaks it.

Fixes: 51584396cff5 ("arm64: Use ftrace_graph_get_ret_stack() instead of curr_ret_stack")
Reported-by: James Morse <james.morse@arm.com>
Tested-by: James Morse <james.morse@arm.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/fgraph.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index d4f04f0ca646..8dfd5021b933 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -246,10 +246,10 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
 struct ftrace_ret_stack *
 ftrace_graph_get_ret_stack(struct task_struct *task, int idx)
 {
-	idx = current->curr_ret_stack - idx;
+	idx = task->curr_ret_stack - idx;
 
 	if (idx >= 0 && idx <= task->curr_ret_stack)
-		return &current->ret_stack[idx];
+		return &task->ret_stack[idx];
 
 	return NULL;
 }
-- 
cgit 


From 6801f0d5ca00bed159c7f87870cc6f5411837544 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Tue, 18 Dec 2018 14:33:20 -0600
Subject: tracing: Remove unnecessary hist trigger struct field

hist_field.var_idx is completely unused, so remove it.

Link: http://lkml.kernel.org/r/d4e066c0f509f5f13ad3babc8c33ca6e7ddc439a.1545161087.git.tom.zanussi@linux.intel.com

Acked-by: Namhyung Kim <namhyung@kernel.org>
Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 82e72c48a5a9..d29bf8a8e1dd 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -61,7 +61,6 @@ struct hist_field {
 	char				*system;
 	char				*event_name;
 	char				*name;
-	unsigned int			var_idx;
 	unsigned int			var_ref_idx;
 	bool                            read_once;
 };
-- 
cgit 


From 2f31ed9308cc9e95c149078b276a47360ab507bb Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Tue, 18 Dec 2018 14:33:21 -0600
Subject: tracing: Change strlen to sizeof for hist trigger static strings

There's no need to use strlen() for static strings when the length is
already known, so update trace_events_hist.c with sizeof() for those
cases.

Link: http://lkml.kernel.org/r/e3e754f2bd18e56eaa8baf79bee619316ebf4cfc.1545161087.git.tom.zanussi@linux.intel.com

Acked-by: Namhyung Kim <namhyung@kernel.org>
Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index d29bf8a8e1dd..25d06b3ae1f6 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -507,7 +507,7 @@ static int synth_field_string_size(char *type)
 	start = strstr(type, "char[");
 	if (start == NULL)
 		return -EINVAL;
-	start += strlen("char[");
+	start += sizeof("char[") - 1;
 
 	end = strchr(type, ']');
 	if (!end || end < start)
@@ -1843,8 +1843,8 @@ static int parse_action(char *str, struct hist_trigger_attrs *attrs)
 	if (attrs->n_actions >= HIST_ACTIONS_MAX)
 		return ret;
 
-	if ((strncmp(str, "onmatch(", strlen("onmatch(")) == 0) ||
-	    (strncmp(str, "onmax(", strlen("onmax(")) == 0)) {
+	if ((strncmp(str, "onmatch(", sizeof("onmatch(") - 1) == 0) ||
+	    (strncmp(str, "onmax(", sizeof("onmax(") - 1) == 0)) {
 		attrs->action_str[attrs->n_actions] = kstrdup(str, GFP_KERNEL);
 		if (!attrs->action_str[attrs->n_actions]) {
 			ret = -ENOMEM;
@@ -1861,34 +1861,34 @@ static int parse_assignment(char *str, struct hist_trigger_attrs *attrs)
 {
 	int ret = 0;
 
-	if ((strncmp(str, "key=", strlen("key=")) == 0) ||
-	    (strncmp(str, "keys=", strlen("keys=")) == 0)) {
+	if ((strncmp(str, "key=", sizeof("key=") - 1) == 0) ||
+	    (strncmp(str, "keys=", sizeof("keys=") - 1) == 0)) {
 		attrs->keys_str = kstrdup(str, GFP_KERNEL);
 		if (!attrs->keys_str) {
 			ret = -ENOMEM;
 			goto out;
 		}
-	} else if ((strncmp(str, "val=", strlen("val=")) == 0) ||
-		 (strncmp(str, "vals=", strlen("vals=")) == 0) ||
-		 (strncmp(str, "values=", strlen("values=")) == 0)) {
+	} else if ((strncmp(str, "val=", sizeof("val=") - 1) == 0) ||
+		 (strncmp(str, "vals=", sizeof("vals=") - 1) == 0) ||
+		 (strncmp(str, "values=", sizeof("values=") - 1) == 0)) {
 		attrs->vals_str = kstrdup(str, GFP_KERNEL);
 		if (!attrs->vals_str) {
 			ret = -ENOMEM;
 			goto out;
 		}
-	} else if (strncmp(str, "sort=", strlen("sort=")) == 0) {
+	} else if (strncmp(str, "sort=", sizeof("sort=") - 1) == 0) {
 		attrs->sort_key_str = kstrdup(str, GFP_KERNEL);
 		if (!attrs->sort_key_str) {
 			ret = -ENOMEM;
 			goto out;
 		}
-	} else if (strncmp(str, "name=", strlen("name=")) == 0) {
+	} else if (strncmp(str, "name=", sizeof("name=") - 1) == 0) {
 		attrs->name = kstrdup(str, GFP_KERNEL);
 		if (!attrs->name) {
 			ret = -ENOMEM;
 			goto out;
 		}
-	} else if (strncmp(str, "clock=", strlen("clock=")) == 0) {
+	} else if (strncmp(str, "clock=", sizeof("clock=") - 1) == 0) {
 		strsep(&str, "=");
 		if (!str) {
 			ret = -EINVAL;
@@ -1901,7 +1901,7 @@ static int parse_assignment(char *str, struct hist_trigger_attrs *attrs)
 			ret = -ENOMEM;
 			goto out;
 		}
-	} else if (strncmp(str, "size=", strlen("size=")) == 0) {
+	} else if (strncmp(str, "size=", sizeof("size=") - 1) == 0) {
 		int map_bits = parse_map_size(str);
 
 		if (map_bits < 0) {
@@ -3497,7 +3497,7 @@ static struct action_data *onmax_parse(char *str)
 	if (!onmax_fn_name || !str)
 		goto free;
 
-	if (strncmp(onmax_fn_name, "save", strlen("save")) == 0) {
+	if (strncmp(onmax_fn_name, "save", sizeof("save") - 1) == 0) {
 		char *params = strsep(&str, ")");
 
 		if (!params) {
@@ -4302,8 +4302,8 @@ static int parse_actions(struct hist_trigger_data *hist_data)
 	for (i = 0; i < hist_data->attrs->n_actions; i++) {
 		str = hist_data->attrs->action_str[i];
 
-		if (strncmp(str, "onmatch(", strlen("onmatch(")) == 0) {
-			char *action_str = str + strlen("onmatch(");
+		if (strncmp(str, "onmatch(", sizeof("onmatch(") - 1) == 0) {
+			char *action_str = str + sizeof("onmatch(") - 1;
 
 			data = onmatch_parse(tr, action_str);
 			if (IS_ERR(data)) {
@@ -4311,8 +4311,8 @@ static int parse_actions(struct hist_trigger_data *hist_data)
 				break;
 			}
 			data->fn = action_trace;
-		} else if (strncmp(str, "onmax(", strlen("onmax(")) == 0) {
-			char *action_str = str + strlen("onmax(");
+		} else if (strncmp(str, "onmax(", sizeof("onmax(") - 1) == 0) {
+			char *action_str = str + sizeof("onmax(") - 1;
 
 			data = onmax_parse(action_str);
 			if (IS_ERR(data)) {
@@ -5548,9 +5548,9 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
 			p++;
 			continue;
 		}
-		if (p >= param + strlen(param) - strlen("if") - 1)
+		if (p >= param + strlen(param) - (sizeof("if") - 1) - 1)
 			return -EINVAL;
-		if (*(p + strlen("if")) != ' ' && *(p + strlen("if")) != '\t') {
+		if (*(p + sizeof("if") - 1) != ' ' && *(p + sizeof("if") - 1) != '\t') {
 			p++;
 			continue;
 		}
-- 
cgit 


From e4f6d245031e04bdd12db390298acec0474a1a46 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Wed, 19 Dec 2018 13:09:16 -0600
Subject: tracing: Use var_refs[] for hist trigger reference checking

Since all the variable reference hist_fields are collected into
hist_data->var_refs[] array, there's no need to go through all the
fields looking for them, or in separate arrays like synth_var_refs[],
which will be going away soon anyway.

This also allows us to get rid of some unnecessary code and functions
currently used for the same purpose.

Link: http://lkml.kernel.org/r/1545246556.4239.7.camel@gmail.com

Acked-by: Namhyung Kim <namhyung@kernel.org>
Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c | 68 +++++++---------------------------------
 1 file changed, 11 insertions(+), 57 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 25d06b3ae1f6..f3caf6e484f4 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -1297,75 +1297,29 @@ check_field_for_var_ref(struct hist_field *hist_field,
 			struct hist_trigger_data *var_data,
 			unsigned int var_idx)
 {
-	struct hist_field *found = NULL;
-
-	if (hist_field && hist_field->flags & HIST_FIELD_FL_VAR_REF) {
-		if (hist_field->var.idx == var_idx &&
-		    hist_field->var.hist_data == var_data) {
-			found = hist_field;
-		}
-	}
-
-	return found;
-}
-
-static struct hist_field *
-check_field_for_var_refs(struct hist_trigger_data *hist_data,
-			 struct hist_field *hist_field,
-			 struct hist_trigger_data *var_data,
-			 unsigned int var_idx,
-			 unsigned int level)
-{
-	struct hist_field *found = NULL;
-	unsigned int i;
-
-	if (level > 3)
-		return found;
-
-	if (!hist_field)
-		return found;
-
-	found = check_field_for_var_ref(hist_field, var_data, var_idx);
-	if (found)
-		return found;
-
-	for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++) {
-		struct hist_field *operand;
+	WARN_ON(!(hist_field && hist_field->flags & HIST_FIELD_FL_VAR_REF));
 
-		operand = hist_field->operands[i];
-		found = check_field_for_var_refs(hist_data, operand, var_data,
-						 var_idx, level + 1);
-		if (found)
-			return found;
-	}
+	if (hist_field && hist_field->var.idx == var_idx &&
+	    hist_field->var.hist_data == var_data)
+		return hist_field;
 
-	return found;
+	return NULL;
 }
 
 static struct hist_field *find_var_ref(struct hist_trigger_data *hist_data,
 				       struct hist_trigger_data *var_data,
 				       unsigned int var_idx)
 {
-	struct hist_field *hist_field, *found = NULL;
+	struct hist_field *hist_field;
 	unsigned int i;
 
-	for_each_hist_field(i, hist_data) {
-		hist_field = hist_data->fields[i];
-		found = check_field_for_var_refs(hist_data, hist_field,
-						 var_data, var_idx, 0);
-		if (found)
-			return found;
-	}
-
-	for (i = 0; i < hist_data->n_synth_var_refs; i++) {
-		hist_field = hist_data->synth_var_refs[i];
-		found = check_field_for_var_refs(hist_data, hist_field,
-						 var_data, var_idx, 0);
-		if (found)
-			return found;
+	for (i = 0; i < hist_data->n_var_refs; i++) {
+		hist_field = hist_data->var_refs[i];
+		if (check_field_for_var_ref(hist_field, var_data, var_idx))
+			return hist_field;
 	}
 
-	return found;
+	return NULL;
 }
 
 static struct hist_field *find_any_var_ref(struct hist_trigger_data *hist_data,
-- 
cgit 


From de40f033d4e84e843d6a12266e3869015ea9097c Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Tue, 18 Dec 2018 14:33:23 -0600
Subject: tracing: Remove open-coding of hist trigger var_ref management

Have create_var_ref() manage the hist trigger's var_ref list, rather
than having similar code doing it in multiple places.  This cleans up
the code and makes sure var_refs are always accounted properly.

Also, document the var_ref-related functions to make what their
purpose clearer.

Link: http://lkml.kernel.org/r/05ddae93ff514e66fc03897d6665231892939913.1545161087.git.tom.zanussi@linux.intel.com

Acked-by: Namhyung Kim <namhyung@kernel.org>
Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c | 93 ++++++++++++++++++++++++++++++++--------
 1 file changed, 75 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index f3caf6e484f4..6309f4dbfb9c 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -1292,6 +1292,17 @@ static u64 hist_field_cpu(struct hist_field *hist_field,
 	return cpu;
 }
 
+/**
+ * check_field_for_var_ref - Check if a VAR_REF field references a variable
+ * @hist_field: The VAR_REF field to check
+ * @var_data: The hist trigger that owns the variable
+ * @var_idx: The trigger variable identifier
+ *
+ * Check the given VAR_REF field to see whether or not it references
+ * the given variable associated with the given trigger.
+ *
+ * Return: The VAR_REF field if it does reference the variable, NULL if not
+ */
 static struct hist_field *
 check_field_for_var_ref(struct hist_field *hist_field,
 			struct hist_trigger_data *var_data,
@@ -1306,6 +1317,18 @@ check_field_for_var_ref(struct hist_field *hist_field,
 	return NULL;
 }
 
+/**
+ * find_var_ref - Check if a trigger has a reference to a trigger variable
+ * @hist_data: The hist trigger that might have a reference to the variable
+ * @var_data: The hist trigger that owns the variable
+ * @var_idx: The trigger variable identifier
+ *
+ * Check the list of var_refs[] on the first hist trigger to see
+ * whether any of them are references to the variable on the second
+ * trigger.
+ *
+ * Return: The VAR_REF field referencing the variable if so, NULL if not
+ */
 static struct hist_field *find_var_ref(struct hist_trigger_data *hist_data,
 				       struct hist_trigger_data *var_data,
 				       unsigned int var_idx)
@@ -1322,6 +1345,20 @@ static struct hist_field *find_var_ref(struct hist_trigger_data *hist_data,
 	return NULL;
 }
 
+/**
+ * find_any_var_ref - Check if there is a reference to a given trigger variable
+ * @hist_data: The hist trigger
+ * @var_idx: The trigger variable identifier
+ *
+ * Check to see whether the given variable is currently referenced by
+ * any other trigger.
+ *
+ * The trigger the variable is defined on is explicitly excluded - the
+ * assumption being that a self-reference doesn't prevent a trigger
+ * from being removed.
+ *
+ * Return: The VAR_REF field referencing the variable if so, NULL if not
+ */
 static struct hist_field *find_any_var_ref(struct hist_trigger_data *hist_data,
 					   unsigned int var_idx)
 {
@@ -1340,6 +1377,19 @@ static struct hist_field *find_any_var_ref(struct hist_trigger_data *hist_data,
 	return found;
 }
 
+/**
+ * check_var_refs - Check if there is a reference to any of trigger's variables
+ * @hist_data: The hist trigger
+ *
+ * A trigger can define one or more variables.  If any one of them is
+ * currently referenced by any other trigger, this function will
+ * determine that.
+
+ * Typically used to determine whether or not a trigger can be removed
+ * - if there are any references to a trigger's variables, it cannot.
+ *
+ * Return: True if there is a reference to any of trigger's variables
+ */
 static bool check_var_refs(struct hist_trigger_data *hist_data)
 {
 	struct hist_field *field;
@@ -2343,7 +2393,23 @@ static int init_var_ref(struct hist_field *ref_field,
 	goto out;
 }
 
-static struct hist_field *create_var_ref(struct hist_field *var_field,
+/**
+ * create_var_ref - Create a variable reference and attach it to trigger
+ * @hist_data: The trigger that will be referencing the variable
+ * @var_field: The VAR field to create a reference to
+ * @system: The optional system string
+ * @event_name: The optional event_name string
+ *
+ * Given a variable hist_field, create a VAR_REF hist_field that
+ * represents a reference to it.
+ *
+ * This function also adds the reference to the trigger that
+ * now references the variable.
+ *
+ * Return: The VAR_REF field if successful, NULL if not
+ */
+static struct hist_field *create_var_ref(struct hist_trigger_data *hist_data,
+					 struct hist_field *var_field,
 					 char *system, char *event_name)
 {
 	unsigned long flags = HIST_FIELD_FL_VAR_REF;
@@ -2355,6 +2421,9 @@ static struct hist_field *create_var_ref(struct hist_field *var_field,
 			destroy_hist_field(ref_field, 0);
 			return NULL;
 		}
+
+		hist_data->var_refs[hist_data->n_var_refs] = ref_field;
+		ref_field->var_ref_idx = hist_data->n_var_refs++;
 	}
 
 	return ref_field;
@@ -2428,7 +2497,8 @@ static struct hist_field *parse_var_ref(struct hist_trigger_data *hist_data,
 
 	var_field = find_event_var(hist_data, system, event_name, var_name);
 	if (var_field)
-		ref_field = create_var_ref(var_field, system, event_name);
+		ref_field = create_var_ref(hist_data, var_field,
+					   system, event_name);
 
 	if (!ref_field)
 		hist_err_event("Couldn't find variable: $",
@@ -2546,8 +2616,6 @@ static struct hist_field *parse_atom(struct hist_trigger_data *hist_data,
 	if (!s) {
 		hist_field = parse_var_ref(hist_data, ref_system, ref_event, ref_var);
 		if (hist_field) {
-			hist_data->var_refs[hist_data->n_var_refs] = hist_field;
-			hist_field->var_ref_idx = hist_data->n_var_refs++;
 			if (var_name) {
 				hist_field = create_alias(hist_data, hist_field, var_name);
 				if (!hist_field) {
@@ -3321,7 +3389,6 @@ static int onmax_create(struct hist_trigger_data *hist_data,
 	unsigned int var_ref_idx = hist_data->n_var_refs;
 	struct field_var *field_var;
 	char *onmax_var_str, *param;
-	unsigned long flags;
 	unsigned int i;
 	int ret = 0;
 
@@ -3338,18 +3405,10 @@ static int onmax_create(struct hist_trigger_data *hist_data,
 		return -EINVAL;
 	}
 
-	flags = HIST_FIELD_FL_VAR_REF;
-	ref_field = create_hist_field(hist_data, NULL, flags, NULL);
+	ref_field = create_var_ref(hist_data, var_field, NULL, NULL);
 	if (!ref_field)
 		return -ENOMEM;
 
-	if (init_var_ref(ref_field, var_field, NULL, NULL)) {
-		destroy_hist_field(ref_field, 0);
-		ret = -ENOMEM;
-		goto out;
-	}
-	hist_data->var_refs[hist_data->n_var_refs] = ref_field;
-	ref_field->var_ref_idx = hist_data->n_var_refs++;
 	data->onmax.var = ref_field;
 
 	data->fn = onmax_save;
@@ -3538,9 +3597,6 @@ static void save_synth_var_ref(struct hist_trigger_data *hist_data,
 			 struct hist_field *var_ref)
 {
 	hist_data->synth_var_refs[hist_data->n_synth_var_refs++] = var_ref;
-
-	hist_data->var_refs[hist_data->n_var_refs] = var_ref;
-	var_ref->var_ref_idx = hist_data->n_var_refs++;
 }
 
 static int check_synth_field(struct synth_event *event,
@@ -3694,7 +3750,8 @@ static int onmatch_create(struct hist_trigger_data *hist_data,
 		}
 
 		if (check_synth_field(event, hist_field, field_pos) == 0) {
-			var_ref = create_var_ref(hist_field, system, event_name);
+			var_ref = create_var_ref(hist_data, hist_field,
+						 system, event_name);
 			if (!var_ref) {
 				kfree(p);
 				ret = -ENOMEM;
-- 
cgit 


From 656fe2ba85e81d00e4447bf77b8da2be3c47acb2 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Tue, 18 Dec 2018 14:33:24 -0600
Subject: tracing: Use hist trigger's var_ref array to destroy var_refs

Since every var ref for a trigger has an entry in the var_ref[] array,
use that to destroy the var_refs, instead of piecemeal via the field
expressions.

This allows us to avoid having to keep and treat differently separate
lists for the action-related references, which future patches will
remove.

Link: http://lkml.kernel.org/r/fad1a164f0e257c158e70d6eadbf6c586e04b2a2.1545161087.git.tom.zanussi@linux.intel.com

Acked-by: Namhyung Kim <namhyung@kernel.org>
Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 6309f4dbfb9c..923c572ee68d 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -2190,6 +2190,15 @@ static int contains_operator(char *str)
 	return field_op;
 }
 
+static void __destroy_hist_field(struct hist_field *hist_field)
+{
+	kfree(hist_field->var.name);
+	kfree(hist_field->name);
+	kfree(hist_field->type);
+
+	kfree(hist_field);
+}
+
 static void destroy_hist_field(struct hist_field *hist_field,
 			       unsigned int level)
 {
@@ -2201,14 +2210,13 @@ static void destroy_hist_field(struct hist_field *hist_field,
 	if (!hist_field)
 		return;
 
+	if (hist_field->flags & HIST_FIELD_FL_VAR_REF)
+		return; /* var refs will be destroyed separately */
+
 	for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++)
 		destroy_hist_field(hist_field->operands[i], level + 1);
 
-	kfree(hist_field->var.name);
-	kfree(hist_field->name);
-	kfree(hist_field->type);
-
-	kfree(hist_field);
+	__destroy_hist_field(hist_field);
 }
 
 static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
@@ -2335,6 +2343,12 @@ static void destroy_hist_fields(struct hist_trigger_data *hist_data)
 			hist_data->fields[i] = NULL;
 		}
 	}
+
+	for (i = 0; i < hist_data->n_var_refs; i++) {
+		WARN_ON(!(hist_data->var_refs[i]->flags & HIST_FIELD_FL_VAR_REF));
+		__destroy_hist_field(hist_data->var_refs[i]);
+		hist_data->var_refs[i] = NULL;
+	}
 }
 
 static int init_var_ref(struct hist_field *ref_field,
-- 
cgit 


From 912201345f7c39e6b0ac283207be2b6641fa47b9 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Tue, 18 Dec 2018 14:33:25 -0600
Subject: tracing: Remove hist trigger synth_var_refs

All var_refs are now handled uniformly and there's no reason to treat
the synth_refs in a special way now, so remove them and associated
functions.

Link: http://lkml.kernel.org/r/b4d3470526b8f0426dcec125399dad9ad9b8589d.1545161087.git.tom.zanussi@linux.intel.com

Acked-by: Namhyung Kim <namhyung@kernel.org>
Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c | 18 ------------------
 1 file changed, 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 923c572ee68d..1b1aee944588 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -279,8 +279,6 @@ struct hist_trigger_data {
 	struct action_data		*actions[HIST_ACTIONS_MAX];
 	unsigned int			n_actions;
 
-	struct hist_field               *synth_var_refs[SYNTH_FIELDS_MAX];
-	unsigned int                    n_synth_var_refs;
 	struct field_var		*field_vars[SYNTH_FIELDS_MAX];
 	unsigned int			n_field_vars;
 	unsigned int			n_field_var_str;
@@ -3599,20 +3597,6 @@ static void save_field_var(struct hist_trigger_data *hist_data,
 }
 
 
-static void destroy_synth_var_refs(struct hist_trigger_data *hist_data)
-{
-	unsigned int i;
-
-	for (i = 0; i < hist_data->n_synth_var_refs; i++)
-		destroy_hist_field(hist_data->synth_var_refs[i], 0);
-}
-
-static void save_synth_var_ref(struct hist_trigger_data *hist_data,
-			 struct hist_field *var_ref)
-{
-	hist_data->synth_var_refs[hist_data->n_synth_var_refs++] = var_ref;
-}
-
 static int check_synth_field(struct synth_event *event,
 			     struct hist_field *hist_field,
 			     unsigned int field_pos)
@@ -3772,7 +3756,6 @@ static int onmatch_create(struct hist_trigger_data *hist_data,
 				goto err;
 			}
 
-			save_synth_var_ref(hist_data, var_ref);
 			field_pos++;
 			kfree(p);
 			continue;
@@ -4516,7 +4499,6 @@ static void destroy_hist_data(struct hist_trigger_data *hist_data)
 	destroy_actions(hist_data);
 	destroy_field_vars(hist_data);
 	destroy_field_var_hists(hist_data);
-	destroy_synth_var_refs(hist_data);
 
 	kfree(hist_data);
 }
-- 
cgit 


From 05ddb25cb31491b0901d057c40ad50d01b3db783 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Tue, 18 Dec 2018 14:33:26 -0600
Subject: tracing: Add hist trigger comments for variable-related fields

Add a few comments to help clarify how variable and variable reference
fields are used in the code.

Link: http://lkml.kernel.org/r/ea857ce948531d7bec712bbb0f38360aa1d378ec.1545161087.git.tom.zanussi@linux.intel.com

Acked-by: Namhyung Kim <namhyung@kernel.org>
Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 1b1aee944588..c5448c103770 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -40,6 +40,16 @@ enum field_op_id {
 	FIELD_OP_UNARY_MINUS,
 };
 
+/*
+ * A hist_var (histogram variable) contains variable information for
+ * hist_fields having the HIST_FIELD_FL_VAR or HIST_FIELD_FL_VAR_REF
+ * flag set.  A hist_var has a variable name e.g. ts0, and is
+ * associated with a given histogram trigger, as specified by
+ * hist_data.  The hist_var idx is the unique index assigned to the
+ * variable by the hist trigger's tracing_map.  The idx is what is
+ * used to set a variable's value and, by a variable reference, to
+ * retrieve it.
+ */
 struct hist_var {
 	char				*name;
 	struct hist_trigger_data	*hist_data;
@@ -56,11 +66,29 @@ struct hist_field {
 	const char			*type;
 	struct hist_field		*operands[HIST_FIELD_OPERANDS_MAX];
 	struct hist_trigger_data	*hist_data;
+
+	/*
+	 * Variable fields contain variable-specific info in var.
+	 */
 	struct hist_var			var;
 	enum field_op_id		operator;
 	char				*system;
 	char				*event_name;
+
+	/*
+	 * The name field is used for EXPR and VAR_REF fields.  VAR
+	 * fields contain the variable name in var.name.
+	 */
 	char				*name;
+
+	/*
+	 * When a histogram trigger is hit, if it has any references
+	 * to variables, the values of those variables are collected
+	 * into a var_ref_vals array by resolve_var_refs().  The
+	 * current value of each variable is read from the tracing_map
+	 * using the hist field's hist_var.idx and entered into the
+	 * var_ref_idx entry i.e. var_ref_vals[var_ref_idx].
+	 */
 	unsigned int			var_ref_idx;
 	bool                            read_once;
 };
@@ -365,6 +393,14 @@ struct action_data {
 
 	union {
 		struct {
+			/*
+			 * When a histogram trigger is hit, the values of any
+			 * references to variables, including variables being passed
+			 * as parameters to synthetic events, are collected into a
+			 * var_ref_vals array.  This var_ref_idx is the index of the
+			 * first param in the array to be passed to the synthetic
+			 * event invocation.
+			 */
 			unsigned int		var_ref_idx;
 			char			*match_event;
 			char			*match_event_system;
-- 
cgit 


From 59dd974bc079079c23b5429cba841696fa7fae41 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Mon, 29 Oct 2018 23:35:40 +0100
Subject: tracing: Merge seq_print_sym_short() and seq_print_sym_offset()

These two functions are nearly identical, so we can avoid some code
duplication by moving the conditional into a common implementation.

Link: http://lkml.kernel.org/r/20181029223542.26175-2-linux@rasmusvillemoes.dk

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_output.c | 34 +++++++---------------------------
 1 file changed, 7 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 6e6cc64faa38..85ecd061c7be 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -339,34 +339,17 @@ static inline const char *kretprobed(const char *name)
 #endif /* CONFIG_KRETPROBES */
 
 static void
-seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
+seq_print_sym(struct trace_seq *s, const char *fmt, unsigned long address,
+	      bool offset)
 {
 	char str[KSYM_SYMBOL_LEN];
 #ifdef CONFIG_KALLSYMS
 	const char *name;
 
-	kallsyms_lookup(address, NULL, NULL, NULL, str);
-
-	name = kretprobed(str);
-
-	if (name && strlen(name)) {
-		trace_seq_printf(s, fmt, name);
-		return;
-	}
-#endif
-	snprintf(str, KSYM_SYMBOL_LEN, "0x%08lx", address);
-	trace_seq_printf(s, fmt, str);
-}
-
-static void
-seq_print_sym_offset(struct trace_seq *s, const char *fmt,
-		     unsigned long address)
-{
-	char str[KSYM_SYMBOL_LEN];
-#ifdef CONFIG_KALLSYMS
-	const char *name;
-
-	sprint_symbol(str, address);
+	if (offset)
+		sprint_symbol(str, address);
+	else
+		kallsyms_lookup(address, NULL, NULL, NULL, str);
 	name = kretprobed(str);
 
 	if (name && strlen(name)) {
@@ -424,10 +407,7 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
 		goto out;
 	}
 
-	if (sym_flags & TRACE_ITER_SYM_OFFSET)
-		seq_print_sym_offset(s, "%s", ip);
-	else
-		seq_print_sym_short(s, "%s", ip);
+	seq_print_sym(s, "%s", ip, sym_flags & TRACE_ITER_SYM_OFFSET);
 
 	if (sym_flags & TRACE_ITER_SYM_ADDR)
 		trace_seq_printf(s, " <" IP_FMT ">", ip);
-- 
cgit 


From cc9f59fb3bc455984404dbab8be16f156a47d023 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Mon, 29 Oct 2018 23:35:41 +0100
Subject: tracing: Avoid -Wformat-nonliteral warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Building with -Wformat-nonliteral, gcc complains

kernel/trace/trace_output.c: In function ‘seq_print_sym’:
kernel/trace/trace_output.c:356:3: warning: format not a string literal, argument types not checked [-Wformat-nonliteral]
   trace_seq_printf(s, fmt, name);

But seq_print_sym only has a single caller which passes "%s" as fmt, so
we might as well just use that directly. That also paves the way for
further cleanups that will actually make that format string go away
entirely.

Link: http://lkml.kernel.org/r/20181029223542.26175-3-linux@rasmusvillemoes.dk

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_output.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 85ecd061c7be..f06fb899b746 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -339,8 +339,7 @@ static inline const char *kretprobed(const char *name)
 #endif /* CONFIG_KRETPROBES */
 
 static void
-seq_print_sym(struct trace_seq *s, const char *fmt, unsigned long address,
-	      bool offset)
+seq_print_sym(struct trace_seq *s, unsigned long address, bool offset)
 {
 	char str[KSYM_SYMBOL_LEN];
 #ifdef CONFIG_KALLSYMS
@@ -353,12 +352,12 @@ seq_print_sym(struct trace_seq *s, const char *fmt, unsigned long address,
 	name = kretprobed(str);
 
 	if (name && strlen(name)) {
-		trace_seq_printf(s, fmt, name);
+		trace_seq_printf(s, "%s", name);
 		return;
 	}
 #endif
 	snprintf(str, KSYM_SYMBOL_LEN, "0x%08lx", address);
-	trace_seq_printf(s, fmt, str);
+	trace_seq_printf(s, "%s", str);
 }
 
 #ifndef CONFIG_64BIT
@@ -407,7 +406,7 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
 		goto out;
 	}
 
-	seq_print_sym(s, "%s", ip, sym_flags & TRACE_ITER_SYM_OFFSET);
+	seq_print_sym(s, ip, sym_flags & TRACE_ITER_SYM_OFFSET);
 
 	if (sym_flags & TRACE_ITER_SYM_ADDR)
 		trace_seq_printf(s, " <" IP_FMT ">", ip);
-- 
cgit 


From bea6957d5cd7cbb327083d3bcf080133ee63ef65 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Mon, 29 Oct 2018 23:35:42 +0100
Subject: tracing: Simplify printf'ing in seq_print_sym

trace_seq_printf(..., "%s", ...) can be done with trace_seq_puts()
instead, avoiding printf overhead. In the second instance, the string
we're copying was just created from an snprintf() to a stack buffer, so
we might as well do that printf directly. This naturally leads to moving
the declaration of the str buffer inside the CONFIG_KALLSYMS guard,
which in turn will make gcc inline the function for !CONFIG_KALLSYMS (it
only has a single caller, but the huge stack frame seems to make gcc not
inline it for CONFIG_KALLSYMS).

Link: http://lkml.kernel.org/r/20181029223542.26175-4-linux@rasmusvillemoes.dk

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_output.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index f06fb899b746..54373d93e251 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -341,8 +341,8 @@ static inline const char *kretprobed(const char *name)
 static void
 seq_print_sym(struct trace_seq *s, unsigned long address, bool offset)
 {
-	char str[KSYM_SYMBOL_LEN];
 #ifdef CONFIG_KALLSYMS
+	char str[KSYM_SYMBOL_LEN];
 	const char *name;
 
 	if (offset)
@@ -352,12 +352,11 @@ seq_print_sym(struct trace_seq *s, unsigned long address, bool offset)
 	name = kretprobed(str);
 
 	if (name && strlen(name)) {
-		trace_seq_printf(s, "%s", name);
+		trace_seq_puts(s, name);
 		return;
 	}
 #endif
-	snprintf(str, KSYM_SYMBOL_LEN, "0x%08lx", address);
-	trace_seq_printf(s, "%s", str);
+	trace_seq_printf(s, "0x%08lx", address);
 }
 
 #ifndef CONFIG_64BIT
-- 
cgit 


From 1cce377df1800154301525a8ee04100dd83c9633 Mon Sep 17 00:00:00 2001
From: Mathieu Malaterre <malat@debian.org>
Date: Wed, 16 May 2018 21:30:12 +0200
Subject: tracing: Make function ‘ftrace_exports’ static
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In commit 478409dd683d ("tracing: Add hook to function tracing for other
subsystems to use"), a new function ‘ftrace_exports’ was added. Since
this function can be made static, make it so.

Silence the following warning triggered using W=1:

  kernel/trace/trace.c:2451:6: warning: no previous prototype for ‘ftrace_exports’ [-Wmissing-prototypes]

Link: http://lkml.kernel.org/r/20180516193012.25390-1-malat@debian.org

Signed-off-by: Mathieu Malaterre <malat@debian.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 911470ad9e94..5afcfecb4bc2 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2452,7 +2452,7 @@ static inline void ftrace_exports_disable(void)
 	static_branch_disable(&ftrace_exports_enabled);
 }
 
-void ftrace_exports(struct ring_buffer_event *event)
+static void ftrace_exports(struct ring_buffer_event *event)
 {
 	struct trace_export *export;
 
-- 
cgit 


From 754481e6954cbef53f8bc4412ad48dde611e21d3 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Wed, 19 Dec 2018 22:38:21 -0500
Subject: tracing: Use str_has_prefix() helper for histogram code

The tracing histogram code contains a lot of instances of the construct:

 strncmp(str, "const", sizeof("const") - 1)

This can be prone to bugs due to typos or bad cut and paste. Use the
str_has_prefix() helper macro instead that removes the need for having two
copies of the constant string.

Cc: Tom Zanussi <tom.zanussi@linux.intel.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index c5448c103770..9d590138f870 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -1881,8 +1881,8 @@ static int parse_action(char *str, struct hist_trigger_attrs *attrs)
 	if (attrs->n_actions >= HIST_ACTIONS_MAX)
 		return ret;
 
-	if ((strncmp(str, "onmatch(", sizeof("onmatch(") - 1) == 0) ||
-	    (strncmp(str, "onmax(", sizeof("onmax(") - 1) == 0)) {
+	if ((str_has_prefix(str, "onmatch(")) ||
+	    (str_has_prefix(str, "onmax("))) {
 		attrs->action_str[attrs->n_actions] = kstrdup(str, GFP_KERNEL);
 		if (!attrs->action_str[attrs->n_actions]) {
 			ret = -ENOMEM;
@@ -1899,34 +1899,34 @@ static int parse_assignment(char *str, struct hist_trigger_attrs *attrs)
 {
 	int ret = 0;
 
-	if ((strncmp(str, "key=", sizeof("key=") - 1) == 0) ||
-	    (strncmp(str, "keys=", sizeof("keys=") - 1) == 0)) {
+	if ((str_has_prefix(str, "key=")) ||
+	    (str_has_prefix(str, "keys="))) {
 		attrs->keys_str = kstrdup(str, GFP_KERNEL);
 		if (!attrs->keys_str) {
 			ret = -ENOMEM;
 			goto out;
 		}
-	} else if ((strncmp(str, "val=", sizeof("val=") - 1) == 0) ||
-		 (strncmp(str, "vals=", sizeof("vals=") - 1) == 0) ||
-		 (strncmp(str, "values=", sizeof("values=") - 1) == 0)) {
+	} else if ((str_has_prefix(str, "val=")) ||
+		   (str_has_prefix(str, "vals=")) ||
+		   (str_has_prefix(str, "values="))) {
 		attrs->vals_str = kstrdup(str, GFP_KERNEL);
 		if (!attrs->vals_str) {
 			ret = -ENOMEM;
 			goto out;
 		}
-	} else if (strncmp(str, "sort=", sizeof("sort=") - 1) == 0) {
+	} else if (str_has_prefix(str, "sort=")) {
 		attrs->sort_key_str = kstrdup(str, GFP_KERNEL);
 		if (!attrs->sort_key_str) {
 			ret = -ENOMEM;
 			goto out;
 		}
-	} else if (strncmp(str, "name=", sizeof("name=") - 1) == 0) {
+	} else if (str_has_prefix(str, "name=")) {
 		attrs->name = kstrdup(str, GFP_KERNEL);
 		if (!attrs->name) {
 			ret = -ENOMEM;
 			goto out;
 		}
-	} else if (strncmp(str, "clock=", sizeof("clock=") - 1) == 0) {
+	} else if (str_has_prefix(str, "clock=")) {
 		strsep(&str, "=");
 		if (!str) {
 			ret = -EINVAL;
@@ -1939,7 +1939,7 @@ static int parse_assignment(char *str, struct hist_trigger_attrs *attrs)
 			ret = -ENOMEM;
 			goto out;
 		}
-	} else if (strncmp(str, "size=", sizeof("size=") - 1) == 0) {
+	} else if (str_has_prefix(str, "size=")) {
 		int map_bits = parse_map_size(str);
 
 		if (map_bits < 0) {
@@ -3558,7 +3558,7 @@ static struct action_data *onmax_parse(char *str)
 	if (!onmax_fn_name || !str)
 		goto free;
 
-	if (strncmp(onmax_fn_name, "save", sizeof("save") - 1) == 0) {
+	if (str_has_prefix(onmax_fn_name, "save")) {
 		char *params = strsep(&str, ")");
 
 		if (!params) {
@@ -4346,7 +4346,7 @@ static int parse_actions(struct hist_trigger_data *hist_data)
 	for (i = 0; i < hist_data->attrs->n_actions; i++) {
 		str = hist_data->attrs->action_str[i];
 
-		if (strncmp(str, "onmatch(", sizeof("onmatch(") - 1) == 0) {
+		if (str_has_prefix(str, "onmatch(")) {
 			char *action_str = str + sizeof("onmatch(") - 1;
 
 			data = onmatch_parse(tr, action_str);
@@ -4355,7 +4355,7 @@ static int parse_actions(struct hist_trigger_data *hist_data)
 				break;
 			}
 			data->fn = action_trace;
-		} else if (strncmp(str, "onmax(", sizeof("onmax(") - 1) == 0) {
+		} else if (str_has_prefix(str, "onmax(")) {
 			char *action_str = str + sizeof("onmax(") - 1;
 
 			data = onmax_parse(action_str);
-- 
cgit 


From b6b2735514bcd70ad1556a33892a636b20ece671 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 20 Dec 2018 13:20:07 -0500
Subject: tracing: Use str_has_prefix() instead of using fixed sizes

There are several instances of strncmp(str, "const", 123), where 123 is the
strlen of the const string to check if "const" is the prefix of str. But
this can be error prone. Use str_has_prefix() instead.

Acked-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c             | 2 +-
 kernel/trace/trace_events.c      | 2 +-
 kernel/trace/trace_events_hist.c | 2 +-
 kernel/trace/trace_probe.c       | 4 ++--
 kernel/trace/trace_stack.c       | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5afcfecb4bc2..eac2824a18ab 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4411,7 +4411,7 @@ static int trace_set_options(struct trace_array *tr, char *option)
 
 	cmp = strstrip(option);
 
-	if (strncmp(cmp, "no", 2) == 0) {
+	if (str_has_prefix(cmp, "no")) {
 		neg = 1;
 		cmp += 2;
 	}
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index bd0162c0467c..5b3b0c3c8a47 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1251,7 +1251,7 @@ static int f_show(struct seq_file *m, void *v)
 	 */
 	array_descriptor = strchr(field->type, '[');
 
-	if (!strncmp(field->type, "__data_loc", 10))
+	if (str_has_prefix(field->type, "__data_loc"))
 		array_descriptor = NULL;
 
 	if (!array_descriptor)
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 9d590138f870..0d878dcd1e4b 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -518,7 +518,7 @@ static int synth_event_define_fields(struct trace_event_call *call)
 
 static bool synth_field_signed(char *type)
 {
-	if (strncmp(type, "u", 1) == 0)
+	if (str_has_prefix(type, "u"))
 		return false;
 
 	return true;
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index ff86417c0149..541375737403 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -194,7 +194,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t,
 			code->op = FETCH_OP_RETVAL;
 		else
 			ret = -EINVAL;
-	} else if (strncmp(arg, "stack", 5) == 0) {
+	} else if (str_has_prefix(arg, "stack")) {
 		if (arg[5] == '\0') {
 			code->op = FETCH_OP_STACKP;
 		} else if (isdigit(arg[5])) {
@@ -213,7 +213,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t,
 #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
 	} else if (((flags & TPARG_FL_MASK) ==
 		    (TPARG_FL_KERNEL | TPARG_FL_FENTRY)) &&
-		   strncmp(arg, "arg", 3) == 0) {
+		   str_has_prefix(arg, "arg")) {
 		if (!isdigit(arg[3]))
 			return -EINVAL;
 		ret = kstrtoul(arg + 3, 10, &param);
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index e2a153fc1afc..3641f28c343f 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -448,7 +448,7 @@ static char stack_trace_filter_buf[COMMAND_LINE_SIZE+1] __initdata;
 
 static __init int enable_stacktrace(char *str)
 {
-	if (strncmp(str, "_filter=", 8) == 0)
+	if (str_has_prefix(str, "_filter="))
 		strncpy(stack_trace_filter_buf, str+8, COMMAND_LINE_SIZE);
 
 	stack_tracer_enabled = 1;
-- 
cgit 


From 036876fa56204ae0fa59045bd6bbb2691a060633 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Fri, 21 Dec 2018 18:40:46 -0500
Subject: tracing: Have the historgram use the result of str_has_prefix() for
 len of prefix

As str_has_prefix() returns the length on match, we can use that for the
updating of the string pointer instead of recalculating the prefix size.

Cc: Tom Zanussi  <zanussi@kernel.org>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 0d878dcd1e4b..449d90cfa151 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -4342,12 +4342,13 @@ static int parse_actions(struct hist_trigger_data *hist_data)
 	unsigned int i;
 	int ret = 0;
 	char *str;
+	int len;
 
 	for (i = 0; i < hist_data->attrs->n_actions; i++) {
 		str = hist_data->attrs->action_str[i];
 
-		if (str_has_prefix(str, "onmatch(")) {
-			char *action_str = str + sizeof("onmatch(") - 1;
+		if ((len = str_has_prefix(str, "onmatch("))) {
+			char *action_str = str + len;
 
 			data = onmatch_parse(tr, action_str);
 			if (IS_ERR(data)) {
@@ -4355,8 +4356,8 @@ static int parse_actions(struct hist_trigger_data *hist_data)
 				break;
 			}
 			data->fn = action_trace;
-		} else if (str_has_prefix(str, "onmax(")) {
-			char *action_str = str + sizeof("onmax(") - 1;
+		} else if ((len = str_has_prefix(str, "onmax("))) {
+			char *action_str = str + len;
 
 			data = onmax_parse(action_str);
 			if (IS_ERR(data)) {
-- 
cgit 


From 3d739c1f6156c70eb0548aa288dcfbac9e0bd162 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Fri, 21 Dec 2018 23:10:26 -0500
Subject: tracing: Use the return of str_has_prefix() to remove open coded
 numbers

There are several locations that compare constants to the beginning of
string variables to determine what commands should be done, then the
constant length is used to index into the string. This is error prone as the
hard coded numbers have to match the size of the constants. Instead, use the
len returned from str_has_prefix() and remove the open coded string length
sizes.

Cc: Joe Perches <joe@perches.com>
Acked-by: Masami  Hiramatsu <mhiramat@kernel.org> (for trace_probe part)
Acked-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c       |  8 +++++---
 kernel/trace/trace_probe.c | 17 +++++++++--------
 kernel/trace/trace_stack.c |  6 ++++--
 3 files changed, 18 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index eac2824a18ab..18b86c3974e1 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4408,13 +4408,15 @@ static int trace_set_options(struct trace_array *tr, char *option)
 	int neg = 0;
 	int ret;
 	size_t orig_len = strlen(option);
+	int len;
 
 	cmp = strstrip(option);
 
-	if (str_has_prefix(cmp, "no")) {
+	len = str_has_prefix(cmp, "no");
+	if (len)
 		neg = 1;
-		cmp += 2;
-	}
+
+	cmp += len;
 
 	mutex_lock(&trace_types_lock);
 
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 541375737403..9962cb5da8ac 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -186,19 +186,20 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup,
 static int parse_probe_vars(char *arg, const struct fetch_type *t,
 			    struct fetch_insn *code, unsigned int flags)
 {
-	int ret = 0;
 	unsigned long param;
+	int ret = 0;
+	int len;
 
 	if (strcmp(arg, "retval") == 0) {
 		if (flags & TPARG_FL_RETURN)
 			code->op = FETCH_OP_RETVAL;
 		else
 			ret = -EINVAL;
-	} else if (str_has_prefix(arg, "stack")) {
-		if (arg[5] == '\0') {
+	} else if ((len = str_has_prefix(arg, "stack"))) {
+		if (arg[len] == '\0') {
 			code->op = FETCH_OP_STACKP;
-		} else if (isdigit(arg[5])) {
-			ret = kstrtoul(arg + 5, 10, &param);
+		} else if (isdigit(arg[len])) {
+			ret = kstrtoul(arg + len, 10, &param);
 			if (ret || ((flags & TPARG_FL_KERNEL) &&
 				    param > PARAM_MAX_STACK))
 				ret = -EINVAL;
@@ -213,10 +214,10 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t,
 #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
 	} else if (((flags & TPARG_FL_MASK) ==
 		    (TPARG_FL_KERNEL | TPARG_FL_FENTRY)) &&
-		   str_has_prefix(arg, "arg")) {
-		if (!isdigit(arg[3]))
+		   (len = str_has_prefix(arg, "arg"))) {
+		if (!isdigit(arg[len]))
 			return -EINVAL;
-		ret = kstrtoul(arg + 3, 10, &param);
+		ret = kstrtoul(arg + len, 10, &param);
 		if (ret || !param || param > PARAM_MAX_STACK)
 			return -EINVAL;
 		code->op = FETCH_OP_ARG;
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 3641f28c343f..eec648a0d673 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -448,8 +448,10 @@ static char stack_trace_filter_buf[COMMAND_LINE_SIZE+1] __initdata;
 
 static __init int enable_stacktrace(char *str)
 {
-	if (str_has_prefix(str, "_filter="))
-		strncpy(stack_trace_filter_buf, str+8, COMMAND_LINE_SIZE);
+	int len;
+
+	if ((len = str_has_prefix(str, "_filter=")))
+		strncpy(stack_trace_filter_buf, str + len, COMMAND_LINE_SIZE);
 
 	stack_tracer_enabled = 1;
 	last_stack_tracer_enabled = 1;
-- 
cgit 


From 6d101ba6be2a26a3e1f513b5e293f0fd2b79ec5c Mon Sep 17 00:00:00 2001
From: Olof Johansson <olof@lixom.net>
Date: Sun, 25 Nov 2018 14:41:05 -0800
Subject: sched/fair: Fix warning on non-SMP build

Caused by making the variable static:

  kernel/sched/fair.c:119:21: warning: 'capacity_margin' defined but not used [-Wunused-variable]

Seems easiest to just move it up under the existing ifdef CONFIG_SMP
that's a few lines above.

Fixes: ed8885a14433a ('sched/fair: Make some variables static')
Signed-off-by: Olof Johansson <olof@lixom.net>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched/fair.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1c1cfbf6ba0c..d1907506318a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -94,6 +94,14 @@ int __weak arch_asym_cpu_priority(int cpu)
 {
 	return -cpu;
 }
+
+/*
+ * The margin used when comparing utilization with CPU capacity:
+ * util * margin < capacity * 1024
+ *
+ * (default: ~20%)
+ */
+static unsigned int capacity_margin			= 1280;
 #endif
 
 #ifdef CONFIG_CFS_BANDWIDTH
@@ -110,14 +118,6 @@ int __weak arch_asym_cpu_priority(int cpu)
 unsigned int sysctl_sched_cfs_bandwidth_slice		= 5000UL;
 #endif
 
-/*
- * The margin used when comparing utilization with CPU capacity:
- * util * margin < capacity * 1024
- *
- * (default: ~20%)
- */
-static unsigned int capacity_margin			= 1280;
-
 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
 {
 	lw->weight += inc;
-- 
cgit 


From e250d91d65750a0c0c62483ac4f9f357e7317617 Mon Sep 17 00:00:00 2001
From: Ondrej Mosnacek <omosnace@redhat.com>
Date: Thu, 13 Dec 2018 15:17:37 +0100
Subject: cgroup: fix parsing empty mount option string

This fixes the case where all mount options specified are consumed by an
LSM and all that's left is an empty string. In this case cgroupfs should
accept the string and not fail.

How to reproduce (with SELinux enabled):

    # umount /sys/fs/cgroup/unified
    # mount -o context=system_u:object_r:cgroup_t:s0 -t cgroup2 cgroup2 /sys/fs/cgroup/unified
    mount: /sys/fs/cgroup/unified: wrong fs type, bad option, bad superblock on cgroup2, missing codepage or helper program, or other error.
    # dmesg | tail -n 1
    [   31.575952] cgroup: cgroup2: unknown option ""

Fixes: 67e9c74b8a87 ("cgroup: replace __DEVEL__sane_behavior with cgroup2 fs type")
[NOTE: should apply on top of commit 5136f6365ce3 ("cgroup: implement "nsdelegate" mount option"), older versions need manual rebase]
Suggested-by: Stephen Smalley <sds@tycho.nsa.gov>
Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cgroup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 8da9bf5e422f..879c9f191f66 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1748,7 +1748,7 @@ static int parse_cgroup_root_flags(char *data, unsigned int *root_flags)
 
 	*root_flags = 0;
 
-	if (!data)
+	if (!data || *data == '\0')
 		return 0;
 
 	while ((token = strsep(&data, ",")) != NULL) {
-- 
cgit 


From 3fc9c12d27b4ded4f1f761a800558dab2e6bbac5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 28 Dec 2018 10:31:07 -0800
Subject: cgroup: Add named hierarchy disabling to cgroup_no_v1 boot param

It can be useful to inhibit all cgroup1 hierarchies especially during
transition and for debugging.  cgroup_no_v1 can block hierarchies with
controllers which leaves out the named hierarchies.  Expand it to
cover the named hierarchies so that "cgroup_no_v1=all,named" disables
all cgroup1 hierarchies.

Signed-off-by: Tejun Heo <tj@kernel.org>
Suggested-by: Marcin Pawlowski <mpawlowski@fb.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 Documentation/admin-guide/kernel-parameters.txt |  8 ++++++--
 kernel/cgroup/cgroup-v1.c                       | 14 +++++++++++++-
 2 files changed, 19 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 81d1d5a74728..8b7652449228 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -486,10 +486,14 @@
 			cut the overhead, others just disable the usage. So
 			only cgroup_disable=memory is actually worthy}
 
-	cgroup_no_v1=	[KNL] Disable one, multiple, all cgroup controllers in v1
-			Format: { controller[,controller...] | "all" }
+	cgroup_no_v1=	[KNL] Disable cgroup controllers and named hierarchies in v1
+			Format: { { controller | "all" | "named" }
+			          [,{ controller | "all" | "named" }...] }
 			Like cgroup_disable, but only applies to cgroup v1;
 			the blacklisted controllers remain available in cgroup2.
+			"all" blacklists all controllers and "named" disables
+			named mounts. Specifying both "all" and "named" disables
+			all v1 hierarchies.
 
 	cgroup.memory=	[KNL] Pass options to the cgroup memory controller.
 			Format: <string>
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 51063e7a93c2..583b969b0c0e 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -27,6 +27,9 @@
 /* Controllers blocked by the commandline in v1 */
 static u16 cgroup_no_v1_mask;
 
+/* disable named v1 mounts */
+static bool cgroup_no_v1_named;
+
 /*
  * pidlist destructions need to be flushed on cgroup destruction.  Use a
  * separate workqueue as flush domain.
@@ -963,6 +966,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 		}
 		if (!strncmp(token, "name=", 5)) {
 			const char *name = token + 5;
+
+			/* blocked by boot param? */
+			if (cgroup_no_v1_named)
+				return -ENOENT;
 			/* Can't specify an empty name */
 			if (!strlen(name))
 				return -EINVAL;
@@ -1292,7 +1299,12 @@ static int __init cgroup_no_v1(char *str)
 
 		if (!strcmp(token, "all")) {
 			cgroup_no_v1_mask = U16_MAX;
-			break;
+			continue;
+		}
+
+		if (!strcmp(token, "named")) {
+			cgroup_no_v1_named = true;
+			continue;
 		}
 
 		for_each_subsys(ss, i) {
-- 
cgit 


From 3d6357de8aa09e1966770dc1171c72679946464f Mon Sep 17 00:00:00 2001
From: Arun KS <arunks@codeaurora.org>
Date: Fri, 28 Dec 2018 00:34:20 -0800
Subject: mm: reference totalram_pages and managed_pages once per function

Patch series "mm: convert totalram_pages, totalhigh_pages and managed
pages to atomic", v5.

This series converts totalram_pages, totalhigh_pages and
zone->managed_pages to atomic variables.

totalram_pages, zone->managed_pages and totalhigh_pages updates are
protected by managed_page_count_lock, but readers never care about it.
Convert these variables to atomic to avoid readers potentially seeing a
store tear.

Main motivation was that managed_page_count_lock handling was complicating
things.  It was discussed in length here,
https://lore.kernel.org/patchwork/patch/995739/#1181785 It seemes better
to remove the lock and convert variables to atomic.  With the change,
preventing poteintial store-to-read tearing comes as a bonus.

This patch (of 4):

This is in preparation to a later patch which converts totalram_pages and
zone->managed_pages to atomic variables.  Please note that re-reading the
value might lead to a different value and as such it could lead to
unexpected behavior.  There are no known bugs as a result of the current
code but it is better to prevent from them in principle.

Link: http://lkml.kernel.org/r/1542090790-21750-2-git-send-email-arunks@codeaurora.org
Signed-off-by: Arun KS <arunks@codeaurora.org>
Reviewed-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Reviewed-by: David Hildenbrand <david@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Pavel Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/kernel/mem.c                 |  2 +-
 arch/x86/kernel/cpu/microcode/core.c |  5 +++--
 drivers/hv/hv_balloon.c              | 19 ++++++++++---------
 fs/file_table.c                      |  7 ++++---
 kernel/fork.c                        |  5 +++--
 kernel/kexec_core.c                  |  5 +++--
 mm/page_alloc.c                      |  5 +++--
 mm/shmem.c                           |  3 ++-
 net/dccp/proto.c                     |  7 ++++---
 net/netfilter/nf_conntrack_core.c    |  7 ++++---
 net/netfilter/xt_hashlimit.c         |  5 +++--
 net/sctp/protocol.c                  |  7 ++++---
 12 files changed, 44 insertions(+), 33 deletions(-)

(limited to 'kernel')

diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c
index 1067469ba2ea..2da209687a22 100644
--- a/arch/um/kernel/mem.c
+++ b/arch/um/kernel/mem.c
@@ -52,7 +52,7 @@ void __init mem_init(void)
 	/* this will put all low memory onto the freelists */
 	memblock_free_all();
 	max_low_pfn = totalram_pages;
-	max_pfn = totalram_pages;
+	max_pfn = max_low_pfn;
 	mem_init_print_info(NULL);
 	kmalloc_ok = 1;
 }
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 2637ff09d6a0..168fa272cc3e 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -434,9 +434,10 @@ static ssize_t microcode_write(struct file *file, const char __user *buf,
 			       size_t len, loff_t *ppos)
 {
 	ssize_t ret = -EINVAL;
+	unsigned long nr_pages = totalram_pages;
 
-	if ((len >> PAGE_SHIFT) > totalram_pages) {
-		pr_err("too much data (max %ld pages)\n", totalram_pages);
+	if ((len >> PAGE_SHIFT) > nr_pages) {
+		pr_err("too much data (max %ld pages)\n", nr_pages);
 		return ret;
 	}
 
diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c
index 41631512ae97..f3e7da981610 100644
--- a/drivers/hv/hv_balloon.c
+++ b/drivers/hv/hv_balloon.c
@@ -1090,6 +1090,7 @@ static void process_info(struct hv_dynmem_device *dm, struct dm_info_msg *msg)
 static unsigned long compute_balloon_floor(void)
 {
 	unsigned long min_pages;
+	unsigned long nr_pages = totalram_pages;
 #define MB2PAGES(mb) ((mb) << (20 - PAGE_SHIFT))
 	/* Simple continuous piecewiese linear function:
 	 *  max MiB -> min MiB  gradient
@@ -1102,16 +1103,16 @@ static unsigned long compute_balloon_floor(void)
 	 *    8192       744    (1/16)
 	 *   32768      1512	(1/32)
 	 */
-	if (totalram_pages < MB2PAGES(128))
-		min_pages = MB2PAGES(8) + (totalram_pages >> 1);
-	else if (totalram_pages < MB2PAGES(512))
-		min_pages = MB2PAGES(40) + (totalram_pages >> 2);
-	else if (totalram_pages < MB2PAGES(2048))
-		min_pages = MB2PAGES(104) + (totalram_pages >> 3);
-	else if (totalram_pages < MB2PAGES(8192))
-		min_pages = MB2PAGES(232) + (totalram_pages >> 4);
+	if (nr_pages < MB2PAGES(128))
+		min_pages = MB2PAGES(8) + (nr_pages >> 1);
+	else if (nr_pages < MB2PAGES(512))
+		min_pages = MB2PAGES(40) + (nr_pages >> 2);
+	else if (nr_pages < MB2PAGES(2048))
+		min_pages = MB2PAGES(104) + (nr_pages >> 3);
+	else if (nr_pages < MB2PAGES(8192))
+		min_pages = MB2PAGES(232) + (nr_pages >> 4);
 	else
-		min_pages = MB2PAGES(488) + (totalram_pages >> 5);
+		min_pages = MB2PAGES(488) + (nr_pages >> 5);
 #undef MB2PAGES
 	return min_pages;
 }
diff --git a/fs/file_table.c b/fs/file_table.c
index e49af4caf15d..b6e9587f05c7 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -380,10 +380,11 @@ void __init files_init(void)
 void __init files_maxfiles_init(void)
 {
 	unsigned long n;
-	unsigned long memreserve = (totalram_pages - nr_free_pages()) * 3/2;
+	unsigned long nr_pages = totalram_pages;
+	unsigned long memreserve = (nr_pages - nr_free_pages()) * 3/2;
 
-	memreserve = min(memreserve, totalram_pages - 1);
-	n = ((totalram_pages - memreserve) * (PAGE_SIZE / 1024)) / 10;
+	memreserve = min(memreserve, nr_pages - 1);
+	n = ((nr_pages - memreserve) * (PAGE_SIZE / 1024)) / 10;
 
 	files_stat.max_files = max_t(unsigned long, n, NR_FILE);
 }
diff --git a/kernel/fork.c b/kernel/fork.c
index e2a5156bc9c3..8617a326e9f5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -744,15 +744,16 @@ void __init __weak arch_task_cache_init(void) { }
 static void set_max_threads(unsigned int max_threads_suggested)
 {
 	u64 threads;
+	unsigned long nr_pages = totalram_pages;
 
 	/*
 	 * The number of threads shall be limited such that the thread
 	 * structures may only consume a small part of the available memory.
 	 */
-	if (fls64(totalram_pages) + fls64(PAGE_SIZE) > 64)
+	if (fls64(nr_pages) + fls64(PAGE_SIZE) > 64)
 		threads = MAX_THREADS;
 	else
-		threads = div64_u64((u64) totalram_pages * (u64) PAGE_SIZE,
+		threads = div64_u64((u64) nr_pages * (u64) PAGE_SIZE,
 				    (u64) THREAD_SIZE * 8UL);
 
 	if (threads > max_threads_suggested)
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 86ef06d3dbe3..7e967ca98d92 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -152,6 +152,7 @@ int sanity_check_segment_list(struct kimage *image)
 	int i;
 	unsigned long nr_segments = image->nr_segments;
 	unsigned long total_pages = 0;
+	unsigned long nr_pages = totalram_pages;
 
 	/*
 	 * Verify we have good destination addresses.  The caller is
@@ -217,13 +218,13 @@ int sanity_check_segment_list(struct kimage *image)
 	 * wasted allocating pages, which can cause a soft lockup.
 	 */
 	for (i = 0; i < nr_segments; i++) {
-		if (PAGE_COUNT(image->segment[i].memsz) > totalram_pages / 2)
+		if (PAGE_COUNT(image->segment[i].memsz) > nr_pages / 2)
 			return -EINVAL;
 
 		total_pages += PAGE_COUNT(image->segment[i].memsz);
 	}
 
-	if (total_pages > totalram_pages / 2)
+	if (total_pages > nr_pages / 2)
 		return -EINVAL;
 
 	/*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c82c41edfe22..b79e79caea99 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -7257,6 +7257,7 @@ static void calculate_totalreserve_pages(void)
 		for (i = 0; i < MAX_NR_ZONES; i++) {
 			struct zone *zone = pgdat->node_zones + i;
 			long max = 0;
+			unsigned long managed_pages = zone->managed_pages;
 
 			/* Find valid and maximum lowmem_reserve in the zone */
 			for (j = i; j < MAX_NR_ZONES; j++) {
@@ -7267,8 +7268,8 @@ static void calculate_totalreserve_pages(void)
 			/* we treat the high watermark as reserved pages. */
 			max += high_wmark_pages(zone);
 
-			if (max > zone->managed_pages)
-				max = zone->managed_pages;
+			if (max > managed_pages)
+				max = managed_pages;
 
 			pgdat->totalreserve_pages += max;
 
diff --git a/mm/shmem.c b/mm/shmem.c
index 375f3ac19bb8..b1f0f54470fb 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -114,7 +114,8 @@ static unsigned long shmem_default_max_blocks(void)
 
 static unsigned long shmem_default_max_inodes(void)
 {
-	return min(totalram_pages - totalhigh_pages, totalram_pages / 2);
+	unsigned long nr_pages = totalram_pages;
+	return min(nr_pages - totalhigh_pages, nr_pages / 2);
 }
 #endif
 
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 2cc5fbb1b29e..ff727ff61b5b 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -1131,6 +1131,7 @@ EXPORT_SYMBOL_GPL(dccp_debug);
 static int __init dccp_init(void)
 {
 	unsigned long goal;
+	unsigned long nr_pages = totalram_pages;
 	int ehash_order, bhash_order, i;
 	int rc;
 
@@ -1157,10 +1158,10 @@ static int __init dccp_init(void)
 	 *
 	 * The methodology is similar to that of the buffer cache.
 	 */
-	if (totalram_pages >= (128 * 1024))
-		goal = totalram_pages >> (21 - PAGE_SHIFT);
+	if (nr_pages >= (128 * 1024))
+		goal = nr_pages >> (21 - PAGE_SHIFT);
 	else
-		goal = totalram_pages >> (23 - PAGE_SHIFT);
+		goal = nr_pages >> (23 - PAGE_SHIFT);
 
 	if (thash_entries)
 		goal = (thash_entries *
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index e87c21e47efe..5eb990830348 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -2248,6 +2248,7 @@ static __always_inline unsigned int total_extension_size(void)
 
 int nf_conntrack_init_start(void)
 {
+	unsigned long nr_pages = totalram_pages;
 	int max_factor = 8;
 	int ret = -ENOMEM;
 	int i;
@@ -2267,11 +2268,11 @@ int nf_conntrack_init_start(void)
 		 * >= 4GB machines have 65536 buckets.
 		 */
 		nf_conntrack_htable_size
-			= (((totalram_pages << PAGE_SHIFT) / 16384)
+			= (((nr_pages << PAGE_SHIFT) / 16384)
 			   / sizeof(struct hlist_head));
-		if (totalram_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE)))
+		if (nr_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE)))
 			nf_conntrack_htable_size = 65536;
-		else if (totalram_pages > (1024 * 1024 * 1024 / PAGE_SIZE))
+		else if (nr_pages > (1024 * 1024 * 1024 / PAGE_SIZE))
 			nf_conntrack_htable_size = 16384;
 		if (nf_conntrack_htable_size < 32)
 			nf_conntrack_htable_size = 32;
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 28e27a32d9b9..88b520ba2abc 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -274,14 +274,15 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg,
 	struct xt_hashlimit_htable *hinfo;
 	const struct seq_operations *ops;
 	unsigned int size, i;
+	unsigned long nr_pages = totalram_pages;
 	int ret;
 
 	if (cfg->size) {
 		size = cfg->size;
 	} else {
-		size = (totalram_pages << PAGE_SHIFT) / 16384 /
+		size = (nr_pages << PAGE_SHIFT) / 16384 /
 		       sizeof(struct hlist_head);
-		if (totalram_pages > 1024 * 1024 * 1024 / PAGE_SIZE)
+		if (nr_pages > 1024 * 1024 * 1024 / PAGE_SIZE)
 			size = 8192;
 		if (size < 16)
 			size = 16;
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 9b277bd36d1a..a5b24182b3cc 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -1368,6 +1368,7 @@ static __init int sctp_init(void)
 	int status = -EINVAL;
 	unsigned long goal;
 	unsigned long limit;
+	unsigned long nr_pages = totalram_pages;
 	int max_share;
 	int order;
 	int num_entries;
@@ -1426,10 +1427,10 @@ static __init int sctp_init(void)
 	 * The methodology is similar to that of the tcp hash tables.
 	 * Though not identical.  Start by getting a goal size
 	 */
-	if (totalram_pages >= (128 * 1024))
-		goal = totalram_pages >> (22 - PAGE_SHIFT);
+	if (nr_pages >= (128 * 1024))
+		goal = nr_pages >> (22 - PAGE_SHIFT);
 	else
-		goal = totalram_pages >> (24 - PAGE_SHIFT);
+		goal = nr_pages >> (24 - PAGE_SHIFT);
 
 	/* Then compute the page order for said goal */
 	order = get_order(goal);
-- 
cgit 


From ca79b0c211af63fa3276f0e3fd7dd9ada2439839 Mon Sep 17 00:00:00 2001
From: Arun KS <arunks@codeaurora.org>
Date: Fri, 28 Dec 2018 00:34:29 -0800
Subject: mm: convert totalram_pages and totalhigh_pages variables to atomic

totalram_pages and totalhigh_pages are made static inline function.

Main motivation was that managed_page_count_lock handling was complicating
things.  It was discussed in length here,
https://lore.kernel.org/patchwork/patch/995739/#1181785 So it seemes
better to remove the lock and convert variables to atomic, with preventing
poteintial store-to-read tearing as a bonus.

[akpm@linux-foundation.org: coding style fixes]
Link: http://lkml.kernel.org/r/1542090790-21750-4-git-send-email-arunks@codeaurora.org
Signed-off-by: Arun KS <arunks@codeaurora.org>
Suggested-by: Michal Hocko <mhocko@suse.com>
Suggested-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Reviewed-by: Pavel Tatashin <pasha.tatashin@soleen.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/csky/mm/init.c                           |  4 ++--
 arch/powerpc/platforms/pseries/cmm.c          | 10 +++++-----
 arch/s390/mm/init.c                           |  2 +-
 arch/um/kernel/mem.c                          |  2 +-
 arch/x86/kernel/cpu/microcode/core.c          |  2 +-
 drivers/char/agp/backend.c                    |  4 ++--
 drivers/gpu/drm/i915/i915_gem.c               |  2 +-
 drivers/gpu/drm/i915/selftests/i915_gem_gtt.c |  4 ++--
 drivers/hv/hv_balloon.c                       |  2 +-
 drivers/md/dm-bufio.c                         |  2 +-
 drivers/md/dm-crypt.c                         |  2 +-
 drivers/md/dm-integrity.c                     |  2 +-
 drivers/md/dm-stats.c                         |  2 +-
 drivers/media/platform/mtk-vpu/mtk_vpu.c      |  2 +-
 drivers/misc/vmw_balloon.c                    |  2 +-
 drivers/parisc/ccio-dma.c                     |  4 ++--
 drivers/parisc/sba_iommu.c                    |  4 ++--
 drivers/staging/android/ion/ion_system_heap.c |  2 +-
 drivers/xen/xen-selfballoon.c                 |  6 +++---
 fs/ceph/super.h                               |  2 +-
 fs/file_table.c                               |  2 +-
 fs/fuse/inode.c                               |  2 +-
 fs/nfs/write.c                                |  2 +-
 fs/nfsd/nfscache.c                            |  2 +-
 fs/ntfs/malloc.h                              |  2 +-
 fs/proc/base.c                                |  2 +-
 include/linux/highmem.h                       | 28 +++++++++++++++++++++++++--
 include/linux/mm.h                            | 27 +++++++++++++++++++++++++-
 include/linux/swap.h                          |  1 -
 kernel/fork.c                                 |  2 +-
 kernel/kexec_core.c                           |  2 +-
 kernel/power/snapshot.c                       |  2 +-
 mm/highmem.c                                  |  5 ++---
 mm/huge_memory.c                              |  2 +-
 mm/kasan/quarantine.c                         |  2 +-
 mm/memblock.c                                 |  4 ++--
 mm/mm_init.c                                  |  2 +-
 mm/oom_kill.c                                 |  2 +-
 mm/page_alloc.c                               | 20 ++++++++++---------
 mm/shmem.c                                    |  9 +++++----
 mm/slab.c                                     |  2 +-
 mm/swap.c                                     |  2 +-
 mm/util.c                                     |  2 +-
 mm/vmalloc.c                                  |  4 ++--
 mm/workingset.c                               |  2 +-
 mm/zswap.c                                    |  4 ++--
 net/dccp/proto.c                              |  2 +-
 net/decnet/dn_route.c                         |  2 +-
 net/ipv4/tcp_metrics.c                        |  2 +-
 net/netfilter/nf_conntrack_core.c             |  2 +-
 net/netfilter/xt_hashlimit.c                  |  2 +-
 net/sctp/protocol.c                           |  2 +-
 security/integrity/ima/ima_kexec.c            |  2 +-
 53 files changed, 131 insertions(+), 81 deletions(-)

(limited to 'kernel')

diff --git a/arch/csky/mm/init.c b/arch/csky/mm/init.c
index dc07c078f9b8..66e597053488 100644
--- a/arch/csky/mm/init.c
+++ b/arch/csky/mm/init.c
@@ -71,7 +71,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 		ClearPageReserved(virt_to_page(start));
 		init_page_count(virt_to_page(start));
 		free_page(start);
-		totalram_pages++;
+		totalram_pages_inc();
 	}
 }
 #endif
@@ -88,7 +88,7 @@ void free_initmem(void)
 		ClearPageReserved(virt_to_page(addr));
 		init_page_count(virt_to_page(addr));
 		free_page(addr);
-		totalram_pages++;
+		totalram_pages_inc();
 		addr += PAGE_SIZE;
 	}
 
diff --git a/arch/powerpc/platforms/pseries/cmm.c b/arch/powerpc/platforms/pseries/cmm.c
index 25427a48feae..e8d63a6a9002 100644
--- a/arch/powerpc/platforms/pseries/cmm.c
+++ b/arch/powerpc/platforms/pseries/cmm.c
@@ -208,7 +208,7 @@ static long cmm_alloc_pages(long nr)
 
 		pa->page[pa->index++] = addr;
 		loaned_pages++;
-		totalram_pages--;
+		totalram_pages_dec();
 		spin_unlock(&cmm_lock);
 		nr--;
 	}
@@ -247,7 +247,7 @@ static long cmm_free_pages(long nr)
 		free_page(addr);
 		loaned_pages--;
 		nr--;
-		totalram_pages++;
+		totalram_pages_inc();
 	}
 	spin_unlock(&cmm_lock);
 	cmm_dbg("End request with %ld pages unfulfilled\n", nr);
@@ -291,7 +291,7 @@ static void cmm_get_mpp(void)
 	int rc;
 	struct hvcall_mpp_data mpp_data;
 	signed long active_pages_target, page_loan_request, target;
-	signed long total_pages = totalram_pages + loaned_pages;
+	signed long total_pages = totalram_pages() + loaned_pages;
 	signed long min_mem_pages = (min_mem_mb * 1024 * 1024) / PAGE_SIZE;
 
 	rc = h_get_mpp(&mpp_data);
@@ -322,7 +322,7 @@ static void cmm_get_mpp(void)
 
 	cmm_dbg("delta = %ld, loaned = %lu, target = %lu, oom = %lu, totalram = %lu\n",
 		page_loan_request, loaned_pages, loaned_pages_target,
-		oom_freed_pages, totalram_pages);
+		oom_freed_pages, totalram_pages());
 }
 
 static struct notifier_block cmm_oom_nb = {
@@ -581,7 +581,7 @@ static int cmm_mem_going_offline(void *arg)
 			free_page(pa_curr->page[idx]);
 			freed++;
 			loaned_pages--;
-			totalram_pages++;
+			totalram_pages_inc();
 			pa_curr->page[idx] = pa_last->page[--pa_last->index];
 			if (pa_last->index == 0) {
 				if (pa_curr == pa_last)
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 76d0708438e9..50388190b393 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -59,7 +59,7 @@ static void __init setup_zero_pages(void)
 	order = 7;
 
 	/* Limit number of empty zero pages for small memory sizes */
-	while (order > 2 && (totalram_pages >> 10) < (1UL << order))
+	while (order > 2 && (totalram_pages() >> 10) < (1UL << order))
 		order--;
 
 	empty_zero_page = __get_free_pages(GFP_KERNEL | __GFP_ZERO, order);
diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c
index 2da209687a22..8d21a83dd289 100644
--- a/arch/um/kernel/mem.c
+++ b/arch/um/kernel/mem.c
@@ -51,7 +51,7 @@ void __init mem_init(void)
 
 	/* this will put all low memory onto the freelists */
 	memblock_free_all();
-	max_low_pfn = totalram_pages;
+	max_low_pfn = totalram_pages();
 	max_pfn = max_low_pfn;
 	mem_init_print_info(NULL);
 	kmalloc_ok = 1;
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 168fa272cc3e..97f9ada9ceda 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -434,7 +434,7 @@ static ssize_t microcode_write(struct file *file, const char __user *buf,
 			       size_t len, loff_t *ppos)
 {
 	ssize_t ret = -EINVAL;
-	unsigned long nr_pages = totalram_pages;
+	unsigned long nr_pages = totalram_pages();
 
 	if ((len >> PAGE_SHIFT) > nr_pages) {
 		pr_err("too much data (max %ld pages)\n", nr_pages);
diff --git a/drivers/char/agp/backend.c b/drivers/char/agp/backend.c
index 38ffb281df97..004a3ce8ba72 100644
--- a/drivers/char/agp/backend.c
+++ b/drivers/char/agp/backend.c
@@ -115,9 +115,9 @@ static int agp_find_max(void)
 	long memory, index, result;
 
 #if PAGE_SHIFT < 20
-	memory = totalram_pages >> (20 - PAGE_SHIFT);
+	memory = totalram_pages() >> (20 - PAGE_SHIFT);
 #else
-	memory = totalram_pages << (PAGE_SHIFT - 20);
+	memory = totalram_pages() << (PAGE_SHIFT - 20);
 #endif
 	index = 1;
 
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index d36a9755ad91..a9de07bb72c8 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2559,7 +2559,7 @@ static int i915_gem_object_get_pages_gtt(struct drm_i915_gem_object *obj)
 	 * If there's no chance of allocating enough pages for the whole
 	 * object, bail early.
 	 */
-	if (page_count > totalram_pages)
+	if (page_count > totalram_pages())
 		return -ENOMEM;
 
 	st = kmalloc(sizeof(*st), GFP_KERNEL);
diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
index 69fe86b30fbb..a9ed0ecc94e2 100644
--- a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
@@ -170,7 +170,7 @@ static int igt_ppgtt_alloc(void *arg)
 	 * This should ensure that we do not run into the oomkiller during
 	 * the test and take down the machine wilfully.
 	 */
-	limit = totalram_pages << PAGE_SHIFT;
+	limit = totalram_pages() << PAGE_SHIFT;
 	limit = min(ppgtt->vm.total, limit);
 
 	/* Check we can allocate the entire range */
@@ -1244,7 +1244,7 @@ static int exercise_mock(struct drm_i915_private *i915,
 				     u64 hole_start, u64 hole_end,
 				     unsigned long end_time))
 {
-	const u64 limit = totalram_pages << PAGE_SHIFT;
+	const u64 limit = totalram_pages() << PAGE_SHIFT;
 	struct i915_gem_context *ctx;
 	struct i915_hw_ppgtt *ppgtt;
 	IGT_TIMEOUT(end_time);
diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c
index f3e7da981610..5301fef16c31 100644
--- a/drivers/hv/hv_balloon.c
+++ b/drivers/hv/hv_balloon.c
@@ -1090,7 +1090,7 @@ static void process_info(struct hv_dynmem_device *dm, struct dm_info_msg *msg)
 static unsigned long compute_balloon_floor(void)
 {
 	unsigned long min_pages;
-	unsigned long nr_pages = totalram_pages;
+	unsigned long nr_pages = totalram_pages();
 #define MB2PAGES(mb) ((mb) << (20 - PAGE_SHIFT))
 	/* Simple continuous piecewiese linear function:
 	 *  max MiB -> min MiB  gradient
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index dc385b70e4c3..8b0b628e5d1c 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -1887,7 +1887,7 @@ static int __init dm_bufio_init(void)
 	dm_bufio_allocated_vmalloc = 0;
 	dm_bufio_current_allocated = 0;
 
-	mem = (__u64)mult_frac(totalram_pages - totalhigh_pages,
+	mem = (__u64)mult_frac(totalram_pages() - totalhigh_pages(),
 			       DM_BUFIO_MEMORY_PERCENT, 100) << PAGE_SHIFT;
 
 	if (mem > ULONG_MAX)
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index a7195eb5b8d8..a8c32de29e3f 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -2158,7 +2158,7 @@ static int crypt_wipe_key(struct crypt_config *cc)
 
 static void crypt_calculate_pages_per_client(void)
 {
-	unsigned long pages = (totalram_pages - totalhigh_pages) * DM_CRYPT_MEMORY_PERCENT / 100;
+	unsigned long pages = (totalram_pages() - totalhigh_pages()) * DM_CRYPT_MEMORY_PERCENT / 100;
 
 	if (!dm_crypt_clients_n)
 		return;
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index d4ad0bfee251..62baa3214cc7 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -2843,7 +2843,7 @@ static int create_journal(struct dm_integrity_c *ic, char **error)
 	journal_pages = roundup((__u64)ic->journal_sections * ic->journal_section_sectors,
 				PAGE_SIZE >> SECTOR_SHIFT) >> (PAGE_SHIFT - SECTOR_SHIFT);
 	journal_desc_size = journal_pages * sizeof(struct page_list);
-	if (journal_pages >= totalram_pages - totalhigh_pages || journal_desc_size > ULONG_MAX) {
+	if (journal_pages >= totalram_pages() - totalhigh_pages() || journal_desc_size > ULONG_MAX) {
 		*error = "Journal doesn't fit into memory";
 		r = -ENOMEM;
 		goto bad;
diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c
index 21de30b4e2a1..45b92a3d9d8e 100644
--- a/drivers/md/dm-stats.c
+++ b/drivers/md/dm-stats.c
@@ -85,7 +85,7 @@ static bool __check_shared_memory(size_t alloc_size)
 	a = shared_memory_amount + alloc_size;
 	if (a < shared_memory_amount)
 		return false;
-	if (a >> PAGE_SHIFT > totalram_pages / DM_STATS_MEMORY_FACTOR)
+	if (a >> PAGE_SHIFT > totalram_pages() / DM_STATS_MEMORY_FACTOR)
 		return false;
 #ifdef CONFIG_MMU
 	if (a > (VMALLOC_END - VMALLOC_START) / DM_STATS_VMALLOC_FACTOR)
diff --git a/drivers/media/platform/mtk-vpu/mtk_vpu.c b/drivers/media/platform/mtk-vpu/mtk_vpu.c
index 616f78b24a79..b6602490a247 100644
--- a/drivers/media/platform/mtk-vpu/mtk_vpu.c
+++ b/drivers/media/platform/mtk-vpu/mtk_vpu.c
@@ -855,7 +855,7 @@ static int mtk_vpu_probe(struct platform_device *pdev)
 	/* Set PTCM to 96K and DTCM to 32K */
 	vpu_cfg_writel(vpu, 0x2, VPU_TCM_CFG);
 
-	vpu->enable_4GB = !!(totalram_pages > (SZ_2G >> PAGE_SHIFT));
+	vpu->enable_4GB = !!(totalram_pages() > (SZ_2G >> PAGE_SHIFT));
 	dev_info(dev, "4GB mode %u\n", vpu->enable_4GB);
 
 	if (vpu->enable_4GB) {
diff --git a/drivers/misc/vmw_balloon.c b/drivers/misc/vmw_balloon.c
index 9b0b3fa4f836..e6126a4b95d3 100644
--- a/drivers/misc/vmw_balloon.c
+++ b/drivers/misc/vmw_balloon.c
@@ -570,7 +570,7 @@ static int vmballoon_send_get_target(struct vmballoon *b)
 	unsigned long status;
 	unsigned long limit;
 
-	limit = totalram_pages;
+	limit = totalram_pages();
 
 	/* Ensure limit fits in 32-bits */
 	if (limit != (u32)limit)
diff --git a/drivers/parisc/ccio-dma.c b/drivers/parisc/ccio-dma.c
index 701a7d6a74d5..358e380eb7fa 100644
--- a/drivers/parisc/ccio-dma.c
+++ b/drivers/parisc/ccio-dma.c
@@ -1251,7 +1251,7 @@ ccio_ioc_init(struct ioc *ioc)
 	** Hot-Plug/Removal of PCI cards. (aka PCI OLARD).
 	*/
 
-	iova_space_size = (u32) (totalram_pages / count_parisc_driver(&ccio_driver));
+	iova_space_size = (u32) (totalram_pages() / count_parisc_driver(&ccio_driver));
 
 	/* limit IOVA space size to 1MB-1GB */
 
@@ -1290,7 +1290,7 @@ ccio_ioc_init(struct ioc *ioc)
 
 	DBG_INIT("%s() hpa 0x%p mem %luMB IOV %dMB (%d bits)\n",
 			__func__, ioc->ioc_regs,
-			(unsigned long) totalram_pages >> (20 - PAGE_SHIFT),
+			(unsigned long) totalram_pages() >> (20 - PAGE_SHIFT),
 			iova_space_size>>20,
 			iov_order + PAGE_SHIFT);
 
diff --git a/drivers/parisc/sba_iommu.c b/drivers/parisc/sba_iommu.c
index c1e599a429af..e0655949480a 100644
--- a/drivers/parisc/sba_iommu.c
+++ b/drivers/parisc/sba_iommu.c
@@ -1414,7 +1414,7 @@ sba_ioc_init(struct parisc_device *sba, struct ioc *ioc, int ioc_num)
 	** for DMA hints - ergo only 30 bits max.
 	*/
 
-	iova_space_size = (u32) (totalram_pages/global_ioc_cnt);
+	iova_space_size = (u32) (totalram_pages()/global_ioc_cnt);
 
 	/* limit IOVA space size to 1MB-1GB */
 	if (iova_space_size < (1 << (20 - PAGE_SHIFT))) {
@@ -1439,7 +1439,7 @@ sba_ioc_init(struct parisc_device *sba, struct ioc *ioc, int ioc_num)
 	DBG_INIT("%s() hpa 0x%lx mem %ldMB IOV %dMB (%d bits)\n",
 			__func__,
 			ioc->ioc_hpa,
-			(unsigned long) totalram_pages >> (20 - PAGE_SHIFT),
+			(unsigned long) totalram_pages() >> (20 - PAGE_SHIFT),
 			iova_space_size>>20,
 			iov_order + PAGE_SHIFT);
 
diff --git a/drivers/staging/android/ion/ion_system_heap.c b/drivers/staging/android/ion/ion_system_heap.c
index 548bb02c0ca6..6cb0eebdff89 100644
--- a/drivers/staging/android/ion/ion_system_heap.c
+++ b/drivers/staging/android/ion/ion_system_heap.c
@@ -110,7 +110,7 @@ static int ion_system_heap_allocate(struct ion_heap *heap,
 	unsigned long size_remaining = PAGE_ALIGN(size);
 	unsigned int max_order = orders[0];
 
-	if (size / PAGE_SIZE > totalram_pages / 2)
+	if (size / PAGE_SIZE > totalram_pages() / 2)
 		return -ENOMEM;
 
 	INIT_LIST_HEAD(&pages);
diff --git a/drivers/xen/xen-selfballoon.c b/drivers/xen/xen-selfballoon.c
index 5165aa82bf7d..246f6122c9ee 100644
--- a/drivers/xen/xen-selfballoon.c
+++ b/drivers/xen/xen-selfballoon.c
@@ -189,7 +189,7 @@ static void selfballoon_process(struct work_struct *work)
 	bool reset_timer = false;
 
 	if (xen_selfballooning_enabled) {
-		cur_pages = totalram_pages;
+		cur_pages = totalram_pages();
 		tgt_pages = cur_pages; /* default is no change */
 		goal_pages = vm_memory_committed() +
 				totalreserve_pages +
@@ -227,7 +227,7 @@ static void selfballoon_process(struct work_struct *work)
 		if (tgt_pages < floor_pages)
 			tgt_pages = floor_pages;
 		balloon_set_new_target(tgt_pages +
-			balloon_stats.current_pages - totalram_pages);
+			balloon_stats.current_pages - totalram_pages());
 		reset_timer = true;
 	}
 #ifdef CONFIG_FRONTSWAP
@@ -569,7 +569,7 @@ int xen_selfballoon_init(bool use_selfballooning, bool use_frontswap_selfshrink)
 	 * much more reliably and response faster in some cases.
 	 */
 	if (!selfballoon_reserved_mb) {
-		reserve_pages = totalram_pages / 10;
+		reserve_pages = totalram_pages() / 10;
 		selfballoon_reserved_mb = PAGES2MB(reserve_pages);
 	}
 	schedule_delayed_work(&selfballoon_worker, selfballoon_interval * HZ);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 79a265ba9200..dfb64a5211b6 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -810,7 +810,7 @@ static inline int default_congestion_kb(void)
 	 * This allows larger machines to have larger/more transfers.
 	 * Limit the default to 256M
 	 */
-	congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
+	congestion_kb = (16*int_sqrt(totalram_pages())) << (PAGE_SHIFT-10);
 	if (congestion_kb > 256*1024)
 		congestion_kb = 256*1024;
 
diff --git a/fs/file_table.c b/fs/file_table.c
index b6e9587f05c7..5679e7fcb6b0 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -380,7 +380,7 @@ void __init files_init(void)
 void __init files_maxfiles_init(void)
 {
 	unsigned long n;
-	unsigned long nr_pages = totalram_pages;
+	unsigned long nr_pages = totalram_pages();
 	unsigned long memreserve = (nr_pages - nr_free_pages()) * 3/2;
 
 	memreserve = min(memreserve, nr_pages - 1);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 568abed20eb2..76baaa6be393 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -824,7 +824,7 @@ static const struct super_operations fuse_super_operations = {
 static void sanitize_global_limit(unsigned *limit)
 {
 	if (*limit == 0)
-		*limit = ((totalram_pages << PAGE_SHIFT) >> 13) /
+		*limit = ((totalram_pages() << PAGE_SHIFT) >> 13) /
 			 sizeof(struct fuse_req);
 
 	if (*limit >= 1 << 16)
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 586726a590d8..4f15665f0ad1 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -2121,7 +2121,7 @@ int __init nfs_init_writepagecache(void)
 	 * This allows larger machines to have larger/more transfers.
 	 * Limit the default to 256M
 	 */
-	nfs_congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
+	nfs_congestion_kb = (16*int_sqrt(totalram_pages())) << (PAGE_SHIFT-10);
 	if (nfs_congestion_kb > 256*1024)
 		nfs_congestion_kb = 256*1024;
 
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index e2fe0e9ce0df..da52b594362a 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -99,7 +99,7 @@ static unsigned int
 nfsd_cache_size_limit(void)
 {
 	unsigned int limit;
-	unsigned long low_pages = totalram_pages - totalhigh_pages;
+	unsigned long low_pages = totalram_pages() - totalhigh_pages();
 
 	limit = (16 * int_sqrt(low_pages)) << (PAGE_SHIFT-10);
 	return min_t(unsigned int, limit, 256*1024);
diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h
index ab172e5f51d9..5becc8acc8f4 100644
--- a/fs/ntfs/malloc.h
+++ b/fs/ntfs/malloc.h
@@ -47,7 +47,7 @@ static inline void *__ntfs_malloc(unsigned long size, gfp_t gfp_mask)
 		return kmalloc(PAGE_SIZE, gfp_mask & ~__GFP_HIGHMEM);
 		/* return (void *)__get_free_page(gfp_mask); */
 	}
-	if (likely((size >> PAGE_SHIFT) < totalram_pages))
+	if (likely((size >> PAGE_SHIFT) < totalram_pages()))
 		return __vmalloc(size, gfp_mask, PAGE_KERNEL);
 	return NULL;
 }
diff --git a/fs/proc/base.c b/fs/proc/base.c
index ce3465479447..d7fd1ca807d2 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -530,7 +530,7 @@ static const struct file_operations proc_lstats_operations = {
 static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns,
 			  struct pid *pid, struct task_struct *task)
 {
-	unsigned long totalpages = totalram_pages + total_swap_pages;
+	unsigned long totalpages = totalram_pages() + total_swap_pages;
 	unsigned long points = 0;
 
 	points = oom_badness(task, NULL, NULL, totalpages) *
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 0690679832d4..ea5cdbd8c2c3 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -36,7 +36,31 @@ static inline void invalidate_kernel_vmap_range(void *vaddr, int size)
 
 /* declarations for linux/mm/highmem.c */
 unsigned int nr_free_highpages(void);
-extern unsigned long totalhigh_pages;
+extern atomic_long_t _totalhigh_pages;
+static inline unsigned long totalhigh_pages(void)
+{
+	return (unsigned long)atomic_long_read(&_totalhigh_pages);
+}
+
+static inline void totalhigh_pages_inc(void)
+{
+	atomic_long_inc(&_totalhigh_pages);
+}
+
+static inline void totalhigh_pages_dec(void)
+{
+	atomic_long_dec(&_totalhigh_pages);
+}
+
+static inline void totalhigh_pages_add(long count)
+{
+	atomic_long_add(count, &_totalhigh_pages);
+}
+
+static inline void totalhigh_pages_set(long val)
+{
+	atomic_long_set(&_totalhigh_pages, val);
+}
 
 void kmap_flush_unused(void);
 
@@ -51,7 +75,7 @@ static inline struct page *kmap_to_page(void *addr)
 	return virt_to_page(addr);
 }
 
-#define totalhigh_pages 0UL
+static inline unsigned long totalhigh_pages(void) { return 0UL; }
 
 #ifndef ARCH_HAS_KMAP
 static inline void *kmap(struct page *page)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index b4d01969e700..1d2be4c2d34a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -48,7 +48,32 @@ static inline void set_max_mapnr(unsigned long limit)
 static inline void set_max_mapnr(unsigned long limit) { }
 #endif
 
-extern unsigned long totalram_pages;
+extern atomic_long_t _totalram_pages;
+static inline unsigned long totalram_pages(void)
+{
+	return (unsigned long)atomic_long_read(&_totalram_pages);
+}
+
+static inline void totalram_pages_inc(void)
+{
+	atomic_long_inc(&_totalram_pages);
+}
+
+static inline void totalram_pages_dec(void)
+{
+	atomic_long_dec(&_totalram_pages);
+}
+
+static inline void totalram_pages_add(long count)
+{
+	atomic_long_add(count, &_totalram_pages);
+}
+
+static inline void totalram_pages_set(long val)
+{
+	atomic_long_set(&_totalram_pages, val);
+}
+
 extern void * high_memory;
 extern int page_cluster;
 
diff --git a/include/linux/swap.h b/include/linux/swap.h
index a8f6d5d89524..77459d695010 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -310,7 +310,6 @@ void workingset_update_node(struct xa_node *node);
 } while (0)
 
 /* linux/mm/page_alloc.c */
-extern unsigned long totalram_pages;
 extern unsigned long totalreserve_pages;
 extern unsigned long nr_free_buffer_pages(void);
 extern unsigned long nr_free_pagecache_pages(void);
diff --git a/kernel/fork.c b/kernel/fork.c
index 8617a326e9f5..c979605fe806 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -744,7 +744,7 @@ void __init __weak arch_task_cache_init(void) { }
 static void set_max_threads(unsigned int max_threads_suggested)
 {
 	u64 threads;
-	unsigned long nr_pages = totalram_pages;
+	unsigned long nr_pages = totalram_pages();
 
 	/*
 	 * The number of threads shall be limited such that the thread
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 7e967ca98d92..d7140447be75 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -152,7 +152,7 @@ int sanity_check_segment_list(struct kimage *image)
 	int i;
 	unsigned long nr_segments = image->nr_segments;
 	unsigned long total_pages = 0;
-	unsigned long nr_pages = totalram_pages;
+	unsigned long nr_pages = totalram_pages();
 
 	/*
 	 * Verify we have good destination addresses.  The caller is
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index b0308a2c6000..640b2034edd6 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -105,7 +105,7 @@ unsigned long image_size;
 
 void __init hibernate_image_size_init(void)
 {
-	image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE;
+	image_size = ((totalram_pages() * 2) / 5) * PAGE_SIZE;
 }
 
 /*
diff --git a/mm/highmem.c b/mm/highmem.c
index 59db3223a5d6..107b10f9878e 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -105,9 +105,8 @@ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color)
 }
 #endif
 
-unsigned long totalhigh_pages __read_mostly;
-EXPORT_SYMBOL(totalhigh_pages);
-
+atomic_long_t _totalhigh_pages __read_mostly;
+EXPORT_SYMBOL(_totalhigh_pages);
 
 EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
 
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e84a10b0d310..da6682bb69aa 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -420,7 +420,7 @@ static int __init hugepage_init(void)
 	 * where the extra memory used could hurt more than TLB overhead
 	 * is likely to save.  The admin can still enable it through /sys.
 	 */
-	if (totalram_pages < (512 << (20 - PAGE_SHIFT))) {
+	if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) {
 		transparent_hugepage_flags = 0;
 		return 0;
 	}
diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c
index 57334ef2d7ef..978bc4a3eb51 100644
--- a/mm/kasan/quarantine.c
+++ b/mm/kasan/quarantine.c
@@ -237,7 +237,7 @@ void quarantine_reduce(void)
 	 * Update quarantine size in case of hotplug. Allocate a fraction of
 	 * the installed memory to quarantine minus per-cpu queue limits.
 	 */
-	total_size = (READ_ONCE(totalram_pages) << PAGE_SHIFT) /
+	total_size = (totalram_pages() << PAGE_SHIFT) /
 		QUARANTINE_FRACTION;
 	percpu_quarantines = QUARANTINE_PERCPU_SIZE * num_online_cpus();
 	new_quarantine_size = (total_size < percpu_quarantines) ?
diff --git a/mm/memblock.c b/mm/memblock.c
index 0068f87af1e8..a53d8697612c 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1576,7 +1576,7 @@ void __init __memblock_free_late(phys_addr_t base, phys_addr_t size)
 
 	for (; cursor < end; cursor++) {
 		memblock_free_pages(pfn_to_page(cursor), cursor, 0);
-		totalram_pages++;
+		totalram_pages_inc();
 	}
 }
 
@@ -1978,7 +1978,7 @@ unsigned long __init memblock_free_all(void)
 	reset_all_zones_managed_pages();
 
 	pages = free_low_memory_core_early();
-	totalram_pages += pages;
+	totalram_pages_add(pages);
 
 	return pages;
 }
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 6838a530789b..33917105a3a2 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -146,7 +146,7 @@ static void __meminit mm_compute_batch(void)
 	s32 batch = max_t(s32, nr*2, 32);
 
 	/* batch size set to 0.4% of (total memory/#cpus), or max int32 */
-	memsized_batch = min_t(u64, (totalram_pages/nr)/256, 0x7fffffff);
+	memsized_batch = min_t(u64, (totalram_pages()/nr)/256, 0x7fffffff);
 
 	vm_committed_as_batch = max_t(s32, memsized_batch, batch);
 }
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 6589f60d5018..21d487749e1d 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -269,7 +269,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc)
 	}
 
 	/* Default to all available memory */
-	oc->totalpages = totalram_pages + total_swap_pages;
+	oc->totalpages = totalram_pages() + total_swap_pages;
 
 	if (!IS_ENABLED(CONFIG_NUMA))
 		return CONSTRAINT_NONE;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4b5c4ff68f18..eb2027892ef9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -16,6 +16,7 @@
 
 #include <linux/stddef.h>
 #include <linux/mm.h>
+#include <linux/highmem.h>
 #include <linux/swap.h>
 #include <linux/interrupt.h>
 #include <linux/pagemap.h>
@@ -124,7 +125,8 @@ EXPORT_SYMBOL(node_states);
 /* Protect totalram_pages and zone->managed_pages */
 static DEFINE_SPINLOCK(managed_page_count_lock);
 
-unsigned long totalram_pages __read_mostly;
+atomic_long_t _totalram_pages __read_mostly;
+EXPORT_SYMBOL(_totalram_pages);
 unsigned long totalreserve_pages __read_mostly;
 unsigned long totalcma_pages __read_mostly;
 
@@ -4747,11 +4749,11 @@ EXPORT_SYMBOL_GPL(si_mem_available);
 
 void si_meminfo(struct sysinfo *val)
 {
-	val->totalram = totalram_pages;
+	val->totalram = totalram_pages();
 	val->sharedram = global_node_page_state(NR_SHMEM);
 	val->freeram = global_zone_page_state(NR_FREE_PAGES);
 	val->bufferram = nr_blockdev_pages();
-	val->totalhigh = totalhigh_pages;
+	val->totalhigh = totalhigh_pages();
 	val->freehigh = nr_free_highpages();
 	val->mem_unit = PAGE_SIZE;
 }
@@ -7077,10 +7079,10 @@ void adjust_managed_page_count(struct page *page, long count)
 {
 	spin_lock(&managed_page_count_lock);
 	atomic_long_add(count, &page_zone(page)->managed_pages);
-	totalram_pages += count;
+	totalram_pages_add(count);
 #ifdef CONFIG_HIGHMEM
 	if (PageHighMem(page))
-		totalhigh_pages += count;
+		totalhigh_pages_add(count);
 #endif
 	spin_unlock(&managed_page_count_lock);
 }
@@ -7123,9 +7125,9 @@ EXPORT_SYMBOL(free_reserved_area);
 void free_highmem_page(struct page *page)
 {
 	__free_reserved_page(page);
-	totalram_pages++;
+	totalram_pages_inc();
 	atomic_long_inc(&page_zone(page)->managed_pages);
-	totalhigh_pages++;
+	totalhigh_pages_inc();
 }
 #endif
 
@@ -7174,10 +7176,10 @@ void __init mem_init_print_info(const char *str)
 		physpages << (PAGE_SHIFT - 10),
 		codesize >> 10, datasize >> 10, rosize >> 10,
 		(init_data_size + init_code_size) >> 10, bss_size >> 10,
-		(physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT - 10),
+		(physpages - totalram_pages() - totalcma_pages) << (PAGE_SHIFT - 10),
 		totalcma_pages << (PAGE_SHIFT - 10),
 #ifdef	CONFIG_HIGHMEM
-		totalhigh_pages << (PAGE_SHIFT - 10),
+		totalhigh_pages() << (PAGE_SHIFT - 10),
 #endif
 		str ? ", " : "", str ? str : "");
 }
diff --git a/mm/shmem.c b/mm/shmem.c
index b1f0f54470fb..6ece1e2fe76e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -109,13 +109,14 @@ struct shmem_falloc {
 #ifdef CONFIG_TMPFS
 static unsigned long shmem_default_max_blocks(void)
 {
-	return totalram_pages / 2;
+	return totalram_pages() / 2;
 }
 
 static unsigned long shmem_default_max_inodes(void)
 {
-	unsigned long nr_pages = totalram_pages;
-	return min(nr_pages - totalhigh_pages, nr_pages / 2);
+	unsigned long nr_pages = totalram_pages();
+
+	return min(nr_pages - totalhigh_pages(), nr_pages / 2);
 }
 #endif
 
@@ -3302,7 +3303,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
 			size = memparse(value,&rest);
 			if (*rest == '%') {
 				size <<= PAGE_SHIFT;
-				size *= totalram_pages;
+				size *= totalram_pages();
 				do_div(size, 100);
 				rest++;
 			}
diff --git a/mm/slab.c b/mm/slab.c
index 01991060714c..73fe23e649c9 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1235,7 +1235,7 @@ void __init kmem_cache_init(void)
 	 * page orders on machines with more than 32MB of memory if
 	 * not overridden on the command line.
 	 */
-	if (!slab_max_order_set && totalram_pages > (32 << 20) >> PAGE_SHIFT)
+	if (!slab_max_order_set && totalram_pages() > (32 << 20) >> PAGE_SHIFT)
 		slab_max_order = SLAB_MAX_ORDER_HI;
 
 	/* Bootstrap is tricky, because several objects are allocated
diff --git a/mm/swap.c b/mm/swap.c
index 5d786019eab9..4d8a1f1afaab 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -1022,7 +1022,7 @@ EXPORT_SYMBOL(pagevec_lookup_range_nr_tag);
  */
 void __init swap_setup(void)
 {
-	unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT);
+	unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT);
 
 	/* Use a smaller cluster for small-memory machines */
 	if (megs < 16)
diff --git a/mm/util.c b/mm/util.c
index 8bf08b5b5760..4df23d64aac7 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -593,7 +593,7 @@ unsigned long vm_commit_limit(void)
 	if (sysctl_overcommit_kbytes)
 		allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10);
 	else
-		allowed = ((totalram_pages - hugetlb_total_pages())
+		allowed = ((totalram_pages() - hugetlb_total_pages())
 			   * sysctl_overcommit_ratio / 100);
 	allowed += total_swap_pages;
 
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 97d4b25d0373..871e41c55e23 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1634,7 +1634,7 @@ void *vmap(struct page **pages, unsigned int count,
 
 	might_sleep();
 
-	if (count > totalram_pages)
+	if (count > totalram_pages())
 		return NULL;
 
 	size = (unsigned long)count << PAGE_SHIFT;
@@ -1739,7 +1739,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
 	unsigned long real_size = size;
 
 	size = PAGE_ALIGN(size);
-	if (!size || (size >> PAGE_SHIFT) > totalram_pages)
+	if (!size || (size >> PAGE_SHIFT) > totalram_pages())
 		goto fail;
 
 	area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED |
diff --git a/mm/workingset.c b/mm/workingset.c
index d46f8c92aa2f..dcb994f2acc2 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -549,7 +549,7 @@ static int __init workingset_init(void)
 	 * double the initial memory by using totalram_pages as-is.
 	 */
 	timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT;
-	max_order = fls_long(totalram_pages - 1);
+	max_order = fls_long(totalram_pages() - 1);
 	if (max_order > timestamp_bits)
 		bucket_order = max_order - timestamp_bits;
 	pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
diff --git a/mm/zswap.c b/mm/zswap.c
index cd91fd9d96b8..a4e4d36ec085 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -219,8 +219,8 @@ static const struct zpool_ops zswap_zpool_ops = {
 
 static bool zswap_is_full(void)
 {
-	return totalram_pages * zswap_max_pool_percent / 100 <
-		DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
+	return totalram_pages() * zswap_max_pool_percent / 100 <
+			DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
 }
 
 static void zswap_update_total_size(void)
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index ff727ff61b5b..0e2f71ab8367 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -1131,7 +1131,7 @@ EXPORT_SYMBOL_GPL(dccp_debug);
 static int __init dccp_init(void)
 {
 	unsigned long goal;
-	unsigned long nr_pages = totalram_pages;
+	unsigned long nr_pages = totalram_pages();
 	int ehash_order, bhash_order, i;
 	int rc;
 
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 1c002c0fb712..950613ee7881 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -1866,7 +1866,7 @@ void __init dn_route_init(void)
 	dn_route_timer.expires = jiffies + decnet_dst_gc_interval * HZ;
 	add_timer(&dn_route_timer);
 
-	goal = totalram_pages >> (26 - PAGE_SHIFT);
+	goal = totalram_pages() >> (26 - PAGE_SHIFT);
 
 	for(order = 0; (1UL << order) < goal; order++)
 		/* NOTHING */;
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 03b51cdcc731..b467a7cabf40 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -1000,7 +1000,7 @@ static int __net_init tcp_net_metrics_init(struct net *net)
 
 	slots = tcpmhash_entries;
 	if (!slots) {
-		if (totalram_pages >= 128 * 1024)
+		if (totalram_pages() >= 128 * 1024)
 			slots = 16 * 1024;
 		else
 			slots = 8 * 1024;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 5eb990830348..741b533148ba 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -2248,7 +2248,7 @@ static __always_inline unsigned int total_extension_size(void)
 
 int nf_conntrack_init_start(void)
 {
-	unsigned long nr_pages = totalram_pages;
+	unsigned long nr_pages = totalram_pages();
 	int max_factor = 8;
 	int ret = -ENOMEM;
 	int i;
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 88b520ba2abc..8d86e39d6280 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -274,7 +274,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg,
 	struct xt_hashlimit_htable *hinfo;
 	const struct seq_operations *ops;
 	unsigned int size, i;
-	unsigned long nr_pages = totalram_pages;
+	unsigned long nr_pages = totalram_pages();
 	int ret;
 
 	if (cfg->size) {
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index a5b24182b3cc..d5878ae55840 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -1368,7 +1368,7 @@ static __init int sctp_init(void)
 	int status = -EINVAL;
 	unsigned long goal;
 	unsigned long limit;
-	unsigned long nr_pages = totalram_pages;
+	unsigned long nr_pages = totalram_pages();
 	int max_share;
 	int order;
 	int num_entries;
diff --git a/security/integrity/ima/ima_kexec.c b/security/integrity/ima/ima_kexec.c
index 16bd18747cfa..d6f32807b347 100644
--- a/security/integrity/ima/ima_kexec.c
+++ b/security/integrity/ima/ima_kexec.c
@@ -106,7 +106,7 @@ void ima_add_kexec_buffer(struct kimage *image)
 		kexec_segment_size = ALIGN(ima_get_binary_runtime_size() +
 					   PAGE_SIZE / 2, PAGE_SIZE);
 	if ((kexec_segment_size == ULONG_MAX) ||
-	    ((kexec_segment_size >> PAGE_SHIFT) > totalram_pages / 2)) {
+	    ((kexec_segment_size >> PAGE_SHIFT) > totalram_pages() / 2)) {
 		pr_err("Binary measurement list too large.\n");
 		return;
 	}
-- 
cgit 


From 808153e1187fa77ac7d7dad261ff476888dcf398 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 28 Dec 2018 00:34:50 -0800
Subject: mm, devm_memremap_pages: mark devm_memremap_pages() EXPORT_SYMBOL_GPL
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

devm_memremap_pages() is a facility that can create struct page entries
for any arbitrary range and give drivers the ability to subvert core
aspects of page management.

Specifically the facility is tightly integrated with the kernel's memory
hotplug functionality.  It injects an altmap argument deep into the
architecture specific vmemmap implementation to allow allocating from
specific reserved pages, and it has Linux specific assumptions about page
structure reference counting relative to get_user_pages() and
get_user_pages_fast().  It was an oversight and a mistake that this was
not marked EXPORT_SYMBOL_GPL from the outset.

Again, devm_memremap_pagex() exposes and relies upon core kernel internal
assumptions and will continue to evolve along with 'struct page', memory
hotplug, and support for new memory types / topologies.  Only an in-kernel
GPL-only driver is expected to keep up with this ongoing evolution.  This
interface, and functionality derived from this interface, is not suitable
for kernel-external drivers.

Link: http://lkml.kernel.org/r/154275557457.76910.16923571232582744134.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: "Jérôme Glisse" <jglisse@redhat.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/memremap.c                 | 2 +-
 tools/testing/nvdimm/test/iomap.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/memremap.c b/kernel/memremap.c
index 9eced2cc9f94..61dbcaa95530 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -233,7 +233,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
  err_array:
 	return ERR_PTR(error);
 }
-EXPORT_SYMBOL(devm_memremap_pages);
+EXPORT_SYMBOL_GPL(devm_memremap_pages);
 
 unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
 {
diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c
index ff9d3a5825e1..ed18a0cbc0c8 100644
--- a/tools/testing/nvdimm/test/iomap.c
+++ b/tools/testing/nvdimm/test/iomap.c
@@ -113,7 +113,7 @@ void *__wrap_devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 		return nfit_res->buf + offset - nfit_res->res.start;
 	return devm_memremap_pages(dev, pgmap);
 }
-EXPORT_SYMBOL(__wrap_devm_memremap_pages);
+EXPORT_SYMBOL_GPL(__wrap_devm_memremap_pages);
 
 pfn_t __wrap_phys_to_pfn_t(phys_addr_t addr, unsigned long flags)
 {
-- 
cgit 


From 06489cfbd915ff36c8e36df27f1c2dc60f97ca56 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 28 Dec 2018 00:34:54 -0800
Subject: mm, devm_memremap_pages: kill mapping "System RAM" support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Given the fact that devm_memremap_pages() requires a percpu_ref that is
torn down by devm_memremap_pages_release() the current support for mapping
RAM is broken.

Support for remapping "System RAM" has been broken since the beginning and
there is no existing user of this this code path, so just kill the support
and make it an explicit error.

This cleanup also simplifies a follow-on patch to fix the error path when
setting a devm release action for devm_memremap_pages_release() fails.

Link: http://lkml.kernel.org/r/154275557997.76910.14689813630968180480.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: "Jérôme Glisse" <jglisse@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/memremap.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/memremap.c b/kernel/memremap.c
index 61dbcaa95530..99d14940acfa 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -167,15 +167,12 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 	is_ram = region_intersects(align_start, align_size,
 		IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE);
 
-	if (is_ram == REGION_MIXED) {
-		WARN_ONCE(1, "%s attempted on mixed region %pr\n",
-				__func__, res);
+	if (is_ram != REGION_DISJOINT) {
+		WARN_ONCE(1, "%s attempted on %s region %pr\n", __func__,
+				is_ram == REGION_MIXED ? "mixed" : "ram", res);
 		return ERR_PTR(-ENXIO);
 	}
 
-	if (is_ram == REGION_INTERSECTS)
-		return __va(res->start);
-
 	if (!pgmap->ref)
 		return ERR_PTR(-EINVAL);
 
-- 
cgit 


From a95c90f1e2c253b280385ecf3d4ebfe476926b28 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 28 Dec 2018 00:34:57 -0800
Subject: mm, devm_memremap_pages: fix shutdown handling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The last step before devm_memremap_pages() returns success is to allocate
a release action, devm_memremap_pages_release(), to tear the entire setup
down.  However, the result from devm_add_action() is not checked.

Checking the error from devm_add_action() is not enough.  The api
currently relies on the fact that the percpu_ref it is using is killed by
the time the devm_memremap_pages_release() is run.  Rather than continue
this awkward situation, offload the responsibility of killing the
percpu_ref to devm_memremap_pages_release() directly.  This allows
devm_memremap_pages() to do the right thing relative to init failures and
shutdown.

Without this change we could fail to register the teardown of
devm_memremap_pages().  The likelihood of hitting this failure is tiny as
small memory allocations almost always succeed.  However, the impact of
the failure is large given any future reconfiguration, or disable/enable,
of an nvdimm namespace will fail forever as subsequent calls to
devm_memremap_pages() will fail to setup the pgmap_radix since there will
be stale entries for the physical address range.

An argument could be made to require that the ->kill() operation be set in
the @pgmap arg rather than passed in separately.  However, it helps code
readability, tracking the lifetime of a given instance, to be able to grep
the kill routine directly at the devm_memremap_pages() call site.

Link: http://lkml.kernel.org/r/154275558526.76910.7535251937849268605.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Fixes: e8d513483300 ("memremap: change devm_memremap_pages interface...")
Reviewed-by: "Jérôme Glisse" <jglisse@redhat.com>
Reported-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/dax/pmem.c                | 14 +++-----------
 drivers/nvdimm/pmem.c             | 13 +++++--------
 include/linux/memremap.h          |  2 ++
 kernel/memremap.c                 | 30 ++++++++++++++----------------
 tools/testing/nvdimm/test/iomap.c | 15 ++++++++++++++-
 5 files changed, 38 insertions(+), 36 deletions(-)

(limited to 'kernel')

diff --git a/drivers/dax/pmem.c b/drivers/dax/pmem.c
index 99e2aace8078..2c1f459c0c63 100644
--- a/drivers/dax/pmem.c
+++ b/drivers/dax/pmem.c
@@ -48,9 +48,8 @@ static void dax_pmem_percpu_exit(void *data)
 	percpu_ref_exit(ref);
 }
 
-static void dax_pmem_percpu_kill(void *data)
+static void dax_pmem_percpu_kill(struct percpu_ref *ref)
 {
-	struct percpu_ref *ref = data;
 	struct dax_pmem *dax_pmem = to_dax_pmem(ref);
 
 	dev_dbg(dax_pmem->dev, "trace\n");
@@ -112,17 +111,10 @@ static int dax_pmem_probe(struct device *dev)
 	}
 
 	dax_pmem->pgmap.ref = &dax_pmem->ref;
+	dax_pmem->pgmap.kill = dax_pmem_percpu_kill;
 	addr = devm_memremap_pages(dev, &dax_pmem->pgmap);
-	if (IS_ERR(addr)) {
-		devm_remove_action(dev, dax_pmem_percpu_exit, &dax_pmem->ref);
-		percpu_ref_exit(&dax_pmem->ref);
+	if (IS_ERR(addr))
 		return PTR_ERR(addr);
-	}
-
-	rc = devm_add_action_or_reset(dev, dax_pmem_percpu_kill,
-							&dax_pmem->ref);
-	if (rc)
-		return rc;
 
 	/* adjust the dax_region resource to the start of data */
 	memcpy(&res, &dax_pmem->pgmap.res, sizeof(res));
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 0e39e3d1846f..d28418b05a04 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -309,8 +309,11 @@ static void pmem_release_queue(void *q)
 	blk_cleanup_queue(q);
 }
 
-static void pmem_freeze_queue(void *q)
+static void pmem_freeze_queue(struct percpu_ref *ref)
 {
+	struct request_queue *q;
+
+	q = container_of(ref, typeof(*q), q_usage_counter);
 	blk_freeze_queue_start(q);
 }
 
@@ -402,6 +405,7 @@ static int pmem_attach_disk(struct device *dev,
 
 	pmem->pfn_flags = PFN_DEV;
 	pmem->pgmap.ref = &q->q_usage_counter;
+	pmem->pgmap.kill = pmem_freeze_queue;
 	if (is_nd_pfn(dev)) {
 		if (setup_pagemap_fsdax(dev, &pmem->pgmap))
 			return -ENOMEM;
@@ -427,13 +431,6 @@ static int pmem_attach_disk(struct device *dev,
 		memcpy(&bb_res, &nsio->res, sizeof(bb_res));
 	}
 
-	/*
-	 * At release time the queue must be frozen before
-	 * devm_memremap_pages is unwound
-	 */
-	if (devm_add_action_or_reset(dev, pmem_freeze_queue, q))
-		return -ENOMEM;
-
 	if (IS_ERR(addr))
 		return PTR_ERR(addr);
 	pmem->virt_addr = addr;
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 0ac69ddf5fc4..55db66b3716f 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -111,6 +111,7 @@ typedef void (*dev_page_free_t)(struct page *page, void *data);
  * @altmap: pre-allocated/reserved memory for vmemmap allocations
  * @res: physical address range covered by @ref
  * @ref: reference count that pins the devm_memremap_pages() mapping
+ * @kill: callback to transition @ref to the dead state
  * @dev: host device of the mapping for debug
  * @data: private data pointer for page_free()
  * @type: memory type: see MEMORY_* in memory_hotplug.h
@@ -122,6 +123,7 @@ struct dev_pagemap {
 	bool altmap_valid;
 	struct resource res;
 	struct percpu_ref *ref;
+	void (*kill)(struct percpu_ref *ref);
 	struct device *dev;
 	void *data;
 	enum memory_type type;
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 99d14940acfa..5e45f0c327a5 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -88,14 +88,10 @@ static void devm_memremap_pages_release(void *data)
 	resource_size_t align_start, align_size;
 	unsigned long pfn;
 
+	pgmap->kill(pgmap->ref);
 	for_each_device_pfn(pfn, pgmap)
 		put_page(pfn_to_page(pfn));
 
-	if (percpu_ref_tryget_live(pgmap->ref)) {
-		dev_WARN(dev, "%s: page mapping is still live!\n", __func__);
-		percpu_ref_put(pgmap->ref);
-	}
-
 	/* pages are dead and unused, undo the arch mapping */
 	align_start = res->start & ~(SECTION_SIZE - 1);
 	align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
@@ -116,7 +112,7 @@ static void devm_memremap_pages_release(void *data)
 /**
  * devm_memremap_pages - remap and provide memmap backing for the given resource
  * @dev: hosting device for @res
- * @pgmap: pointer to a struct dev_pgmap
+ * @pgmap: pointer to a struct dev_pagemap
  *
  * Notes:
  * 1/ At a minimum the res, ref and type members of @pgmap must be initialized
@@ -125,11 +121,8 @@ static void devm_memremap_pages_release(void *data)
  * 2/ The altmap field may optionally be initialized, in which case altmap_valid
  *    must be set to true
  *
- * 3/ pgmap.ref must be 'live' on entry and 'dead' before devm_memunmap_pages()
- *    time (or devm release event). The expected order of events is that ref has
- *    been through percpu_ref_kill() before devm_memremap_pages_release(). The
- *    wait for the completion of all references being dropped and
- *    percpu_ref_exit() must occur after devm_memremap_pages_release().
+ * 3/ pgmap->ref must be 'live' on entry and will be killed at
+ *    devm_memremap_pages_release() time, or if this routine fails.
  *
  * 4/ res is expected to be a host memory range that could feasibly be
  *    treated as a "System RAM" range, i.e. not a device mmio range, but
@@ -145,6 +138,9 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 	pgprot_t pgprot = PAGE_KERNEL;
 	int error, nid, is_ram;
 
+	if (!pgmap->ref || !pgmap->kill)
+		return ERR_PTR(-EINVAL);
+
 	align_start = res->start & ~(SECTION_SIZE - 1);
 	align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
 		- align_start;
@@ -170,12 +166,10 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 	if (is_ram != REGION_DISJOINT) {
 		WARN_ONCE(1, "%s attempted on %s region %pr\n", __func__,
 				is_ram == REGION_MIXED ? "mixed" : "ram", res);
-		return ERR_PTR(-ENXIO);
+		error = -ENXIO;
+		goto err_array;
 	}
 
-	if (!pgmap->ref)
-		return ERR_PTR(-EINVAL);
-
 	pgmap->dev = dev;
 
 	error = xa_err(xa_store_range(&pgmap_array, PHYS_PFN(res->start),
@@ -217,7 +211,10 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 				align_size >> PAGE_SHIFT, pgmap);
 	percpu_ref_get_many(pgmap->ref, pfn_end(pgmap) - pfn_first(pgmap));
 
-	devm_add_action(dev, devm_memremap_pages_release, pgmap);
+	error = devm_add_action_or_reset(dev, devm_memremap_pages_release,
+			pgmap);
+	if (error)
+		return ERR_PTR(error);
 
 	return __va(res->start);
 
@@ -228,6 +225,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
  err_pfn_remap:
 	pgmap_array_delete(res);
  err_array:
+	pgmap->kill(pgmap->ref);
 	return ERR_PTR(error);
 }
 EXPORT_SYMBOL_GPL(devm_memremap_pages);
diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c
index ed18a0cbc0c8..c6635fee27d8 100644
--- a/tools/testing/nvdimm/test/iomap.c
+++ b/tools/testing/nvdimm/test/iomap.c
@@ -104,13 +104,26 @@ void *__wrap_devm_memremap(struct device *dev, resource_size_t offset,
 }
 EXPORT_SYMBOL(__wrap_devm_memremap);
 
+static void nfit_test_kill(void *_pgmap)
+{
+	struct dev_pagemap *pgmap = _pgmap;
+
+	pgmap->kill(pgmap->ref);
+}
+
 void *__wrap_devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 {
 	resource_size_t offset = pgmap->res.start;
 	struct nfit_test_resource *nfit_res = get_nfit_res(offset);
 
-	if (nfit_res)
+	if (nfit_res) {
+		int rc;
+
+		rc = devm_add_action_or_reset(dev, nfit_test_kill, pgmap);
+		if (rc)
+			return ERR_PTR(rc);
 		return nfit_res->buf + offset - nfit_res->res.start;
+	}
 	return devm_memremap_pages(dev, pgmap);
 }
 EXPORT_SYMBOL_GPL(__wrap_devm_memremap_pages);
-- 
cgit 


From 69324b8f48339de2f90fdf2f774687fc6c47629a Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 28 Dec 2018 00:35:01 -0800
Subject: mm, devm_memremap_pages: add MEMORY_DEVICE_PRIVATE support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In preparation for consolidating all ZONE_DEVICE enabling via
devm_memremap_pages(), teach it how to handle the constraints of
MEMORY_DEVICE_PRIVATE ranges.

[jglisse@redhat.com: call move_pfn_range_to_zone for MEMORY_DEVICE_PRIVATE]
Link: http://lkml.kernel.org/r/154275559036.76910.12434636179931292607.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Jérôme Glisse <jglisse@redhat.com>
Acked-by: Christoph Hellwig <hch@lst.de>
Reported-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/memremap.c | 53 +++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 41 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/memremap.c b/kernel/memremap.c
index 5e45f0c327a5..3eef989ef035 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -98,9 +98,15 @@ static void devm_memremap_pages_release(void *data)
 		- align_start;
 
 	mem_hotplug_begin();
-	arch_remove_memory(align_start, align_size, pgmap->altmap_valid ?
-			&pgmap->altmap : NULL);
-	kasan_remove_zero_shadow(__va(align_start), align_size);
+	if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
+		pfn = align_start >> PAGE_SHIFT;
+		__remove_pages(page_zone(pfn_to_page(pfn)), pfn,
+				align_size >> PAGE_SHIFT, NULL);
+	} else {
+		arch_remove_memory(align_start, align_size,
+				pgmap->altmap_valid ? &pgmap->altmap : NULL);
+		kasan_remove_zero_shadow(__va(align_start), align_size);
+	}
 	mem_hotplug_done();
 
 	untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
@@ -187,17 +193,40 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 		goto err_pfn_remap;
 
 	mem_hotplug_begin();
-	error = kasan_add_zero_shadow(__va(align_start), align_size);
-	if (error) {
-		mem_hotplug_done();
-		goto err_kasan;
+
+	/*
+	 * For device private memory we call add_pages() as we only need to
+	 * allocate and initialize struct page for the device memory. More-
+	 * over the device memory is un-accessible thus we do not want to
+	 * create a linear mapping for the memory like arch_add_memory()
+	 * would do.
+	 *
+	 * For all other device memory types, which are accessible by
+	 * the CPU, we do want the linear mapping and thus use
+	 * arch_add_memory().
+	 */
+	if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
+		error = add_pages(nid, align_start >> PAGE_SHIFT,
+				align_size >> PAGE_SHIFT, NULL, false);
+	} else {
+		error = kasan_add_zero_shadow(__va(align_start), align_size);
+		if (error) {
+			mem_hotplug_done();
+			goto err_kasan;
+		}
+
+		error = arch_add_memory(nid, align_start, align_size, altmap,
+				false);
+	}
+
+	if (!error) {
+		struct zone *zone;
+
+		zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE];
+		move_pfn_range_to_zone(zone, align_start >> PAGE_SHIFT,
+				align_size >> PAGE_SHIFT, altmap);
 	}
 
-	error = arch_add_memory(nid, align_start, align_size, altmap, false);
-	if (!error)
-		move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
-					align_start >> PAGE_SHIFT,
-					align_size >> PAGE_SHIFT, altmap);
 	mem_hotplug_done();
 	if (error)
 		goto err_add_memory;
-- 
cgit 


From 1c30844d2dfe272d58c8fc000960b835d13aa2ac Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Fri, 28 Dec 2018 00:35:52 -0800
Subject: mm: reclaim small amounts of memory when an external fragmentation
 event occurs

An external fragmentation event was previously described as

    When the page allocator fragments memory, it records the event using
    the mm_page_alloc_extfrag event. If the fallback_order is smaller
    than a pageblock order (order-9 on 64-bit x86) then it's considered
    an event that will cause external fragmentation issues in the future.

The kernel reduces the probability of such events by increasing the
watermark sizes by calling set_recommended_min_free_kbytes early in the
lifetime of the system.  This works reasonably well in general but if
there are enough sparsely populated pageblocks then the problem can still
occur as enough memory is free overall and kswapd stays asleep.

This patch introduces a watermark_boost_factor sysctl that allows a zone
watermark to be temporarily boosted when an external fragmentation causing
events occurs.  The boosting will stall allocations that would decrease
free memory below the boosted low watermark and kswapd is woken if the
calling context allows to reclaim an amount of memory relative to the size
of the high watermark and the watermark_boost_factor until the boost is
cleared.  When kswapd finishes, it wakes kcompactd at the pageblock order
to clean some of the pageblocks that may have been affected by the
fragmentation event.  kswapd avoids any writeback, slab shrinkage and swap
from reclaim context during this operation to avoid excessive system
disruption in the name of fragmentation avoidance.  Care is taken so that
kswapd will do normal reclaim work if the system is really low on memory.

This was evaluated using the same workloads as "mm, page_alloc: Spread
allocations across zones before introducing fragmentation".

1-socket Skylake machine
config-global-dhp__workload_thpfioscale XFS (no special madvise)
4 fio threads, 1 THP allocating thread
--------------------------------------

4.20-rc3 extfrag events < order 9:   804694
4.20-rc3+patch:                      408912 (49% reduction)
4.20-rc3+patch1-4:                    18421 (98% reduction)

                                   4.20.0-rc3             4.20.0-rc3
                                 lowzone-v5r8             boost-v5r8
Amean     fault-base-1      653.58 (   0.00%)      652.71 (   0.13%)
Amean     fault-huge-1        0.00 (   0.00%)      178.93 * -99.00%*

                              4.20.0-rc3             4.20.0-rc3
                            lowzone-v5r8             boost-v5r8
Percentage huge-1        0.00 (   0.00%)        5.12 ( 100.00%)

Note that external fragmentation causing events are massively reduced by
this path whether in comparison to the previous kernel or the vanilla
kernel.  The fault latency for huge pages appears to be increased but that
is only because THP allocations were successful with the patch applied.

1-socket Skylake machine
global-dhp__workload_thpfioscale-madvhugepage-xfs (MADV_HUGEPAGE)
-----------------------------------------------------------------

4.20-rc3 extfrag events < order 9:  291392
4.20-rc3+patch:                     191187 (34% reduction)
4.20-rc3+patch1-4:                   13464 (95% reduction)

thpfioscale Fault Latencies
                                   4.20.0-rc3             4.20.0-rc3
                                 lowzone-v5r8             boost-v5r8
Min       fault-base-1      912.00 (   0.00%)      905.00 (   0.77%)
Min       fault-huge-1      127.00 (   0.00%)      135.00 (  -6.30%)
Amean     fault-base-1     1467.55 (   0.00%)     1481.67 (  -0.96%)
Amean     fault-huge-1     1127.11 (   0.00%)     1063.88 *   5.61%*

                              4.20.0-rc3             4.20.0-rc3
                            lowzone-v5r8             boost-v5r8
Percentage huge-1       77.64 (   0.00%)       83.46 (   7.49%)

As before, massive reduction in external fragmentation events, some jitter
on latencies and an increase in THP allocation success rates.

2-socket Haswell machine
config-global-dhp__workload_thpfioscale XFS (no special madvise)
4 fio threads, 5 THP allocating threads
----------------------------------------------------------------

4.20-rc3 extfrag events < order 9:  215698
4.20-rc3+patch:                     200210 (7% reduction)
4.20-rc3+patch1-4:                   14263 (93% reduction)

                                   4.20.0-rc3             4.20.0-rc3
                                 lowzone-v5r8             boost-v5r8
Amean     fault-base-5     1346.45 (   0.00%)     1306.87 (   2.94%)
Amean     fault-huge-5     3418.60 (   0.00%)     1348.94 (  60.54%)

                              4.20.0-rc3             4.20.0-rc3
                            lowzone-v5r8             boost-v5r8
Percentage huge-5        0.78 (   0.00%)        7.91 ( 910.64%)

There is a 93% reduction in fragmentation causing events, there is a big
reduction in the huge page fault latency and allocation success rate is
higher.

2-socket Haswell machine
global-dhp__workload_thpfioscale-madvhugepage-xfs (MADV_HUGEPAGE)
-----------------------------------------------------------------

4.20-rc3 extfrag events < order 9: 166352
4.20-rc3+patch:                    147463 (11% reduction)
4.20-rc3+patch1-4:                  11095 (93% reduction)

thpfioscale Fault Latencies
                                   4.20.0-rc3             4.20.0-rc3
                                 lowzone-v5r8             boost-v5r8
Amean     fault-base-5     6217.43 (   0.00%)     7419.67 * -19.34%*
Amean     fault-huge-5     3163.33 (   0.00%)     3263.80 (  -3.18%)

                              4.20.0-rc3             4.20.0-rc3
                            lowzone-v5r8             boost-v5r8
Percentage huge-5       95.14 (   0.00%)       87.98 (  -7.53%)

There is a large reduction in fragmentation events with some jitter around
the latencies and success rates.  As before, the high THP allocation
success rate does mean the system is under a lot of pressure.  However, as
the fragmentation events are reduced, it would be expected that the
long-term allocation success rate would be higher.

Link: http://lkml.kernel.org/r/20181123114528.28802-5-mgorman@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Zi Yan <zi.yan@cs.rutgers.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/sysctl/vm.txt |  21 +++++++
 include/linux/mm.h          |   1 +
 include/linux/mmzone.h      |  11 ++--
 kernel/sysctl.c             |   8 +++
 mm/page_alloc.c             |  43 +++++++++++++-
 mm/vmscan.c                 | 133 +++++++++++++++++++++++++++++++++++++++++---
 6 files changed, 202 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 7d73882e2c27..187ce4f599a2 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -63,6 +63,7 @@ Currently, these files are in /proc/sys/vm:
 - swappiness
 - user_reserve_kbytes
 - vfs_cache_pressure
+- watermark_boost_factor
 - watermark_scale_factor
 - zone_reclaim_mode
 
@@ -856,6 +857,26 @@ ten times more freeable objects than there are.
 
 =============================================================
 
+watermark_boost_factor:
+
+This factor controls the level of reclaim when memory is being fragmented.
+It defines the percentage of the high watermark of a zone that will be
+reclaimed if pages of different mobility are being mixed within pageblocks.
+The intent is that compaction has less work to do in the future and to
+increase the success rate of future high-order allocations such as SLUB
+allocations, THP and hugetlbfs pages.
+
+To make it sensible with respect to the watermark_scale_factor parameter,
+the unit is in fractions of 10,000. The default value of 15,000 means
+that up to 150% of the high watermark will be reclaimed in the event of
+a pageblock being mixed due to fragmentation. The level of reclaim is
+determined by the number of fragmentation events that occurred in the
+recent past. If this value is smaller than a pageblock then a pageblocks
+worth of pages will be reclaimed (e.g.  2MB on 64-bit x86). A boost factor
+of 0 will disable the feature.
+
+=============================================================
+
 watermark_scale_factor:
 
 This factor controls the aggressiveness of kswapd. It defines the
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1d2be4c2d34a..031b2ce983f9 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2256,6 +2256,7 @@ extern void zone_pcp_reset(struct zone *zone);
 
 /* page_alloc.c */
 extern int min_free_kbytes;
+extern int watermark_boost_factor;
 extern int watermark_scale_factor;
 
 /* nommu.c */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index dcf1b66a96ab..5b4bfb90fb94 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -269,10 +269,10 @@ enum zone_watermarks {
 	NR_WMARK
 };
 
-#define min_wmark_pages(z) (z->_watermark[WMARK_MIN])
-#define low_wmark_pages(z) (z->_watermark[WMARK_LOW])
-#define high_wmark_pages(z) (z->_watermark[WMARK_HIGH])
-#define wmark_pages(z, i) (z->_watermark[i])
+#define min_wmark_pages(z) (z->_watermark[WMARK_MIN] + z->watermark_boost)
+#define low_wmark_pages(z) (z->_watermark[WMARK_LOW] + z->watermark_boost)
+#define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost)
+#define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost)
 
 struct per_cpu_pages {
 	int count;		/* number of pages in the list */
@@ -364,6 +364,7 @@ struct zone {
 
 	/* zone watermarks, access with *_wmark_pages(zone) macros */
 	unsigned long _watermark[NR_WMARK];
+	unsigned long watermark_boost;
 
 	unsigned long nr_reserved_highatomic;
 
@@ -890,6 +891,8 @@ static inline int is_highmem(struct zone *zone)
 struct ctl_table;
 int min_free_kbytes_sysctl_handler(struct ctl_table *, int,
 					void __user *, size_t *, loff_t *);
+int watermark_boost_factor_sysctl_handler(struct ctl_table *, int,
+					void __user *, size_t *, loff_t *);
 int watermark_scale_factor_sysctl_handler(struct ctl_table *, int,
 					void __user *, size_t *, loff_t *);
 extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES];
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 5fc724e4e454..1825f712e73b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1462,6 +1462,14 @@ static struct ctl_table vm_table[] = {
 		.proc_handler	= min_free_kbytes_sysctl_handler,
 		.extra1		= &zero,
 	},
+	{
+		.procname	= "watermark_boost_factor",
+		.data		= &watermark_boost_factor,
+		.maxlen		= sizeof(watermark_boost_factor),
+		.mode		= 0644,
+		.proc_handler	= watermark_boost_factor_sysctl_handler,
+		.extra1		= &zero,
+	},
 	{
 		.procname	= "watermark_scale_factor",
 		.data		= &watermark_scale_factor,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 32b3e121a388..80373eca453d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -262,6 +262,7 @@ compound_page_dtor * const compound_page_dtors[] = {
 
 int min_free_kbytes = 1024;
 int user_min_free_kbytes = -1;
+int watermark_boost_factor __read_mostly = 15000;
 int watermark_scale_factor = 10;
 
 static unsigned long nr_kernel_pages __meminitdata;
@@ -2129,6 +2130,21 @@ static bool can_steal_fallback(unsigned int order, int start_mt)
 	return false;
 }
 
+static inline void boost_watermark(struct zone *zone)
+{
+	unsigned long max_boost;
+
+	if (!watermark_boost_factor)
+		return;
+
+	max_boost = mult_frac(zone->_watermark[WMARK_HIGH],
+			watermark_boost_factor, 10000);
+	max_boost = max(pageblock_nr_pages, max_boost);
+
+	zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages,
+		max_boost);
+}
+
 /*
  * This function implements actual steal behaviour. If order is large enough,
  * we can steal whole pageblock. If not, we first move freepages in this
@@ -2138,7 +2154,7 @@ static bool can_steal_fallback(unsigned int order, int start_mt)
  * itself, so pages freed in the future will be put on the correct free list.
  */
 static void steal_suitable_fallback(struct zone *zone, struct page *page,
-					int start_type, bool whole_block)
+		unsigned int alloc_flags, int start_type, bool whole_block)
 {
 	unsigned int current_order = page_order(page);
 	struct free_area *area;
@@ -2160,6 +2176,15 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page,
 		goto single_page;
 	}
 
+	/*
+	 * Boost watermarks to increase reclaim pressure to reduce the
+	 * likelihood of future fallbacks. Wake kswapd now as the node
+	 * may be balanced overall and kswapd will not wake naturally.
+	 */
+	boost_watermark(zone);
+	if (alloc_flags & ALLOC_KSWAPD)
+		wakeup_kswapd(zone, 0, 0, zone_idx(zone));
+
 	/* We are not allowed to try stealing from the whole block */
 	if (!whole_block)
 		goto single_page;
@@ -2443,7 +2468,8 @@ do_steal:
 	page = list_first_entry(&area->free_list[fallback_mt],
 							struct page, lru);
 
-	steal_suitable_fallback(zone, page, start_migratetype, can_steal);
+	steal_suitable_fallback(zone, page, alloc_flags, start_migratetype,
+								can_steal);
 
 	trace_mm_page_alloc_extfrag(page, order, current_order,
 		start_migratetype, fallback_mt);
@@ -7454,6 +7480,7 @@ static void __setup_per_zone_wmarks(void)
 
 		zone->_watermark[WMARK_LOW]  = min_wmark_pages(zone) + tmp;
 		zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
+		zone->watermark_boost = 0;
 
 		spin_unlock_irqrestore(&zone->lock, flags);
 	}
@@ -7554,6 +7581,18 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
 	return 0;
 }
 
+int watermark_boost_factor_sysctl_handler(struct ctl_table *table, int write,
+	void __user *buffer, size_t *length, loff_t *ppos)
+{
+	int rc;
+
+	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
+	if (rc)
+		return rc;
+
+	return 0;
+}
+
 int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 24ab1f7394ab..bd8971a29204 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -88,6 +88,9 @@ struct scan_control {
 	/* Can pages be swapped as part of reclaim? */
 	unsigned int may_swap:1;
 
+	/* e.g. boosted watermark reclaim leaves slabs alone */
+	unsigned int may_shrinkslab:1;
+
 	/*
 	 * Cgroups are not reclaimed below their configured memory.low,
 	 * unless we threaten to OOM. If any cgroups are skipped due to
@@ -2756,8 +2759,10 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
 			shrink_node_memcg(pgdat, memcg, sc, &lru_pages);
 			node_lru_pages += lru_pages;
 
-			shrink_slab(sc->gfp_mask, pgdat->node_id,
+			if (sc->may_shrinkslab) {
+				shrink_slab(sc->gfp_mask, pgdat->node_id,
 				    memcg, sc->priority);
+			}
 
 			/* Record the group's reclaim efficiency */
 			vmpressure(sc->gfp_mask, memcg, false,
@@ -3239,6 +3244,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 		.may_writepage = !laptop_mode,
 		.may_unmap = 1,
 		.may_swap = 1,
+		.may_shrinkslab = 1,
 	};
 
 	/*
@@ -3283,6 +3289,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
 		.may_unmap = 1,
 		.reclaim_idx = MAX_NR_ZONES - 1,
 		.may_swap = !noswap,
+		.may_shrinkslab = 1,
 	};
 	unsigned long lru_pages;
 
@@ -3329,6 +3336,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 		.may_writepage = !laptop_mode,
 		.may_unmap = 1,
 		.may_swap = may_swap,
+		.may_shrinkslab = 1,
 	};
 
 	/*
@@ -3379,6 +3387,30 @@ static void age_active_anon(struct pglist_data *pgdat,
 	} while (memcg);
 }
 
+static bool pgdat_watermark_boosted(pg_data_t *pgdat, int classzone_idx)
+{
+	int i;
+	struct zone *zone;
+
+	/*
+	 * Check for watermark boosts top-down as the higher zones
+	 * are more likely to be boosted. Both watermarks and boosts
+	 * should not be checked at the time time as reclaim would
+	 * start prematurely when there is no boosting and a lower
+	 * zone is balanced.
+	 */
+	for (i = classzone_idx; i >= 0; i--) {
+		zone = pgdat->node_zones + i;
+		if (!managed_zone(zone))
+			continue;
+
+		if (zone->watermark_boost)
+			return true;
+	}
+
+	return false;
+}
+
 /*
  * Returns true if there is an eligible zone balanced for the request order
  * and classzone_idx
@@ -3389,6 +3421,10 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
 	unsigned long mark = -1;
 	struct zone *zone;
 
+	/*
+	 * Check watermarks bottom-up as lower zones are more likely to
+	 * meet watermarks.
+	 */
 	for (i = 0; i <= classzone_idx; i++) {
 		zone = pgdat->node_zones + i;
 
@@ -3517,14 +3553,14 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
 	unsigned long nr_soft_reclaimed;
 	unsigned long nr_soft_scanned;
 	unsigned long pflags;
+	unsigned long nr_boost_reclaim;
+	unsigned long zone_boosts[MAX_NR_ZONES] = { 0, };
+	bool boosted;
 	struct zone *zone;
 	struct scan_control sc = {
 		.gfp_mask = GFP_KERNEL,
 		.order = order,
-		.priority = DEF_PRIORITY,
-		.may_writepage = !laptop_mode,
 		.may_unmap = 1,
-		.may_swap = 1,
 	};
 
 	psi_memstall_enter(&pflags);
@@ -3532,9 +3568,28 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
 
 	count_vm_event(PAGEOUTRUN);
 
+	/*
+	 * Account for the reclaim boost. Note that the zone boost is left in
+	 * place so that parallel allocations that are near the watermark will
+	 * stall or direct reclaim until kswapd is finished.
+	 */
+	nr_boost_reclaim = 0;
+	for (i = 0; i <= classzone_idx; i++) {
+		zone = pgdat->node_zones + i;
+		if (!managed_zone(zone))
+			continue;
+
+		nr_boost_reclaim += zone->watermark_boost;
+		zone_boosts[i] = zone->watermark_boost;
+	}
+	boosted = nr_boost_reclaim;
+
+restart:
+	sc.priority = DEF_PRIORITY;
 	do {
 		unsigned long nr_reclaimed = sc.nr_reclaimed;
 		bool raise_priority = true;
+		bool balanced;
 		bool ret;
 
 		sc.reclaim_idx = classzone_idx;
@@ -3561,13 +3616,40 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
 		}
 
 		/*
-		 * Only reclaim if there are no eligible zones. Note that
-		 * sc.reclaim_idx is not used as buffer_heads_over_limit may
-		 * have adjusted it.
+		 * If the pgdat is imbalanced then ignore boosting and preserve
+		 * the watermarks for a later time and restart. Note that the
+		 * zone watermarks will be still reset at the end of balancing
+		 * on the grounds that the normal reclaim should be enough to
+		 * re-evaluate if boosting is required when kswapd next wakes.
+		 */
+		balanced = pgdat_balanced(pgdat, sc.order, classzone_idx);
+		if (!balanced && nr_boost_reclaim) {
+			nr_boost_reclaim = 0;
+			goto restart;
+		}
+
+		/*
+		 * If boosting is not active then only reclaim if there are no
+		 * eligible zones. Note that sc.reclaim_idx is not used as
+		 * buffer_heads_over_limit may have adjusted it.
 		 */
-		if (pgdat_balanced(pgdat, sc.order, classzone_idx))
+		if (!nr_boost_reclaim && balanced)
 			goto out;
 
+		/* Limit the priority of boosting to avoid reclaim writeback */
+		if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - 2)
+			raise_priority = false;
+
+		/*
+		 * Do not writeback or swap pages for boosted reclaim. The
+		 * intent is to relieve pressure not issue sub-optimal IO
+		 * from reclaim context. If no pages are reclaimed, the
+		 * reclaim will be aborted.
+		 */
+		sc.may_writepage = !laptop_mode && !nr_boost_reclaim;
+		sc.may_swap = !nr_boost_reclaim;
+		sc.may_shrinkslab = !nr_boost_reclaim;
+
 		/*
 		 * Do some background aging of the anon list, to give
 		 * pages a chance to be referenced before reclaiming. All
@@ -3619,6 +3701,16 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
 		 * progress in reclaiming pages
 		 */
 		nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
+		nr_boost_reclaim -= min(nr_boost_reclaim, nr_reclaimed);
+
+		/*
+		 * If reclaim made no progress for a boost, stop reclaim as
+		 * IO cannot be queued and it could be an infinite loop in
+		 * extreme circumstances.
+		 */
+		if (nr_boost_reclaim && !nr_reclaimed)
+			break;
+
 		if (raise_priority || !nr_reclaimed)
 			sc.priority--;
 	} while (sc.priority >= 1);
@@ -3627,6 +3719,28 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
 		pgdat->kswapd_failures++;
 
 out:
+	/* If reclaim was boosted, account for the reclaim done in this pass */
+	if (boosted) {
+		unsigned long flags;
+
+		for (i = 0; i <= classzone_idx; i++) {
+			if (!zone_boosts[i])
+				continue;
+
+			/* Increments are under the zone lock */
+			zone = pgdat->node_zones + i;
+			spin_lock_irqsave(&zone->lock, flags);
+			zone->watermark_boost -= min(zone->watermark_boost, zone_boosts[i]);
+			spin_unlock_irqrestore(&zone->lock, flags);
+		}
+
+		/*
+		 * As there is now likely space, wakeup kcompact to defragment
+		 * pageblocks.
+		 */
+		wakeup_kcompactd(pgdat, pageblock_order, classzone_idx);
+	}
+
 	snapshot_refaults(NULL, pgdat);
 	__fs_reclaim_release();
 	psi_memstall_leave(&pflags);
@@ -3855,7 +3969,8 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
 
 	/* Hopeless node, leave it to direct reclaim if possible */
 	if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
-	    pgdat_balanced(pgdat, order, classzone_idx)) {
+	    (pgdat_balanced(pgdat, order, classzone_idx) &&
+	     !pgdat_watermark_boosted(pgdat, classzone_idx))) {
 		/*
 		 * There may be plenty of free memory available, but it's too
 		 * fragmented for high-order allocations.  Wake up kcompactd
-- 
cgit 


From ef8444ea01d7442652f8e1b8a8b94278cb57eafd Mon Sep 17 00:00:00 2001
From: yuzhoujian <yuzhoujian@didichuxing.com>
Date: Fri, 28 Dec 2018 00:36:07 -0800
Subject: mm, oom: reorganize the oom report in dump_header

OOM report contains several sections.  The first one is the allocation
context that has triggered the OOM.  Then we have cpuset context followed
by the stack trace of the OOM path.  The tird one is the OOM memory
information.  Followed by the current memory state of all system tasks.
At last, we will show oom eligible tasks and the information about the
chosen oom victim.

One thing that makes parsing more awkward than necessary is that we do not
have a single and easily parsable line about the oom context.  This patch
is reorganizing the oom report to

1) who invoked oom and what was the allocation request

[  515.902945] tuned invoked oom-killer: gfp_mask=0x6200ca(GFP_HIGHUSER_MOVABLE), order=0, oom_score_adj=0

2) OOM stack trace

[  515.904273] CPU: 24 PID: 1809 Comm: tuned Not tainted 4.20.0-rc3+ #3
[  515.905518] Hardware name: Inspur SA5212M4/YZMB-00370-107, BIOS 4.1.10 11/14/2016
[  515.906821] Call Trace:
[  515.908062]  dump_stack+0x5a/0x73
[  515.909311]  dump_header+0x55/0x28c
[  515.914260]  oom_kill_process+0x2d8/0x300
[  515.916708]  out_of_memory+0x145/0x4a0
[  515.917932]  __alloc_pages_slowpath+0x7d2/0xa16
[  515.919157]  __alloc_pages_nodemask+0x277/0x290
[  515.920367]  filemap_fault+0x3d0/0x6c0
[  515.921529]  ? filemap_map_pages+0x2b8/0x420
[  515.922709]  ext4_filemap_fault+0x2c/0x40 [ext4]
[  515.923884]  __do_fault+0x20/0x80
[  515.925032]  __handle_mm_fault+0xbc0/0xe80
[  515.926195]  handle_mm_fault+0xfa/0x210
[  515.927357]  __do_page_fault+0x233/0x4c0
[  515.928506]  do_page_fault+0x32/0x140
[  515.929646]  ? page_fault+0x8/0x30
[  515.930770]  page_fault+0x1e/0x30

3) OOM memory information

[  515.958093] Mem-Info:
[  515.959647] active_anon:26501758 inactive_anon:1179809 isolated_anon:0
 active_file:4402672 inactive_file:483963 isolated_file:1344
 unevictable:0 dirty:4886753 writeback:0 unstable:0
 slab_reclaimable:148442 slab_unreclaimable:18741
 mapped:1347 shmem:1347 pagetables:58669 bounce:0
 free:88663 free_pcp:0 free_cma:0
...

4) current memory state of all system tasks

[  516.079544] [    744]     0   744     9211     1345   114688       82             0 systemd-journal
[  516.082034] [    787]     0   787    31764        0   143360       92             0 lvmetad
[  516.084465] [    792]     0   792    10930        1   110592      208         -1000 systemd-udevd
[  516.086865] [   1199]     0  1199    13866        0   131072      112         -1000 auditd
[  516.089190] [   1222]     0  1222    31990        1   110592      157             0 smartd
[  516.091477] [   1225]     0  1225     4864       85    81920       43             0 irqbalance
[  516.093712] [   1226]     0  1226    52612        0   258048      426             0 abrtd
[  516.112128] [   1280]     0  1280   109774       55   299008      400             0 NetworkManager
[  516.113998] [   1295]     0  1295    28817       37    69632       24             0 ksmtuned
[  516.144596] [  10718]     0 10718  2622484  1721372 15998976   267219             0 panic
[  516.145792] [  10719]     0 10719  2622484  1164767  9818112    53576             0 panic
[  516.146977] [  10720]     0 10720  2622484  1174361  9904128    53709             0 panic
[  516.148163] [  10721]     0 10721  2622484  1209070 10194944    54824             0 panic
[  516.149329] [  10722]     0 10722  2622484  1745799 14774272    91138             0 panic

5) oom context (contrains and the chosen victim).

oom-kill:constraint=CONSTRAINT_NONE,nodemask=(null),cpuset=/,mems_allowed=0-1,task=panic,pid=10737,uid=0

An admin can easily get the full oom context at a single line which
makes parsing much easier.

Link: http://lkml.kernel.org/r/1542799799-36184-1-git-send-email-ufo19890607@gmail.com
Signed-off-by: yuzhoujian <yuzhoujian@didichuxing.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Cc: Yang Shi <yang.s@alibaba-inc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/oom.h    | 10 ++++++++++
 kernel/cgroup/cpuset.c |  4 ++--
 mm/oom_kill.c          | 29 ++++++++++++++++++++---------
 mm/page_alloc.c        |  4 ++--
 4 files changed, 34 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/oom.h b/include/linux/oom.h
index 69864a547663..d07992009265 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -15,6 +15,13 @@ struct notifier_block;
 struct mem_cgroup;
 struct task_struct;
 
+enum oom_constraint {
+	CONSTRAINT_NONE,
+	CONSTRAINT_CPUSET,
+	CONSTRAINT_MEMORY_POLICY,
+	CONSTRAINT_MEMCG,
+};
+
 /*
  * Details of the page allocation that triggered the oom killer that are used to
  * determine what should be killed.
@@ -42,6 +49,9 @@ struct oom_control {
 	unsigned long totalpages;
 	struct task_struct *chosen;
 	unsigned long chosen_points;
+
+	/* Used to print the constraint info. */
+	enum oom_constraint constraint;
 };
 
 extern struct mutex oom_lock;
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 266f10cb7222..9510a5b32eaf 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -2666,9 +2666,9 @@ void cpuset_print_current_mems_allowed(void)
 	rcu_read_lock();
 
 	cgrp = task_cs(current)->css.cgroup;
-	pr_info("%s cpuset=", current->comm);
+	pr_cont(",cpuset=");
 	pr_cont_cgroup_name(cgrp);
-	pr_cont(" mems_allowed=%*pbl\n",
+	pr_cont(",mems_allowed=%*pbl",
 		nodemask_pr_args(&current->mems_allowed));
 
 	rcu_read_unlock();
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 21d487749e1d..d90253f1ff93 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -245,11 +245,11 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
 	return points > 0 ? points : 1;
 }
 
-enum oom_constraint {
-	CONSTRAINT_NONE,
-	CONSTRAINT_CPUSET,
-	CONSTRAINT_MEMORY_POLICY,
-	CONSTRAINT_MEMCG,
+static const char * const oom_constraint_text[] = {
+	[CONSTRAINT_NONE] = "CONSTRAINT_NONE",
+	[CONSTRAINT_CPUSET] = "CONSTRAINT_CPUSET",
+	[CONSTRAINT_MEMORY_POLICY] = "CONSTRAINT_MEMORY_POLICY",
+	[CONSTRAINT_MEMCG] = "CONSTRAINT_MEMCG",
 };
 
 /*
@@ -428,16 +428,25 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
 	rcu_read_unlock();
 }
 
+static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim)
+{
+	/* one line summary of the oom killer context. */
+	pr_info("oom-kill:constraint=%s,nodemask=%*pbl",
+			oom_constraint_text[oc->constraint],
+			nodemask_pr_args(oc->nodemask));
+	cpuset_print_current_mems_allowed();
+	pr_cont(",task=%s,pid=%d,uid=%d\n", victim->comm, victim->pid,
+		from_kuid(&init_user_ns, task_uid(victim)));
+}
+
 static void dump_header(struct oom_control *oc, struct task_struct *p)
 {
-	pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), nodemask=%*pbl, order=%d, oom_score_adj=%hd\n",
-		current->comm, oc->gfp_mask, &oc->gfp_mask,
-		nodemask_pr_args(oc->nodemask), oc->order,
+	pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n",
+		current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
 			current->signal->oom_score_adj);
 	if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order)
 		pr_warn("COMPACTION is disabled!!!\n");
 
-	cpuset_print_current_mems_allowed();
 	dump_stack();
 	if (is_memcg_oom(oc))
 		mem_cgroup_print_oom_info(oc->memcg, p);
@@ -448,6 +457,8 @@ static void dump_header(struct oom_control *oc, struct task_struct *p)
 	}
 	if (sysctl_oom_dump_tasks)
 		dump_tasks(oc->memcg, oc->nodemask);
+	if (p)
+		dump_oom_summary(oc, p);
 }
 
 /*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e97ebaf5ba26..a48db99da7b5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3517,13 +3517,13 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
 	va_start(args, fmt);
 	vaf.fmt = fmt;
 	vaf.va = &args;
-	pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl\n",
+	pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl",
 			current->comm, &vaf, gfp_mask, &gfp_mask,
 			nodemask_pr_args(nodemask));
 	va_end(args);
 
 	cpuset_print_current_mems_allowed();
-
+	pr_cont("\n");
 	dump_stack();
 	warn_alloc_show_mem(gfp_mask, nodemask);
 }
-- 
cgit 


From 2c2a5af6fed20cf74401c9d64319c76c5ff81309 Mon Sep 17 00:00:00 2001
From: Oscar Salvador <osalvador@suse.com>
Date: Fri, 28 Dec 2018 00:36:22 -0800
Subject: mm, memory_hotplug: add nid parameter to arch_remove_memory

Patch series "Do not touch pages in hot-remove path", v2.

This patchset aims for two things:

 1) A better definition about offline and hot-remove stage
 2) Solving bugs where we can access non-initialized pages
    during hot-remove operations [2] [3].

This is achieved by moving all page/zone handling to the offline
stage, so we do not need to access pages when hot-removing memory.

[1] https://patchwork.kernel.org/cover/10691415/
[2] https://patchwork.kernel.org/patch/10547445/
[3] https://www.spinics.net/lists/linux-mm/msg161316.html

This patch (of 5):

This is a preparation for the following-up patches.  The idea of passing
the nid is that it will allow us to get rid of the zone parameter
afterwards.

Link: http://lkml.kernel.org/r/20181127162005.15833-2-osalvador@suse.de
Signed-off-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/mm/init.c            | 2 +-
 arch/powerpc/mm/mem.c          | 3 ++-
 arch/s390/mm/init.c            | 2 +-
 arch/sh/mm/init.c              | 2 +-
 arch/x86/mm/init_32.c          | 2 +-
 arch/x86/mm/init_64.c          | 3 ++-
 include/linux/memory_hotplug.h | 4 ++--
 kernel/memremap.c              | 5 ++++-
 mm/memory_hotplug.c            | 2 +-
 9 files changed, 15 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index d5e12ff1d73c..904fe55e10fc 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -661,7 +661,7 @@ int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
 }
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
-int arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
+int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap)
 {
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 20394e52fe27..33cc6f676fa6 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -139,7 +139,8 @@ int __meminit arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *
 }
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
-int __meminit arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
+int __meminit arch_remove_memory(int nid, u64 start, u64 size,
+					struct vmem_altmap *altmap)
 {
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 50388190b393..3e82f66d5c61 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -242,7 +242,7 @@ int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
 }
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
-int arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
+int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap)
 {
 	/*
 	 * There is no hardware or firmware interface which could trigger a
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index c8c13c777162..a8e5c0e00fca 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -443,7 +443,7 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
 #endif
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
-int arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
+int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap)
 {
 	unsigned long start_pfn = PFN_DOWN(start);
 	unsigned long nr_pages = size >> PAGE_SHIFT;
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 49ecf5ecf6d3..85c94f9a87f8 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -860,7 +860,7 @@ int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
 }
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
-int arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
+int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap)
 {
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 484c1b92f078..bccff68e3267 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1141,7 +1141,8 @@ kernel_physical_mapping_remove(unsigned long start, unsigned long end)
 	remove_pagetable(start, end, true, NULL);
 }
 
-int __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
+int __ref arch_remove_memory(int nid, u64 start, u64 size,
+				struct vmem_altmap *altmap)
 {
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 7383a7a76d69..9e4d9b9b93ea 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -107,8 +107,8 @@ static inline bool movable_node_is_enabled(void)
 }
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
-extern int arch_remove_memory(u64 start, u64 size,
-		struct vmem_altmap *altmap);
+extern int arch_remove_memory(int nid, u64 start, u64 size,
+				struct vmem_altmap *altmap);
 extern int __remove_pages(struct zone *zone, unsigned long start_pfn,
 	unsigned long nr_pages, struct vmem_altmap *altmap);
 #endif /* CONFIG_MEMORY_HOTREMOVE */
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 3eef989ef035..0d5603d76c37 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -87,6 +87,7 @@ static void devm_memremap_pages_release(void *data)
 	struct resource *res = &pgmap->res;
 	resource_size_t align_start, align_size;
 	unsigned long pfn;
+	int nid;
 
 	pgmap->kill(pgmap->ref);
 	for_each_device_pfn(pfn, pgmap)
@@ -97,13 +98,15 @@ static void devm_memremap_pages_release(void *data)
 	align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
 		- align_start;
 
+	nid = page_to_nid(pfn_to_page(align_start >> PAGE_SHIFT));
+
 	mem_hotplug_begin();
 	if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
 		pfn = align_start >> PAGE_SHIFT;
 		__remove_pages(page_zone(pfn_to_page(pfn)), pfn,
 				align_size >> PAGE_SHIFT, NULL);
 	} else {
-		arch_remove_memory(align_start, align_size,
+		arch_remove_memory(nid, align_start, align_size,
 				pgmap->altmap_valid ? &pgmap->altmap : NULL);
 		kasan_remove_zero_shadow(__va(align_start), align_size);
 	}
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 6258e0e923cc..0718cf7427b2 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1841,7 +1841,7 @@ void __ref __remove_memory(int nid, u64 start, u64 size)
 	memblock_free(start, size);
 	memblock_remove(start, size);
 
-	arch_remove_memory(start, size, NULL);
+	arch_remove_memory(nid, start, size, NULL);
 
 	try_offline_node(nid);
 
-- 
cgit 


From 65c78784135f847e49eb98e6b976e453e71100c3 Mon Sep 17 00:00:00 2001
From: Oscar Salvador <osalvador@suse.de>
Date: Fri, 28 Dec 2018 00:36:26 -0800
Subject: kernel, resource: check for IORESOURCE_SYSRAM in
 release_mem_region_adjustable

This is a preparation for the next patch.

Currently, we only call release_mem_region_adjustable() in __remove_pages
if the zone is not ZONE_DEVICE, because resources that belong to HMM/devm
are being released by themselves with devm_release_mem_region.

Since we do not want to touch any zone/page stuff during the removing of
the memory (but during the offlining), we do not want to check for the
zone here.  So we need another way to tell release_mem_region_adjustable()
to not realease the resource in case it belongs to HMM/devm.

HMM/devm acquires/releases a resource through
devm_request_mem_region/devm_release_mem_region.

These resources have the flag IORESOURCE_MEM, while resources acquired by
hot-add memory path (register_memory_resource()) contain
IORESOURCE_SYSTEM_RAM.

So, we can check for this flag in release_mem_region_adjustable, and if
the resource does not contain such flag, we know that we are dealing with
a HMM/devm resource, so we can back off.

Link: http://lkml.kernel.org/r/20181127162005.15833-3-osalvador@suse.de
Signed-off-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oscar Salvador <osalvador@suse.com>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/resource.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'kernel')

diff --git a/kernel/resource.c b/kernel/resource.c
index b0fbf685c77a..915c02e8e5dd 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1256,6 +1256,21 @@ int release_mem_region_adjustable(struct resource *parent,
 			continue;
 		}
 
+		/*
+		 * All memory regions added from memory-hotplug path have the
+		 * flag IORESOURCE_SYSTEM_RAM. If the resource does not have
+		 * this flag, we know that we are dealing with a resource coming
+		 * from HMM/devm. HMM/devm use another mechanism to add/release
+		 * a resource. This goes via devm_request_mem_region and
+		 * devm_release_mem_region.
+		 * HMM/devm take care to release their resources when they want,
+		 * so if we are dealing with them, let us just back off here.
+		 */
+		if (!(res->flags & IORESOURCE_SYSRAM)) {
+			ret = 0;
+			break;
+		}
+
 		if (!(res->flags & IORESOURCE_MEM))
 			break;
 
-- 
cgit 


From ac46d4f3c43241ffa23d5bf36153a0830c0e02cc Mon Sep 17 00:00:00 2001
From: Jérôme Glisse <jglisse@redhat.com>
Date: Fri, 28 Dec 2018 00:38:09 -0800
Subject: mm/mmu_notifier: use structure for invalidate_range_start/end calls
 v2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

To avoid having to change many call sites everytime we want to add a
parameter use a structure to group all parameters for the mmu_notifier
invalidate_range_start/end cakks.  No functional changes with this patch.

[akpm@linux-foundation.org: coding style fixes]
Link: http://lkml.kernel.org/r/20181205053628.3210-3-jglisse@redhat.com
Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
Acked-by: Christian König <christian.koenig@amd.com>
Acked-by: Jan Kara <jack@suse.cz>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Ross Zwisler <zwisler@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krcmar <rkrcmar@redhat.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Felix Kuehling <felix.kuehling@amd.com>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: John Hubbard <jhubbard@nvidia.com>
From: Jérôme Glisse <jglisse@redhat.com>
Subject: mm/mmu_notifier: use structure for invalidate_range_start/end calls v3

fix build warning in migrate.c when CONFIG_MMU_NOTIFIER=n

Link: http://lkml.kernel.org/r/20181213171330.8489-3-jglisse@redhat.com
Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dax.c                     |  8 ++--
 fs/proc/task_mmu.c           |  7 +++-
 include/linux/mm.h           |  6 ++-
 include/linux/mmu_notifier.h | 87 +++++++++++++++++++++++++-------------
 kernel/events/uprobes.c      | 10 ++---
 mm/huge_memory.c             | 54 +++++++++++-------------
 mm/hugetlb.c                 | 52 +++++++++++------------
 mm/khugepaged.c              | 10 ++---
 mm/ksm.c                     | 21 ++++------
 mm/madvise.c                 | 21 +++++-----
 mm/memory.c                  | 99 ++++++++++++++++++++++----------------------
 mm/migrate.c                 | 28 ++++++-------
 mm/mmu_notifier.c            | 37 ++++-------------
 mm/mprotect.c                | 15 +++----
 mm/mremap.c                  | 10 ++---
 mm/oom_kill.c                | 17 ++++----
 mm/rmap.c                    | 30 ++++++++------
 17 files changed, 262 insertions(+), 250 deletions(-)

(limited to 'kernel')

diff --git a/fs/dax.c b/fs/dax.c
index 48132eca3761..262e14f29933 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -779,7 +779,8 @@ static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index,
 
 	i_mmap_lock_read(mapping);
 	vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) {
-		unsigned long address, start, end;
+		struct mmu_notifier_range range;
+		unsigned long address;
 
 		cond_resched();
 
@@ -793,7 +794,8 @@ static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index,
 		 * call mmu_notifier_invalidate_range_start() on our behalf
 		 * before taking any lock.
 		 */
-		if (follow_pte_pmd(vma->vm_mm, address, &start, &end, &ptep, &pmdp, &ptl))
+		if (follow_pte_pmd(vma->vm_mm, address, &range,
+				   &ptep, &pmdp, &ptl))
 			continue;
 
 		/*
@@ -835,7 +837,7 @@ unlock_pte:
 			pte_unmap_unlock(ptep, ptl);
 		}
 
-		mmu_notifier_invalidate_range_end(vma->vm_mm, start, end);
+		mmu_notifier_invalidate_range_end(&range);
 	}
 	i_mmap_unlock_read(mapping);
 }
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 47c3764c469b..b3ddceb003bc 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1096,6 +1096,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
 		return -ESRCH;
 	mm = get_task_mm(task);
 	if (mm) {
+		struct mmu_notifier_range range;
 		struct clear_refs_private cp = {
 			.type = type,
 		};
@@ -1139,11 +1140,13 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
 				downgrade_write(&mm->mmap_sem);
 				break;
 			}
-			mmu_notifier_invalidate_range_start(mm, 0, -1);
+
+			mmu_notifier_range_init(&range, mm, 0, -1UL);
+			mmu_notifier_invalidate_range_start(&range);
 		}
 		walk_page_range(0, mm->highest_vm_end, &clear_refs_walk);
 		if (type == CLEAR_REFS_SOFT_DIRTY)
-			mmu_notifier_invalidate_range_end(mm, 0, -1);
+			mmu_notifier_invalidate_range_end(&range);
 		tlb_finish_mmu(&tlb, 0, -1);
 		up_read(&mm->mmap_sem);
 out_mm:
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3c39b9dc7a90..ea1f12d15365 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1451,6 +1451,8 @@ struct mm_walk {
 	void *private;
 };
 
+struct mmu_notifier_range;
+
 int walk_page_range(unsigned long addr, unsigned long end,
 		struct mm_walk *walk);
 int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk);
@@ -1459,8 +1461,8 @@ void free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
 int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
 			struct vm_area_struct *vma);
 int follow_pte_pmd(struct mm_struct *mm, unsigned long address,
-			     unsigned long *start, unsigned long *end,
-			     pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp);
+		   struct mmu_notifier_range *range,
+		   pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp);
 int follow_pfn(struct vm_area_struct *vma, unsigned long address,
 	unsigned long *pfn);
 int follow_phys(struct vm_area_struct *vma, unsigned long address,
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 3d377805b29c..4050ec1c3b45 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -220,11 +220,8 @@ extern int __mmu_notifier_test_young(struct mm_struct *mm,
 				     unsigned long address);
 extern void __mmu_notifier_change_pte(struct mm_struct *mm,
 				      unsigned long address, pte_t pte);
-extern int __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
-				  unsigned long start, unsigned long end,
-				  bool blockable);
-extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
-				  unsigned long start, unsigned long end,
+extern int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *r);
+extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r,
 				  bool only_end);
 extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
 				  unsigned long start, unsigned long end);
@@ -268,33 +265,37 @@ static inline void mmu_notifier_change_pte(struct mm_struct *mm,
 		__mmu_notifier_change_pte(mm, address, pte);
 }
 
-static inline void mmu_notifier_invalidate_range_start(struct mm_struct *mm,
-				  unsigned long start, unsigned long end)
+static inline void
+mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
 {
-	if (mm_has_notifiers(mm))
-		__mmu_notifier_invalidate_range_start(mm, start, end, true);
+	if (mm_has_notifiers(range->mm)) {
+		range->blockable = true;
+		__mmu_notifier_invalidate_range_start(range);
+	}
 }
 
-static inline int mmu_notifier_invalidate_range_start_nonblock(struct mm_struct *mm,
-				  unsigned long start, unsigned long end)
+static inline int
+mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range)
 {
-	if (mm_has_notifiers(mm))
-		return __mmu_notifier_invalidate_range_start(mm, start, end, false);
+	if (mm_has_notifiers(range->mm)) {
+		range->blockable = false;
+		return __mmu_notifier_invalidate_range_start(range);
+	}
 	return 0;
 }
 
-static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm,
-				  unsigned long start, unsigned long end)
+static inline void
+mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range)
 {
-	if (mm_has_notifiers(mm))
-		__mmu_notifier_invalidate_range_end(mm, start, end, false);
+	if (mm_has_notifiers(range->mm))
+		__mmu_notifier_invalidate_range_end(range, false);
 }
 
-static inline void mmu_notifier_invalidate_range_only_end(struct mm_struct *mm,
-				  unsigned long start, unsigned long end)
+static inline void
+mmu_notifier_invalidate_range_only_end(struct mmu_notifier_range *range)
 {
-	if (mm_has_notifiers(mm))
-		__mmu_notifier_invalidate_range_end(mm, start, end, true);
+	if (mm_has_notifiers(range->mm))
+		__mmu_notifier_invalidate_range_end(range, true);
 }
 
 static inline void mmu_notifier_invalidate_range(struct mm_struct *mm,
@@ -315,6 +316,17 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
 		__mmu_notifier_mm_destroy(mm);
 }
 
+
+static inline void mmu_notifier_range_init(struct mmu_notifier_range *range,
+					   struct mm_struct *mm,
+					   unsigned long start,
+					   unsigned long end)
+{
+	range->mm = mm;
+	range->start = start;
+	range->end = end;
+}
+
 #define ptep_clear_flush_young_notify(__vma, __address, __ptep)		\
 ({									\
 	int __young;							\
@@ -427,6 +439,23 @@ extern void mmu_notifier_call_srcu(struct rcu_head *rcu,
 
 #else /* CONFIG_MMU_NOTIFIER */
 
+struct mmu_notifier_range {
+	unsigned long start;
+	unsigned long end;
+};
+
+static inline void _mmu_notifier_range_init(struct mmu_notifier_range *range,
+					    unsigned long start,
+					    unsigned long end)
+{
+	range->start = start;
+	range->end = end;
+}
+
+#define mmu_notifier_range_init(range, mm, start, end) \
+	_mmu_notifier_range_init(range, start, end)
+
+
 static inline int mm_has_notifiers(struct mm_struct *mm)
 {
 	return 0;
@@ -454,24 +483,24 @@ static inline void mmu_notifier_change_pte(struct mm_struct *mm,
 {
 }
 
-static inline void mmu_notifier_invalidate_range_start(struct mm_struct *mm,
-				  unsigned long start, unsigned long end)
+static inline void
+mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
 {
 }
 
-static inline int mmu_notifier_invalidate_range_start_nonblock(struct mm_struct *mm,
-				  unsigned long start, unsigned long end)
+static inline int
+mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range)
 {
 	return 0;
 }
 
-static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm,
-				  unsigned long start, unsigned long end)
+static inline
+void mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range)
 {
 }
 
-static inline void mmu_notifier_invalidate_range_only_end(struct mm_struct *mm,
-				  unsigned long start, unsigned long end)
+static inline void
+mmu_notifier_invalidate_range_only_end(struct mmu_notifier_range *range)
 {
 }
 
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index abbd8da9ac21..8aef47ee7bfa 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -171,11 +171,11 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
 		.address = addr,
 	};
 	int err;
-	/* For mmu_notifiers */
-	const unsigned long mmun_start = addr;
-	const unsigned long mmun_end   = addr + PAGE_SIZE;
+	struct mmu_notifier_range range;
 	struct mem_cgroup *memcg;
 
+	mmu_notifier_range_init(&range, mm, addr, addr + PAGE_SIZE);
+
 	VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page);
 
 	err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, &memcg,
@@ -186,7 +186,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
 	/* For try_to_free_swap() and munlock_vma_page() below */
 	lock_page(old_page);
 
-	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+	mmu_notifier_invalidate_range_start(&range);
 	err = -EAGAIN;
 	if (!page_vma_mapped_walk(&pvmw)) {
 		mem_cgroup_cancel_charge(new_page, memcg, false);
@@ -220,7 +220,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
 
 	err = 0;
  unlock:
-	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+	mmu_notifier_invalidate_range_end(&range);
 	unlock_page(old_page);
 	return err;
 }
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 0c0e18409fde..05136ad0f325 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1134,8 +1134,7 @@ static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf,
 	int i;
 	vm_fault_t ret = 0;
 	struct page **pages;
-	unsigned long mmun_start;	/* For mmu_notifiers */
-	unsigned long mmun_end;		/* For mmu_notifiers */
+	struct mmu_notifier_range range;
 
 	pages = kmalloc_array(HPAGE_PMD_NR, sizeof(struct page *),
 			      GFP_KERNEL);
@@ -1173,9 +1172,9 @@ static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf,
 		cond_resched();
 	}
 
-	mmun_start = haddr;
-	mmun_end   = haddr + HPAGE_PMD_SIZE;
-	mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
+	mmu_notifier_range_init(&range, vma->vm_mm, haddr,
+				haddr + HPAGE_PMD_SIZE);
+	mmu_notifier_invalidate_range_start(&range);
 
 	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
 	if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
@@ -1220,8 +1219,7 @@ static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf,
 	 * No need to double call mmu_notifier->invalidate_range() callback as
 	 * the above pmdp_huge_clear_flush_notify() did already call it.
 	 */
-	mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start,
-						mmun_end);
+	mmu_notifier_invalidate_range_only_end(&range);
 
 	ret |= VM_FAULT_WRITE;
 	put_page(page);
@@ -1231,7 +1229,7 @@ out:
 
 out_free_pages:
 	spin_unlock(vmf->ptl);
-	mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
+	mmu_notifier_invalidate_range_end(&range);
 	for (i = 0; i < HPAGE_PMD_NR; i++) {
 		memcg = (void *)page_private(pages[i]);
 		set_page_private(pages[i], 0);
@@ -1248,8 +1246,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
 	struct page *page = NULL, *new_page;
 	struct mem_cgroup *memcg;
 	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
-	unsigned long mmun_start;	/* For mmu_notifiers */
-	unsigned long mmun_end;		/* For mmu_notifiers */
+	struct mmu_notifier_range range;
 	gfp_t huge_gfp;			/* for allocation and charge */
 	vm_fault_t ret = 0;
 
@@ -1338,9 +1335,9 @@ alloc:
 				    vma, HPAGE_PMD_NR);
 	__SetPageUptodate(new_page);
 
-	mmun_start = haddr;
-	mmun_end   = haddr + HPAGE_PMD_SIZE;
-	mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
+	mmu_notifier_range_init(&range, vma->vm_mm, haddr,
+				haddr + HPAGE_PMD_SIZE);
+	mmu_notifier_invalidate_range_start(&range);
 
 	spin_lock(vmf->ptl);
 	if (page)
@@ -1375,8 +1372,7 @@ out_mn:
 	 * No need to double call mmu_notifier->invalidate_range() callback as
 	 * the above pmdp_huge_clear_flush_notify() did already call it.
 	 */
-	mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start,
-					       mmun_end);
+	mmu_notifier_invalidate_range_only_end(&range);
 out:
 	return ret;
 out_unlock:
@@ -2015,14 +2011,15 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
 		unsigned long address)
 {
 	spinlock_t *ptl;
-	struct mm_struct *mm = vma->vm_mm;
-	unsigned long haddr = address & HPAGE_PUD_MASK;
+	struct mmu_notifier_range range;
 
-	mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PUD_SIZE);
-	ptl = pud_lock(mm, pud);
+	mmu_notifier_range_init(&range, vma->vm_mm, address & HPAGE_PUD_MASK,
+				(address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE);
+	mmu_notifier_invalidate_range_start(&range);
+	ptl = pud_lock(vma->vm_mm, pud);
 	if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud)))
 		goto out;
-	__split_huge_pud_locked(vma, pud, haddr);
+	__split_huge_pud_locked(vma, pud, range.start);
 
 out:
 	spin_unlock(ptl);
@@ -2030,8 +2027,7 @@ out:
 	 * No need to double call mmu_notifier->invalidate_range() callback as
 	 * the above pudp_huge_clear_flush_notify() did already call it.
 	 */
-	mmu_notifier_invalidate_range_only_end(mm, haddr, haddr +
-					       HPAGE_PUD_SIZE);
+	mmu_notifier_invalidate_range_only_end(&range);
 }
 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
 
@@ -2233,11 +2229,12 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 		unsigned long address, bool freeze, struct page *page)
 {
 	spinlock_t *ptl;
-	struct mm_struct *mm = vma->vm_mm;
-	unsigned long haddr = address & HPAGE_PMD_MASK;
+	struct mmu_notifier_range range;
 
-	mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PMD_SIZE);
-	ptl = pmd_lock(mm, pmd);
+	mmu_notifier_range_init(&range, vma->vm_mm, address & HPAGE_PMD_MASK,
+				(address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
+	mmu_notifier_invalidate_range_start(&range);
+	ptl = pmd_lock(vma->vm_mm, pmd);
 
 	/*
 	 * If caller asks to setup a migration entries, we need a page to check
@@ -2253,7 +2250,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 			clear_page_mlock(page);
 	} else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd)))
 		goto out;
-	__split_huge_pmd_locked(vma, pmd, haddr, freeze);
+	__split_huge_pmd_locked(vma, pmd, range.start, freeze);
 out:
 	spin_unlock(ptl);
 	/*
@@ -2269,8 +2266,7 @@ out:
 	 *     any further changes to individual pte will notify. So no need
 	 *     to call mmu_notifier->invalidate_range()
 	 */
-	mmu_notifier_invalidate_range_only_end(mm, haddr, haddr +
-					       HPAGE_PMD_SIZE);
+	mmu_notifier_invalidate_range_only_end(&range);
 }
 
 void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a80832487981..12000ba5c868 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3240,16 +3240,16 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 	int cow;
 	struct hstate *h = hstate_vma(vma);
 	unsigned long sz = huge_page_size(h);
-	unsigned long mmun_start;	/* For mmu_notifiers */
-	unsigned long mmun_end;		/* For mmu_notifiers */
+	struct mmu_notifier_range range;
 	int ret = 0;
 
 	cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
 
-	mmun_start = vma->vm_start;
-	mmun_end = vma->vm_end;
-	if (cow)
-		mmu_notifier_invalidate_range_start(src, mmun_start, mmun_end);
+	if (cow) {
+		mmu_notifier_range_init(&range, src, vma->vm_start,
+					vma->vm_end);
+		mmu_notifier_invalidate_range_start(&range);
+	}
 
 	for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
 		spinlock_t *src_ptl, *dst_ptl;
@@ -3325,7 +3325,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 	}
 
 	if (cow)
-		mmu_notifier_invalidate_range_end(src, mmun_start, mmun_end);
+		mmu_notifier_invalidate_range_end(&range);
 
 	return ret;
 }
@@ -3342,8 +3342,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
 	struct page *page;
 	struct hstate *h = hstate_vma(vma);
 	unsigned long sz = huge_page_size(h);
-	unsigned long mmun_start = start;	/* For mmu_notifiers */
-	unsigned long mmun_end   = end;		/* For mmu_notifiers */
+	struct mmu_notifier_range range;
 
 	WARN_ON(!is_vm_hugetlb_page(vma));
 	BUG_ON(start & ~huge_page_mask(h));
@@ -3359,8 +3358,9 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
 	/*
 	 * If sharing possible, alert mmu notifiers of worst case.
 	 */
-	adjust_range_if_pmd_sharing_possible(vma, &mmun_start, &mmun_end);
-	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+	mmu_notifier_range_init(&range, mm, start, end);
+	adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
+	mmu_notifier_invalidate_range_start(&range);
 	address = start;
 	for (; address < end; address += sz) {
 		ptep = huge_pte_offset(mm, address, sz);
@@ -3428,7 +3428,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		if (ref_page)
 			break;
 	}
-	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+	mmu_notifier_invalidate_range_end(&range);
 	tlb_end_vma(tlb, vma);
 }
 
@@ -3546,9 +3546,8 @@ static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 	struct page *old_page, *new_page;
 	int outside_reserve = 0;
 	vm_fault_t ret = 0;
-	unsigned long mmun_start;	/* For mmu_notifiers */
-	unsigned long mmun_end;		/* For mmu_notifiers */
 	unsigned long haddr = address & huge_page_mask(h);
+	struct mmu_notifier_range range;
 
 	pte = huge_ptep_get(ptep);
 	old_page = pte_page(pte);
@@ -3627,9 +3626,8 @@ retry_avoidcopy:
 	__SetPageUptodate(new_page);
 	set_page_huge_active(new_page);
 
-	mmun_start = haddr;
-	mmun_end = mmun_start + huge_page_size(h);
-	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+	mmu_notifier_range_init(&range, mm, haddr, haddr + huge_page_size(h));
+	mmu_notifier_invalidate_range_start(&range);
 
 	/*
 	 * Retake the page table lock to check for racing updates
@@ -3642,7 +3640,7 @@ retry_avoidcopy:
 
 		/* Break COW */
 		huge_ptep_clear_flush(vma, haddr, ptep);
-		mmu_notifier_invalidate_range(mm, mmun_start, mmun_end);
+		mmu_notifier_invalidate_range(mm, range.start, range.end);
 		set_huge_pte_at(mm, haddr, ptep,
 				make_huge_pte(vma, new_page, 1));
 		page_remove_rmap(old_page, true);
@@ -3651,7 +3649,7 @@ retry_avoidcopy:
 		new_page = old_page;
 	}
 	spin_unlock(ptl);
-	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+	mmu_notifier_invalidate_range_end(&range);
 out_release_all:
 	restore_reserve_on_error(h, vma, haddr, new_page);
 	put_page(new_page);
@@ -4340,21 +4338,21 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 	pte_t pte;
 	struct hstate *h = hstate_vma(vma);
 	unsigned long pages = 0;
-	unsigned long f_start = start;
-	unsigned long f_end = end;
 	bool shared_pmd = false;
+	struct mmu_notifier_range range;
 
 	/*
 	 * In the case of shared PMDs, the area to flush could be beyond
-	 * start/end.  Set f_start/f_end to cover the maximum possible
+	 * start/end.  Set range.start/range.end to cover the maximum possible
 	 * range if PMD sharing is possible.
 	 */
-	adjust_range_if_pmd_sharing_possible(vma, &f_start, &f_end);
+	mmu_notifier_range_init(&range, mm, start, end);
+	adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
 
 	BUG_ON(address >= end);
-	flush_cache_range(vma, f_start, f_end);
+	flush_cache_range(vma, range.start, range.end);
 
-	mmu_notifier_invalidate_range_start(mm, f_start, f_end);
+	mmu_notifier_invalidate_range_start(&range);
 	i_mmap_lock_write(vma->vm_file->f_mapping);
 	for (; address < end; address += huge_page_size(h)) {
 		spinlock_t *ptl;
@@ -4405,7 +4403,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 	 * did unshare a page of pmds, flush the range corresponding to the pud.
 	 */
 	if (shared_pmd)
-		flush_hugetlb_tlb_range(vma, f_start, f_end);
+		flush_hugetlb_tlb_range(vma, range.start, range.end);
 	else
 		flush_hugetlb_tlb_range(vma, start, end);
 	/*
@@ -4415,7 +4413,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 	 * See Documentation/vm/mmu_notifier.rst
 	 */
 	i_mmap_unlock_write(vma->vm_file->f_mapping);
-	mmu_notifier_invalidate_range_end(mm, f_start, f_end);
+	mmu_notifier_invalidate_range_end(&range);
 
 	return pages << h->order;
 }
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 43ce2f4d2551..4f017339ddb2 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -944,8 +944,7 @@ static void collapse_huge_page(struct mm_struct *mm,
 	int isolated = 0, result = 0;
 	struct mem_cgroup *memcg;
 	struct vm_area_struct *vma;
-	unsigned long mmun_start;	/* For mmu_notifiers */
-	unsigned long mmun_end;		/* For mmu_notifiers */
+	struct mmu_notifier_range range;
 	gfp_t gfp;
 
 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
@@ -1017,9 +1016,8 @@ static void collapse_huge_page(struct mm_struct *mm,
 	pte = pte_offset_map(pmd, address);
 	pte_ptl = pte_lockptr(mm, pmd);
 
-	mmun_start = address;
-	mmun_end   = address + HPAGE_PMD_SIZE;
-	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+	mmu_notifier_range_init(&range, mm, address, address + HPAGE_PMD_SIZE);
+	mmu_notifier_invalidate_range_start(&range);
 	pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
 	/*
 	 * After this gup_fast can't run anymore. This also removes
@@ -1029,7 +1027,7 @@ static void collapse_huge_page(struct mm_struct *mm,
 	 */
 	_pmd = pmdp_collapse_flush(vma, address, pmd);
 	spin_unlock(pmd_ptl);
-	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+	mmu_notifier_invalidate_range_end(&range);
 
 	spin_lock(pte_ptl);
 	isolated = __collapse_huge_page_isolate(vma, address, pte);
diff --git a/mm/ksm.c b/mm/ksm.c
index 1a088306ef81..38c0360482fa 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1042,8 +1042,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
 	};
 	int swapped;
 	int err = -EFAULT;
-	unsigned long mmun_start;	/* For mmu_notifiers */
-	unsigned long mmun_end;		/* For mmu_notifiers */
+	struct mmu_notifier_range range;
 
 	pvmw.address = page_address_in_vma(page, vma);
 	if (pvmw.address == -EFAULT)
@@ -1051,9 +1050,9 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
 
 	BUG_ON(PageTransCompound(page));
 
-	mmun_start = pvmw.address;
-	mmun_end   = pvmw.address + PAGE_SIZE;
-	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+	mmu_notifier_range_init(&range, mm, pvmw.address,
+				pvmw.address + PAGE_SIZE);
+	mmu_notifier_invalidate_range_start(&range);
 
 	if (!page_vma_mapped_walk(&pvmw))
 		goto out_mn;
@@ -1105,7 +1104,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
 out_unlock:
 	page_vma_mapped_walk_done(&pvmw);
 out_mn:
-	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+	mmu_notifier_invalidate_range_end(&range);
 out:
 	return err;
 }
@@ -1129,8 +1128,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
 	spinlock_t *ptl;
 	unsigned long addr;
 	int err = -EFAULT;
-	unsigned long mmun_start;	/* For mmu_notifiers */
-	unsigned long mmun_end;		/* For mmu_notifiers */
+	struct mmu_notifier_range range;
 
 	addr = page_address_in_vma(page, vma);
 	if (addr == -EFAULT)
@@ -1140,9 +1138,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
 	if (!pmd)
 		goto out;
 
-	mmun_start = addr;
-	mmun_end   = addr + PAGE_SIZE;
-	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+	mmu_notifier_range_init(&range, mm, addr, addr + PAGE_SIZE);
+	mmu_notifier_invalidate_range_start(&range);
 
 	ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
 	if (!pte_same(*ptep, orig_pte)) {
@@ -1188,7 +1185,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
 	pte_unmap_unlock(ptep, ptl);
 	err = 0;
 out_mn:
-	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+	mmu_notifier_invalidate_range_end(&range);
 out:
 	return err;
 }
diff --git a/mm/madvise.c b/mm/madvise.c
index 6cb1ca93e290..21a7881a2db4 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -458,29 +458,30 @@ static void madvise_free_page_range(struct mmu_gather *tlb,
 static int madvise_free_single_vma(struct vm_area_struct *vma,
 			unsigned long start_addr, unsigned long end_addr)
 {
-	unsigned long start, end;
 	struct mm_struct *mm = vma->vm_mm;
+	struct mmu_notifier_range range;
 	struct mmu_gather tlb;
 
 	/* MADV_FREE works for only anon vma at the moment */
 	if (!vma_is_anonymous(vma))
 		return -EINVAL;
 
-	start = max(vma->vm_start, start_addr);
-	if (start >= vma->vm_end)
+	range.start = max(vma->vm_start, start_addr);
+	if (range.start >= vma->vm_end)
 		return -EINVAL;
-	end = min(vma->vm_end, end_addr);
-	if (end <= vma->vm_start)
+	range.end = min(vma->vm_end, end_addr);
+	if (range.end <= vma->vm_start)
 		return -EINVAL;
+	mmu_notifier_range_init(&range, mm, range.start, range.end);
 
 	lru_add_drain();
-	tlb_gather_mmu(&tlb, mm, start, end);
+	tlb_gather_mmu(&tlb, mm, range.start, range.end);
 	update_hiwater_rss(mm);
 
-	mmu_notifier_invalidate_range_start(mm, start, end);
-	madvise_free_page_range(&tlb, vma, start, end);
-	mmu_notifier_invalidate_range_end(mm, start, end);
-	tlb_finish_mmu(&tlb, start, end);
+	mmu_notifier_invalidate_range_start(&range);
+	madvise_free_page_range(&tlb, vma, range.start, range.end);
+	mmu_notifier_invalidate_range_end(&range);
+	tlb_finish_mmu(&tlb, range.start, range.end);
 
 	return 0;
 }
diff --git a/mm/memory.c b/mm/memory.c
index 4ad2d293ddc2..b7a8bfe5f5ec 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -973,8 +973,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	unsigned long next;
 	unsigned long addr = vma->vm_start;
 	unsigned long end = vma->vm_end;
-	unsigned long mmun_start;	/* For mmu_notifiers */
-	unsigned long mmun_end;		/* For mmu_notifiers */
+	struct mmu_notifier_range range;
 	bool is_cow;
 	int ret;
 
@@ -1008,11 +1007,11 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	 * is_cow_mapping() returns true.
 	 */
 	is_cow = is_cow_mapping(vma->vm_flags);
-	mmun_start = addr;
-	mmun_end   = end;
-	if (is_cow)
-		mmu_notifier_invalidate_range_start(src_mm, mmun_start,
-						    mmun_end);
+
+	if (is_cow) {
+		mmu_notifier_range_init(&range, src_mm, addr, end);
+		mmu_notifier_invalidate_range_start(&range);
+	}
 
 	ret = 0;
 	dst_pgd = pgd_offset(dst_mm, addr);
@@ -1029,7 +1028,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	} while (dst_pgd++, src_pgd++, addr = next, addr != end);
 
 	if (is_cow)
-		mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end);
+		mmu_notifier_invalidate_range_end(&range);
 	return ret;
 }
 
@@ -1332,12 +1331,13 @@ void unmap_vmas(struct mmu_gather *tlb,
 		struct vm_area_struct *vma, unsigned long start_addr,
 		unsigned long end_addr)
 {
-	struct mm_struct *mm = vma->vm_mm;
+	struct mmu_notifier_range range;
 
-	mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
+	mmu_notifier_range_init(&range, vma->vm_mm, start_addr, end_addr);
+	mmu_notifier_invalidate_range_start(&range);
 	for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
 		unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
-	mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
+	mmu_notifier_invalidate_range_end(&range);
 }
 
 /**
@@ -1351,18 +1351,18 @@ void unmap_vmas(struct mmu_gather *tlb,
 void zap_page_range(struct vm_area_struct *vma, unsigned long start,
 		unsigned long size)
 {
-	struct mm_struct *mm = vma->vm_mm;
+	struct mmu_notifier_range range;
 	struct mmu_gather tlb;
-	unsigned long end = start + size;
 
 	lru_add_drain();
-	tlb_gather_mmu(&tlb, mm, start, end);
-	update_hiwater_rss(mm);
-	mmu_notifier_invalidate_range_start(mm, start, end);
-	for ( ; vma && vma->vm_start < end; vma = vma->vm_next)
-		unmap_single_vma(&tlb, vma, start, end, NULL);
-	mmu_notifier_invalidate_range_end(mm, start, end);
-	tlb_finish_mmu(&tlb, start, end);
+	mmu_notifier_range_init(&range, vma->vm_mm, start, start + size);
+	tlb_gather_mmu(&tlb, vma->vm_mm, start, range.end);
+	update_hiwater_rss(vma->vm_mm);
+	mmu_notifier_invalidate_range_start(&range);
+	for ( ; vma && vma->vm_start < range.end; vma = vma->vm_next)
+		unmap_single_vma(&tlb, vma, start, range.end, NULL);
+	mmu_notifier_invalidate_range_end(&range);
+	tlb_finish_mmu(&tlb, start, range.end);
 }
 
 /**
@@ -1377,17 +1377,17 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
 static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
 		unsigned long size, struct zap_details *details)
 {
-	struct mm_struct *mm = vma->vm_mm;
+	struct mmu_notifier_range range;
 	struct mmu_gather tlb;
-	unsigned long end = address + size;
 
 	lru_add_drain();
-	tlb_gather_mmu(&tlb, mm, address, end);
-	update_hiwater_rss(mm);
-	mmu_notifier_invalidate_range_start(mm, address, end);
-	unmap_single_vma(&tlb, vma, address, end, details);
-	mmu_notifier_invalidate_range_end(mm, address, end);
-	tlb_finish_mmu(&tlb, address, end);
+	mmu_notifier_range_init(&range, vma->vm_mm, address, address + size);
+	tlb_gather_mmu(&tlb, vma->vm_mm, address, range.end);
+	update_hiwater_rss(vma->vm_mm);
+	mmu_notifier_invalidate_range_start(&range);
+	unmap_single_vma(&tlb, vma, address, range.end, details);
+	mmu_notifier_invalidate_range_end(&range);
+	tlb_finish_mmu(&tlb, address, range.end);
 }
 
 /**
@@ -2247,9 +2247,8 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
 	struct page *new_page = NULL;
 	pte_t entry;
 	int page_copied = 0;
-	const unsigned long mmun_start = vmf->address & PAGE_MASK;
-	const unsigned long mmun_end = mmun_start + PAGE_SIZE;
 	struct mem_cgroup *memcg;
+	struct mmu_notifier_range range;
 
 	if (unlikely(anon_vma_prepare(vma)))
 		goto oom;
@@ -2272,7 +2271,9 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
 
 	__SetPageUptodate(new_page);
 
-	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+	mmu_notifier_range_init(&range, mm, vmf->address & PAGE_MASK,
+				(vmf->address & PAGE_MASK) + PAGE_SIZE);
+	mmu_notifier_invalidate_range_start(&range);
 
 	/*
 	 * Re-check the pte - we dropped the lock
@@ -2349,7 +2350,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
 	 * No need to double call mmu_notifier->invalidate_range() callback as
 	 * the above ptep_clear_flush_notify() did already call it.
 	 */
-	mmu_notifier_invalidate_range_only_end(mm, mmun_start, mmun_end);
+	mmu_notifier_invalidate_range_only_end(&range);
 	if (old_page) {
 		/*
 		 * Don't let another task, with possibly unlocked vma,
@@ -4030,7 +4031,7 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
 #endif /* __PAGETABLE_PMD_FOLDED */
 
 static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
-			    unsigned long *start, unsigned long *end,
+			    struct mmu_notifier_range *range,
 			    pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
 {
 	pgd_t *pgd;
@@ -4058,10 +4059,10 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
 		if (!pmdpp)
 			goto out;
 
-		if (start && end) {
-			*start = address & PMD_MASK;
-			*end = *start + PMD_SIZE;
-			mmu_notifier_invalidate_range_start(mm, *start, *end);
+		if (range) {
+			mmu_notifier_range_init(range, mm, address & PMD_MASK,
+					     (address & PMD_MASK) + PMD_SIZE);
+			mmu_notifier_invalidate_range_start(range);
 		}
 		*ptlp = pmd_lock(mm, pmd);
 		if (pmd_huge(*pmd)) {
@@ -4069,17 +4070,17 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
 			return 0;
 		}
 		spin_unlock(*ptlp);
-		if (start && end)
-			mmu_notifier_invalidate_range_end(mm, *start, *end);
+		if (range)
+			mmu_notifier_invalidate_range_end(range);
 	}
 
 	if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
 		goto out;
 
-	if (start && end) {
-		*start = address & PAGE_MASK;
-		*end = *start + PAGE_SIZE;
-		mmu_notifier_invalidate_range_start(mm, *start, *end);
+	if (range) {
+		range->start = address & PAGE_MASK;
+		range->end = range->start + PAGE_SIZE;
+		mmu_notifier_invalidate_range_start(range);
 	}
 	ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
 	if (!pte_present(*ptep))
@@ -4088,8 +4089,8 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
 	return 0;
 unlock:
 	pte_unmap_unlock(ptep, *ptlp);
-	if (start && end)
-		mmu_notifier_invalidate_range_end(mm, *start, *end);
+	if (range)
+		mmu_notifier_invalidate_range_end(range);
 out:
 	return -EINVAL;
 }
@@ -4101,20 +4102,20 @@ static inline int follow_pte(struct mm_struct *mm, unsigned long address,
 
 	/* (void) is needed to make gcc happy */
 	(void) __cond_lock(*ptlp,
-			   !(res = __follow_pte_pmd(mm, address, NULL, NULL,
+			   !(res = __follow_pte_pmd(mm, address, NULL,
 						    ptepp, NULL, ptlp)));
 	return res;
 }
 
 int follow_pte_pmd(struct mm_struct *mm, unsigned long address,
-			     unsigned long *start, unsigned long *end,
-			     pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
+		   struct mmu_notifier_range *range,
+		   pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
 {
 	int res;
 
 	/* (void) is needed to make gcc happy */
 	(void) __cond_lock(*ptlp,
-			   !(res = __follow_pte_pmd(mm, address, start, end,
+			   !(res = __follow_pte_pmd(mm, address, range,
 						    ptepp, pmdpp, ptlp)));
 	return res;
 }
diff --git a/mm/migrate.c b/mm/migrate.c
index acda06f99754..462163f5f278 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2299,6 +2299,7 @@ next:
  */
 static void migrate_vma_collect(struct migrate_vma *migrate)
 {
+	struct mmu_notifier_range range;
 	struct mm_walk mm_walk;
 
 	mm_walk.pmd_entry = migrate_vma_collect_pmd;
@@ -2310,13 +2311,11 @@ static void migrate_vma_collect(struct migrate_vma *migrate)
 	mm_walk.mm = migrate->vma->vm_mm;
 	mm_walk.private = migrate;
 
-	mmu_notifier_invalidate_range_start(mm_walk.mm,
-					    migrate->start,
-					    migrate->end);
+	mmu_notifier_range_init(&range, mm_walk.mm, migrate->start,
+				migrate->end);
+	mmu_notifier_invalidate_range_start(&range);
 	walk_page_range(migrate->start, migrate->end, &mm_walk);
-	mmu_notifier_invalidate_range_end(mm_walk.mm,
-					  migrate->start,
-					  migrate->end);
+	mmu_notifier_invalidate_range_end(&range);
 
 	migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT);
 }
@@ -2697,9 +2696,8 @@ static void migrate_vma_pages(struct migrate_vma *migrate)
 {
 	const unsigned long npages = migrate->npages;
 	const unsigned long start = migrate->start;
-	struct vm_area_struct *vma = migrate->vma;
-	struct mm_struct *mm = vma->vm_mm;
-	unsigned long addr, i, mmu_start;
+	struct mmu_notifier_range range;
+	unsigned long addr, i;
 	bool notified = false;
 
 	for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) {
@@ -2718,11 +2716,12 @@ static void migrate_vma_pages(struct migrate_vma *migrate)
 				continue;
 			}
 			if (!notified) {
-				mmu_start = addr;
 				notified = true;
-				mmu_notifier_invalidate_range_start(mm,
-								mmu_start,
-								migrate->end);
+
+				mmu_notifier_range_init(&range,
+							migrate->vma->vm_mm,
+							addr, migrate->end);
+				mmu_notifier_invalidate_range_start(&range);
 			}
 			migrate_vma_insert_page(migrate, addr, newpage,
 						&migrate->src[i],
@@ -2763,8 +2762,7 @@ static void migrate_vma_pages(struct migrate_vma *migrate)
 	 * did already call it.
 	 */
 	if (notified)
-		mmu_notifier_invalidate_range_only_end(mm, mmu_start,
-						       migrate->end);
+		mmu_notifier_invalidate_range_only_end(&range);
 }
 
 /*
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 74a7dc3d11c8..9c884abc7850 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -167,28 +167,20 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
 	srcu_read_unlock(&srcu, id);
 }
 
-int __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
-				  unsigned long start, unsigned long end,
-				  bool blockable)
+int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
 {
-	struct mmu_notifier_range _range, *range = &_range;
 	struct mmu_notifier *mn;
 	int ret = 0;
 	int id;
 
-	range->blockable = blockable;
-	range->start = start;
-	range->end = end;
-	range->mm = mm;
-
 	id = srcu_read_lock(&srcu);
-	hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
+	hlist_for_each_entry_rcu(mn, &range->mm->mmu_notifier_mm->list, hlist) {
 		if (mn->ops->invalidate_range_start) {
 			int _ret = mn->ops->invalidate_range_start(mn, range);
 			if (_ret) {
 				pr_info("%pS callback failed with %d in %sblockable context.\n",
-						mn->ops->invalidate_range_start, _ret,
-						!blockable ? "non-" : "");
+					mn->ops->invalidate_range_start, _ret,
+					!range->blockable ? "non-" : "");
 				ret = _ret;
 			}
 		}
@@ -199,27 +191,14 @@ int __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
 }
 EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_start);
 
-void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
-					 unsigned long start,
-					 unsigned long end,
+void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range,
 					 bool only_end)
 {
-	struct mmu_notifier_range _range, *range = &_range;
 	struct mmu_notifier *mn;
 	int id;
 
-	/*
-	 * The end call back will never be call if the start refused to go
-	 * through because of blockable was false so here assume that we
-	 * can block.
-	 */
-	range->blockable = true;
-	range->start = start;
-	range->end = end;
-	range->mm = mm;
-
 	id = srcu_read_lock(&srcu);
-	hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
+	hlist_for_each_entry_rcu(mn, &range->mm->mmu_notifier_mm->list, hlist) {
 		/*
 		 * Call invalidate_range here too to avoid the need for the
 		 * subsystem of having to register an invalidate_range_end
@@ -234,7 +213,9 @@ void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
 		 * already happen under page table lock.
 		 */
 		if (!only_end && mn->ops->invalidate_range)
-			mn->ops->invalidate_range(mn, mm, start, end);
+			mn->ops->invalidate_range(mn, range->mm,
+						  range->start,
+						  range->end);
 		if (mn->ops->invalidate_range_end)
 			mn->ops->invalidate_range_end(mn, range);
 	}
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 6d331620b9e5..36cb358db170 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -167,11 +167,12 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
 		pgprot_t newprot, int dirty_accountable, int prot_numa)
 {
 	pmd_t *pmd;
-	struct mm_struct *mm = vma->vm_mm;
 	unsigned long next;
 	unsigned long pages = 0;
 	unsigned long nr_huge_updates = 0;
-	unsigned long mni_start = 0;
+	struct mmu_notifier_range range;
+
+	range.start = 0;
 
 	pmd = pmd_offset(pud, addr);
 	do {
@@ -183,9 +184,9 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
 			goto next;
 
 		/* invoke the mmu notifier if the pmd is populated */
-		if (!mni_start) {
-			mni_start = addr;
-			mmu_notifier_invalidate_range_start(mm, mni_start, end);
+		if (!range.start) {
+			mmu_notifier_range_init(&range, vma->vm_mm, addr, end);
+			mmu_notifier_invalidate_range_start(&range);
 		}
 
 		if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
@@ -214,8 +215,8 @@ next:
 		cond_resched();
 	} while (pmd++, addr = next, addr != end);
 
-	if (mni_start)
-		mmu_notifier_invalidate_range_end(mm, mni_start, end);
+	if (range.start)
+		mmu_notifier_invalidate_range_end(&range);
 
 	if (nr_huge_updates)
 		count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates);
diff --git a/mm/mremap.c b/mm/mremap.c
index 7f9f9180e401..def01d86e36f 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -197,16 +197,14 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
 		bool need_rmap_locks)
 {
 	unsigned long extent, next, old_end;
+	struct mmu_notifier_range range;
 	pmd_t *old_pmd, *new_pmd;
-	unsigned long mmun_start;	/* For mmu_notifiers */
-	unsigned long mmun_end;		/* For mmu_notifiers */
 
 	old_end = old_addr + len;
 	flush_cache_range(vma, old_addr, old_end);
 
-	mmun_start = old_addr;
-	mmun_end   = old_end;
-	mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
+	mmu_notifier_range_init(&range, vma->vm_mm, old_addr, old_end);
+	mmu_notifier_invalidate_range_start(&range);
 
 	for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
 		cond_resched();
@@ -247,7 +245,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
 			  new_pmd, new_addr, need_rmap_locks);
 	}
 
-	mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
+	mmu_notifier_invalidate_range_end(&range);
 
 	return len + old_addr - old_end;	/* how much done */
 }
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 5442cb12e4ed..f0e8cd9edb1a 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -528,19 +528,20 @@ bool __oom_reap_task_mm(struct mm_struct *mm)
 		 * count elevated without a good reason.
 		 */
 		if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) {
-			const unsigned long start = vma->vm_start;
-			const unsigned long end = vma->vm_end;
+			struct mmu_notifier_range range;
 			struct mmu_gather tlb;
 
-			tlb_gather_mmu(&tlb, mm, start, end);
-			if (mmu_notifier_invalidate_range_start_nonblock(mm, start, end)) {
-				tlb_finish_mmu(&tlb, start, end);
+			mmu_notifier_range_init(&range, mm, vma->vm_start,
+						vma->vm_end);
+			tlb_gather_mmu(&tlb, mm, range.start, range.end);
+			if (mmu_notifier_invalidate_range_start_nonblock(&range)) {
+				tlb_finish_mmu(&tlb, range.start, range.end);
 				ret = false;
 				continue;
 			}
-			unmap_page_range(&tlb, vma, start, end, NULL);
-			mmu_notifier_invalidate_range_end(mm, start, end);
-			tlb_finish_mmu(&tlb, start, end);
+			unmap_page_range(&tlb, vma, range.start, range.end, NULL);
+			mmu_notifier_invalidate_range_end(&range);
+			tlb_finish_mmu(&tlb, range.start, range.end);
 		}
 	}
 
diff --git a/mm/rmap.c b/mm/rmap.c
index 85b7f9423352..c75f72f6fe0e 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -889,15 +889,17 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
 		.address = address,
 		.flags = PVMW_SYNC,
 	};
-	unsigned long start = address, end;
+	struct mmu_notifier_range range;
 	int *cleaned = arg;
 
 	/*
 	 * We have to assume the worse case ie pmd for invalidation. Note that
 	 * the page can not be free from this function.
 	 */
-	end = min(vma->vm_end, start + (PAGE_SIZE << compound_order(page)));
-	mmu_notifier_invalidate_range_start(vma->vm_mm, start, end);
+	mmu_notifier_range_init(&range, vma->vm_mm, address,
+				min(vma->vm_end, address +
+				    (PAGE_SIZE << compound_order(page))));
+	mmu_notifier_invalidate_range_start(&range);
 
 	while (page_vma_mapped_walk(&pvmw)) {
 		unsigned long cstart;
@@ -949,7 +951,7 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
 			(*cleaned)++;
 	}
 
-	mmu_notifier_invalidate_range_end(vma->vm_mm, start, end);
+	mmu_notifier_invalidate_range_end(&range);
 
 	return true;
 }
@@ -1345,7 +1347,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 	pte_t pteval;
 	struct page *subpage;
 	bool ret = true;
-	unsigned long start = address, end;
+	struct mmu_notifier_range range;
 	enum ttu_flags flags = (enum ttu_flags)arg;
 
 	/* munlock has nothing to gain from examining un-locked vmas */
@@ -1369,15 +1371,18 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 	 * Note that the page can not be free in this function as call of
 	 * try_to_unmap() must hold a reference on the page.
 	 */
-	end = min(vma->vm_end, start + (PAGE_SIZE << compound_order(page)));
+	mmu_notifier_range_init(&range, vma->vm_mm, vma->vm_start,
+				min(vma->vm_end, vma->vm_start +
+				    (PAGE_SIZE << compound_order(page))));
 	if (PageHuge(page)) {
 		/*
 		 * If sharing is possible, start and end will be adjusted
 		 * accordingly.
 		 */
-		adjust_range_if_pmd_sharing_possible(vma, &start, &end);
+		adjust_range_if_pmd_sharing_possible(vma, &range.start,
+						     &range.end);
 	}
-	mmu_notifier_invalidate_range_start(vma->vm_mm, start, end);
+	mmu_notifier_invalidate_range_start(&range);
 
 	while (page_vma_mapped_walk(&pvmw)) {
 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
@@ -1428,9 +1433,10 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 				 * we must flush them all.  start/end were
 				 * already adjusted above to cover this range.
 				 */
-				flush_cache_range(vma, start, end);
-				flush_tlb_range(vma, start, end);
-				mmu_notifier_invalidate_range(mm, start, end);
+				flush_cache_range(vma, range.start, range.end);
+				flush_tlb_range(vma, range.start, range.end);
+				mmu_notifier_invalidate_range(mm, range.start,
+							      range.end);
 
 				/*
 				 * The ref count of the PMD page was dropped
@@ -1650,7 +1656,7 @@ discard:
 		put_page(page);
 	}
 
-	mmu_notifier_invalidate_range_end(vma->vm_mm, start, end);
+	mmu_notifier_invalidate_range_end(&range);
 
 	return ret;
 }
-- 
cgit 


From 063a7d1d3623db31ca5d2309cab6030ebf93b72f Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 28 Dec 2018 00:39:46 -0800
Subject: mm/hmm: fix memremap.h, move dev_page_fault_t callback to hmm
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The kbuild robot reported the following on a development branch that used
memremap.h in a new path:

   In file included from arch/m68k/include/asm/pgtable_mm.h:148:0,
                     from arch/m68k/include/asm/pgtable.h:5,
                     from include/linux/memremap.h:7,
                     from drivers//dax/bus.c:3:
    arch/m68k/include/asm/motorola_pgtable.h: In function 'pgd_offset':
 >> arch/m68k/include/asm/motorola_pgtable.h:199:11: error: dereferencing pointer to incomplete type 'const struct mm_struct'
      return mm->pgd + pgd_index(address);
               ^~

The ->page_fault() callback is specific to HMM.  Move it to 'struct
hmm_devmem' where the unusual asm/pgtable.h dependency can be contained in
include/linux/hmm.h.  Longer term refactoring this dependency out of HMM
is recommended, but in the meantime memremap.h remains generic.

Link: http://lkml.kernel.org/r/154534090899.3120190.6652620807617715272.stgit@dwillia2-desk3.amr.corp.intel.com
Fixes: 5042db43cc26 ("mm/ZONE_DEVICE: new type of ZONE_DEVICE memory...")
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: "Jérôme Glisse" <jglisse@redhat.com>
Cc: Logan Gunthorpe <logang@deltatee.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hmm.h      | 24 ++++++++++++++++++++++++
 include/linux/memremap.h | 32 --------------------------------
 kernel/memremap.c        |  6 +++++-
 mm/hmm.c                 |  4 ++--
 4 files changed, 31 insertions(+), 35 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index ed89fbc525d2..66f9ebbb1df3 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -69,6 +69,7 @@
 #define LINUX_HMM_H
 
 #include <linux/kconfig.h>
+#include <asm/pgtable.h>
 
 #if IS_ENABLED(CONFIG_HMM)
 
@@ -486,6 +487,7 @@ struct hmm_devmem_ops {
  * @device: device to bind resource to
  * @ops: memory operations callback
  * @ref: per CPU refcount
+ * @page_fault: callback when CPU fault on an unaddressable device page
  *
  * This an helper structure for device drivers that do not wish to implement
  * the gory details related to hotplugging new memoy and allocating struct
@@ -493,7 +495,28 @@ struct hmm_devmem_ops {
  *
  * Device drivers can directly use ZONE_DEVICE memory on their own if they
  * wish to do so.
+ *
+ * The page_fault() callback must migrate page back, from device memory to
+ * system memory, so that the CPU can access it. This might fail for various
+ * reasons (device issues,  device have been unplugged, ...). When such error
+ * conditions happen, the page_fault() callback must return VM_FAULT_SIGBUS and
+ * set the CPU page table entry to "poisoned".
+ *
+ * Note that because memory cgroup charges are transferred to the device memory,
+ * this should never fail due to memory restrictions. However, allocation
+ * of a regular system page might still fail because we are out of memory. If
+ * that happens, the page_fault() callback must return VM_FAULT_OOM.
+ *
+ * The page_fault() callback can also try to migrate back multiple pages in one
+ * chunk, as an optimization. It must, however, prioritize the faulting address
+ * over all the others.
  */
+typedef int (*dev_page_fault_t)(struct vm_area_struct *vma,
+				unsigned long addr,
+				const struct page *page,
+				unsigned int flags,
+				pmd_t *pmdp);
+
 struct hmm_devmem {
 	struct completion		completion;
 	unsigned long			pfn_first;
@@ -503,6 +526,7 @@ struct hmm_devmem {
 	struct dev_pagemap		pagemap;
 	const struct hmm_devmem_ops	*ops;
 	struct percpu_ref		ref;
+	dev_page_fault_t		page_fault;
 };
 
 /*
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 55db66b3716f..f0628660d541 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -4,8 +4,6 @@
 #include <linux/ioport.h>
 #include <linux/percpu-refcount.h>
 
-#include <asm/pgtable.h>
-
 struct resource;
 struct device;
 
@@ -66,47 +64,18 @@ enum memory_type {
 };
 
 /*
- * For MEMORY_DEVICE_PRIVATE we use ZONE_DEVICE and extend it with two
- * callbacks:
- *   page_fault()
- *   page_free()
- *
  * Additional notes about MEMORY_DEVICE_PRIVATE may be found in
  * include/linux/hmm.h and Documentation/vm/hmm.rst. There is also a brief
  * explanation in include/linux/memory_hotplug.h.
  *
- * The page_fault() callback must migrate page back, from device memory to
- * system memory, so that the CPU can access it. This might fail for various
- * reasons (device issues,  device have been unplugged, ...). When such error
- * conditions happen, the page_fault() callback must return VM_FAULT_SIGBUS and
- * set the CPU page table entry to "poisoned".
- *
- * Note that because memory cgroup charges are transferred to the device memory,
- * this should never fail due to memory restrictions. However, allocation
- * of a regular system page might still fail because we are out of memory. If
- * that happens, the page_fault() callback must return VM_FAULT_OOM.
- *
- * The page_fault() callback can also try to migrate back multiple pages in one
- * chunk, as an optimization. It must, however, prioritize the faulting address
- * over all the others.
- *
- *
  * The page_free() callback is called once the page refcount reaches 1
  * (ZONE_DEVICE pages never reach 0 refcount unless there is a refcount bug.
  * This allows the device driver to implement its own memory management.)
- *
- * For MEMORY_DEVICE_PUBLIC only the page_free() callback matter.
  */
-typedef int (*dev_page_fault_t)(struct vm_area_struct *vma,
-				unsigned long addr,
-				const struct page *page,
-				unsigned int flags,
-				pmd_t *pmdp);
 typedef void (*dev_page_free_t)(struct page *page, void *data);
 
 /**
  * struct dev_pagemap - metadata for ZONE_DEVICE mappings
- * @page_fault: callback when CPU fault on an unaddressable device page
  * @page_free: free page callback when page refcount reaches 1
  * @altmap: pre-allocated/reserved memory for vmemmap allocations
  * @res: physical address range covered by @ref
@@ -117,7 +86,6 @@ typedef void (*dev_page_free_t)(struct page *page, void *data);
  * @type: memory type: see MEMORY_* in memory_hotplug.h
  */
 struct dev_pagemap {
-	dev_page_fault_t page_fault;
 	dev_page_free_t page_free;
 	struct vmem_altmap altmap;
 	bool altmap_valid;
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 0d5603d76c37..a856cb5ff192 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -11,6 +11,7 @@
 #include <linux/types.h>
 #include <linux/wait_bit.h>
 #include <linux/xarray.h>
+#include <linux/hmm.h>
 
 static DEFINE_XARRAY(pgmap_array);
 #define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1)
@@ -24,6 +25,9 @@ vm_fault_t device_private_entry_fault(struct vm_area_struct *vma,
 		       pmd_t *pmdp)
 {
 	struct page *page = device_private_entry_to_page(entry);
+	struct hmm_devmem *devmem;
+
+	devmem = container_of(page->pgmap, typeof(*devmem), pagemap);
 
 	/*
 	 * The page_fault() callback must migrate page back to system memory
@@ -39,7 +43,7 @@ vm_fault_t device_private_entry_fault(struct vm_area_struct *vma,
 	 * There is a more in-depth description of what that callback can and
 	 * cannot do, in include/linux/memremap.h
 	 */
-	return page->pgmap->page_fault(vma, addr, page, flags, pmdp);
+	return devmem->page_fault(vma, addr, page, flags, pmdp);
 }
 EXPORT_SYMBOL(device_private_entry_fault);
 #endif /* CONFIG_DEVICE_PRIVATE */
diff --git a/mm/hmm.c b/mm/hmm.c
index 789587731217..a04e4b810610 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -1087,10 +1087,10 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
 	devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
 	devmem->pfn_last = devmem->pfn_first +
 			   (resource_size(devmem->resource) >> PAGE_SHIFT);
+	devmem->page_fault = hmm_devmem_fault;
 
 	devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
 	devmem->pagemap.res = *devmem->resource;
-	devmem->pagemap.page_fault = hmm_devmem_fault;
 	devmem->pagemap.page_free = hmm_devmem_free;
 	devmem->pagemap.altmap_valid = false;
 	devmem->pagemap.ref = &devmem->ref;
@@ -1141,10 +1141,10 @@ struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
 	devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
 	devmem->pfn_last = devmem->pfn_first +
 			   (resource_size(devmem->resource) >> PAGE_SHIFT);
+	devmem->page_fault = hmm_devmem_fault;
 
 	devmem->pagemap.type = MEMORY_DEVICE_PUBLIC;
 	devmem->pagemap.res = *devmem->resource;
-	devmem->pagemap.page_fault = hmm_devmem_fault;
 	devmem->pagemap.page_free = hmm_devmem_free;
 	devmem->pagemap.altmap_valid = false;
 	devmem->pagemap.ref = &devmem->ref;
-- 
cgit 


From 0f4991e8fd48987ae476a92cdee6bfec4aff31b8 Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Fri, 28 Dec 2018 00:40:00 -0800
Subject: kernel/fork.c: mark 'stack_vm_area' with __maybe_unused

Fixes gcc '-Wunused-but-set-variable' warning when CONFIG_VMAP_STACK is
not set:

kernel/fork.c: In function 'dup_task_struct':
kernel/fork.c:843:20: warning:
 variable 'stack_vm_area' set but not used [-Wunused-but-set-variable]

Link: http://lkml.kernel.org/r/1545965190-2381-1-git-send-email-yuehaibing@huawei.com
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index c979605fe806..d439c48ecf18 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -841,7 +841,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 {
 	struct task_struct *tsk;
 	unsigned long *stack;
-	struct vm_struct *stack_vm_area;
+	struct vm_struct *stack_vm_area __maybe_unused;
 	int err;
 
 	if (node == NUMA_NO_NODE)
-- 
cgit 


From 1a80dade010c7a7f4885a4c4c2a7ac22cc7b34df Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@infradead.org>
Date: Fri, 28 Dec 2018 07:22:26 -0800
Subject: Fix failure path in alloc_pid()

The failure path removes the allocated PIDs from the wrong namespace.
This could lead to us inadvertently reusing PIDs in the leaf namespace
and leaking PIDs in parent namespaces.

Fixes: 95846ecf9dac ("pid: replace pid bitmap implementation with IDR API")
Cc: <stable@vger.kernel.org>
Signed-off-by: Matthew Wilcox <willy@infradead.org>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/pid.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/pid.c b/kernel/pid.c
index b2f6c506035d..20881598bdfa 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -233,8 +233,10 @@ out_unlock:
 
 out_free:
 	spin_lock_irq(&pidmap_lock);
-	while (++i <= ns->level)
-		idr_remove(&ns->idr, (pid->numbers + i)->nr);
+	while (++i <= ns->level) {
+		upid = pid->numbers + i;
+		idr_remove(&upid->ns->idr, upid->nr);
+	}
 
 	/* On failure to allocate the first pid, reset the state */
 	if (ns->pid_allocated == PIDNS_ADDING)
-- 
cgit 


From 9ef7fa507d6b53a96de4da3298c5f01bde603c0a Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Tue, 4 Dec 2018 19:38:25 -0800
Subject: kgdb: Remove irq flags from roundup

The function kgdb_roundup_cpus() was passed a parameter that was
documented as:

> the flags that will be used when restoring the interrupts. There is
> local_irq_save() call before kgdb_roundup_cpus().

Nobody used those flags.  Anyone who wanted to temporarily turn on
interrupts just did local_irq_enable() and local_irq_disable() without
looking at them.  So we can definitely remove the flags.

Signed-off-by: Douglas Anderson <dianders@chromium.org>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Richard Kuo <rkuo@codeaurora.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Paul Burton <paul.burton@mips.com>
Cc: James Hogan <jhogan@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Rich Felker <dalias@libc.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>
---
 arch/arc/kernel/kgdb.c     | 2 +-
 arch/arm/kernel/kgdb.c     | 2 +-
 arch/arm64/kernel/kgdb.c   | 2 +-
 arch/hexagon/kernel/kgdb.c | 9 ++-------
 arch/mips/kernel/kgdb.c    | 2 +-
 arch/powerpc/kernel/kgdb.c | 2 +-
 arch/sh/kernel/kgdb.c      | 2 +-
 arch/sparc/kernel/smp_64.c | 2 +-
 arch/x86/kernel/kgdb.c     | 9 ++-------
 include/linux/kgdb.h       | 9 ++-------
 kernel/debug/debug_core.c  | 2 +-
 11 files changed, 14 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/arch/arc/kernel/kgdb.c b/arch/arc/kernel/kgdb.c
index 9a3c34af2ae8..0932851028e0 100644
--- a/arch/arc/kernel/kgdb.c
+++ b/arch/arc/kernel/kgdb.c
@@ -197,7 +197,7 @@ static void kgdb_call_nmi_hook(void *ignored)
 	kgdb_nmicallback(raw_smp_processor_id(), NULL);
 }
 
-void kgdb_roundup_cpus(unsigned long flags)
+void kgdb_roundup_cpus(void)
 {
 	local_irq_enable();
 	smp_call_function(kgdb_call_nmi_hook, NULL, 0);
diff --git a/arch/arm/kernel/kgdb.c b/arch/arm/kernel/kgdb.c
index caa0dbe3dc61..f21077b077be 100644
--- a/arch/arm/kernel/kgdb.c
+++ b/arch/arm/kernel/kgdb.c
@@ -175,7 +175,7 @@ static void kgdb_call_nmi_hook(void *ignored)
        kgdb_nmicallback(raw_smp_processor_id(), get_irq_regs());
 }
 
-void kgdb_roundup_cpus(unsigned long flags)
+void kgdb_roundup_cpus(void)
 {
        local_irq_enable();
        smp_call_function(kgdb_call_nmi_hook, NULL, 0);
diff --git a/arch/arm64/kernel/kgdb.c b/arch/arm64/kernel/kgdb.c
index a20de58061a8..12c339ff6e75 100644
--- a/arch/arm64/kernel/kgdb.c
+++ b/arch/arm64/kernel/kgdb.c
@@ -289,7 +289,7 @@ static void kgdb_call_nmi_hook(void *ignored)
 	kgdb_nmicallback(raw_smp_processor_id(), get_irq_regs());
 }
 
-void kgdb_roundup_cpus(unsigned long flags)
+void kgdb_roundup_cpus(void)
 {
 	local_irq_enable();
 	smp_call_function(kgdb_call_nmi_hook, NULL, 0);
diff --git a/arch/hexagon/kernel/kgdb.c b/arch/hexagon/kernel/kgdb.c
index 16c24b22d0b2..012e0e230ac2 100644
--- a/arch/hexagon/kernel/kgdb.c
+++ b/arch/hexagon/kernel/kgdb.c
@@ -119,17 +119,12 @@ void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long pc)
 
 /**
  * kgdb_roundup_cpus - Get other CPUs into a holding pattern
- * @flags: Current IRQ state
  *
  * On SMP systems, we need to get the attention of the other CPUs
  * and get them be in a known state.  This should do what is needed
  * to get the other CPUs to call kgdb_wait(). Note that on some arches,
  * the NMI approach is not used for rounding up all the CPUs. For example,
- * in case of MIPS, smp_call_function() is used to roundup CPUs. In
- * this case, we have to make sure that interrupts are enabled before
- * calling smp_call_function(). The argument to this function is
- * the flags that will be used when restoring the interrupts. There is
- * local_irq_save() call before kgdb_roundup_cpus().
+ * in case of MIPS, smp_call_function() is used to roundup CPUs.
  *
  * On non-SMP systems, this is not called.
  */
@@ -139,7 +134,7 @@ static void hexagon_kgdb_nmi_hook(void *ignored)
 	kgdb_nmicallback(raw_smp_processor_id(), get_irq_regs());
 }
 
-void kgdb_roundup_cpus(unsigned long flags)
+void kgdb_roundup_cpus(void)
 {
 	local_irq_enable();
 	smp_call_function(hexagon_kgdb_nmi_hook, NULL, 0);
diff --git a/arch/mips/kernel/kgdb.c b/arch/mips/kernel/kgdb.c
index eb6c0d582626..2b05effc17b4 100644
--- a/arch/mips/kernel/kgdb.c
+++ b/arch/mips/kernel/kgdb.c
@@ -219,7 +219,7 @@ static void kgdb_call_nmi_hook(void *ignored)
 	set_fs(old_fs);
 }
 
-void kgdb_roundup_cpus(unsigned long flags)
+void kgdb_roundup_cpus(void)
 {
 	local_irq_enable();
 	smp_call_function(kgdb_call_nmi_hook, NULL, 0);
diff --git a/arch/powerpc/kernel/kgdb.c b/arch/powerpc/kernel/kgdb.c
index 59c578f865aa..b0e804844be0 100644
--- a/arch/powerpc/kernel/kgdb.c
+++ b/arch/powerpc/kernel/kgdb.c
@@ -124,7 +124,7 @@ static int kgdb_call_nmi_hook(struct pt_regs *regs)
 }
 
 #ifdef CONFIG_SMP
-void kgdb_roundup_cpus(unsigned long flags)
+void kgdb_roundup_cpus(void)
 {
 	smp_send_debugger_break();
 }
diff --git a/arch/sh/kernel/kgdb.c b/arch/sh/kernel/kgdb.c
index 4f04c6638a4d..cc57630f6bf2 100644
--- a/arch/sh/kernel/kgdb.c
+++ b/arch/sh/kernel/kgdb.c
@@ -319,7 +319,7 @@ static void kgdb_call_nmi_hook(void *ignored)
 	kgdb_nmicallback(raw_smp_processor_id(), get_irq_regs());
 }
 
-void kgdb_roundup_cpus(unsigned long flags)
+void kgdb_roundup_cpus(void)
 {
 	local_irq_enable();
 	smp_call_function(kgdb_call_nmi_hook, NULL, 0);
diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index 4792e08ad36b..f45d876983f1 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -1014,7 +1014,7 @@ void flush_dcache_page_all(struct mm_struct *mm, struct page *page)
 }
 
 #ifdef CONFIG_KGDB
-void kgdb_roundup_cpus(unsigned long flags)
+void kgdb_roundup_cpus(void)
 {
 	smp_cross_call(&xcall_kgdb_capture, 0, 0, 0);
 }
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 8e36f249646e..ac6291a4178d 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -422,21 +422,16 @@ static void kgdb_disable_hw_debug(struct pt_regs *regs)
 #ifdef CONFIG_SMP
 /**
  *	kgdb_roundup_cpus - Get other CPUs into a holding pattern
- *	@flags: Current IRQ state
  *
  *	On SMP systems, we need to get the attention of the other CPUs
  *	and get them be in a known state.  This should do what is needed
  *	to get the other CPUs to call kgdb_wait(). Note that on some arches,
  *	the NMI approach is not used for rounding up all the CPUs. For example,
- *	in case of MIPS, smp_call_function() is used to roundup CPUs. In
- *	this case, we have to make sure that interrupts are enabled before
- *	calling smp_call_function(). The argument to this function is
- *	the flags that will be used when restoring the interrupts. There is
- *	local_irq_save() call before kgdb_roundup_cpus().
+ *	in case of MIPS, smp_call_function() is used to roundup CPUs.
  *
  *	On non-SMP systems, this is not called.
  */
-void kgdb_roundup_cpus(unsigned long flags)
+void kgdb_roundup_cpus(void)
 {
 	apic->send_IPI_allbutself(APIC_DM_NMI);
 }
diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h
index e465bb15912d..05e5b2eb0d32 100644
--- a/include/linux/kgdb.h
+++ b/include/linux/kgdb.h
@@ -178,21 +178,16 @@ kgdb_arch_handle_exception(int vector, int signo, int err_code,
 
 /**
  *	kgdb_roundup_cpus - Get other CPUs into a holding pattern
- *	@flags: Current IRQ state
  *
  *	On SMP systems, we need to get the attention of the other CPUs
  *	and get them into a known state.  This should do what is needed
  *	to get the other CPUs to call kgdb_wait(). Note that on some arches,
  *	the NMI approach is not used for rounding up all the CPUs. For example,
- *	in case of MIPS, smp_call_function() is used to roundup CPUs. In
- *	this case, we have to make sure that interrupts are enabled before
- *	calling smp_call_function(). The argument to this function is
- *	the flags that will be used when restoring the interrupts. There is
- *	local_irq_save() call before kgdb_roundup_cpus().
+ *	in case of MIPS, smp_call_function() is used to roundup CPUs.
  *
  *	On non-SMP systems, this is not called.
  */
-extern void kgdb_roundup_cpus(unsigned long flags);
+extern void kgdb_roundup_cpus(void);
 
 /**
  *	kgdb_arch_set_pc - Generic call back to the program counter
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 65c0f1363788..f3cadda45f07 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -593,7 +593,7 @@ return_normal:
 
 	/* Signal the other CPUs to enter kgdb_wait() */
 	else if ((!kgdb_single_step) && kgdb_do_roundup)
-		kgdb_roundup_cpus(flags);
+		kgdb_roundup_cpus();
 #endif
 
 	/*
-- 
cgit 


From 3cd99ac3559855f69afbc1d5080e17eaa12394ff Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Tue, 4 Dec 2018 19:38:26 -0800
Subject: kgdb: Fix kgdb_roundup_cpus() for arches who used smp_call_function()

When I had lockdep turned on and dropped into kgdb I got a nice splat
on my system.  Specifically it hit:
  DEBUG_LOCKS_WARN_ON(current->hardirq_context)

Specifically it looked like this:
  sysrq: SysRq : DEBUG
  ------------[ cut here ]------------
  DEBUG_LOCKS_WARN_ON(current->hardirq_context)
  WARNING: CPU: 0 PID: 0 at .../kernel/locking/lockdep.c:2875 lockdep_hardirqs_on+0xf0/0x160
  CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.19.0 #27
  pstate: 604003c9 (nZCv DAIF +PAN -UAO)
  pc : lockdep_hardirqs_on+0xf0/0x160
  ...
  Call trace:
   lockdep_hardirqs_on+0xf0/0x160
   trace_hardirqs_on+0x188/0x1ac
   kgdb_roundup_cpus+0x14/0x3c
   kgdb_cpu_enter+0x53c/0x5cc
   kgdb_handle_exception+0x180/0x1d4
   kgdb_compiled_brk_fn+0x30/0x3c
   brk_handler+0x134/0x178
   do_debug_exception+0xfc/0x178
   el1_dbg+0x18/0x78
   kgdb_breakpoint+0x34/0x58
   sysrq_handle_dbg+0x54/0x5c
   __handle_sysrq+0x114/0x21c
   handle_sysrq+0x30/0x3c
   qcom_geni_serial_isr+0x2dc/0x30c
  ...
  ...
  irq event stamp: ...45
  hardirqs last  enabled at (...44): [...] __do_softirq+0xd8/0x4e4
  hardirqs last disabled at (...45): [...] el1_irq+0x74/0x130
  softirqs last  enabled at (...42): [...] _local_bh_enable+0x2c/0x34
  softirqs last disabled at (...43): [...] irq_exit+0xa8/0x100
  ---[ end trace adf21f830c46e638 ]---

Looking closely at it, it seems like a really bad idea to be calling
local_irq_enable() in kgdb_roundup_cpus().  If nothing else that seems
like it could violate spinlock semantics and cause a deadlock.

Instead, let's use a private csd alongside
smp_call_function_single_async() to round up the other CPUs.  Using
smp_call_function_single_async() doesn't require interrupts to be
enabled so we can remove the offending bit of code.

In order to avoid duplicating this across all the architectures that
use the default kgdb_roundup_cpus(), we'll add a "weak" implementation
to debug_core.c.

Looking at all the people who previously had copies of this code,
there were a few variants.  I've attempted to keep the variants
working like they used to.  Specifically:
* For arch/arc we passed NULL to kgdb_nmicallback() instead of
  get_irq_regs().
* For arch/mips there was a bit of extra code around
  kgdb_nmicallback()

NOTE: In this patch we will still get into trouble if we try to round
up a CPU that failed to round up before.  We'll try to round it up
again and potentially hang when we try to grab the csd lock.  That's
not new behavior but we'll still try to do better in a future patch.

Suggested-by: Daniel Thompson <daniel.thompson@linaro.org>
Signed-off-by: Douglas Anderson <dianders@chromium.org>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Richard Kuo <rkuo@codeaurora.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Paul Burton <paul.burton@mips.com>
Cc: James Hogan <jhogan@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Rich Felker <dalias@libc.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>
---
 arch/arc/kernel/kgdb.c     | 10 ++--------
 arch/arm/kernel/kgdb.c     | 12 ------------
 arch/arm64/kernel/kgdb.c   | 12 ------------
 arch/hexagon/kernel/kgdb.c | 27 ---------------------------
 arch/mips/kernel/kgdb.c    |  9 +--------
 arch/powerpc/kernel/kgdb.c |  4 ++--
 arch/sh/kernel/kgdb.c      | 12 ------------
 include/linux/kgdb.h       | 15 +++++++++++++--
 kernel/debug/debug_core.c  | 41 +++++++++++++++++++++++++++++++++++++++++
 9 files changed, 59 insertions(+), 83 deletions(-)

(limited to 'kernel')

diff --git a/arch/arc/kernel/kgdb.c b/arch/arc/kernel/kgdb.c
index 0932851028e0..68d9fe4b5aa7 100644
--- a/arch/arc/kernel/kgdb.c
+++ b/arch/arc/kernel/kgdb.c
@@ -192,18 +192,12 @@ void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long ip)
 	instruction_pointer(regs) = ip;
 }
 
-static void kgdb_call_nmi_hook(void *ignored)
+void kgdb_call_nmi_hook(void *ignored)
 {
+	/* Default implementation passes get_irq_regs() but we don't */
 	kgdb_nmicallback(raw_smp_processor_id(), NULL);
 }
 
-void kgdb_roundup_cpus(void)
-{
-	local_irq_enable();
-	smp_call_function(kgdb_call_nmi_hook, NULL, 0);
-	local_irq_disable();
-}
-
 struct kgdb_arch arch_kgdb_ops = {
 	/* breakpoint instruction: TRAP_S 0x3 */
 #ifdef CONFIG_CPU_BIG_ENDIAN
diff --git a/arch/arm/kernel/kgdb.c b/arch/arm/kernel/kgdb.c
index f21077b077be..d9a69e941463 100644
--- a/arch/arm/kernel/kgdb.c
+++ b/arch/arm/kernel/kgdb.c
@@ -170,18 +170,6 @@ static struct undef_hook kgdb_compiled_brkpt_hook = {
 	.fn			= kgdb_compiled_brk_fn
 };
 
-static void kgdb_call_nmi_hook(void *ignored)
-{
-       kgdb_nmicallback(raw_smp_processor_id(), get_irq_regs());
-}
-
-void kgdb_roundup_cpus(void)
-{
-       local_irq_enable();
-       smp_call_function(kgdb_call_nmi_hook, NULL, 0);
-       local_irq_disable();
-}
-
 static int __kgdb_notify(struct die_args *args, unsigned long cmd)
 {
 	struct pt_regs *regs = args->regs;
diff --git a/arch/arm64/kernel/kgdb.c b/arch/arm64/kernel/kgdb.c
index 12c339ff6e75..da880247c734 100644
--- a/arch/arm64/kernel/kgdb.c
+++ b/arch/arm64/kernel/kgdb.c
@@ -284,18 +284,6 @@ static struct step_hook kgdb_step_hook = {
 	.fn		= kgdb_step_brk_fn
 };
 
-static void kgdb_call_nmi_hook(void *ignored)
-{
-	kgdb_nmicallback(raw_smp_processor_id(), get_irq_regs());
-}
-
-void kgdb_roundup_cpus(void)
-{
-	local_irq_enable();
-	smp_call_function(kgdb_call_nmi_hook, NULL, 0);
-	local_irq_disable();
-}
-
 static int __kgdb_notify(struct die_args *args, unsigned long cmd)
 {
 	struct pt_regs *regs = args->regs;
diff --git a/arch/hexagon/kernel/kgdb.c b/arch/hexagon/kernel/kgdb.c
index 012e0e230ac2..b95d12038a4e 100644
--- a/arch/hexagon/kernel/kgdb.c
+++ b/arch/hexagon/kernel/kgdb.c
@@ -115,33 +115,6 @@ void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long pc)
 	instruction_pointer(regs) = pc;
 }
 
-#ifdef CONFIG_SMP
-
-/**
- * kgdb_roundup_cpus - Get other CPUs into a holding pattern
- *
- * On SMP systems, we need to get the attention of the other CPUs
- * and get them be in a known state.  This should do what is needed
- * to get the other CPUs to call kgdb_wait(). Note that on some arches,
- * the NMI approach is not used for rounding up all the CPUs. For example,
- * in case of MIPS, smp_call_function() is used to roundup CPUs.
- *
- * On non-SMP systems, this is not called.
- */
-
-static void hexagon_kgdb_nmi_hook(void *ignored)
-{
-	kgdb_nmicallback(raw_smp_processor_id(), get_irq_regs());
-}
-
-void kgdb_roundup_cpus(void)
-{
-	local_irq_enable();
-	smp_call_function(hexagon_kgdb_nmi_hook, NULL, 0);
-	local_irq_disable();
-}
-#endif
-
 
 /*  Not yet working  */
 void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs,
diff --git a/arch/mips/kernel/kgdb.c b/arch/mips/kernel/kgdb.c
index 2b05effc17b4..42f057a6c215 100644
--- a/arch/mips/kernel/kgdb.c
+++ b/arch/mips/kernel/kgdb.c
@@ -207,7 +207,7 @@ void arch_kgdb_breakpoint(void)
 		".set\treorder");
 }
 
-static void kgdb_call_nmi_hook(void *ignored)
+void kgdb_call_nmi_hook(void *ignored)
 {
 	mm_segment_t old_fs;
 
@@ -219,13 +219,6 @@ static void kgdb_call_nmi_hook(void *ignored)
 	set_fs(old_fs);
 }
 
-void kgdb_roundup_cpus(void)
-{
-	local_irq_enable();
-	smp_call_function(kgdb_call_nmi_hook, NULL, 0);
-	local_irq_disable();
-}
-
 static int compute_signal(int tt)
 {
 	struct hard_trap_info *ht;
diff --git a/arch/powerpc/kernel/kgdb.c b/arch/powerpc/kernel/kgdb.c
index b0e804844be0..b4ce54d73337 100644
--- a/arch/powerpc/kernel/kgdb.c
+++ b/arch/powerpc/kernel/kgdb.c
@@ -117,7 +117,7 @@ int kgdb_skipexception(int exception, struct pt_regs *regs)
 	return kgdb_isremovedbreak(regs->nip);
 }
 
-static int kgdb_call_nmi_hook(struct pt_regs *regs)
+static int kgdb_debugger_ipi(struct pt_regs *regs)
 {
 	kgdb_nmicallback(raw_smp_processor_id(), regs);
 	return 0;
@@ -502,7 +502,7 @@ int kgdb_arch_init(void)
 	old__debugger_break_match = __debugger_break_match;
 	old__debugger_fault_handler = __debugger_fault_handler;
 
-	__debugger_ipi = kgdb_call_nmi_hook;
+	__debugger_ipi = kgdb_debugger_ipi;
 	__debugger = kgdb_debugger;
 	__debugger_bpt = kgdb_handle_breakpoint;
 	__debugger_sstep = kgdb_singlestep;
diff --git a/arch/sh/kernel/kgdb.c b/arch/sh/kernel/kgdb.c
index cc57630f6bf2..14e012ad7c57 100644
--- a/arch/sh/kernel/kgdb.c
+++ b/arch/sh/kernel/kgdb.c
@@ -314,18 +314,6 @@ BUILD_TRAP_HANDLER(singlestep)
 	local_irq_restore(flags);
 }
 
-static void kgdb_call_nmi_hook(void *ignored)
-{
-	kgdb_nmicallback(raw_smp_processor_id(), get_irq_regs());
-}
-
-void kgdb_roundup_cpus(void)
-{
-	local_irq_enable();
-	smp_call_function(kgdb_call_nmi_hook, NULL, 0);
-	local_irq_disable();
-}
-
 static int __kgdb_notify(struct die_args *args, unsigned long cmd)
 {
 	int ret;
diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h
index 05e5b2eb0d32..24422865cd18 100644
--- a/include/linux/kgdb.h
+++ b/include/linux/kgdb.h
@@ -176,14 +176,25 @@ kgdb_arch_handle_exception(int vector, int signo, int err_code,
 			   char *remcom_out_buffer,
 			   struct pt_regs *regs);
 
+/**
+ *	kgdb_call_nmi_hook - Call kgdb_nmicallback() on the current CPU
+ *	@ignored: This parameter is only here to match the prototype.
+ *
+ *	If you're using the default implementation of kgdb_roundup_cpus()
+ *	this function will be called per CPU.  If you don't implement
+ *	kgdb_call_nmi_hook() a default will be used.
+ */
+
+extern void kgdb_call_nmi_hook(void *ignored);
+
 /**
  *	kgdb_roundup_cpus - Get other CPUs into a holding pattern
  *
  *	On SMP systems, we need to get the attention of the other CPUs
  *	and get them into a known state.  This should do what is needed
  *	to get the other CPUs to call kgdb_wait(). Note that on some arches,
- *	the NMI approach is not used for rounding up all the CPUs. For example,
- *	in case of MIPS, smp_call_function() is used to roundup CPUs.
+ *	the NMI approach is not used for rounding up all the CPUs.  Normally
+ *	those architectures can just not implement this and get the default.
  *
  *	On non-SMP systems, this is not called.
  */
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index f3cadda45f07..10db2833a423 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -55,6 +55,7 @@
 #include <linux/mm.h>
 #include <linux/vmacache.h>
 #include <linux/rcupdate.h>
+#include <linux/irq.h>
 
 #include <asm/cacheflush.h>
 #include <asm/byteorder.h>
@@ -220,6 +221,46 @@ int __weak kgdb_skipexception(int exception, struct pt_regs *regs)
 	return 0;
 }
 
+#ifdef CONFIG_SMP
+
+/*
+ * Default (weak) implementation for kgdb_roundup_cpus
+ */
+
+static DEFINE_PER_CPU(call_single_data_t, kgdb_roundup_csd);
+
+void __weak kgdb_call_nmi_hook(void *ignored)
+{
+	/*
+	 * NOTE: get_irq_regs() is supposed to get the registers from
+	 * before the IPI interrupt happened and so is supposed to
+	 * show where the processor was.  In some situations it's
+	 * possible we might be called without an IPI, so it might be
+	 * safer to figure out how to make kgdb_breakpoint() work
+	 * properly here.
+	 */
+	kgdb_nmicallback(raw_smp_processor_id(), get_irq_regs());
+}
+
+void __weak kgdb_roundup_cpus(void)
+{
+	call_single_data_t *csd;
+	int this_cpu = raw_smp_processor_id();
+	int cpu;
+
+	for_each_online_cpu(cpu) {
+		/* No need to roundup ourselves */
+		if (cpu == this_cpu)
+			continue;
+
+		csd = &per_cpu(kgdb_roundup_csd, cpu);
+		csd->func = kgdb_call_nmi_hook;
+		smp_call_function_single_async(cpu, csd);
+	}
+}
+
+#endif
+
 /*
  * Some architectures need cache flushes when we set/clear a
  * breakpoint:
-- 
cgit 


From 87b095928584da7d5cb3149016f00b0b139c2292 Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Tue, 4 Dec 2018 19:38:27 -0800
Subject: kgdb: Don't round up a CPU that failed rounding up before

If we're using the default implementation of kgdb_roundup_cpus() that
uses smp_call_function_single_async() we can end up hanging
kgdb_roundup_cpus() if we try to round up a CPU that failed to round
up before.

Specifically smp_call_function_single_async() will try to wait on the
csd lock for the CPU that we're trying to round up.  If the previous
round up never finished then that lock could still be held and we'll
just sit there hanging.

There's not a lot of use trying to round up a CPU that failed to round
up before.  Let's keep a flag that indicates whether the CPU started
but didn't finish to round up before.  If we see that flag set then
we'll skip the next round up.

In general we have a few goals here:
- We never want to end up calling smp_call_function_single_async()
  when the csd is still locked.  This is accomplished because
  flush_smp_call_function_queue() unlocks the csd _before_ invoking
  the callback.  That means that when kgdb_nmicallback() runs we know
  for sure the the csd is no longer locked.  Thus when we set
  "rounding_up = false" we know for sure that the csd is unlocked.
- If there are no timeouts rounding up we should never skip a round
  up.

NOTE #1: In general trying to continue running after failing to round
up CPUs doesn't appear to be supported in the debugger.  When I
simulate this I find that kdb reports "Catastrophic error detected"
when I try to continue.  I can overrule and continue anyway, but it
should be noted that we may be entering the land of dragons here.
Possibly the "Catastrophic error detected" was added _because_ of the
future failure to round up, but even so this is an area of the code
that hasn't been strongly tested.

NOTE #2: I did a bit of testing before and after this change.  I
introduced a 10 second hang in the kernel while holding a spinlock
that I could invoke on a certain CPU with 'taskset -c 3 cat /sys/...".

Before this change if I did:
- Invoke hang
- Enter debugger
- g (which warns about Catastrophic error, g again to go anyway)
- g
- Enter debugger

...I'd hang the rest of the 10 seconds without getting a debugger
prompt.  After this change I end up in the debugger the 2nd time after
only 1 second with the standard warning about 'Timed out waiting for
secondary CPUs.'

I'll also note that once the CPU finished waiting I could actually
debug it (aka "btc" worked)

I won't promise that everything works perfectly if the errant CPU
comes back at just the wrong time (like as we're entering or exiting
the debugger) but it certainly seems like an improvement.

NOTE #3: setting 'kgdb_info[cpu].rounding_up = false' is in
kgdb_nmicallback() instead of kgdb_call_nmi_hook() because some
implementations override kgdb_call_nmi_hook().  It shouldn't hurt to
have it in kgdb_nmicallback() in any case.

NOTE #4: this logic is really only needed because there is no API call
like "smp_try_call_function_single_async()" or "smp_csd_is_locked()".
If such an API existed then we'd use it instead, but it seemed a bit
much to add an API like this just for kgdb.

Signed-off-by: Douglas Anderson <dianders@chromium.org>
Acked-by: Daniel Thompson <daniel.thompson@linaro.org>
Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>
---
 kernel/debug/debug_core.c | 20 +++++++++++++++++++-
 kernel/debug/debug_core.h |  1 +
 2 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 10db2833a423..1fb8b239e567 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -247,6 +247,7 @@ void __weak kgdb_roundup_cpus(void)
 	call_single_data_t *csd;
 	int this_cpu = raw_smp_processor_id();
 	int cpu;
+	int ret;
 
 	for_each_online_cpu(cpu) {
 		/* No need to roundup ourselves */
@@ -254,8 +255,23 @@ void __weak kgdb_roundup_cpus(void)
 			continue;
 
 		csd = &per_cpu(kgdb_roundup_csd, cpu);
+
+		/*
+		 * If it didn't round up last time, don't try again
+		 * since smp_call_function_single_async() will block.
+		 *
+		 * If rounding_up is false then we know that the
+		 * previous call must have at least started and that
+		 * means smp_call_function_single_async() won't block.
+		 */
+		if (kgdb_info[cpu].rounding_up)
+			continue;
+		kgdb_info[cpu].rounding_up = true;
+
 		csd->func = kgdb_call_nmi_hook;
-		smp_call_function_single_async(cpu, csd);
+		ret = smp_call_function_single_async(cpu, csd);
+		if (ret)
+			kgdb_info[cpu].rounding_up = false;
 	}
 }
 
@@ -788,6 +804,8 @@ int kgdb_nmicallback(int cpu, void *regs)
 	struct kgdb_state kgdb_var;
 	struct kgdb_state *ks = &kgdb_var;
 
+	kgdb_info[cpu].rounding_up = false;
+
 	memset(ks, 0, sizeof(struct kgdb_state));
 	ks->cpu			= cpu;
 	ks->linux_regs		= regs;
diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h
index 127d9bc49fb4..b4a7c326d546 100644
--- a/kernel/debug/debug_core.h
+++ b/kernel/debug/debug_core.h
@@ -42,6 +42,7 @@ struct debuggerinfo_struct {
 	int			ret_state;
 	int			irq_depth;
 	int			enter_kgdb;
+	bool			rounding_up;
 };
 
 extern struct debuggerinfo_struct kgdb_info[];
-- 
cgit 


From 162bc7f5afd75b72acbe3c5f3488ef7e64a3fe36 Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Tue, 4 Dec 2018 19:38:28 -0800
Subject: kdb: Don't back trace on a cpu that didn't round up

If you have a CPU that fails to round up and then run 'btc' you'll end
up crashing in kdb becaue we dereferenced NULL.  Let's add a check.
It's wise to also set the task to NULL when leaving the debugger so
that if we fail to round up on a later entry into the debugger we
won't backtrace a stale task.

Signed-off-by: Douglas Anderson <dianders@chromium.org>
Acked-by: Daniel Thompson <daniel.thompson@linaro.org>
Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>
---
 kernel/debug/debug_core.c       |  4 ++++
 kernel/debug/kdb/kdb_bt.c       | 11 ++++++++++-
 kernel/debug/kdb/kdb_debugger.c |  7 -------
 3 files changed, 14 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 1fb8b239e567..5cc608de6883 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -592,6 +592,8 @@ return_normal:
 				arch_kgdb_ops.correct_hw_break();
 			if (trace_on)
 				tracing_on();
+			kgdb_info[cpu].debuggerinfo = NULL;
+			kgdb_info[cpu].task = NULL;
 			kgdb_info[cpu].exception_state &=
 				~(DCPU_WANT_MASTER | DCPU_IS_SLAVE);
 			kgdb_info[cpu].enter_kgdb--;
@@ -724,6 +726,8 @@ kgdb_restore:
 	if (trace_on)
 		tracing_on();
 
+	kgdb_info[cpu].debuggerinfo = NULL;
+	kgdb_info[cpu].task = NULL;
 	kgdb_info[cpu].exception_state &=
 		~(DCPU_WANT_MASTER | DCPU_IS_SLAVE);
 	kgdb_info[cpu].enter_kgdb--;
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
index 7921ae4fca8d..7e2379aa0a1e 100644
--- a/kernel/debug/kdb/kdb_bt.c
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -186,7 +186,16 @@ kdb_bt(int argc, const char **argv)
 		kdb_printf("btc: cpu status: ");
 		kdb_parse("cpu\n");
 		for_each_online_cpu(cpu) {
-			sprintf(buf, "btt 0x%px\n", KDB_TSK(cpu));
+			void *kdb_tsk = KDB_TSK(cpu);
+
+			/* If a CPU failed to round up we could be here */
+			if (!kdb_tsk) {
+				kdb_printf("WARNING: no task for cpu %ld\n",
+					   cpu);
+				continue;
+			}
+
+			sprintf(buf, "btt 0x%px\n", kdb_tsk);
 			kdb_parse(buf);
 			touch_nmi_watchdog();
 		}
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index 15e1a7af5dd0..53a0df6e4d92 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -118,13 +118,6 @@ int kdb_stub(struct kgdb_state *ks)
 	kdb_bp_remove();
 	KDB_STATE_CLEAR(DOING_SS);
 	KDB_STATE_SET(PAGER);
-	/* zero out any offline cpu data */
-	for_each_present_cpu(i) {
-		if (!cpu_online(i)) {
-			kgdb_info[i].debuggerinfo = NULL;
-			kgdb_info[i].task = NULL;
-		}
-	}
 	if (ks->err_code == DIE_OOPS || reason == KDB_REASON_OOPS) {
 		ks->pass_exception = 1;
 		KDB_FLAG_SET(CATASTROPHIC);
-- 
cgit 


From 7faedcd4de4356dfcda30841a5ffef3bf35fa06c Mon Sep 17 00:00:00 2001
From: Nicholas Mc Guire <hofrat@osadl.org>
Date: Fri, 20 Jul 2018 11:23:37 +0200
Subject: kdb: use bool for binary state indicators

defcmd_in_progress  is the state trace for command group processing
- within a command group or not -  usable  is an indicator if a command
set is valid (allocated/non-empty) - so use a bool for those binary
indication here.

Signed-off-by: Nicholas Mc Guire <hofrat@osadl.org>
Reviewed-by: Daniel Thompson <daniel.thompson@linaro.org>
Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>
---
 kernel/debug/kdb/kdb_main.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index d72b32c66f7d..82a3b32a7cfc 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -658,7 +658,7 @@ static void kdb_cmderror(int diag)
  */
 struct defcmd_set {
 	int count;
-	int usable;
+	bool usable;
 	char *name;
 	char *usage;
 	char *help;
@@ -666,7 +666,7 @@ struct defcmd_set {
 };
 static struct defcmd_set *defcmd_set;
 static int defcmd_set_count;
-static int defcmd_in_progress;
+static bool defcmd_in_progress;
 
 /* Forward references */
 static int kdb_exec_defcmd(int argc, const char **argv);
@@ -676,9 +676,9 @@ static int kdb_defcmd2(const char *cmdstr, const char *argv0)
 	struct defcmd_set *s = defcmd_set + defcmd_set_count - 1;
 	char **save_command = s->command;
 	if (strcmp(argv0, "endefcmd") == 0) {
-		defcmd_in_progress = 0;
+		defcmd_in_progress = false;
 		if (!s->count)
-			s->usable = 0;
+			s->usable = false;
 		if (s->usable)
 			/* macros are always safe because when executed each
 			 * internal command re-enters kdb_parse() and is
@@ -695,7 +695,7 @@ static int kdb_defcmd2(const char *cmdstr, const char *argv0)
 	if (!s->command) {
 		kdb_printf("Could not allocate new kdb_defcmd table for %s\n",
 			   cmdstr);
-		s->usable = 0;
+		s->usable = false;
 		return KDB_NOTIMP;
 	}
 	memcpy(s->command, save_command, s->count * sizeof(*(s->command)));
@@ -737,7 +737,7 @@ static int kdb_defcmd(int argc, const char **argv)
 	       defcmd_set_count * sizeof(*defcmd_set));
 	s = defcmd_set + defcmd_set_count;
 	memset(s, 0, sizeof(*s));
-	s->usable = 1;
+	s->usable = true;
 	s->name = kdb_strdup(argv[1], GFP_KDB);
 	if (!s->name)
 		goto fail_name;
@@ -756,7 +756,7 @@ static int kdb_defcmd(int argc, const char **argv)
 		s->help[strlen(s->help)-1] = '\0';
 	}
 	++defcmd_set_count;
-	defcmd_in_progress = 1;
+	defcmd_in_progress = true;
 	kfree(save_defcmd_set);
 	return 0;
 fail_help:
-- 
cgit 


From c40f7d74c741a907cfaeb73a7697081881c497d0 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 27 Dec 2018 13:46:17 -0800
Subject: sched/fair: Fix infinite loop in update_blocked_averages() by
 reverting a9e7f6544b9c

Zhipeng Xie, Xie XiuQi and Sargun Dhillon reported lockups in the
scheduler under high loads, starting at around the v4.18 time frame,
and Zhipeng Xie tracked it down to bugs in the rq->leaf_cfs_rq_list
manipulation.

Do a (manual) revert of:

  a9e7f6544b9c ("sched/fair: Fix O(nr_cgroups) in load balance path")

It turns out that the list_del_leaf_cfs_rq() introduced by this commit
is a surprising property that was not considered in followup commits
such as:

  9c2791f936ef ("sched/fair: Fix hierarchical order in rq->leaf_cfs_rq_list")

As Vincent Guittot explains:

 "I think that there is a bigger problem with commit a9e7f6544b9c and
  cfs_rq throttling:

  Let take the example of the following topology TG2 --> TG1 --> root:

   1) The 1st time a task is enqueued, we will add TG2 cfs_rq then TG1
      cfs_rq to leaf_cfs_rq_list and we are sure to do the whole branch in
      one path because it has never been used and can't be throttled so
      tmp_alone_branch will point to leaf_cfs_rq_list at the end.

   2) Then TG1 is throttled

   3) and we add TG3 as a new child of TG1.

   4) The 1st enqueue of a task on TG3 will add TG3 cfs_rq just before TG1
      cfs_rq and tmp_alone_branch will stay  on rq->leaf_cfs_rq_list.

  With commit a9e7f6544b9c, we can del a cfs_rq from rq->leaf_cfs_rq_list.
  So if the load of TG1 cfs_rq becomes NULL before step 2) above, TG1
  cfs_rq is removed from the list.
  Then at step 4), TG3 cfs_rq is added at the beginning of rq->leaf_cfs_rq_list
  but tmp_alone_branch still points to TG3 cfs_rq because its throttled
  parent can't be enqueued when the lock is released.
  tmp_alone_branch doesn't point to rq->leaf_cfs_rq_list whereas it should.

  So if TG3 cfs_rq is removed or destroyed before tmp_alone_branch
  points on another TG cfs_rq, the next TG cfs_rq that will be added,
  will be linked outside rq->leaf_cfs_rq_list - which is bad.

  In addition, we can break the ordering of the cfs_rq in
  rq->leaf_cfs_rq_list but this ordering is used to update and
  propagate the update from leaf down to root."

Instead of trying to work through all these cases and trying to reproduce
the very high loads that produced the lockup to begin with, simplify
the code temporarily by reverting a9e7f6544b9c - which change was clearly
not thought through completely.

This (hopefully) gives us a kernel that doesn't lock up so people
can continue to enjoy their holidays without worrying about regressions. ;-)

[ mingo: Wrote changelog, fixed weird spelling in code comment while at it. ]

Analyzed-by: Xie XiuQi <xiexiuqi@huawei.com>
Analyzed-by: Vincent Guittot <vincent.guittot@linaro.org>
Reported-by: Zhipeng Xie <xiezhipeng1@huawei.com>
Reported-by: Sargun Dhillon <sargun@sargun.me>
Reported-by: Xie XiuQi <xiexiuqi@huawei.com>
Tested-by: Zhipeng Xie <xiezhipeng1@huawei.com>
Tested-by: Sargun Dhillon <sargun@sargun.me>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Acked-by: Vincent Guittot <vincent.guittot@linaro.org>
Cc: <stable@vger.kernel.org> # v4.13+
Cc: Bin Li <huawei.libin@huawei.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: a9e7f6544b9c ("sched/fair: Fix O(nr_cgroups) in load balance path")
Link: http://lkml.kernel.org/r/1545879866-27809-1-git-send-email-xiexiuqi@huawei.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 43 +++++++++----------------------------------
 1 file changed, 9 insertions(+), 34 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d1907506318a..6483834f1278 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -352,10 +352,9 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 	}
 }
 
-/* Iterate thr' all leaf cfs_rq's on a runqueue */
-#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)			\
-	list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list,	\
-				 leaf_cfs_rq_list)
+/* Iterate through all leaf cfs_rq's on a runqueue: */
+#define for_each_leaf_cfs_rq(rq, cfs_rq) \
+	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
 
 /* Do the two (enqueued) entities belong to the same group ? */
 static inline struct cfs_rq *
@@ -447,8 +446,8 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 {
 }
 
-#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)	\
-		for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
+#define for_each_leaf_cfs_rq(rq, cfs_rq)	\
+		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
 
 static inline struct sched_entity *parent_entity(struct sched_entity *se)
 {
@@ -7647,27 +7646,10 @@ static inline bool others_have_blocked(struct rq *rq)
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 
-static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
-{
-	if (cfs_rq->load.weight)
-		return false;
-
-	if (cfs_rq->avg.load_sum)
-		return false;
-
-	if (cfs_rq->avg.util_sum)
-		return false;
-
-	if (cfs_rq->avg.runnable_load_sum)
-		return false;
-
-	return true;
-}
-
 static void update_blocked_averages(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
-	struct cfs_rq *cfs_rq, *pos;
+	struct cfs_rq *cfs_rq;
 	const struct sched_class *curr_class;
 	struct rq_flags rf;
 	bool done = true;
@@ -7679,7 +7661,7 @@ static void update_blocked_averages(int cpu)
 	 * Iterates the task_group tree in a bottom up fashion, see
 	 * list_add_leaf_cfs_rq() for details.
 	 */
-	for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
+	for_each_leaf_cfs_rq(rq, cfs_rq) {
 		struct sched_entity *se;
 
 		/* throttled entities do not contribute to load */
@@ -7694,13 +7676,6 @@ static void update_blocked_averages(int cpu)
 		if (se && !skip_blocked_update(se))
 			update_load_avg(cfs_rq_of(se), se, 0);
 
-		/*
-		 * There can be a lot of idle CPU cgroups.  Don't let fully
-		 * decayed cfs_rqs linger on the list.
-		 */
-		if (cfs_rq_is_decayed(cfs_rq))
-			list_del_leaf_cfs_rq(cfs_rq);
-
 		/* Don't need periodic decay once load/util_avg are null */
 		if (cfs_rq_has_blocked(cfs_rq))
 			done = false;
@@ -10570,10 +10545,10 @@ const struct sched_class fair_sched_class = {
 #ifdef CONFIG_SCHED_DEBUG
 void print_cfs_stats(struct seq_file *m, int cpu)
 {
-	struct cfs_rq *cfs_rq, *pos;
+	struct cfs_rq *cfs_rq;
 
 	rcu_read_lock();
-	for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
+	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
 		print_cfs_rq(m, cpu, cfs_rq);
 	rcu_read_unlock();
 }
-- 
cgit 


From c08435ec7f2bc8f4109401f696fd55159b4b40cb Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 3 Jan 2019 00:58:27 +0100
Subject: bpf: move {prev_,}insn_idx into verifier env

Move prev_insn_idx and insn_idx from the do_check() function into
the verifier environment, so they can be read inside the various
helper functions for handling the instructions. It's easier to put
this into the environment rather than changing all call-sites only
to pass it along. insn_idx is useful in particular since this later
on allows to hold state in env->insn_aux_data[env->insn_idx].

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |  2 ++
 kernel/bpf/verifier.c        | 76 ++++++++++++++++++++++----------------------
 2 files changed, 40 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index c233efc106c6..3f84f3e87704 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -212,6 +212,8 @@ struct bpf_subprog_info {
  * one verifier_env per bpf_check() call
  */
 struct bpf_verifier_env {
+	u32 insn_idx;
+	u32 prev_insn_idx;
 	struct bpf_prog *prog;		/* eBPF program being verified */
 	const struct bpf_verifier_ops *ops;
 	struct bpf_verifier_stack_elem *head; /* stack of verifier states to be processed */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 71d86e3024ae..afa8515bbb34 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5650,7 +5650,6 @@ static int do_check(struct bpf_verifier_env *env)
 	struct bpf_insn *insns = env->prog->insnsi;
 	struct bpf_reg_state *regs;
 	int insn_cnt = env->prog->len, i;
-	int insn_idx, prev_insn_idx = 0;
 	int insn_processed = 0;
 	bool do_print_state = false;
 
@@ -5670,19 +5669,19 @@ static int do_check(struct bpf_verifier_env *env)
 			BPF_MAIN_FUNC /* callsite */,
 			0 /* frameno */,
 			0 /* subprogno, zero == main subprog */);
-	insn_idx = 0;
+
 	for (;;) {
 		struct bpf_insn *insn;
 		u8 class;
 		int err;
 
-		if (insn_idx >= insn_cnt) {
+		if (env->insn_idx >= insn_cnt) {
 			verbose(env, "invalid insn idx %d insn_cnt %d\n",
-				insn_idx, insn_cnt);
+				env->insn_idx, insn_cnt);
 			return -EFAULT;
 		}
 
-		insn = &insns[insn_idx];
+		insn = &insns[env->insn_idx];
 		class = BPF_CLASS(insn->code);
 
 		if (++insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) {
@@ -5692,7 +5691,7 @@ static int do_check(struct bpf_verifier_env *env)
 			return -E2BIG;
 		}
 
-		err = is_state_visited(env, insn_idx);
+		err = is_state_visited(env, env->insn_idx);
 		if (err < 0)
 			return err;
 		if (err == 1) {
@@ -5700,9 +5699,9 @@ static int do_check(struct bpf_verifier_env *env)
 			if (env->log.level) {
 				if (do_print_state)
 					verbose(env, "\nfrom %d to %d: safe\n",
-						prev_insn_idx, insn_idx);
+						env->prev_insn_idx, env->insn_idx);
 				else
-					verbose(env, "%d: safe\n", insn_idx);
+					verbose(env, "%d: safe\n", env->insn_idx);
 			}
 			goto process_bpf_exit;
 		}
@@ -5715,10 +5714,10 @@ static int do_check(struct bpf_verifier_env *env)
 
 		if (env->log.level > 1 || (env->log.level && do_print_state)) {
 			if (env->log.level > 1)
-				verbose(env, "%d:", insn_idx);
+				verbose(env, "%d:", env->insn_idx);
 			else
 				verbose(env, "\nfrom %d to %d:",
-					prev_insn_idx, insn_idx);
+					env->prev_insn_idx, env->insn_idx);
 			print_verifier_state(env, state->frame[state->curframe]);
 			do_print_state = false;
 		}
@@ -5729,20 +5728,20 @@ static int do_check(struct bpf_verifier_env *env)
 				.private_data	= env,
 			};
 
-			verbose_linfo(env, insn_idx, "; ");
-			verbose(env, "%d: ", insn_idx);
+			verbose_linfo(env, env->insn_idx, "; ");
+			verbose(env, "%d: ", env->insn_idx);
 			print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
 		}
 
 		if (bpf_prog_is_dev_bound(env->prog->aux)) {
-			err = bpf_prog_offload_verify_insn(env, insn_idx,
-							   prev_insn_idx);
+			err = bpf_prog_offload_verify_insn(env, env->insn_idx,
+							   env->prev_insn_idx);
 			if (err)
 				return err;
 		}
 
 		regs = cur_regs(env);
-		env->insn_aux_data[insn_idx].seen = true;
+		env->insn_aux_data[env->insn_idx].seen = true;
 
 		if (class == BPF_ALU || class == BPF_ALU64) {
 			err = check_alu_op(env, insn);
@@ -5768,13 +5767,13 @@ static int do_check(struct bpf_verifier_env *env)
 			/* check that memory (src_reg + off) is readable,
 			 * the state of dst_reg will be updated by this func
 			 */
-			err = check_mem_access(env, insn_idx, insn->src_reg, insn->off,
-					       BPF_SIZE(insn->code), BPF_READ,
-					       insn->dst_reg, false);
+			err = check_mem_access(env, env->insn_idx, insn->src_reg,
+					       insn->off, BPF_SIZE(insn->code),
+					       BPF_READ, insn->dst_reg, false);
 			if (err)
 				return err;
 
-			prev_src_type = &env->insn_aux_data[insn_idx].ptr_type;
+			prev_src_type = &env->insn_aux_data[env->insn_idx].ptr_type;
 
 			if (*prev_src_type == NOT_INIT) {
 				/* saw a valid insn
@@ -5799,10 +5798,10 @@ static int do_check(struct bpf_verifier_env *env)
 			enum bpf_reg_type *prev_dst_type, dst_reg_type;
 
 			if (BPF_MODE(insn->code) == BPF_XADD) {
-				err = check_xadd(env, insn_idx, insn);
+				err = check_xadd(env, env->insn_idx, insn);
 				if (err)
 					return err;
-				insn_idx++;
+				env->insn_idx++;
 				continue;
 			}
 
@@ -5818,13 +5817,13 @@ static int do_check(struct bpf_verifier_env *env)
 			dst_reg_type = regs[insn->dst_reg].type;
 
 			/* check that memory (dst_reg + off) is writeable */
-			err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
-					       BPF_SIZE(insn->code), BPF_WRITE,
-					       insn->src_reg, false);
+			err = check_mem_access(env, env->insn_idx, insn->dst_reg,
+					       insn->off, BPF_SIZE(insn->code),
+					       BPF_WRITE, insn->src_reg, false);
 			if (err)
 				return err;
 
-			prev_dst_type = &env->insn_aux_data[insn_idx].ptr_type;
+			prev_dst_type = &env->insn_aux_data[env->insn_idx].ptr_type;
 
 			if (*prev_dst_type == NOT_INIT) {
 				*prev_dst_type = dst_reg_type;
@@ -5852,9 +5851,9 @@ static int do_check(struct bpf_verifier_env *env)
 			}
 
 			/* check that memory (dst_reg + off) is writeable */
-			err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
-					       BPF_SIZE(insn->code), BPF_WRITE,
-					       -1, false);
+			err = check_mem_access(env, env->insn_idx, insn->dst_reg,
+					       insn->off, BPF_SIZE(insn->code),
+					       BPF_WRITE, -1, false);
 			if (err)
 				return err;
 
@@ -5872,9 +5871,9 @@ static int do_check(struct bpf_verifier_env *env)
 				}
 
 				if (insn->src_reg == BPF_PSEUDO_CALL)
-					err = check_func_call(env, insn, &insn_idx);
+					err = check_func_call(env, insn, &env->insn_idx);
 				else
-					err = check_helper_call(env, insn->imm, insn_idx);
+					err = check_helper_call(env, insn->imm, env->insn_idx);
 				if (err)
 					return err;
 
@@ -5887,7 +5886,7 @@ static int do_check(struct bpf_verifier_env *env)
 					return -EINVAL;
 				}
 
-				insn_idx += insn->off + 1;
+				env->insn_idx += insn->off + 1;
 				continue;
 
 			} else if (opcode == BPF_EXIT) {
@@ -5901,8 +5900,8 @@ static int do_check(struct bpf_verifier_env *env)
 
 				if (state->curframe) {
 					/* exit from nested function */
-					prev_insn_idx = insn_idx;
-					err = prepare_func_exit(env, &insn_idx);
+					env->prev_insn_idx = env->insn_idx;
+					err = prepare_func_exit(env, &env->insn_idx);
 					if (err)
 						return err;
 					do_print_state = true;
@@ -5932,7 +5931,8 @@ static int do_check(struct bpf_verifier_env *env)
 				if (err)
 					return err;
 process_bpf_exit:
-				err = pop_stack(env, &prev_insn_idx, &insn_idx);
+				err = pop_stack(env, &env->prev_insn_idx,
+						&env->insn_idx);
 				if (err < 0) {
 					if (err != -ENOENT)
 						return err;
@@ -5942,7 +5942,7 @@ process_bpf_exit:
 					continue;
 				}
 			} else {
-				err = check_cond_jmp_op(env, insn, &insn_idx);
+				err = check_cond_jmp_op(env, insn, &env->insn_idx);
 				if (err)
 					return err;
 			}
@@ -5959,8 +5959,8 @@ process_bpf_exit:
 				if (err)
 					return err;
 
-				insn_idx++;
-				env->insn_aux_data[insn_idx].seen = true;
+				env->insn_idx++;
+				env->insn_aux_data[env->insn_idx].seen = true;
 			} else {
 				verbose(env, "invalid BPF_LD mode\n");
 				return -EINVAL;
@@ -5970,7 +5970,7 @@ process_bpf_exit:
 			return -EINVAL;
 		}
 
-		insn_idx++;
+		env->insn_idx++;
 	}
 
 	verbose(env, "processed %d insns (limit %d), stack depth ",
-- 
cgit 


From 144cd91c4c2bced6eb8a7e25e590f6618a11e854 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 3 Jan 2019 00:58:28 +0100
Subject: bpf: move tmp variable into ax register in interpreter

This change moves the on-stack 64 bit tmp variable in ___bpf_prog_run()
into the hidden ax register. The latter is currently only used in JITs
for constant blinding as a temporary scratch register, meaning the BPF
interpreter will never see the use of ax. Therefore it is safe to use
it for the cases where tmp has been used earlier. This is needed to later
on allow restricted hidden use of ax in both interpreter and JITs.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/filter.h |  3 ++-
 kernel/bpf/core.c      | 34 +++++++++++++++++-----------------
 2 files changed, 19 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 8c8544b375eb..84a6a98f8328 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -60,7 +60,8 @@ struct sock_reuseport;
  * constants. See JIT pre-step in bpf_jit_blind_constants().
  */
 #define BPF_REG_AX		MAX_BPF_REG
-#define MAX_BPF_JIT_REG		(MAX_BPF_REG + 1)
+#define MAX_BPF_EXT_REG		(MAX_BPF_REG + 1)
+#define MAX_BPF_JIT_REG		MAX_BPF_EXT_REG
 
 /* unused opcode to mark special call to bpf_tail_call() helper */
 #define BPF_TAIL_CALL	0xf0
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 38de580abcc2..a34312a5eea2 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -54,6 +54,7 @@
 #define DST	regs[insn->dst_reg]
 #define SRC	regs[insn->src_reg]
 #define FP	regs[BPF_REG_FP]
+#define AX	regs[BPF_REG_AX]
 #define ARG1	regs[BPF_REG_ARG1]
 #define CTX	regs[BPF_REG_CTX]
 #define IMM	insn->imm
@@ -1188,7 +1189,6 @@ bool bpf_opcode_in_insntable(u8 code)
  */
 static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack)
 {
-	u64 tmp;
 #define BPF_INSN_2_LBL(x, y)    [BPF_##x | BPF_##y] = &&x##_##y
 #define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z
 	static const void *jumptable[256] = {
@@ -1268,36 +1268,36 @@ select_insn:
 		(*(s64 *) &DST) >>= IMM;
 		CONT;
 	ALU64_MOD_X:
-		div64_u64_rem(DST, SRC, &tmp);
-		DST = tmp;
+		div64_u64_rem(DST, SRC, &AX);
+		DST = AX;
 		CONT;
 	ALU_MOD_X:
-		tmp = (u32) DST;
-		DST = do_div(tmp, (u32) SRC);
+		AX = (u32) DST;
+		DST = do_div(AX, (u32) SRC);
 		CONT;
 	ALU64_MOD_K:
-		div64_u64_rem(DST, IMM, &tmp);
-		DST = tmp;
+		div64_u64_rem(DST, IMM, &AX);
+		DST = AX;
 		CONT;
 	ALU_MOD_K:
-		tmp = (u32) DST;
-		DST = do_div(tmp, (u32) IMM);
+		AX = (u32) DST;
+		DST = do_div(AX, (u32) IMM);
 		CONT;
 	ALU64_DIV_X:
 		DST = div64_u64(DST, SRC);
 		CONT;
 	ALU_DIV_X:
-		tmp = (u32) DST;
-		do_div(tmp, (u32) SRC);
-		DST = (u32) tmp;
+		AX = (u32) DST;
+		do_div(AX, (u32) SRC);
+		DST = (u32) AX;
 		CONT;
 	ALU64_DIV_K:
 		DST = div64_u64(DST, IMM);
 		CONT;
 	ALU_DIV_K:
-		tmp = (u32) DST;
-		do_div(tmp, (u32) IMM);
-		DST = (u32) tmp;
+		AX = (u32) DST;
+		do_div(AX, (u32) IMM);
+		DST = (u32) AX;
 		CONT;
 	ALU_END_TO_BE:
 		switch (IMM) {
@@ -1553,7 +1553,7 @@ STACK_FRAME_NON_STANDARD(___bpf_prog_run); /* jump table */
 static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn *insn) \
 { \
 	u64 stack[stack_size / sizeof(u64)]; \
-	u64 regs[MAX_BPF_REG]; \
+	u64 regs[MAX_BPF_EXT_REG]; \
 \
 	FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \
 	ARG1 = (u64) (unsigned long) ctx; \
@@ -1566,7 +1566,7 @@ static u64 PROG_NAME_ARGS(stack_size)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, \
 				      const struct bpf_insn *insn) \
 { \
 	u64 stack[stack_size / sizeof(u64)]; \
-	u64 regs[MAX_BPF_REG]; \
+	u64 regs[MAX_BPF_EXT_REG]; \
 \
 	FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \
 	BPF_R1 = r1; \
-- 
cgit 


From 9b73bfdd08e73231d6a90ae6db4b46b3fbf56c30 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 3 Jan 2019 00:58:29 +0100
Subject: bpf: enable access to ax register also from verifier rewrite

Right now we are using BPF ax register in JIT for constant blinding as
well as in interpreter as temporary variable. Verifier will not be able
to use it simply because its use will get overridden from the former in
bpf_jit_blind_insn(). However, it can be made to work in that blinding
will be skipped if there is prior use in either source or destination
register on the instruction. Taking constraints of ax into account, the
verifier is then open to use it in rewrites under some constraints. Note,
ax register already has mappings in every eBPF JIT.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/filter.h |  7 +------
 kernel/bpf/core.c      | 20 ++++++++++++++++++++
 2 files changed, 21 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 84a6a98f8328..ad106d845b22 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -53,12 +53,7 @@ struct sock_reuseport;
 #define BPF_REG_D	BPF_REG_8	/* data, callee-saved */
 #define BPF_REG_H	BPF_REG_9	/* hlen, callee-saved */
 
-/* Kernel hidden auxiliary/helper register for hardening step.
- * Only used by eBPF JITs. It's nothing more than a temporary
- * register that JITs use internally, only that here it's part
- * of eBPF instructions that have been rewritten for blinding
- * constants. See JIT pre-step in bpf_jit_blind_constants().
- */
+/* Kernel hidden auxiliary/helper register. */
 #define BPF_REG_AX		MAX_BPF_REG
 #define MAX_BPF_EXT_REG		(MAX_BPF_REG + 1)
 #define MAX_BPF_JIT_REG		MAX_BPF_EXT_REG
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index a34312a5eea2..f908b9356025 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -858,6 +858,26 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from,
 	BUILD_BUG_ON(BPF_REG_AX  + 1 != MAX_BPF_JIT_REG);
 	BUILD_BUG_ON(MAX_BPF_REG + 1 != MAX_BPF_JIT_REG);
 
+	/* Constraints on AX register:
+	 *
+	 * AX register is inaccessible from user space. It is mapped in
+	 * all JITs, and used here for constant blinding rewrites. It is
+	 * typically "stateless" meaning its contents are only valid within
+	 * the executed instruction, but not across several instructions.
+	 * There are a few exceptions however which are further detailed
+	 * below.
+	 *
+	 * Constant blinding is only used by JITs, not in the interpreter.
+	 * The interpreter uses AX in some occasions as a local temporary
+	 * register e.g. in DIV or MOD instructions.
+	 *
+	 * In restricted circumstances, the verifier can also use the AX
+	 * register for rewrites as long as they do not interfere with
+	 * the above cases!
+	 */
+	if (from->dst_reg == BPF_REG_AX || from->src_reg == BPF_REG_AX)
+		goto out;
+
 	if (from->imm == 0 &&
 	    (from->code == (BPF_ALU   | BPF_MOV | BPF_K) ||
 	     from->code == (BPF_ALU64 | BPF_MOV | BPF_K))) {
-- 
cgit 


From 0d6303db7970e6f56ae700fa07e11eb510cda125 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 3 Jan 2019 00:58:30 +0100
Subject: bpf: restrict map value pointer arithmetic for unprivileged

Restrict map value pointer arithmetic for unprivileged users in that
arithmetic itself must not go out of bounds as opposed to the actual
access later on. Therefore after each adjust_ptr_min_max_vals() with a
map value pointer as a destination it will simulate a check_map_access()
of 1 byte on the destination and once that fails the program is rejected
for unprivileged program loads. We use this later on for masking any
pointer arithmetic with the remainder of the map value space. The
likelihood of breaking any existing real-world unprivileged eBPF
program is very small for this corner case.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index afa8515bbb34..4da8c73e168f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3249,6 +3249,17 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 	__update_reg_bounds(dst_reg);
 	__reg_deduce_bounds(dst_reg);
 	__reg_bound_offset(dst_reg);
+
+	/* For unprivileged we require that resulting offset must be in bounds
+	 * in order to be able to sanitize access later on.
+	 */
+	if (!env->allow_ptr_leaks && dst_reg->type == PTR_TO_MAP_VALUE &&
+	    check_map_access(env, dst, dst_reg->off, 1, false)) {
+		verbose(env, "R%d pointer arithmetic of map value goes out of range, prohibited for !root\n",
+			dst);
+		return -EACCES;
+	}
+
 	return 0;
 }
 
-- 
cgit 


From e4298d25830a866cc0f427d4bccb858e76715859 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 3 Jan 2019 00:58:31 +0100
Subject: bpf: restrict stack pointer arithmetic for unprivileged

Restrict stack pointer arithmetic for unprivileged users in that
arithmetic itself must not go out of bounds as opposed to the actual
access later on. Therefore after each adjust_ptr_min_max_vals() with
a stack pointer as a destination we simulate a check_stack_access()
of 1 byte on the destination and once that fails the program is
rejected for unprivileged program loads. This is analog to map
value pointer arithmetic and needed for masking later on.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 63 +++++++++++++++++++++++++++++++++------------------
 1 file changed, 41 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 4da8c73e168f..9ac205d1b8b7 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1387,6 +1387,31 @@ static int check_stack_read(struct bpf_verifier_env *env,
 	}
 }
 
+static int check_stack_access(struct bpf_verifier_env *env,
+			      const struct bpf_reg_state *reg,
+			      int off, int size)
+{
+	/* Stack accesses must be at a fixed offset, so that we
+	 * can determine what type of data were returned. See
+	 * check_stack_read().
+	 */
+	if (!tnum_is_const(reg->var_off)) {
+		char tn_buf[48];
+
+		tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+		verbose(env, "variable stack access var_off=%s off=%d size=%d",
+			tn_buf, off, size);
+		return -EACCES;
+	}
+
+	if (off >= 0 || off < -MAX_BPF_STACK) {
+		verbose(env, "invalid stack off=%d size=%d\n", off, size);
+		return -EACCES;
+	}
+
+	return 0;
+}
+
 /* check read/write into map element returned by bpf_map_lookup_elem() */
 static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
 			      int size, bool zero_size_allowed)
@@ -1954,24 +1979,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 		}
 
 	} else if (reg->type == PTR_TO_STACK) {
-		/* stack accesses must be at a fixed offset, so that we can
-		 * determine what type of data were returned.
-		 * See check_stack_read().
-		 */
-		if (!tnum_is_const(reg->var_off)) {
-			char tn_buf[48];
-
-			tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-			verbose(env, "variable stack access var_off=%s off=%d size=%d",
-				tn_buf, off, size);
-			return -EACCES;
-		}
 		off += reg->var_off.value;
-		if (off >= 0 || off < -MAX_BPF_STACK) {
-			verbose(env, "invalid stack off=%d size=%d\n", off,
-				size);
-			return -EACCES;
-		}
+		err = check_stack_access(env, reg, off, size);
+		if (err)
+			return err;
 
 		state = func(env, reg);
 		err = update_stack_depth(env, state, off);
@@ -3253,11 +3264,19 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 	/* For unprivileged we require that resulting offset must be in bounds
 	 * in order to be able to sanitize access later on.
 	 */
-	if (!env->allow_ptr_leaks && dst_reg->type == PTR_TO_MAP_VALUE &&
-	    check_map_access(env, dst, dst_reg->off, 1, false)) {
-		verbose(env, "R%d pointer arithmetic of map value goes out of range, prohibited for !root\n",
-			dst);
-		return -EACCES;
+	if (!env->allow_ptr_leaks) {
+		if (dst_reg->type == PTR_TO_MAP_VALUE &&
+		    check_map_access(env, dst, dst_reg->off, 1, false)) {
+			verbose(env, "R%d pointer arithmetic of map value goes out of range, "
+				"prohibited for !root\n", dst);
+			return -EACCES;
+		} else if (dst_reg->type == PTR_TO_STACK &&
+			   check_stack_access(env, dst_reg, dst_reg->off +
+					      dst_reg->var_off.value, 1)) {
+			verbose(env, "R%d stack pointer arithmetic goes out of range, "
+				"prohibited for !root\n", dst);
+			return -EACCES;
+		}
 	}
 
 	return 0;
-- 
cgit 


From 9d7eceede769f90b66cfa06ad5b357140d5141ed Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 3 Jan 2019 00:58:32 +0100
Subject: bpf: restrict unknown scalars of mixed signed bounds for unprivileged

For unknown scalars of mixed signed bounds, meaning their smin_value is
negative and their smax_value is positive, we need to reject arithmetic
with pointer to map value. For unprivileged the goal is to mask every
map pointer arithmetic and this cannot reliably be done when it is
unknown at verification time whether the scalar value is negative or
positive. Given this is a corner case, the likelihood of breaking should
be very small.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9ac205d1b8b7..eebbc03e5af2 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3081,8 +3081,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 	    smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value;
 	u64 umin_val = off_reg->umin_value, umax_val = off_reg->umax_value,
 	    umin_ptr = ptr_reg->umin_value, umax_ptr = ptr_reg->umax_value;
+	u32 dst = insn->dst_reg, src = insn->src_reg;
 	u8 opcode = BPF_OP(insn->code);
-	u32 dst = insn->dst_reg;
 
 	dst_reg = &regs[dst];
 
@@ -3115,6 +3115,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		verbose(env, "R%d pointer arithmetic on %s prohibited\n",
 			dst, reg_type_str[ptr_reg->type]);
 		return -EACCES;
+	case PTR_TO_MAP_VALUE:
+		if (!env->allow_ptr_leaks && !known && (smin_val < 0) != (smax_val < 0)) {
+			verbose(env, "R%d has unknown scalar with mixed signed bounds, pointer arithmetic with it prohibited for !root\n",
+				off_reg == dst_reg ? dst : src);
+			return -EACCES;
+		}
+		/* fall-through */
 	default:
 		break;
 	}
-- 
cgit 


From b7137c4eab85c1cf3d46acdde90ce1163b28c873 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 3 Jan 2019 00:58:33 +0100
Subject: bpf: fix check_map_access smin_value test when pointer contains
 offset

In check_map_access() we probe actual bounds through __check_map_access()
with offset of reg->smin_value + off for lower bound and offset of
reg->umax_value + off for the upper bound. However, even though the
reg->smin_value could have a negative value, the final result of the
sum with off could be positive when pointer arithmetic with known and
unknown scalars is combined. In this case we reject the program with
an error such as "R<x> min value is negative, either use unsigned index
or do a if (index >=0) check." even though the access itself would be
fine. Therefore extend the check to probe whether the actual resulting
reg->smin_value + off is less than zero.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index eebbc03e5af2..8e5da1ce5da4 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1443,13 +1443,17 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
 	 */
 	if (env->log.level)
 		print_verifier_state(env, state);
+
 	/* The minimum value is only important with signed
 	 * comparisons where we can't assume the floor of a
 	 * value is 0.  If we are using signed variables for our
 	 * index'es we need to make sure that whatever we use
 	 * will have a set floor within our range.
 	 */
-	if (reg->smin_value < 0) {
+	if (reg->smin_value < 0 &&
+	    (reg->smin_value == S64_MIN ||
+	     (off + reg->smin_value != (s64)(s32)(off + reg->smin_value)) ||
+	      reg->smin_value + off < 0)) {
 		verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
 			regno);
 		return -EACCES;
-- 
cgit 


From 979d63d50c0c0f7bc537bf821e056cc9fe5abd38 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 3 Jan 2019 00:58:34 +0100
Subject: bpf: prevent out of bounds speculation on pointer arithmetic

Jann reported that the original commit back in b2157399cc98
("bpf: prevent out-of-bounds speculation") was not sufficient
to stop CPU from speculating out of bounds memory access:
While b2157399cc98 only focussed on masking array map access
for unprivileged users for tail calls and data access such
that the user provided index gets sanitized from BPF program
and syscall side, there is still a more generic form affected
from BPF programs that applies to most maps that hold user
data in relation to dynamic map access when dealing with
unknown scalars or "slow" known scalars as access offset, for
example:

  - Load a map value pointer into R6
  - Load an index into R7
  - Do a slow computation (e.g. with a memory dependency) that
    loads a limit into R8 (e.g. load the limit from a map for
    high latency, then mask it to make the verifier happy)
  - Exit if R7 >= R8 (mispredicted branch)
  - Load R0 = R6[R7]
  - Load R0 = R6[R0]

For unknown scalars there are two options in the BPF verifier
where we could derive knowledge from in order to guarantee
safe access to the memory: i) While </>/<=/>= variants won't
allow to derive any lower or upper bounds from the unknown
scalar where it would be safe to add it to the map value
pointer, it is possible through ==/!= test however. ii) another
option is to transform the unknown scalar into a known scalar,
for example, through ALU ops combination such as R &= <imm>
followed by R |= <imm> or any similar combination where the
original information from the unknown scalar would be destroyed
entirely leaving R with a constant. The initial slow load still
precedes the latter ALU ops on that register, so the CPU
executes speculatively from that point. Once we have the known
scalar, any compare operation would work then. A third option
only involving registers with known scalars could be crafted
as described in [0] where a CPU port (e.g. Slow Int unit)
would be filled with many dependent computations such that
the subsequent condition depending on its outcome has to wait
for evaluation on its execution port and thereby executing
speculatively if the speculated code can be scheduled on a
different execution port, or any other form of mistraining
as described in [1], for example. Given this is not limited
to only unknown scalars, not only map but also stack access
is affected since both is accessible for unprivileged users
and could potentially be used for out of bounds access under
speculation.

In order to prevent any of these cases, the verifier is now
sanitizing pointer arithmetic on the offset such that any
out of bounds speculation would be masked in a way where the
pointer arithmetic result in the destination register will
stay unchanged, meaning offset masked into zero similar as
in array_index_nospec() case. With regards to implementation,
there are three options that were considered: i) new insn
for sanitation, ii) push/pop insn and sanitation as inlined
BPF, iii) reuse of ax register and sanitation as inlined BPF.

Option i) has the downside that we end up using from reserved
bits in the opcode space, but also that we would require
each JIT to emit masking as native arch opcodes meaning
mitigation would have slow adoption till everyone implements
it eventually which is counter-productive. Option ii) and iii)
have both in common that a temporary register is needed in
order to implement the sanitation as inlined BPF since we
are not allowed to modify the source register. While a push /
pop insn in ii) would be useful to have in any case, it
requires once again that every JIT needs to implement it
first. While possible, amount of changes needed would also
be unsuitable for a -stable patch. Therefore, the path which
has fewer changes, less BPF instructions for the mitigation
and does not require anything to be changed in the JITs is
option iii) which this work is pursuing. The ax register is
already mapped to a register in all JITs (modulo arm32 where
it's mapped to stack as various other BPF registers there)
and used in constant blinding for JITs-only so far. It can
be reused for verifier rewrites under certain constraints.
The interpreter's tmp "register" has therefore been remapped
into extending the register set with hidden ax register and
reusing that for a number of instructions that needed the
prior temporary variable internally (e.g. div, mod). This
allows for zero increase in stack space usage in the interpreter,
and enables (restricted) generic use in rewrites otherwise as
long as such a patchlet does not make use of these instructions.
The sanitation mask is dynamic and relative to the offset the
map value or stack pointer currently holds.

There are various cases that need to be taken under consideration
for the masking, e.g. such operation could look as follows:
ptr += val or val += ptr or ptr -= val. Thus, the value to be
sanitized could reside either in source or in destination
register, and the limit is different depending on whether
the ALU op is addition or subtraction and depending on the
current known and bounded offset. The limit is derived as
follows: limit := max_value_size - (smin_value + off). For
subtraction: limit := umax_value + off. This holds because
we do not allow any pointer arithmetic that would
temporarily go out of bounds or would have an unknown
value with mixed signed bounds where it is unclear at
verification time whether the actual runtime value would
be either negative or positive. For example, we have a
derived map pointer value with constant offset and bounded
one, so limit based on smin_value works because the verifier
requires that statically analyzed arithmetic on the pointer
must be in bounds, and thus it checks if resulting
smin_value + off and umax_value + off is still within map
value bounds at time of arithmetic in addition to time of
access. Similarly, for the case of stack access we derive
the limit as follows: MAX_BPF_STACK + off for subtraction
and -off for the case of addition where off := ptr_reg->off +
ptr_reg->var_off.value. Subtraction is a special case for
the masking which can be in form of ptr += -val, ptr -= -val,
or ptr -= val. In the first two cases where we know that
the value is negative, we need to temporarily negate the
value in order to do the sanitation on a positive value
where we later swap the ALU op, and restore original source
register if the value was in source.

The sanitation of pointer arithmetic alone is still not fully
sufficient as is, since a scenario like the following could
happen ...

  PTR += 0x1000 (e.g. K-based imm)
  PTR -= BIG_NUMBER_WITH_SLOW_COMPARISON
  PTR += 0x1000
  PTR -= BIG_NUMBER_WITH_SLOW_COMPARISON
  [...]

... which under speculation could end up as ...

  PTR += 0x1000
  PTR -= 0 [ truncated by mitigation ]
  PTR += 0x1000
  PTR -= 0 [ truncated by mitigation ]
  [...]

... and therefore still access out of bounds. To prevent such
case, the verifier is also analyzing safety for potential out
of bounds access under speculative execution. Meaning, it is
also simulating pointer access under truncation. We therefore
"branch off" and push the current verification state after the
ALU operation with known 0 to the verification stack for later
analysis. Given the current path analysis succeeded it is
likely that the one under speculation can be pruned. In any
case, it is also subject to existing complexity limits and
therefore anything beyond this point will be rejected. In
terms of pruning, it needs to be ensured that the verification
state from speculative execution simulation must never prune
a non-speculative execution path, therefore, we mark verifier
state accordingly at the time of push_stack(). If verifier
detects out of bounds access under speculative execution from
one of the possible paths that includes a truncation, it will
reject such program.

Given we mask every reg-based pointer arithmetic for
unprivileged programs, we've been looking into how it could
affect real-world programs in terms of size increase. As the
majority of programs are targeted for privileged-only use
case, we've unconditionally enabled masking (with its alu
restrictions on top of it) for privileged programs for the
sake of testing in order to check i) whether they get rejected
in its current form, and ii) by how much the number of
instructions and size will increase. We've tested this by
using Katran, Cilium and test_l4lb from the kernel selftests.
For Katran we've evaluated balancer_kern.o, Cilium bpf_lxc.o
and an older test object bpf_lxc_opt_-DUNKNOWN.o and l4lb
we've used test_l4lb.o as well as test_l4lb_noinline.o. We
found that none of the programs got rejected by the verifier
with this change, and that impact is rather minimal to none.
balancer_kern.o had 13,904 bytes (1,738 insns) xlated and
7,797 bytes JITed before and after the change. Most complex
program in bpf_lxc.o had 30,544 bytes (3,817 insns) xlated
and 18,538 bytes JITed before and after and none of the other
tail call programs in bpf_lxc.o had any changes either. For
the older bpf_lxc_opt_-DUNKNOWN.o object we found a small
increase from 20,616 bytes (2,576 insns) and 12,536 bytes JITed
before to 20,664 bytes (2,582 insns) and 12,558 bytes JITed
after the change. Other programs from that object file had
similar small increase. Both test_l4lb.o had no change and
remained at 6,544 bytes (817 insns) xlated and 3,401 bytes
JITed and for test_l4lb_noinline.o constant at 5,080 bytes
(634 insns) xlated and 3,313 bytes JITed. This can be explained
in that LLVM typically optimizes stack based pointer arithmetic
by using K-based operations and that use of dynamic map access
is not overly frequent. However, in future we may decide to
optimize the algorithm further under known guarantees from
branch and value speculation. Latter seems also unclear in
terms of prediction heuristics that today's CPUs apply as well
as whether there could be collisions in e.g. the predictor's
Value History/Pattern Table for triggering out of bounds access,
thus masking is performed unconditionally at this point but could
be subject to relaxation later on. We were generally also
brainstorming various other approaches for mitigation, but the
blocker was always lack of available registers at runtime and/or
overhead for runtime tracking of limits belonging to a specific
pointer. Thus, we found this to be minimally intrusive under
given constraints.

With that in place, a simple example with sanitized access on
unprivileged load at post-verification time looks as follows:

  # bpftool prog dump xlated id 282
  [...]
  28: (79) r1 = *(u64 *)(r7 +0)
  29: (79) r2 = *(u64 *)(r7 +8)
  30: (57) r1 &= 15
  31: (79) r3 = *(u64 *)(r0 +4608)
  32: (57) r3 &= 1
  33: (47) r3 |= 1
  34: (2d) if r2 > r3 goto pc+19
  35: (b4) (u32) r11 = (u32) 20479  |
  36: (1f) r11 -= r2                | Dynamic sanitation for pointer
  37: (4f) r11 |= r2                | arithmetic with registers
  38: (87) r11 = -r11               | containing bounded or known
  39: (c7) r11 s>>= 63              | scalars in order to prevent
  40: (5f) r11 &= r2                | out of bounds speculation.
  41: (0f) r4 += r11                |
  42: (71) r4 = *(u8 *)(r4 +0)
  43: (6f) r4 <<= r1
  [...]

For the case where the scalar sits in the destination register
as opposed to the source register, the following code is emitted
for the above example:

  [...]
  16: (b4) (u32) r11 = (u32) 20479
  17: (1f) r11 -= r2
  18: (4f) r11 |= r2
  19: (87) r11 = -r11
  20: (c7) r11 s>>= 63
  21: (5f) r2 &= r11
  22: (0f) r2 += r0
  23: (61) r0 = *(u32 *)(r2 +0)
  [...]

JIT blinding example with non-conflicting use of r10:

  [...]
   d5:	je     0x0000000000000106    _
   d7:	mov    0x0(%rax),%edi       |
   da:	mov    $0xf153246,%r10d     | Index load from map value and
   e0:	xor    $0xf153259,%r10      | (const blinded) mask with 0x1f.
   e7:	and    %r10,%rdi            |_
   ea:	mov    $0x2f,%r10d          |
   f0:	sub    %rdi,%r10            | Sanitized addition. Both use r10
   f3:	or     %rdi,%r10            | but do not interfere with each
   f6:	neg    %r10                 | other. (Neither do these instructions
   f9:	sar    $0x3f,%r10           | interfere with the use of ax as temp
   fd:	and    %r10,%rdi            | in interpreter.)
  100:	add    %rax,%rdi            |_
  103:	mov    0x0(%rdi),%eax
 [...]

Tested that it fixes Jann's reproducer, and also checked that test_verifier
and test_progs suite with interpreter, JIT and JIT with hardening enabled
on x86-64 and arm64 runs successfully.

  [0] Speculose: Analyzing the Security Implications of Speculative
      Execution in CPUs, Giorgi Maisuradze and Christian Rossow,
      https://arxiv.org/pdf/1801.04084.pdf

  [1] A Systematic Evaluation of Transient Execution Attacks and
      Defenses, Claudio Canella, Jo Van Bulck, Michael Schwarz,
      Moritz Lipp, Benjamin von Berg, Philipp Ortner, Frank Piessens,
      Dmitry Evtyushkin, Daniel Gruss,
      https://arxiv.org/pdf/1811.05441.pdf

Fixes: b2157399cc98 ("bpf: prevent out-of-bounds speculation")
Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |  10 +++
 kernel/bpf/verifier.c        | 185 +++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 189 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 3f84f3e87704..27b74947cd2b 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -148,6 +148,7 @@ struct bpf_verifier_state {
 	/* call stack tracking */
 	struct bpf_func_state *frame[MAX_CALL_FRAMES];
 	u32 curframe;
+	bool speculative;
 };
 
 #define bpf_get_spilled_reg(slot, frame)				\
@@ -167,15 +168,24 @@ struct bpf_verifier_state_list {
 	struct bpf_verifier_state_list *next;
 };
 
+/* Possible states for alu_state member. */
+#define BPF_ALU_SANITIZE_SRC		1U
+#define BPF_ALU_SANITIZE_DST		2U
+#define BPF_ALU_NEG_VALUE		(1U << 2)
+#define BPF_ALU_SANITIZE		(BPF_ALU_SANITIZE_SRC | \
+					 BPF_ALU_SANITIZE_DST)
+
 struct bpf_insn_aux_data {
 	union {
 		enum bpf_reg_type ptr_type;	/* pointer type for load/store insns */
 		unsigned long map_state;	/* pointer/poison value for maps */
 		s32 call_imm;			/* saved imm field of call insn */
+		u32 alu_limit;			/* limit for add/sub register with pointer */
 	};
 	int ctx_field_size; /* the ctx field size for load insn, maybe 0 */
 	int sanitize_stack_off; /* stack slot to be cleared */
 	bool seen; /* this insn was processed by the verifier */
+	u8 alu_state; /* used in combination with alu_limit */
 };
 
 #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8e5da1ce5da4..f6bc62a9ee8e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -710,6 +710,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
 		free_func_state(dst_state->frame[i]);
 		dst_state->frame[i] = NULL;
 	}
+	dst_state->speculative = src->speculative;
 	dst_state->curframe = src->curframe;
 	for (i = 0; i <= src->curframe; i++) {
 		dst = dst_state->frame[i];
@@ -754,7 +755,8 @@ static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx,
 }
 
 static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
-					     int insn_idx, int prev_insn_idx)
+					     int insn_idx, int prev_insn_idx,
+					     bool speculative)
 {
 	struct bpf_verifier_state *cur = env->cur_state;
 	struct bpf_verifier_stack_elem *elem;
@@ -772,6 +774,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
 	err = copy_verifier_state(&elem->st, cur);
 	if (err)
 		goto err;
+	elem->st.speculative |= speculative;
 	if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) {
 		verbose(env, "BPF program is too complex\n");
 		goto err;
@@ -3067,6 +3070,102 @@ static bool check_reg_sane_offset(struct bpf_verifier_env *env,
 	return true;
 }
 
+static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env)
+{
+	return &env->insn_aux_data[env->insn_idx];
+}
+
+static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg,
+			      u32 *ptr_limit, u8 opcode, bool off_is_neg)
+{
+	bool mask_to_left = (opcode == BPF_ADD &&  off_is_neg) ||
+			    (opcode == BPF_SUB && !off_is_neg);
+	u32 off;
+
+	switch (ptr_reg->type) {
+	case PTR_TO_STACK:
+		off = ptr_reg->off + ptr_reg->var_off.value;
+		if (mask_to_left)
+			*ptr_limit = MAX_BPF_STACK + off;
+		else
+			*ptr_limit = -off;
+		return 0;
+	case PTR_TO_MAP_VALUE:
+		if (mask_to_left) {
+			*ptr_limit = ptr_reg->umax_value + ptr_reg->off;
+		} else {
+			off = ptr_reg->smin_value + ptr_reg->off;
+			*ptr_limit = ptr_reg->map_ptr->value_size - off;
+		}
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
+static int sanitize_ptr_alu(struct bpf_verifier_env *env,
+			    struct bpf_insn *insn,
+			    const struct bpf_reg_state *ptr_reg,
+			    struct bpf_reg_state *dst_reg,
+			    bool off_is_neg)
+{
+	struct bpf_verifier_state *vstate = env->cur_state;
+	struct bpf_insn_aux_data *aux = cur_aux(env);
+	bool ptr_is_dst_reg = ptr_reg == dst_reg;
+	u8 opcode = BPF_OP(insn->code);
+	u32 alu_state, alu_limit;
+	struct bpf_reg_state tmp;
+	bool ret;
+
+	if (env->allow_ptr_leaks || BPF_SRC(insn->code) == BPF_K)
+		return 0;
+
+	/* We already marked aux for masking from non-speculative
+	 * paths, thus we got here in the first place. We only care
+	 * to explore bad access from here.
+	 */
+	if (vstate->speculative)
+		goto do_sim;
+
+	alu_state  = off_is_neg ? BPF_ALU_NEG_VALUE : 0;
+	alu_state |= ptr_is_dst_reg ?
+		     BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST;
+
+	if (retrieve_ptr_limit(ptr_reg, &alu_limit, opcode, off_is_neg))
+		return 0;
+
+	/* If we arrived here from different branches with different
+	 * limits to sanitize, then this won't work.
+	 */
+	if (aux->alu_state &&
+	    (aux->alu_state != alu_state ||
+	     aux->alu_limit != alu_limit))
+		return -EACCES;
+
+	/* Corresponding fixup done in fixup_bpf_calls(). */
+	aux->alu_state = alu_state;
+	aux->alu_limit = alu_limit;
+
+do_sim:
+	/* Simulate and find potential out-of-bounds access under
+	 * speculative execution from truncation as a result of
+	 * masking when off was not within expected range. If off
+	 * sits in dst, then we temporarily need to move ptr there
+	 * to simulate dst (== 0) +/-= ptr. Needed, for example,
+	 * for cases where we use K-based arithmetic in one direction
+	 * and truncated reg-based in the other in order to explore
+	 * bad access.
+	 */
+	if (!ptr_is_dst_reg) {
+		tmp = *dst_reg;
+		*dst_reg = *ptr_reg;
+	}
+	ret = push_stack(env, env->insn_idx + 1, env->insn_idx, true);
+	if (!ptr_is_dst_reg)
+		*dst_reg = tmp;
+	return !ret ? -EFAULT : 0;
+}
+
 /* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off.
  * Caller should also handle BPF_MOV case separately.
  * If we return -EACCES, caller may want to try again treating pointer as a
@@ -3087,6 +3186,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 	    umin_ptr = ptr_reg->umin_value, umax_ptr = ptr_reg->umax_value;
 	u32 dst = insn->dst_reg, src = insn->src_reg;
 	u8 opcode = BPF_OP(insn->code);
+	int ret;
 
 	dst_reg = &regs[dst];
 
@@ -3142,6 +3242,11 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 
 	switch (opcode) {
 	case BPF_ADD:
+		ret = sanitize_ptr_alu(env, insn, ptr_reg, dst_reg, smin_val < 0);
+		if (ret < 0) {
+			verbose(env, "R%d tried to add from different maps or paths\n", dst);
+			return ret;
+		}
 		/* We can take a fixed offset as long as it doesn't overflow
 		 * the s32 'off' field
 		 */
@@ -3192,6 +3297,11 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		}
 		break;
 	case BPF_SUB:
+		ret = sanitize_ptr_alu(env, insn, ptr_reg, dst_reg, smin_val < 0);
+		if (ret < 0) {
+			verbose(env, "R%d tried to sub from different maps or paths\n", dst);
+			return ret;
+		}
 		if (dst_reg == off_reg) {
 			/* scalar -= pointer.  Creates an unknown scalar */
 			verbose(env, "R%d tried to subtract pointer from scalar\n",
@@ -4389,7 +4499,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 		}
 	}
 
-	other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx);
+	other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx,
+				  false);
 	if (!other_branch)
 		return -EFAULT;
 	other_branch_regs = other_branch->frame[other_branch->curframe]->regs;
@@ -5499,6 +5610,12 @@ static bool states_equal(struct bpf_verifier_env *env,
 	if (old->curframe != cur->curframe)
 		return false;
 
+	/* Verification state from speculative execution simulation
+	 * must never prune a non-speculative execution one.
+	 */
+	if (old->speculative && !cur->speculative)
+		return false;
+
 	/* for states to be equal callsites have to be the same
 	 * and all frame states need to be equivalent
 	 */
@@ -5700,6 +5817,7 @@ static int do_check(struct bpf_verifier_env *env)
 	if (!state)
 		return -ENOMEM;
 	state->curframe = 0;
+	state->speculative = false;
 	state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL);
 	if (!state->frame[0]) {
 		kfree(state);
@@ -5739,8 +5857,10 @@ static int do_check(struct bpf_verifier_env *env)
 			/* found equivalent state, can prune the search */
 			if (env->log.level) {
 				if (do_print_state)
-					verbose(env, "\nfrom %d to %d: safe\n",
-						env->prev_insn_idx, env->insn_idx);
+					verbose(env, "\nfrom %d to %d%s: safe\n",
+						env->prev_insn_idx, env->insn_idx,
+						env->cur_state->speculative ?
+						" (speculative execution)" : "");
 				else
 					verbose(env, "%d: safe\n", env->insn_idx);
 			}
@@ -5757,8 +5877,10 @@ static int do_check(struct bpf_verifier_env *env)
 			if (env->log.level > 1)
 				verbose(env, "%d:", env->insn_idx);
 			else
-				verbose(env, "\nfrom %d to %d:",
-					env->prev_insn_idx, env->insn_idx);
+				verbose(env, "\nfrom %d to %d%s:",
+					env->prev_insn_idx, env->insn_idx,
+					env->cur_state->speculative ?
+					" (speculative execution)" : "");
 			print_verifier_state(env, state->frame[state->curframe]);
 			do_print_state = false;
 		}
@@ -6750,6 +6872,57 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 			continue;
 		}
 
+		if (insn->code == (BPF_ALU64 | BPF_ADD | BPF_X) ||
+		    insn->code == (BPF_ALU64 | BPF_SUB | BPF_X)) {
+			const u8 code_add = BPF_ALU64 | BPF_ADD | BPF_X;
+			const u8 code_sub = BPF_ALU64 | BPF_SUB | BPF_X;
+			struct bpf_insn insn_buf[16];
+			struct bpf_insn *patch = &insn_buf[0];
+			bool issrc, isneg;
+			u32 off_reg;
+
+			aux = &env->insn_aux_data[i + delta];
+			if (!aux->alu_state)
+				continue;
+
+			isneg = aux->alu_state & BPF_ALU_NEG_VALUE;
+			issrc = (aux->alu_state & BPF_ALU_SANITIZE) ==
+				BPF_ALU_SANITIZE_SRC;
+
+			off_reg = issrc ? insn->src_reg : insn->dst_reg;
+			if (isneg)
+				*patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1);
+			*patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit - 1);
+			*patch++ = BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, off_reg);
+			*patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg);
+			*patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0);
+			*patch++ = BPF_ALU64_IMM(BPF_ARSH, BPF_REG_AX, 63);
+			if (issrc) {
+				*patch++ = BPF_ALU64_REG(BPF_AND, BPF_REG_AX,
+							 off_reg);
+				insn->src_reg = BPF_REG_AX;
+			} else {
+				*patch++ = BPF_ALU64_REG(BPF_AND, off_reg,
+							 BPF_REG_AX);
+			}
+			if (isneg)
+				insn->code = insn->code == code_add ?
+					     code_sub : code_add;
+			*patch++ = *insn;
+			if (issrc && isneg)
+				*patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1);
+			cnt = patch - insn_buf;
+
+			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+			if (!new_prog)
+				return -ENOMEM;
+
+			delta    += cnt - 1;
+			env->prog = prog = new_prog;
+			insn      = new_prog->insnsi + i + delta;
+			continue;
+		}
+
 		if (insn->code != (BPF_JMP | BPF_CALL))
 			continue;
 		if (insn->src_reg == BPF_PSEUDO_CALL)
-- 
cgit 


From 96d4f267e40f9509e8a66e2b39e8b95655617693 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 3 Jan 2019 18:57:57 -0800
Subject: Remove 'type' argument from access_ok() function

Nobody has actually used the type (VERIFY_READ vs VERIFY_WRITE) argument
of the user address range verification function since we got rid of the
old racy i386-only code to walk page tables by hand.

It existed because the original 80386 would not honor the write protect
bit when in kernel mode, so you had to do COW by hand before doing any
user access.  But we haven't supported that in a long time, and these
days the 'type' argument is a purely historical artifact.

A discussion about extending 'user_access_begin()' to do the range
checking resulted this patch, because there is no way we're going to
move the old VERIFY_xyz interface to that model.  And it's best done at
the end of the merge window when I've done most of my merges, so let's
just get this done once and for all.

This patch was mostly done with a sed-script, with manual fix-ups for
the cases that weren't of the trivial 'access_ok(VERIFY_xyz' form.

There were a couple of notable cases:

 - csky still had the old "verify_area()" name as an alias.

 - the iter_iov code had magical hardcoded knowledge of the actual
   values of VERIFY_{READ,WRITE} (not that they mattered, since nothing
   really used it)

 - microblaze used the type argument for a debug printout

but other than those oddities this should be a total no-op patch.

I tried to fix up all architectures, did fairly extensive grepping for
access_ok() uses, and the changes are trivial, but I may have missed
something.  Any missed conversion should be trivially fixable, though.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/include/asm/futex.h                  |  2 +-
 arch/alpha/include/asm/uaccess.h                |  2 +-
 arch/alpha/kernel/signal.c                      | 12 +--
 arch/alpha/lib/csum_partial_copy.c              |  2 +-
 arch/arc/include/asm/futex.h                    |  2 +-
 arch/arc/kernel/process.c                       |  2 +-
 arch/arc/kernel/signal.c                        |  4 +-
 arch/arm/include/asm/futex.h                    |  4 +-
 arch/arm/include/asm/uaccess.h                  |  4 +-
 arch/arm/kernel/perf_callchain.c                |  2 +-
 arch/arm/kernel/signal.c                        |  6 +-
 arch/arm/kernel/swp_emulate.c                   |  2 +-
 arch/arm/kernel/sys_oabi-compat.c               |  4 +-
 arch/arm/kernel/traps.c                         |  2 +-
 arch/arm/oprofile/common.c                      |  2 +-
 arch/arm64/include/asm/futex.h                  |  2 +-
 arch/arm64/include/asm/uaccess.h                |  8 +-
 arch/arm64/kernel/armv8_deprecated.c            |  2 +-
 arch/arm64/kernel/perf_callchain.c              |  4 +-
 arch/arm64/kernel/signal.c                      |  6 +-
 arch/arm64/kernel/signal32.c                    |  6 +-
 arch/arm64/kernel/sys_compat.c                  |  2 +-
 arch/c6x/kernel/signal.c                        |  4 +-
 arch/csky/abiv1/alignment.c                     |  4 +-
 arch/csky/include/asm/uaccess.h                 | 16 +---
 arch/csky/kernel/signal.c                       |  2 +-
 arch/csky/lib/usercopy.c                        |  8 +-
 arch/h8300/kernel/signal.c                      |  4 +-
 arch/hexagon/include/asm/futex.h                |  2 +-
 arch/hexagon/include/asm/uaccess.h              |  3 -
 arch/hexagon/kernel/signal.c                    |  4 +-
 arch/hexagon/mm/uaccess.c                       |  2 +-
 arch/ia64/include/asm/futex.h                   |  2 +-
 arch/ia64/include/asm/uaccess.h                 |  2 +-
 arch/ia64/kernel/ptrace.c                       |  4 +-
 arch/ia64/kernel/signal.c                       |  4 +-
 arch/m68k/include/asm/uaccess_mm.h              |  2 +-
 arch/m68k/include/asm/uaccess_no.h              |  2 +-
 arch/m68k/kernel/signal.c                       |  4 +-
 arch/microblaze/include/asm/futex.h             |  2 +-
 arch/microblaze/include/asm/uaccess.h           | 23 +++---
 arch/microblaze/kernel/signal.c                 |  4 +-
 arch/mips/include/asm/checksum.h                |  4 +-
 arch/mips/include/asm/futex.h                   |  2 +-
 arch/mips/include/asm/termios.h                 |  4 +-
 arch/mips/include/asm/uaccess.h                 | 12 +--
 arch/mips/kernel/mips-r2-to-r6-emul.c           | 24 +++---
 arch/mips/kernel/ptrace.c                       | 12 +--
 arch/mips/kernel/signal.c                       | 12 +--
 arch/mips/kernel/signal32.c                     |  4 +-
 arch/mips/kernel/signal_n32.c                   |  4 +-
 arch/mips/kernel/signal_o32.c                   |  8 +-
 arch/mips/kernel/syscall.c                      |  2 +-
 arch/mips/kernel/unaligned.c                    | 98 ++++++++++++-------------
 arch/mips/math-emu/cp1emu.c                     | 16 ++--
 arch/mips/mm/cache.c                            |  2 +-
 arch/mips/mm/gup.c                              |  3 +-
 arch/mips/oprofile/backtrace.c                  |  2 +-
 arch/mips/sibyte/common/sb_tbprof.c             |  2 +-
 arch/nds32/include/asm/futex.h                  |  2 +-
 arch/nds32/include/asm/uaccess.h                | 11 +--
 arch/nds32/kernel/perf_event_cpu.c              | 11 ++-
 arch/nds32/kernel/signal.c                      |  4 +-
 arch/nds32/mm/alignment.c                       |  8 +-
 arch/nios2/include/asm/uaccess.h                |  8 +-
 arch/nios2/kernel/signal.c                      |  2 +-
 arch/openrisc/include/asm/futex.h               |  2 +-
 arch/openrisc/include/asm/uaccess.h             |  8 +-
 arch/openrisc/kernel/signal.c                   |  6 +-
 arch/parisc/include/asm/futex.h                 |  2 +-
 arch/parisc/include/asm/uaccess.h               |  2 +-
 arch/powerpc/include/asm/futex.h                |  2 +-
 arch/powerpc/include/asm/uaccess.h              |  8 +-
 arch/powerpc/kernel/align.c                     |  3 +-
 arch/powerpc/kernel/rtas_flash.c                |  2 +-
 arch/powerpc/kernel/rtasd.c                     |  2 +-
 arch/powerpc/kernel/signal.c                    |  2 +-
 arch/powerpc/kernel/signal_32.c                 | 12 +--
 arch/powerpc/kernel/signal_64.c                 | 13 ++--
 arch/powerpc/kernel/syscalls.c                  |  2 +-
 arch/powerpc/kernel/traps.c                     |  2 +-
 arch/powerpc/kvm/book3s_64_mmu_hv.c             |  4 +-
 arch/powerpc/lib/checksum_wrappers.c            |  4 +-
 arch/powerpc/mm/fault.c                         |  2 +-
 arch/powerpc/mm/subpage-prot.c                  |  2 +-
 arch/powerpc/oprofile/backtrace.c               |  4 +-
 arch/powerpc/platforms/cell/spufs/file.c        | 16 ++--
 arch/powerpc/platforms/powernv/opal-lpc.c       |  4 +-
 arch/powerpc/platforms/pseries/scanlog.c        |  2 +-
 arch/riscv/include/asm/futex.h                  |  2 +-
 arch/riscv/include/asm/uaccess.h                | 14 +---
 arch/riscv/kernel/signal.c                      |  4 +-
 arch/s390/include/asm/uaccess.h                 |  2 +-
 arch/sh/include/asm/checksum_32.h               |  2 +-
 arch/sh/include/asm/futex.h                     |  2 +-
 arch/sh/include/asm/uaccess.h                   |  9 +--
 arch/sh/kernel/signal_32.c                      |  8 +-
 arch/sh/kernel/signal_64.c                      |  8 +-
 arch/sh/kernel/traps_64.c                       | 12 +--
 arch/sh/mm/gup.c                                |  3 +-
 arch/sh/oprofile/backtrace.c                    |  2 +-
 arch/sparc/include/asm/checksum_32.h            |  2 +-
 arch/sparc/include/asm/uaccess_32.h             |  2 +-
 arch/sparc/include/asm/uaccess_64.h             |  2 +-
 arch/sparc/kernel/sigutil_32.c                  |  2 +-
 arch/sparc/kernel/unaligned_32.c                |  7 +-
 arch/um/kernel/ptrace.c                         |  4 +-
 arch/unicore32/kernel/signal.c                  |  4 +-
 arch/x86/entry/vsyscall/vsyscall_64.c           |  2 +-
 arch/x86/ia32/ia32_aout.c                       |  4 +-
 arch/x86/ia32/ia32_signal.c                     |  8 +-
 arch/x86/ia32/sys_ia32.c                        |  2 +-
 arch/x86/include/asm/checksum_32.h              |  2 +-
 arch/x86/include/asm/pgtable_32.h               |  2 +-
 arch/x86/include/asm/uaccess.h                  |  7 +-
 arch/x86/kernel/fpu/signal.c                    |  4 +-
 arch/x86/kernel/signal.c                        | 14 ++--
 arch/x86/kernel/stacktrace.c                    |  2 +-
 arch/x86/kernel/vm86_32.c                       |  4 +-
 arch/x86/lib/csum-wrappers_64.c                 |  4 +-
 arch/x86/lib/usercopy_32.c                      |  2 +-
 arch/x86/lib/usercopy_64.c                      |  2 +-
 arch/x86/math-emu/fpu_system.h                  |  4 +-
 arch/x86/math-emu/load_store.c                  |  6 +-
 arch/x86/math-emu/reg_ld_str.c                  | 48 ++++++------
 arch/x86/mm/mpx.c                               |  2 +-
 arch/x86/um/asm/checksum_32.h                   |  2 +-
 arch/x86/um/signal.c                            |  6 +-
 arch/xtensa/include/asm/checksum.h              |  2 +-
 arch/xtensa/include/asm/futex.h                 |  2 +-
 arch/xtensa/include/asm/uaccess.h               | 10 +--
 arch/xtensa/kernel/signal.c                     |  4 +-
 arch/xtensa/kernel/stacktrace.c                 |  2 +-
 drivers/acpi/acpi_dbg.c                         |  4 +-
 drivers/char/generic_nvram.c                    |  4 +-
 drivers/char/mem.c                              |  4 +-
 drivers/char/nwflash.c                          |  2 +-
 drivers/char/pcmcia/cm4000_cs.c                 |  4 +-
 drivers/crypto/ccp/psp-dev.c                    |  6 +-
 drivers/firewire/core-cdev.c                    |  2 +-
 drivers/firmware/efi/test/efi_test.c            |  8 +-
 drivers/fpga/dfl-afu-dma-region.c               |  2 +-
 drivers/fpga/dfl-fme-pr.c                       |  3 +-
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c        | 18 ++---
 drivers/gpu/drm/armada/armada_gem.c             |  2 +-
 drivers/gpu/drm/drm_file.c                      |  2 +-
 drivers/gpu/drm/etnaviv/etnaviv_drv.c           |  8 +-
 drivers/gpu/drm/i915/i915_gem.c                 |  7 +-
 drivers/gpu/drm/i915/i915_gem_execbuffer.c      |  6 +-
 drivers/gpu/drm/i915/i915_gem_userptr.c         |  3 +-
 drivers/gpu/drm/i915/i915_ioc32.c               |  2 +-
 drivers/gpu/drm/i915/i915_perf.c                |  2 +-
 drivers/gpu/drm/i915/i915_query.c               |  2 +-
 drivers/gpu/drm/msm/msm_gem_submit.c            |  2 +-
 drivers/gpu/drm/qxl/qxl_ioctl.c                 |  3 +-
 drivers/infiniband/core/uverbs_main.c           |  3 +-
 drivers/infiniband/hw/hfi1/user_exp_rcv.c       |  2 +-
 drivers/infiniband/hw/qib/qib_file_ops.c        |  2 +-
 drivers/macintosh/ans-lcd.c                     |  2 +-
 drivers/macintosh/via-pmu.c                     |  2 +-
 drivers/media/pci/ivtv/ivtvfb.c                 |  2 +-
 drivers/media/v4l2-core/v4l2-compat-ioctl32.c   | 46 ++++++------
 drivers/misc/vmw_vmci/vmci_host.c               |  2 +-
 drivers/pci/proc.c                              |  4 +-
 drivers/platform/goldfish/goldfish_pipe.c       |  3 +-
 drivers/pnp/isapnp/proc.c                       |  2 +-
 drivers/scsi/pmcraid.c                          |  4 +-
 drivers/scsi/scsi_ioctl.c                       |  2 +-
 drivers/scsi/sg.c                               | 16 ++--
 drivers/staging/comedi/comedi_compat32.c        | 24 +++---
 drivers/tty/n_hdlc.c                            |  2 +-
 drivers/usb/core/devices.c                      |  2 +-
 drivers/usb/core/devio.c                        |  7 +-
 drivers/usb/gadget/function/f_hid.c             |  4 +-
 drivers/usb/gadget/udc/atmel_usba_udc.c         |  2 +-
 drivers/vhost/vhost.c                           | 16 ++--
 drivers/video/fbdev/amifb.c                     |  4 +-
 drivers/video/fbdev/omap2/omapfb/omapfb-ioctl.c |  2 +-
 drivers/xen/privcmd.c                           |  6 +-
 fs/binfmt_aout.c                                |  4 +-
 fs/btrfs/send.c                                 |  2 +-
 fs/eventpoll.c                                  |  2 +-
 fs/fat/dir.c                                    |  4 +-
 fs/ioctl.c                                      |  2 +-
 fs/namespace.c                                  |  2 +-
 fs/ocfs2/dlmfs/dlmfs.c                          |  4 +-
 fs/pstore/pmsg.c                                |  2 +-
 fs/pstore/ram_core.c                            |  2 +-
 fs/read_write.c                                 | 13 ++--
 fs/readdir.c                                    | 10 +--
 fs/select.c                                     | 11 +--
 include/asm-generic/uaccess.h                   | 12 +--
 include/linux/regset.h                          |  4 +-
 include/linux/uaccess.h                         |  9 +--
 include/net/checksum.h                          |  4 +-
 kernel/bpf/syscall.c                            |  2 +-
 kernel/compat.c                                 | 16 ++--
 kernel/events/core.c                            |  2 +-
 kernel/exit.c                                   |  4 +-
 kernel/futex.c                                  | 35 +++++----
 kernel/printk/printk.c                          |  4 +-
 kernel/ptrace.c                                 |  4 +-
 kernel/rseq.c                                   |  6 +-
 kernel/sched/core.c                             |  4 +-
 kernel/signal.c                                 |  8 +-
 kernel/sys.c                                    |  2 +-
 kernel/trace/bpf_trace.c                        |  2 +-
 lib/bitmap.c                                    |  4 +-
 lib/iov_iter.c                                  |  8 +-
 lib/usercopy.c                                  |  4 +-
 mm/gup.c                                        |  6 +-
 mm/mincore.c                                    |  4 +-
 net/batman-adv/icmp_socket.c                    |  2 +-
 net/batman-adv/log.c                            |  2 +-
 net/compat.c                                    | 30 ++++----
 net/sunrpc/sysctl.c                             |  2 +-
 security/tomoyo/common.c                        |  2 +-
 sound/core/seq/seq_clientmgr.c                  |  2 +-
 sound/isa/sb/emu8000_patch.c                    |  4 +-
 tools/perf/util/include/asm/uaccess.h           |  2 +-
 virt/kvm/kvm_main.c                             |  3 +-
 221 files changed, 610 insertions(+), 679 deletions(-)

(limited to 'kernel')

diff --git a/arch/alpha/include/asm/futex.h b/arch/alpha/include/asm/futex.h
index ca3322536f72..bfd3c01038f8 100644
--- a/arch/alpha/include/asm/futex.h
+++ b/arch/alpha/include/asm/futex.h
@@ -68,7 +68,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
 	int ret = 0, cmp;
 	u32 prev;
 
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
+	if (!access_ok(uaddr, sizeof(u32)))
 		return -EFAULT;
 
 	__asm__ __volatile__ (
diff --git a/arch/alpha/include/asm/uaccess.h b/arch/alpha/include/asm/uaccess.h
index 87d8c4f0307d..e69c4e13c328 100644
--- a/arch/alpha/include/asm/uaccess.h
+++ b/arch/alpha/include/asm/uaccess.h
@@ -36,7 +36,7 @@
 #define __access_ok(addr, size) \
 	((get_fs().seg & (addr | size | (addr+size))) == 0)
 
-#define access_ok(type, addr, size)			\
+#define access_ok(addr, size)				\
 ({							\
 	__chk_user_ptr(addr);				\
 	__access_ok(((unsigned long)(addr)), (size));	\
diff --git a/arch/alpha/kernel/signal.c b/arch/alpha/kernel/signal.c
index 8c0c4ee0be6e..33e904a05881 100644
--- a/arch/alpha/kernel/signal.c
+++ b/arch/alpha/kernel/signal.c
@@ -65,7 +65,7 @@ SYSCALL_DEFINE3(osf_sigaction, int, sig,
 
 	if (act) {
 		old_sigset_t mask;
-		if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
+		if (!access_ok(act, sizeof(*act)) ||
 		    __get_user(new_ka.sa.sa_handler, &act->sa_handler) ||
 		    __get_user(new_ka.sa.sa_flags, &act->sa_flags) ||
 		    __get_user(mask, &act->sa_mask))
@@ -77,7 +77,7 @@ SYSCALL_DEFINE3(osf_sigaction, int, sig,
 	ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
 
 	if (!ret && oact) {
-		if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
+		if (!access_ok(oact, sizeof(*oact)) ||
 		    __put_user(old_ka.sa.sa_handler, &oact->sa_handler) ||
 		    __put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
 		    __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask))
@@ -207,7 +207,7 @@ do_sigreturn(struct sigcontext __user *sc)
 	sigset_t set;
 
 	/* Verify that it's a good sigcontext before using it */
-	if (!access_ok(VERIFY_READ, sc, sizeof(*sc)))
+	if (!access_ok(sc, sizeof(*sc)))
 		goto give_sigsegv;
 	if (__get_user(set.sig[0], &sc->sc_mask))
 		goto give_sigsegv;
@@ -235,7 +235,7 @@ do_rt_sigreturn(struct rt_sigframe __user *frame)
 	sigset_t set;
 
 	/* Verify that it's a good ucontext_t before using it */
-	if (!access_ok(VERIFY_READ, &frame->uc, sizeof(frame->uc)))
+	if (!access_ok(&frame->uc, sizeof(frame->uc)))
 		goto give_sigsegv;
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto give_sigsegv;
@@ -332,7 +332,7 @@ setup_frame(struct ksignal *ksig, sigset_t *set, struct pt_regs *regs)
 
 	oldsp = rdusp();
 	frame = get_sigframe(ksig, oldsp, sizeof(*frame));
-	if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+	if (!access_ok(frame, sizeof(*frame)))
 		return -EFAULT;
 
 	err |= setup_sigcontext(&frame->sc, regs, set->sig[0], oldsp);
@@ -377,7 +377,7 @@ setup_rt_frame(struct ksignal *ksig, sigset_t *set, struct pt_regs *regs)
 
 	oldsp = rdusp();
 	frame = get_sigframe(ksig, oldsp, sizeof(*frame));
-	if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+	if (!access_ok(frame, sizeof(*frame)))
 		return -EFAULT;
 
 	err |= copy_siginfo_to_user(&frame->info, &ksig->info);
diff --git a/arch/alpha/lib/csum_partial_copy.c b/arch/alpha/lib/csum_partial_copy.c
index ddb9c2f376fa..e53f96e8aa6d 100644
--- a/arch/alpha/lib/csum_partial_copy.c
+++ b/arch/alpha/lib/csum_partial_copy.c
@@ -333,7 +333,7 @@ csum_partial_copy_from_user(const void __user *src, void *dst, int len,
 	unsigned long doff = 7 & (unsigned long) dst;
 
 	if (len) {
-		if (!access_ok(VERIFY_READ, src, len)) {
+		if (!access_ok(src, len)) {
 			if (errp) *errp = -EFAULT;
 			memset(dst, 0, len);
 			return sum;
diff --git a/arch/arc/include/asm/futex.h b/arch/arc/include/asm/futex.h
index eb887dd13e74..c29c3fae6854 100644
--- a/arch/arc/include/asm/futex.h
+++ b/arch/arc/include/asm/futex.h
@@ -126,7 +126,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 expval,
 	int ret = 0;
 	u32 existval;
 
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
+	if (!access_ok(uaddr, sizeof(u32)))
 		return -EFAULT;
 
 #ifndef CONFIG_ARC_HAS_LLSC
diff --git a/arch/arc/kernel/process.c b/arch/arc/kernel/process.c
index 8ce6e7235915..641c364fc232 100644
--- a/arch/arc/kernel/process.c
+++ b/arch/arc/kernel/process.c
@@ -61,7 +61,7 @@ SYSCALL_DEFINE3(arc_usr_cmpxchg, int *, uaddr, int, expected, int, new)
 	/* Z indicates to userspace if operation succeded */
 	regs->status32 &= ~STATUS_Z_MASK;
 
-	ret = access_ok(VERIFY_WRITE, uaddr, sizeof(*uaddr));
+	ret = access_ok(uaddr, sizeof(*uaddr));
 	if (!ret)
 		 goto fail;
 
diff --git a/arch/arc/kernel/signal.c b/arch/arc/kernel/signal.c
index 48685445002e..1bfb7de696bd 100644
--- a/arch/arc/kernel/signal.c
+++ b/arch/arc/kernel/signal.c
@@ -169,7 +169,7 @@ SYSCALL_DEFINE0(rt_sigreturn)
 
 	sf = (struct rt_sigframe __force __user *)(regs->sp);
 
-	if (!access_ok(VERIFY_READ, sf, sizeof(*sf)))
+	if (!access_ok(sf, sizeof(*sf)))
 		goto badframe;
 
 	if (__get_user(magic, &sf->sigret_magic))
@@ -219,7 +219,7 @@ static inline void __user *get_sigframe(struct ksignal *ksig,
 	frame = (void __user *)((sp - framesize) & ~7);
 
 	/* Check that we can actually write to the signal frame */
-	if (!access_ok(VERIFY_WRITE, frame, framesize))
+	if (!access_ok(frame, framesize))
 		frame = NULL;
 
 	return frame;
diff --git a/arch/arm/include/asm/futex.h b/arch/arm/include/asm/futex.h
index ffebe7b7a5b7..0a46676b4245 100644
--- a/arch/arm/include/asm/futex.h
+++ b/arch/arm/include/asm/futex.h
@@ -50,7 +50,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
 	int ret;
 	u32 val;
 
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
+	if (!access_ok(uaddr, sizeof(u32)))
 		return -EFAULT;
 
 	smp_mb();
@@ -104,7 +104,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
 	int ret = 0;
 	u32 val;
 
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
+	if (!access_ok(uaddr, sizeof(u32)))
 		return -EFAULT;
 
 	preempt_disable();
diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h
index c136eef8f690..27ed17ec45fe 100644
--- a/arch/arm/include/asm/uaccess.h
+++ b/arch/arm/include/asm/uaccess.h
@@ -279,7 +279,7 @@ static inline void set_fs(mm_segment_t fs)
 
 #endif /* CONFIG_MMU */
 
-#define access_ok(type, addr, size)	(__range_ok(addr, size) == 0)
+#define access_ok(addr, size)	(__range_ok(addr, size) == 0)
 
 #define user_addr_max() \
 	(uaccess_kernel() ? ~0UL : get_fs())
@@ -560,7 +560,7 @@ raw_copy_to_user(void __user *to, const void *from, unsigned long n)
 
 static inline unsigned long __must_check clear_user(void __user *to, unsigned long n)
 {
-	if (access_ok(VERIFY_WRITE, to, n))
+	if (access_ok(to, n))
 		n = __clear_user(to, n);
 	return n;
 }
diff --git a/arch/arm/kernel/perf_callchain.c b/arch/arm/kernel/perf_callchain.c
index 08e43a32a693..3b69a76d341e 100644
--- a/arch/arm/kernel/perf_callchain.c
+++ b/arch/arm/kernel/perf_callchain.c
@@ -37,7 +37,7 @@ user_backtrace(struct frame_tail __user *tail,
 	struct frame_tail buftail;
 	unsigned long err;
 
-	if (!access_ok(VERIFY_READ, tail, sizeof(buftail)))
+	if (!access_ok(tail, sizeof(buftail)))
 		return NULL;
 
 	pagefault_disable();
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index b908382b69ff..76bb8de6bf6b 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -241,7 +241,7 @@ asmlinkage int sys_sigreturn(struct pt_regs *regs)
 
 	frame = (struct sigframe __user *)regs->ARM_sp;
 
-	if (!access_ok(VERIFY_READ, frame, sizeof (*frame)))
+	if (!access_ok(frame, sizeof (*frame)))
 		goto badframe;
 
 	if (restore_sigframe(regs, frame))
@@ -271,7 +271,7 @@ asmlinkage int sys_rt_sigreturn(struct pt_regs *regs)
 
 	frame = (struct rt_sigframe __user *)regs->ARM_sp;
 
-	if (!access_ok(VERIFY_READ, frame, sizeof (*frame)))
+	if (!access_ok(frame, sizeof (*frame)))
 		goto badframe;
 
 	if (restore_sigframe(regs, &frame->sig))
@@ -355,7 +355,7 @@ get_sigframe(struct ksignal *ksig, struct pt_regs *regs, int framesize)
 	/*
 	 * Check that we can actually write to the signal frame.
 	 */
-	if (!access_ok(VERIFY_WRITE, frame, framesize))
+	if (!access_ok(frame, framesize))
 		frame = NULL;
 
 	return frame;
diff --git a/arch/arm/kernel/swp_emulate.c b/arch/arm/kernel/swp_emulate.c
index a188d5e8ab7f..76f6e6a9736c 100644
--- a/arch/arm/kernel/swp_emulate.c
+++ b/arch/arm/kernel/swp_emulate.c
@@ -198,7 +198,7 @@ static int swp_handler(struct pt_regs *regs, unsigned int instr)
 		 destreg, EXTRACT_REG_NUM(instr, RT2_OFFSET), data);
 
 	/* Check access in reasonable access range for both SWP and SWPB */
-	if (!access_ok(VERIFY_WRITE, (address & ~3), 4)) {
+	if (!access_ok((address & ~3), 4)) {
 		pr_debug("SWP{B} emulation: access to %p not allowed!\n",
 			 (void *)address);
 		res = -EFAULT;
diff --git a/arch/arm/kernel/sys_oabi-compat.c b/arch/arm/kernel/sys_oabi-compat.c
index 40da0872170f..92ab36f38795 100644
--- a/arch/arm/kernel/sys_oabi-compat.c
+++ b/arch/arm/kernel/sys_oabi-compat.c
@@ -285,7 +285,7 @@ asmlinkage long sys_oabi_epoll_wait(int epfd,
 			maxevents > (INT_MAX/sizeof(*kbuf)) ||
 			maxevents > (INT_MAX/sizeof(*events)))
 		return -EINVAL;
-	if (!access_ok(VERIFY_WRITE, events, sizeof(*events) * maxevents))
+	if (!access_ok(events, sizeof(*events) * maxevents))
 		return -EFAULT;
 	kbuf = kmalloc_array(maxevents, sizeof(*kbuf), GFP_KERNEL);
 	if (!kbuf)
@@ -326,7 +326,7 @@ asmlinkage long sys_oabi_semtimedop(int semid,
 
 	if (nsops < 1 || nsops > SEMOPM)
 		return -EINVAL;
-	if (!access_ok(VERIFY_READ, tsops, sizeof(*tsops) * nsops))
+	if (!access_ok(tsops, sizeof(*tsops) * nsops))
 		return -EFAULT;
 	sops = kmalloc_array(nsops, sizeof(*sops), GFP_KERNEL);
 	if (!sops)
diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c
index 2d668cff8ef4..33af097c454b 100644
--- a/arch/arm/kernel/traps.c
+++ b/arch/arm/kernel/traps.c
@@ -582,7 +582,7 @@ do_cache_op(unsigned long start, unsigned long end, int flags)
 	if (end < start || flags)
 		return -EINVAL;
 
-	if (!access_ok(VERIFY_READ, start, end - start))
+	if (!access_ok(start, end - start))
 		return -EFAULT;
 
 	return __do_cache_op(start, end);
diff --git a/arch/arm/oprofile/common.c b/arch/arm/oprofile/common.c
index cc649a1e46da..7cb3e0453fcd 100644
--- a/arch/arm/oprofile/common.c
+++ b/arch/arm/oprofile/common.c
@@ -88,7 +88,7 @@ static struct frame_tail* user_backtrace(struct frame_tail *tail)
 	struct frame_tail buftail[2];
 
 	/* Also check accessibility of one struct frame_tail beyond */
-	if (!access_ok(VERIFY_READ, tail, sizeof(buftail)))
+	if (!access_ok(tail, sizeof(buftail)))
 		return NULL;
 	if (__copy_from_user_inatomic(buftail, tail, sizeof(buftail)))
 		return NULL;
diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h
index 07fe2479d310..cccb83ad7fa8 100644
--- a/arch/arm64/include/asm/futex.h
+++ b/arch/arm64/include/asm/futex.h
@@ -96,7 +96,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *_uaddr,
 	u32 val, tmp;
 	u32 __user *uaddr;
 
-	if (!access_ok(VERIFY_WRITE, _uaddr, sizeof(u32)))
+	if (!access_ok(_uaddr, sizeof(u32)))
 		return -EFAULT;
 
 	uaddr = __uaccess_mask_ptr(_uaddr);
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index ed252435fd92..547d7a0c9d05 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -95,7 +95,7 @@ static inline unsigned long __range_ok(const void __user *addr, unsigned long si
 	return ret;
 }
 
-#define access_ok(type, addr, size)	__range_ok(addr, size)
+#define access_ok(addr, size)	__range_ok(addr, size)
 #define user_addr_max			get_fs
 
 #define _ASM_EXTABLE(from, to)						\
@@ -301,7 +301,7 @@ do {									\
 ({									\
 	__typeof__(*(ptr)) __user *__p = (ptr);				\
 	might_fault();							\
-	if (access_ok(VERIFY_READ, __p, sizeof(*__p))) {		\
+	if (access_ok(__p, sizeof(*__p))) {				\
 		__p = uaccess_mask_ptr(__p);				\
 		__get_user_err((x), __p, (err));			\
 	} else {							\
@@ -370,7 +370,7 @@ do {									\
 ({									\
 	__typeof__(*(ptr)) __user *__p = (ptr);				\
 	might_fault();							\
-	if (access_ok(VERIFY_WRITE, __p, sizeof(*__p))) {		\
+	if (access_ok(__p, sizeof(*__p))) {				\
 		__p = uaccess_mask_ptr(__p);				\
 		__put_user_err((x), __p, (err));			\
 	} else	{							\
@@ -418,7 +418,7 @@ extern unsigned long __must_check __arch_copy_in_user(void __user *to, const voi
 extern unsigned long __must_check __arch_clear_user(void __user *to, unsigned long n);
 static inline unsigned long __must_check __clear_user(void __user *to, unsigned long n)
 {
-	if (access_ok(VERIFY_WRITE, to, n))
+	if (access_ok(to, n))
 		n = __arch_clear_user(__uaccess_mask_ptr(to), n);
 	return n;
 }
diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c
index 92be1d12d590..e52e7280884a 100644
--- a/arch/arm64/kernel/armv8_deprecated.c
+++ b/arch/arm64/kernel/armv8_deprecated.c
@@ -402,7 +402,7 @@ static int swp_handler(struct pt_regs *regs, u32 instr)
 
 	/* Check access in reasonable access range for both SWP and SWPB */
 	user_ptr = (const void __user *)(unsigned long)(address & ~3);
-	if (!access_ok(VERIFY_WRITE, user_ptr, 4)) {
+	if (!access_ok(user_ptr, 4)) {
 		pr_debug("SWP{B} emulation: access to 0x%08x not allowed!\n",
 			address);
 		goto fault;
diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c
index a34c26afacb0..61d983f5756f 100644
--- a/arch/arm64/kernel/perf_callchain.c
+++ b/arch/arm64/kernel/perf_callchain.c
@@ -39,7 +39,7 @@ user_backtrace(struct frame_tail __user *tail,
 	unsigned long lr;
 
 	/* Also check accessibility of one struct frame_tail beyond */
-	if (!access_ok(VERIFY_READ, tail, sizeof(buftail)))
+	if (!access_ok(tail, sizeof(buftail)))
 		return NULL;
 
 	pagefault_disable();
@@ -86,7 +86,7 @@ compat_user_backtrace(struct compat_frame_tail __user *tail,
 	unsigned long err;
 
 	/* Also check accessibility of one struct frame_tail beyond */
-	if (!access_ok(VERIFY_READ, tail, sizeof(buftail)))
+	if (!access_ok(tail, sizeof(buftail)))
 		return NULL;
 
 	pagefault_disable();
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index 5dcc942906db..867a7cea70e5 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -470,7 +470,7 @@ static int parse_user_sigframe(struct user_ctxs *user,
 			offset = 0;
 			limit = extra_size;
 
-			if (!access_ok(VERIFY_READ, base, limit))
+			if (!access_ok(base, limit))
 				goto invalid;
 
 			continue;
@@ -556,7 +556,7 @@ SYSCALL_DEFINE0(rt_sigreturn)
 
 	frame = (struct rt_sigframe __user *)regs->sp;
 
-	if (!access_ok(VERIFY_READ, frame, sizeof (*frame)))
+	if (!access_ok(frame, sizeof (*frame)))
 		goto badframe;
 
 	if (restore_sigframe(regs, frame))
@@ -730,7 +730,7 @@ static int get_sigframe(struct rt_sigframe_user_layout *user,
 	/*
 	 * Check that we can actually write to the signal frame.
 	 */
-	if (!access_ok(VERIFY_WRITE, user->sigframe, sp_top - sp))
+	if (!access_ok(user->sigframe, sp_top - sp))
 		return -EFAULT;
 
 	return 0;
diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c
index 24b09003f821..cb7800acd19f 100644
--- a/arch/arm64/kernel/signal32.c
+++ b/arch/arm64/kernel/signal32.c
@@ -303,7 +303,7 @@ COMPAT_SYSCALL_DEFINE0(sigreturn)
 
 	frame = (struct compat_sigframe __user *)regs->compat_sp;
 
-	if (!access_ok(VERIFY_READ, frame, sizeof (*frame)))
+	if (!access_ok(frame, sizeof (*frame)))
 		goto badframe;
 
 	if (compat_restore_sigframe(regs, frame))
@@ -334,7 +334,7 @@ COMPAT_SYSCALL_DEFINE0(rt_sigreturn)
 
 	frame = (struct compat_rt_sigframe __user *)regs->compat_sp;
 
-	if (!access_ok(VERIFY_READ, frame, sizeof (*frame)))
+	if (!access_ok(frame, sizeof (*frame)))
 		goto badframe;
 
 	if (compat_restore_sigframe(regs, &frame->sig))
@@ -365,7 +365,7 @@ static void __user *compat_get_sigframe(struct ksignal *ksig,
 	/*
 	 * Check that we can actually write to the signal frame.
 	 */
-	if (!access_ok(VERIFY_WRITE, frame, framesize))
+	if (!access_ok(frame, framesize))
 		frame = NULL;
 
 	return frame;
diff --git a/arch/arm64/kernel/sys_compat.c b/arch/arm64/kernel/sys_compat.c
index 32653d156747..21005dfe8406 100644
--- a/arch/arm64/kernel/sys_compat.c
+++ b/arch/arm64/kernel/sys_compat.c
@@ -58,7 +58,7 @@ do_compat_cache_op(unsigned long start, unsigned long end, int flags)
 	if (end < start || flags)
 		return -EINVAL;
 
-	if (!access_ok(VERIFY_READ, (const void __user *)start, end - start))
+	if (!access_ok((const void __user *)start, end - start))
 		return -EFAULT;
 
 	return __do_compat_cache_op(start, end);
diff --git a/arch/c6x/kernel/signal.c b/arch/c6x/kernel/signal.c
index 3c4bb5a5c382..33b9f69c38f7 100644
--- a/arch/c6x/kernel/signal.c
+++ b/arch/c6x/kernel/signal.c
@@ -80,7 +80,7 @@ asmlinkage int do_rt_sigreturn(struct pt_regs *regs)
 
 	frame = (struct rt_sigframe __user *) ((unsigned long) regs->sp + 8);
 
-	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
+	if (!access_ok(frame, sizeof(*frame)))
 		goto badframe;
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
@@ -149,7 +149,7 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set,
 
 	frame = get_sigframe(ksig, regs, sizeof(*frame));
 
-	if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+	if (!access_ok(frame, sizeof(*frame)))
 		return -EFAULT;
 
 	err |= __put_user(&frame->info, &frame->pinfo);
diff --git a/arch/csky/abiv1/alignment.c b/arch/csky/abiv1/alignment.c
index 60205e98fb87..d789be36eb4f 100644
--- a/arch/csky/abiv1/alignment.c
+++ b/arch/csky/abiv1/alignment.c
@@ -32,7 +32,7 @@ static int ldb_asm(uint32_t addr, uint32_t *valp)
 	uint32_t val;
 	int err;
 
-	if (!access_ok(VERIFY_READ, (void *)addr, 1))
+	if (!access_ok((void *)addr, 1))
 		return 1;
 
 	asm volatile (
@@ -67,7 +67,7 @@ static int stb_asm(uint32_t addr, uint32_t val)
 {
 	int err;
 
-	if (!access_ok(VERIFY_WRITE, (void *)addr, 1))
+	if (!access_ok((void *)addr, 1))
 		return 1;
 
 	asm volatile (
diff --git a/arch/csky/include/asm/uaccess.h b/arch/csky/include/asm/uaccess.h
index acaf0e210d81..eaa1c3403a42 100644
--- a/arch/csky/include/asm/uaccess.h
+++ b/arch/csky/include/asm/uaccess.h
@@ -16,10 +16,7 @@
 #include <linux/version.h>
 #include <asm/segment.h>
 
-#define VERIFY_READ	0
-#define VERIFY_WRITE	1
-
-static inline int access_ok(int type, const void *addr, unsigned long size)
+static inline int access_ok(const void *addr, unsigned long size)
 {
 	unsigned long limit = current_thread_info()->addr_limit.seg;
 
@@ -27,12 +24,7 @@ static inline int access_ok(int type, const void *addr, unsigned long size)
 		((unsigned long)(addr + size) < limit));
 }
 
-static inline int verify_area(int type, const void *addr, unsigned long size)
-{
-	return access_ok(type, addr, size) ? 0 : -EFAULT;
-}
-
-#define __addr_ok(addr) (access_ok(VERIFY_READ, addr, 0))
+#define __addr_ok(addr) (access_ok(addr, 0))
 
 extern int __put_user_bad(void);
 
@@ -91,7 +83,7 @@ extern int __put_user_bad(void);
 	long __pu_err = -EFAULT;					\
 	typeof(*(ptr)) *__pu_addr = (ptr);				\
 	typeof(*(ptr)) __pu_val = (typeof(*(ptr)))(x);			\
-	if (access_ok(VERIFY_WRITE, __pu_addr, size) && __pu_addr)	\
+	if (access_ok(__pu_addr, size) && __pu_addr)	\
 		__put_user_size(__pu_val, __pu_addr, (size), __pu_err);	\
 	__pu_err;							\
 })
@@ -217,7 +209,7 @@ do {								\
 ({								\
 	int __gu_err = -EFAULT;					\
 	const __typeof__(*(ptr)) __user *__gu_ptr = (ptr);	\
-	if (access_ok(VERIFY_READ, __gu_ptr, size) && __gu_ptr)	\
+	if (access_ok(__gu_ptr, size) && __gu_ptr)	\
 		__get_user_size(x, __gu_ptr, size, __gu_err);	\
 	__gu_err;						\
 })
diff --git a/arch/csky/kernel/signal.c b/arch/csky/kernel/signal.c
index 66e1b729b10b..9967c10eee2b 100644
--- a/arch/csky/kernel/signal.c
+++ b/arch/csky/kernel/signal.c
@@ -88,7 +88,7 @@ do_rt_sigreturn(void)
 	struct pt_regs *regs = current_pt_regs();
 	struct rt_sigframe *frame = (struct rt_sigframe *)(regs->usp);
 
-	if (verify_area(VERIFY_READ, frame, sizeof(*frame)))
+	if (!access_ok(frame, sizeof(*frame)))
 		goto badframe;
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
diff --git a/arch/csky/lib/usercopy.c b/arch/csky/lib/usercopy.c
index ac9170e2cbb8..647a23986fb5 100644
--- a/arch/csky/lib/usercopy.c
+++ b/arch/csky/lib/usercopy.c
@@ -7,7 +7,7 @@
 unsigned long raw_copy_from_user(void *to, const void *from,
 			unsigned long n)
 {
-	if (access_ok(VERIFY_READ, from, n))
+	if (access_ok(from, n))
 		__copy_user_zeroing(to, from, n);
 	else
 		memset(to, 0, n);
@@ -18,7 +18,7 @@ EXPORT_SYMBOL(raw_copy_from_user);
 unsigned long raw_copy_to_user(void *to, const void *from,
 			unsigned long n)
 {
-	if (access_ok(VERIFY_WRITE, to, n))
+	if (access_ok(to, n))
 		__copy_user(to, from, n);
 	return n;
 }
@@ -113,7 +113,7 @@ long strncpy_from_user(char *dst, const char *src, long count)
 {
 	long res = -EFAULT;
 
-	if (access_ok(VERIFY_READ, src, 1))
+	if (access_ok(src, 1))
 		__do_strncpy_from_user(dst, src, count, res);
 	return res;
 }
@@ -236,7 +236,7 @@ do {							\
 unsigned long
 clear_user(void __user *to, unsigned long n)
 {
-	if (access_ok(VERIFY_WRITE, to, n))
+	if (access_ok(to, n))
 		__do_clear_user(to, n);
 	return n;
 }
diff --git a/arch/h8300/kernel/signal.c b/arch/h8300/kernel/signal.c
index 1e8070d08770..e0f2b708e5d9 100644
--- a/arch/h8300/kernel/signal.c
+++ b/arch/h8300/kernel/signal.c
@@ -110,7 +110,7 @@ asmlinkage int sys_rt_sigreturn(void)
 	sigset_t set;
 	int er0;
 
-	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
+	if (!access_ok(frame, sizeof(*frame)))
 		goto badframe;
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
@@ -165,7 +165,7 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set,
 
 	frame = get_sigframe(ksig, regs, sizeof(*frame));
 
-	if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+	if (!access_ok(frame, sizeof(*frame)))
 		return -EFAULT;
 
 	if (ksig->ka.sa.sa_flags & SA_SIGINFO)
diff --git a/arch/hexagon/include/asm/futex.h b/arch/hexagon/include/asm/futex.h
index c889f5993ecd..cb635216a732 100644
--- a/arch/hexagon/include/asm/futex.h
+++ b/arch/hexagon/include/asm/futex.h
@@ -77,7 +77,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 oldval,
 	int prev;
 	int ret;
 
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
+	if (!access_ok(uaddr, sizeof(u32)))
 		return -EFAULT;
 
 	__asm__ __volatile__ (
diff --git a/arch/hexagon/include/asm/uaccess.h b/arch/hexagon/include/asm/uaccess.h
index 458b69886b34..a30e58d5f351 100644
--- a/arch/hexagon/include/asm/uaccess.h
+++ b/arch/hexagon/include/asm/uaccess.h
@@ -29,9 +29,6 @@
 
 /*
  * access_ok: - Checks if a user space pointer is valid
- * @type: Type of access: %VERIFY_READ or %VERIFY_WRITE.  Note that
- *        %VERIFY_WRITE is a superset of %VERIFY_READ - if it is safe
- *        to write to a block, it is always safe to read from it.
  * @addr: User space pointer to start of block to check
  * @size: Size of block to check
  *
diff --git a/arch/hexagon/kernel/signal.c b/arch/hexagon/kernel/signal.c
index 78aa7304a5c9..31e2cf95f189 100644
--- a/arch/hexagon/kernel/signal.c
+++ b/arch/hexagon/kernel/signal.c
@@ -115,7 +115,7 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set,
 
 	frame = get_sigframe(ksig, regs, sizeof(struct rt_sigframe));
 
-	if (!access_ok(VERIFY_WRITE, frame, sizeof(struct rt_sigframe)))
+	if (!access_ok(frame, sizeof(struct rt_sigframe)))
 		return -EFAULT;
 
 	if (copy_siginfo_to_user(&frame->info, &ksig->info))
@@ -244,7 +244,7 @@ asmlinkage int sys_rt_sigreturn(void)
 	current->restart_block.fn = do_no_restart_syscall;
 
 	frame = (struct rt_sigframe __user *)pt_psp(regs);
-	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
+	if (!access_ok(frame, sizeof(*frame)))
 		goto badframe;
 	if (__copy_from_user(&blocked, &frame->uc.uc_sigmask, sizeof(blocked)))
 		goto badframe;
diff --git a/arch/hexagon/mm/uaccess.c b/arch/hexagon/mm/uaccess.c
index c599eb126c9e..6f9c4697552c 100644
--- a/arch/hexagon/mm/uaccess.c
+++ b/arch/hexagon/mm/uaccess.c
@@ -51,7 +51,7 @@ __kernel_size_t __clear_user_hexagon(void __user *dest, unsigned long count)
 
 unsigned long clear_user_hexagon(void __user *dest, unsigned long count)
 {
-	if (!access_ok(VERIFY_WRITE, dest, count))
+	if (!access_ok(dest, count))
 		return count;
 	else
 		return __clear_user_hexagon(dest, count);
diff --git a/arch/ia64/include/asm/futex.h b/arch/ia64/include/asm/futex.h
index db2dd85918c2..2e106d462196 100644
--- a/arch/ia64/include/asm/futex.h
+++ b/arch/ia64/include/asm/futex.h
@@ -86,7 +86,7 @@ static inline int
 futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
 			      u32 oldval, u32 newval)
 {
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
+	if (!access_ok(uaddr, sizeof(u32)))
 		return -EFAULT;
 
 	{
diff --git a/arch/ia64/include/asm/uaccess.h b/arch/ia64/include/asm/uaccess.h
index a74524f2d625..306d469e43da 100644
--- a/arch/ia64/include/asm/uaccess.h
+++ b/arch/ia64/include/asm/uaccess.h
@@ -67,7 +67,7 @@ static inline int __access_ok(const void __user *p, unsigned long size)
 	return likely(addr <= seg) &&
 	 (seg == KERNEL_DS.seg || likely(REGION_OFFSET(addr) < RGN_MAP_LIMIT));
 }
-#define access_ok(type, addr, size)	__access_ok((addr), (size))
+#define access_ok(addr, size)	__access_ok((addr), (size))
 
 /*
  * These are the main single-value transfer routines.  They automatically
diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c
index 427cd565fd61..6d50ede0ed69 100644
--- a/arch/ia64/kernel/ptrace.c
+++ b/arch/ia64/kernel/ptrace.c
@@ -836,7 +836,7 @@ ptrace_getregs (struct task_struct *child, struct pt_all_user_regs __user *ppr)
 	char nat = 0;
 	int i;
 
-	if (!access_ok(VERIFY_WRITE, ppr, sizeof(struct pt_all_user_regs)))
+	if (!access_ok(ppr, sizeof(struct pt_all_user_regs)))
 		return -EIO;
 
 	pt = task_pt_regs(child);
@@ -981,7 +981,7 @@ ptrace_setregs (struct task_struct *child, struct pt_all_user_regs __user *ppr)
 
 	memset(&fpval, 0, sizeof(fpval));
 
-	if (!access_ok(VERIFY_READ, ppr, sizeof(struct pt_all_user_regs)))
+	if (!access_ok(ppr, sizeof(struct pt_all_user_regs)))
 		return -EIO;
 
 	pt = task_pt_regs(child);
diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c
index 99099f73b207..6062fd14e34e 100644
--- a/arch/ia64/kernel/signal.c
+++ b/arch/ia64/kernel/signal.c
@@ -132,7 +132,7 @@ ia64_rt_sigreturn (struct sigscratch *scr)
 		 */
 		retval = (long) &ia64_strace_leave_kernel;
 
-	if (!access_ok(VERIFY_READ, sc, sizeof(*sc)))
+	if (!access_ok(sc, sizeof(*sc)))
 		goto give_sigsegv;
 
 	if (GET_SIGSET(&set, &sc->sc_mask))
@@ -264,7 +264,7 @@ setup_frame(struct ksignal *ksig, sigset_t *set, struct sigscratch *scr)
 	}
 	frame = (void __user *) ((new_sp - sizeof(*frame)) & -STACK_ALIGN);
 
-	if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) {
+	if (!access_ok(frame, sizeof(*frame))) {
 		force_sigsegv(ksig->sig, current);
 		return 1;
 	}
diff --git a/arch/m68k/include/asm/uaccess_mm.h b/arch/m68k/include/asm/uaccess_mm.h
index c4cb889660aa..7e85de984df1 100644
--- a/arch/m68k/include/asm/uaccess_mm.h
+++ b/arch/m68k/include/asm/uaccess_mm.h
@@ -10,7 +10,7 @@
 #include <asm/segment.h>
 
 /* We let the MMU do all checking */
-static inline int access_ok(int type, const void __user *addr,
+static inline int access_ok(const void __user *addr,
 			    unsigned long size)
 {
 	return 1;
diff --git a/arch/m68k/include/asm/uaccess_no.h b/arch/m68k/include/asm/uaccess_no.h
index 892efb56beef..0134008bf539 100644
--- a/arch/m68k/include/asm/uaccess_no.h
+++ b/arch/m68k/include/asm/uaccess_no.h
@@ -10,7 +10,7 @@
 
 #include <asm/segment.h>
 
-#define access_ok(type,addr,size)	_access_ok((unsigned long)(addr),(size))
+#define access_ok(addr,size)	_access_ok((unsigned long)(addr),(size))
 
 /*
  * It is not enough to just have access_ok check for a real RAM address.
diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c
index 72850b85ecf8..e2a9421c5797 100644
--- a/arch/m68k/kernel/signal.c
+++ b/arch/m68k/kernel/signal.c
@@ -787,7 +787,7 @@ asmlinkage int do_sigreturn(struct pt_regs *regs, struct switch_stack *sw)
 	struct sigframe __user *frame = (struct sigframe __user *)(usp - 4);
 	sigset_t set;
 
-	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
+	if (!access_ok(frame, sizeof(*frame)))
 		goto badframe;
 	if (__get_user(set.sig[0], &frame->sc.sc_mask) ||
 	    (_NSIG_WORDS > 1 &&
@@ -812,7 +812,7 @@ asmlinkage int do_rt_sigreturn(struct pt_regs *regs, struct switch_stack *sw)
 	struct rt_sigframe __user *frame = (struct rt_sigframe __user *)(usp - 4);
 	sigset_t set;
 
-	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
+	if (!access_ok(frame, sizeof(*frame)))
 		goto badframe;
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
diff --git a/arch/microblaze/include/asm/futex.h b/arch/microblaze/include/asm/futex.h
index 2572077b04ea..8c90357e5983 100644
--- a/arch/microblaze/include/asm/futex.h
+++ b/arch/microblaze/include/asm/futex.h
@@ -71,7 +71,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
 	int ret = 0, cmp;
 	u32 prev;
 
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
+	if (!access_ok(uaddr, sizeof(u32)))
 		return -EFAULT;
 
 	__asm__ __volatile__ ("1:	lwx	%1, %3, r0;		\
diff --git a/arch/microblaze/include/asm/uaccess.h b/arch/microblaze/include/asm/uaccess.h
index 81f16aadbf9e..dbfea093a7c7 100644
--- a/arch/microblaze/include/asm/uaccess.h
+++ b/arch/microblaze/include/asm/uaccess.h
@@ -60,26 +60,25 @@ static inline int ___range_ok(unsigned long addr, unsigned long size)
 #define __range_ok(addr, size) \
 		___range_ok((unsigned long)(addr), (unsigned long)(size))
 
-#define access_ok(type, addr, size) (__range_ok((addr), (size)) == 0)
+#define access_ok(addr, size) (__range_ok((addr), (size)) == 0)
 
 #else
 
-static inline int access_ok(int type, const void __user *addr,
-							unsigned long size)
+static inline int access_ok(const void __user *addr, unsigned long size)
 {
 	if (!size)
 		goto ok;
 
 	if ((get_fs().seg < ((unsigned long)addr)) ||
 			(get_fs().seg < ((unsigned long)addr + size - 1))) {
-		pr_devel("ACCESS fail: %s at 0x%08x (size 0x%x), seg 0x%08x\n",
-			type ? "WRITE" : "READ ", (__force u32)addr, (u32)size,
+		pr_devel("ACCESS fail at 0x%08x (size 0x%x), seg 0x%08x\n",
+			(__force u32)addr, (u32)size,
 			(u32)get_fs().seg);
 		return 0;
 	}
 ok:
-	pr_devel("ACCESS OK: %s at 0x%08x (size 0x%x), seg 0x%08x\n",
-			type ? "WRITE" : "READ ", (__force u32)addr, (u32)size,
+	pr_devel("ACCESS OK at 0x%08x (size 0x%x), seg 0x%08x\n",
+			(__force u32)addr, (u32)size,
 			(u32)get_fs().seg);
 	return 1;
 }
@@ -120,7 +119,7 @@ static inline unsigned long __must_check clear_user(void __user *to,
 							unsigned long n)
 {
 	might_fault();
-	if (unlikely(!access_ok(VERIFY_WRITE, to, n)))
+	if (unlikely(!access_ok(to, n)))
 		return n;
 
 	return __clear_user(to, n);
@@ -174,7 +173,7 @@ extern long __user_bad(void);
 	const typeof(*(ptr)) __user *__gu_addr = (ptr);			\
 	int __gu_err = 0;						\
 									\
-	if (access_ok(VERIFY_READ, __gu_addr, size)) {			\
+	if (access_ok(__gu_addr, size)) {			\
 		switch (size) {						\
 		case 1:							\
 			__get_user_asm("lbu", __gu_addr, __gu_val,	\
@@ -286,7 +285,7 @@ extern long __user_bad(void);
 	typeof(*(ptr)) __user *__pu_addr = (ptr);			\
 	int __pu_err = 0;						\
 									\
-	if (access_ok(VERIFY_WRITE, __pu_addr, size)) {			\
+	if (access_ok(__pu_addr, size)) {			\
 		switch (size) {						\
 		case 1:							\
 			__put_user_asm("sb", __pu_addr, __pu_val,	\
@@ -358,7 +357,7 @@ extern int __strncpy_user(char *to, const char __user *from, int len);
 static inline long
 strncpy_from_user(char *dst, const char __user *src, long count)
 {
-	if (!access_ok(VERIFY_READ, src, 1))
+	if (!access_ok(src, 1))
 		return -EFAULT;
 	return __strncpy_user(dst, src, count);
 }
@@ -372,7 +371,7 @@ extern int __strnlen_user(const char __user *sstr, int len);
 
 static inline long strnlen_user(const char __user *src, long n)
 {
-	if (!access_ok(VERIFY_READ, src, 1))
+	if (!access_ok(src, 1))
 		return 0;
 	return __strnlen_user(src, n);
 }
diff --git a/arch/microblaze/kernel/signal.c b/arch/microblaze/kernel/signal.c
index 97001524ca2d..0685696349bb 100644
--- a/arch/microblaze/kernel/signal.c
+++ b/arch/microblaze/kernel/signal.c
@@ -91,7 +91,7 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
 	/* Always make any pending restarted system calls return -EINTR */
 	current->restart_block.fn = do_no_restart_syscall;
 
-	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
+	if (!access_ok(frame, sizeof(*frame)))
 		goto badframe;
 
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
@@ -166,7 +166,7 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set,
 
 	frame = get_sigframe(ksig, regs, sizeof(*frame));
 
-	if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+	if (!access_ok(frame, sizeof(*frame)))
 		return -EFAULT;
 
 	if (ksig->ka.sa.sa_flags & SA_SIGINFO)
diff --git a/arch/mips/include/asm/checksum.h b/arch/mips/include/asm/checksum.h
index e8161e4dfde7..dcebaaf8c862 100644
--- a/arch/mips/include/asm/checksum.h
+++ b/arch/mips/include/asm/checksum.h
@@ -63,7 +63,7 @@ static inline
 __wsum csum_and_copy_from_user(const void __user *src, void *dst,
 			       int len, __wsum sum, int *err_ptr)
 {
-	if (access_ok(VERIFY_READ, src, len))
+	if (access_ok(src, len))
 		return csum_partial_copy_from_user(src, dst, len, sum,
 						   err_ptr);
 	if (len)
@@ -81,7 +81,7 @@ __wsum csum_and_copy_to_user(const void *src, void __user *dst, int len,
 			     __wsum sum, int *err_ptr)
 {
 	might_fault();
-	if (access_ok(VERIFY_WRITE, dst, len)) {
+	if (access_ok(dst, len)) {
 		if (uaccess_kernel())
 			return __csum_partial_copy_kernel(src,
 							  (__force void *)dst,
diff --git a/arch/mips/include/asm/futex.h b/arch/mips/include/asm/futex.h
index 8eff134b3a43..c14d798f3888 100644
--- a/arch/mips/include/asm/futex.h
+++ b/arch/mips/include/asm/futex.h
@@ -129,7 +129,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
 	int ret = 0;
 	u32 val;
 
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
+	if (!access_ok(uaddr, sizeof(u32)))
 		return -EFAULT;
 
 	if (cpu_has_llsc && R10000_LLSC_WAR) {
diff --git a/arch/mips/include/asm/termios.h b/arch/mips/include/asm/termios.h
index ce2d72e34274..bc29eeacc55a 100644
--- a/arch/mips/include/asm/termios.h
+++ b/arch/mips/include/asm/termios.h
@@ -32,7 +32,7 @@ static inline int user_termio_to_kernel_termios(struct ktermios *termios,
 	unsigned short iflag, oflag, cflag, lflag;
 	unsigned int err;
 
-	if (!access_ok(VERIFY_READ, termio, sizeof(struct termio)))
+	if (!access_ok(termio, sizeof(struct termio)))
 		return -EFAULT;
 
 	err = __get_user(iflag, &termio->c_iflag);
@@ -61,7 +61,7 @@ static inline int kernel_termios_to_user_termio(struct termio __user *termio,
 {
 	int err;
 
-	if (!access_ok(VERIFY_WRITE, termio, sizeof(struct termio)))
+	if (!access_ok(termio, sizeof(struct termio)))
 		return -EFAULT;
 
 	err = __put_user(termios->c_iflag, &termio->c_iflag);
diff --git a/arch/mips/include/asm/uaccess.h b/arch/mips/include/asm/uaccess.h
index 06629011a434..d43c1dc6ef15 100644
--- a/arch/mips/include/asm/uaccess.h
+++ b/arch/mips/include/asm/uaccess.h
@@ -109,9 +109,6 @@ static inline bool eva_kernel_access(void)
 
 /*
  * access_ok: - Checks if a user space pointer is valid
- * @type: Type of access: %VERIFY_READ or %VERIFY_WRITE.  Note that
- *	  %VERIFY_WRITE is a superset of %VERIFY_READ - if it is safe
- *	  to write to a block, it is always safe to read from it.
  * @addr: User space pointer to start of block to check
  * @size: Size of block to check
  *
@@ -134,7 +131,7 @@ static inline int __access_ok(const void __user *p, unsigned long size)
 	return (get_fs().seg & (addr | (addr + size) | __ua_size(size))) == 0;
 }
 
-#define access_ok(type, addr, size)					\
+#define access_ok(addr, size)					\
 	likely(__access_ok((addr), (size)))
 
 /*
@@ -304,7 +301,7 @@ do {									\
 	const __typeof__(*(ptr)) __user * __gu_ptr = (ptr);		\
 									\
 	might_fault();							\
-	if (likely(access_ok(VERIFY_READ,  __gu_ptr, size))) {		\
+	if (likely(access_ok( __gu_ptr, size))) {		\
 		if (eva_kernel_access())				\
 			__get_kernel_common((x), size, __gu_ptr);	\
 		else							\
@@ -446,7 +443,7 @@ do {									\
 	int __pu_err = -EFAULT;						\
 									\
 	might_fault();							\
-	if (likely(access_ok(VERIFY_WRITE,  __pu_addr, size))) {	\
+	if (likely(access_ok( __pu_addr, size))) {	\
 		if (eva_kernel_access())				\
 			__put_kernel_common(__pu_addr, size);		\
 		else							\
@@ -691,8 +688,7 @@ __clear_user(void __user *addr, __kernel_size_t size)
 ({									\
 	void __user * __cl_addr = (addr);				\
 	unsigned long __cl_size = (n);					\
-	if (__cl_size && access_ok(VERIFY_WRITE,			\
-					__cl_addr, __cl_size))		\
+	if (__cl_size && access_ok(__cl_addr, __cl_size))		\
 		__cl_size = __clear_user(__cl_addr, __cl_size);		\
 	__cl_size;							\
 })
diff --git a/arch/mips/kernel/mips-r2-to-r6-emul.c b/arch/mips/kernel/mips-r2-to-r6-emul.c
index cb22a558431e..c50c89a978f1 100644
--- a/arch/mips/kernel/mips-r2-to-r6-emul.c
+++ b/arch/mips/kernel/mips-r2-to-r6-emul.c
@@ -1205,7 +1205,7 @@ fpu_emul:
 	case lwl_op:
 		rt = regs->regs[MIPSInst_RT(inst)];
 		vaddr = regs->regs[MIPSInst_RS(inst)] + MIPSInst_SIMM(inst);
-		if (!access_ok(VERIFY_READ, (void __user *)vaddr, 4)) {
+		if (!access_ok((void __user *)vaddr, 4)) {
 			current->thread.cp0_baduaddr = vaddr;
 			err = SIGSEGV;
 			break;
@@ -1278,7 +1278,7 @@ fpu_emul:
 	case lwr_op:
 		rt = regs->regs[MIPSInst_RT(inst)];
 		vaddr = regs->regs[MIPSInst_RS(inst)] + MIPSInst_SIMM(inst);
-		if (!access_ok(VERIFY_READ, (void __user *)vaddr, 4)) {
+		if (!access_ok((void __user *)vaddr, 4)) {
 			current->thread.cp0_baduaddr = vaddr;
 			err = SIGSEGV;
 			break;
@@ -1352,7 +1352,7 @@ fpu_emul:
 	case swl_op:
 		rt = regs->regs[MIPSInst_RT(inst)];
 		vaddr = regs->regs[MIPSInst_RS(inst)] + MIPSInst_SIMM(inst);
-		if (!access_ok(VERIFY_WRITE, (void __user *)vaddr, 4)) {
+		if (!access_ok((void __user *)vaddr, 4)) {
 			current->thread.cp0_baduaddr = vaddr;
 			err = SIGSEGV;
 			break;
@@ -1422,7 +1422,7 @@ fpu_emul:
 	case swr_op:
 		rt = regs->regs[MIPSInst_RT(inst)];
 		vaddr = regs->regs[MIPSInst_RS(inst)] + MIPSInst_SIMM(inst);
-		if (!access_ok(VERIFY_WRITE, (void __user *)vaddr, 4)) {
+		if (!access_ok((void __user *)vaddr, 4)) {
 			current->thread.cp0_baduaddr = vaddr;
 			err = SIGSEGV;
 			break;
@@ -1497,7 +1497,7 @@ fpu_emul:
 
 		rt = regs->regs[MIPSInst_RT(inst)];
 		vaddr = regs->regs[MIPSInst_RS(inst)] + MIPSInst_SIMM(inst);
-		if (!access_ok(VERIFY_READ, (void __user *)vaddr, 8)) {
+		if (!access_ok((void __user *)vaddr, 8)) {
 			current->thread.cp0_baduaddr = vaddr;
 			err = SIGSEGV;
 			break;
@@ -1616,7 +1616,7 @@ fpu_emul:
 
 		rt = regs->regs[MIPSInst_RT(inst)];
 		vaddr = regs->regs[MIPSInst_RS(inst)] + MIPSInst_SIMM(inst);
-		if (!access_ok(VERIFY_READ, (void __user *)vaddr, 8)) {
+		if (!access_ok((void __user *)vaddr, 8)) {
 			current->thread.cp0_baduaddr = vaddr;
 			err = SIGSEGV;
 			break;
@@ -1735,7 +1735,7 @@ fpu_emul:
 
 		rt = regs->regs[MIPSInst_RT(inst)];
 		vaddr = regs->regs[MIPSInst_RS(inst)] + MIPSInst_SIMM(inst);
-		if (!access_ok(VERIFY_WRITE, (void __user *)vaddr, 8)) {
+		if (!access_ok((void __user *)vaddr, 8)) {
 			current->thread.cp0_baduaddr = vaddr;
 			err = SIGSEGV;
 			break;
@@ -1853,7 +1853,7 @@ fpu_emul:
 
 		rt = regs->regs[MIPSInst_RT(inst)];
 		vaddr = regs->regs[MIPSInst_RS(inst)] + MIPSInst_SIMM(inst);
-		if (!access_ok(VERIFY_WRITE, (void __user *)vaddr, 8)) {
+		if (!access_ok((void __user *)vaddr, 8)) {
 			current->thread.cp0_baduaddr = vaddr;
 			err = SIGSEGV;
 			break;
@@ -1970,7 +1970,7 @@ fpu_emul:
 			err = SIGBUS;
 			break;
 		}
-		if (!access_ok(VERIFY_READ, (void __user *)vaddr, 4)) {
+		if (!access_ok((void __user *)vaddr, 4)) {
 			current->thread.cp0_baduaddr = vaddr;
 			err = SIGBUS;
 			break;
@@ -2026,7 +2026,7 @@ fpu_emul:
 			err = SIGBUS;
 			break;
 		}
-		if (!access_ok(VERIFY_WRITE, (void __user *)vaddr, 4)) {
+		if (!access_ok((void __user *)vaddr, 4)) {
 			current->thread.cp0_baduaddr = vaddr;
 			err = SIGBUS;
 			break;
@@ -2089,7 +2089,7 @@ fpu_emul:
 			err = SIGBUS;
 			break;
 		}
-		if (!access_ok(VERIFY_READ, (void __user *)vaddr, 8)) {
+		if (!access_ok((void __user *)vaddr, 8)) {
 			current->thread.cp0_baduaddr = vaddr;
 			err = SIGBUS;
 			break;
@@ -2150,7 +2150,7 @@ fpu_emul:
 			err = SIGBUS;
 			break;
 		}
-		if (!access_ok(VERIFY_WRITE, (void __user *)vaddr, 8)) {
+		if (!access_ok((void __user *)vaddr, 8)) {
 			current->thread.cp0_baduaddr = vaddr;
 			err = SIGBUS;
 			break;
diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c
index ea54575255ea..0057c910bc2f 100644
--- a/arch/mips/kernel/ptrace.c
+++ b/arch/mips/kernel/ptrace.c
@@ -71,7 +71,7 @@ int ptrace_getregs(struct task_struct *child, struct user_pt_regs __user *data)
 	struct pt_regs *regs;
 	int i;
 
-	if (!access_ok(VERIFY_WRITE, data, 38 * 8))
+	if (!access_ok(data, 38 * 8))
 		return -EIO;
 
 	regs = task_pt_regs(child);
@@ -98,7 +98,7 @@ int ptrace_setregs(struct task_struct *child, struct user_pt_regs __user *data)
 	struct pt_regs *regs;
 	int i;
 
-	if (!access_ok(VERIFY_READ, data, 38 * 8))
+	if (!access_ok(data, 38 * 8))
 		return -EIO;
 
 	regs = task_pt_regs(child);
@@ -125,7 +125,7 @@ int ptrace_get_watch_regs(struct task_struct *child,
 
 	if (!cpu_has_watch || boot_cpu_data.watch_reg_use_cnt == 0)
 		return -EIO;
-	if (!access_ok(VERIFY_WRITE, addr, sizeof(struct pt_watch_regs)))
+	if (!access_ok(addr, sizeof(struct pt_watch_regs)))
 		return -EIO;
 
 #ifdef CONFIG_32BIT
@@ -167,7 +167,7 @@ int ptrace_set_watch_regs(struct task_struct *child,
 
 	if (!cpu_has_watch || boot_cpu_data.watch_reg_use_cnt == 0)
 		return -EIO;
-	if (!access_ok(VERIFY_READ, addr, sizeof(struct pt_watch_regs)))
+	if (!access_ok(addr, sizeof(struct pt_watch_regs)))
 		return -EIO;
 	/* Check the values. */
 	for (i = 0; i < boot_cpu_data.watch_reg_use_cnt; i++) {
@@ -359,7 +359,7 @@ int ptrace_getfpregs(struct task_struct *child, __u32 __user *data)
 {
 	int i;
 
-	if (!access_ok(VERIFY_WRITE, data, 33 * 8))
+	if (!access_ok(data, 33 * 8))
 		return -EIO;
 
 	if (tsk_used_math(child)) {
@@ -385,7 +385,7 @@ int ptrace_setfpregs(struct task_struct *child, __u32 __user *data)
 	u32 value;
 	int i;
 
-	if (!access_ok(VERIFY_READ, data, 33 * 8))
+	if (!access_ok(data, 33 * 8))
 		return -EIO;
 
 	init_fp_ctx(child);
diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c
index d3a23758592c..d75337974ee9 100644
--- a/arch/mips/kernel/signal.c
+++ b/arch/mips/kernel/signal.c
@@ -590,7 +590,7 @@ SYSCALL_DEFINE3(sigaction, int, sig, const struct sigaction __user *, act,
 	if (act) {
 		old_sigset_t mask;
 
-		if (!access_ok(VERIFY_READ, act, sizeof(*act)))
+		if (!access_ok(act, sizeof(*act)))
 			return -EFAULT;
 		err |= __get_user(new_ka.sa.sa_handler, &act->sa_handler);
 		err |= __get_user(new_ka.sa.sa_flags, &act->sa_flags);
@@ -604,7 +604,7 @@ SYSCALL_DEFINE3(sigaction, int, sig, const struct sigaction __user *, act,
 	ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
 
 	if (!ret && oact) {
-		if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)))
+		if (!access_ok(oact, sizeof(*oact)))
 			return -EFAULT;
 		err |= __put_user(old_ka.sa.sa_flags, &oact->sa_flags);
 		err |= __put_user(old_ka.sa.sa_handler, &oact->sa_handler);
@@ -630,7 +630,7 @@ asmlinkage void sys_sigreturn(void)
 
 	regs = current_pt_regs();
 	frame = (struct sigframe __user *)regs->regs[29];
-	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
+	if (!access_ok(frame, sizeof(*frame)))
 		goto badframe;
 	if (__copy_from_user(&blocked, &frame->sf_mask, sizeof(blocked)))
 		goto badframe;
@@ -667,7 +667,7 @@ asmlinkage void sys_rt_sigreturn(void)
 
 	regs = current_pt_regs();
 	frame = (struct rt_sigframe __user *)regs->regs[29];
-	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
+	if (!access_ok(frame, sizeof(*frame)))
 		goto badframe;
 	if (__copy_from_user(&set, &frame->rs_uc.uc_sigmask, sizeof(set)))
 		goto badframe;
@@ -705,7 +705,7 @@ static int setup_frame(void *sig_return, struct ksignal *ksig,
 	int err = 0;
 
 	frame = get_sigframe(ksig, regs, sizeof(*frame));
-	if (!access_ok(VERIFY_WRITE, frame, sizeof (*frame)))
+	if (!access_ok(frame, sizeof (*frame)))
 		return -EFAULT;
 
 	err |= setup_sigcontext(regs, &frame->sf_sc);
@@ -744,7 +744,7 @@ static int setup_rt_frame(void *sig_return, struct ksignal *ksig,
 	int err = 0;
 
 	frame = get_sigframe(ksig, regs, sizeof(*frame));
-	if (!access_ok(VERIFY_WRITE, frame, sizeof (*frame)))
+	if (!access_ok(frame, sizeof (*frame)))
 		return -EFAULT;
 
 	/* Create siginfo.  */
diff --git a/arch/mips/kernel/signal32.c b/arch/mips/kernel/signal32.c
index b5d9e1784aff..59b8965433c2 100644
--- a/arch/mips/kernel/signal32.c
+++ b/arch/mips/kernel/signal32.c
@@ -46,7 +46,7 @@ SYSCALL_DEFINE3(32_sigaction, long, sig, const struct compat_sigaction __user *,
 		old_sigset_t mask;
 		s32 handler;
 
-		if (!access_ok(VERIFY_READ, act, sizeof(*act)))
+		if (!access_ok(act, sizeof(*act)))
 			return -EFAULT;
 		err |= __get_user(handler, &act->sa_handler);
 		new_ka.sa.sa_handler = (void __user *)(s64)handler;
@@ -61,7 +61,7 @@ SYSCALL_DEFINE3(32_sigaction, long, sig, const struct compat_sigaction __user *,
 	ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
 
 	if (!ret && oact) {
-		if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)))
+		if (!access_ok(oact, sizeof(*oact)))
 			return -EFAULT;
 		err |= __put_user(old_ka.sa.sa_flags, &oact->sa_flags);
 		err |= __put_user((u32)(u64)old_ka.sa.sa_handler,
diff --git a/arch/mips/kernel/signal_n32.c b/arch/mips/kernel/signal_n32.c
index 8f65aaf9206d..c498b027823e 100644
--- a/arch/mips/kernel/signal_n32.c
+++ b/arch/mips/kernel/signal_n32.c
@@ -73,7 +73,7 @@ asmlinkage void sysn32_rt_sigreturn(void)
 
 	regs = current_pt_regs();
 	frame = (struct rt_sigframe_n32 __user *)regs->regs[29];
-	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
+	if (!access_ok(frame, sizeof(*frame)))
 		goto badframe;
 	if (__copy_conv_sigset_from_user(&set, &frame->rs_uc.uc_sigmask))
 		goto badframe;
@@ -110,7 +110,7 @@ static int setup_rt_frame_n32(void *sig_return, struct ksignal *ksig,
 	int err = 0;
 
 	frame = get_sigframe(ksig, regs, sizeof(*frame));
-	if (!access_ok(VERIFY_WRITE, frame, sizeof (*frame)))
+	if (!access_ok(frame, sizeof (*frame)))
 		return -EFAULT;
 
 	/* Create siginfo.  */
diff --git a/arch/mips/kernel/signal_o32.c b/arch/mips/kernel/signal_o32.c
index b6e3ddef48a0..df259618e834 100644
--- a/arch/mips/kernel/signal_o32.c
+++ b/arch/mips/kernel/signal_o32.c
@@ -118,7 +118,7 @@ static int setup_frame_32(void *sig_return, struct ksignal *ksig,
 	int err = 0;
 
 	frame = get_sigframe(ksig, regs, sizeof(*frame));
-	if (!access_ok(VERIFY_WRITE, frame, sizeof (*frame)))
+	if (!access_ok(frame, sizeof (*frame)))
 		return -EFAULT;
 
 	err |= setup_sigcontext32(regs, &frame->sf_sc);
@@ -160,7 +160,7 @@ asmlinkage void sys32_rt_sigreturn(void)
 
 	regs = current_pt_regs();
 	frame = (struct rt_sigframe32 __user *)regs->regs[29];
-	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
+	if (!access_ok(frame, sizeof(*frame)))
 		goto badframe;
 	if (__copy_conv_sigset_from_user(&set, &frame->rs_uc.uc_sigmask))
 		goto badframe;
@@ -197,7 +197,7 @@ static int setup_rt_frame_32(void *sig_return, struct ksignal *ksig,
 	int err = 0;
 
 	frame = get_sigframe(ksig, regs, sizeof(*frame));
-	if (!access_ok(VERIFY_WRITE, frame, sizeof (*frame)))
+	if (!access_ok(frame, sizeof (*frame)))
 		return -EFAULT;
 
 	/* Convert (siginfo_t -> compat_siginfo_t) and copy to user. */
@@ -262,7 +262,7 @@ asmlinkage void sys32_sigreturn(void)
 
 	regs = current_pt_regs();
 	frame = (struct sigframe32 __user *)regs->regs[29];
-	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
+	if (!access_ok(frame, sizeof(*frame)))
 		goto badframe;
 	if (__copy_conv_sigset_from_user(&blocked, &frame->sf_mask))
 		goto badframe;
diff --git a/arch/mips/kernel/syscall.c b/arch/mips/kernel/syscall.c
index 41a0db08cd37..b6dc78ad5d8c 100644
--- a/arch/mips/kernel/syscall.c
+++ b/arch/mips/kernel/syscall.c
@@ -101,7 +101,7 @@ static inline int mips_atomic_set(unsigned long addr, unsigned long new)
 	if (unlikely(addr & 3))
 		return -EINVAL;
 
-	if (unlikely(!access_ok(VERIFY_WRITE, (const void __user *)addr, 4)))
+	if (unlikely(!access_ok((const void __user *)addr, 4)))
 		return -EINVAL;
 
 	if (cpu_has_llsc && R10000_LLSC_WAR) {
diff --git a/arch/mips/kernel/unaligned.c b/arch/mips/kernel/unaligned.c
index c60e7719ef77..595ca9c85111 100644
--- a/arch/mips/kernel/unaligned.c
+++ b/arch/mips/kernel/unaligned.c
@@ -936,7 +936,7 @@ static void emulate_load_store_insn(struct pt_regs *regs,
 		if (insn.dsp_format.func == lx_op) {
 			switch (insn.dsp_format.op) {
 			case lwx_op:
-				if (!access_ok(VERIFY_READ, addr, 4))
+				if (!access_ok(addr, 4))
 					goto sigbus;
 				LoadW(addr, value, res);
 				if (res)
@@ -945,7 +945,7 @@ static void emulate_load_store_insn(struct pt_regs *regs,
 				regs->regs[insn.dsp_format.rd] = value;
 				break;
 			case lhx_op:
-				if (!access_ok(VERIFY_READ, addr, 2))
+				if (!access_ok(addr, 2))
 					goto sigbus;
 				LoadHW(addr, value, res);
 				if (res)
@@ -968,7 +968,7 @@ static void emulate_load_store_insn(struct pt_regs *regs,
 			set_fs(USER_DS);
 			switch (insn.spec3_format.func) {
 			case lhe_op:
-				if (!access_ok(VERIFY_READ, addr, 2)) {
+				if (!access_ok(addr, 2)) {
 					set_fs(seg);
 					goto sigbus;
 				}
@@ -981,7 +981,7 @@ static void emulate_load_store_insn(struct pt_regs *regs,
 				regs->regs[insn.spec3_format.rt] = value;
 				break;
 			case lwe_op:
-				if (!access_ok(VERIFY_READ, addr, 4)) {
+				if (!access_ok(addr, 4)) {
 					set_fs(seg);
 					goto sigbus;
 				}
@@ -994,7 +994,7 @@ static void emulate_load_store_insn(struct pt_regs *regs,
 				regs->regs[insn.spec3_format.rt] = value;
 				break;
 			case lhue_op:
-				if (!access_ok(VERIFY_READ, addr, 2)) {
+				if (!access_ok(addr, 2)) {
 					set_fs(seg);
 					goto sigbus;
 				}
@@ -1007,7 +1007,7 @@ static void emulate_load_store_insn(struct pt_regs *regs,
 				regs->regs[insn.spec3_format.rt] = value;
 				break;
 			case she_op:
-				if (!access_ok(VERIFY_WRITE, addr, 2)) {
+				if (!access_ok(addr, 2)) {
 					set_fs(seg);
 					goto sigbus;
 				}
@@ -1020,7 +1020,7 @@ static void emulate_load_store_insn(struct pt_regs *regs,
 				}
 				break;
 			case swe_op:
-				if (!access_ok(VERIFY_WRITE, addr, 4)) {
+				if (!access_ok(addr, 4)) {
 					set_fs(seg);
 					goto sigbus;
 				}
@@ -1041,7 +1041,7 @@ static void emulate_load_store_insn(struct pt_regs *regs,
 #endif
 		break;
 	case lh_op:
-		if (!access_ok(VERIFY_READ, addr, 2))
+		if (!access_ok(addr, 2))
 			goto sigbus;
 
 		if (IS_ENABLED(CONFIG_EVA)) {
@@ -1060,7 +1060,7 @@ static void emulate_load_store_insn(struct pt_regs *regs,
 		break;
 
 	case lw_op:
-		if (!access_ok(VERIFY_READ, addr, 4))
+		if (!access_ok(addr, 4))
 			goto sigbus;
 
 		if (IS_ENABLED(CONFIG_EVA)) {
@@ -1079,7 +1079,7 @@ static void emulate_load_store_insn(struct pt_regs *regs,
 		break;
 
 	case lhu_op:
-		if (!access_ok(VERIFY_READ, addr, 2))
+		if (!access_ok(addr, 2))
 			goto sigbus;
 
 		if (IS_ENABLED(CONFIG_EVA)) {
@@ -1106,7 +1106,7 @@ static void emulate_load_store_insn(struct pt_regs *regs,
 		 * would blow up, so for now we don't handle unaligned 64-bit
 		 * instructions on 32-bit kernels.
 		 */
-		if (!access_ok(VERIFY_READ, addr, 4))
+		if (!access_ok(addr, 4))
 			goto sigbus;
 
 		LoadWU(addr, value, res);
@@ -1129,7 +1129,7 @@ static void emulate_load_store_insn(struct pt_regs *regs,
 		 * would blow up, so for now we don't handle unaligned 64-bit
 		 * instructions on 32-bit kernels.
 		 */
-		if (!access_ok(VERIFY_READ, addr, 8))
+		if (!access_ok(addr, 8))
 			goto sigbus;
 
 		LoadDW(addr, value, res);
@@ -1144,7 +1144,7 @@ static void emulate_load_store_insn(struct pt_regs *regs,
 		goto sigill;
 
 	case sh_op:
-		if (!access_ok(VERIFY_WRITE, addr, 2))
+		if (!access_ok(addr, 2))
 			goto sigbus;
 
 		compute_return_epc(regs);
@@ -1164,7 +1164,7 @@ static void emulate_load_store_insn(struct pt_regs *regs,
 		break;
 
 	case sw_op:
-		if (!access_ok(VERIFY_WRITE, addr, 4))
+		if (!access_ok(addr, 4))
 			goto sigbus;
 
 		compute_return_epc(regs);
@@ -1192,7 +1192,7 @@ static void emulate_load_store_insn(struct pt_regs *regs,
 		 * would blow up, so for now we don't handle unaligned 64-bit
 		 * instructions on 32-bit kernels.
 		 */
-		if (!access_ok(VERIFY_WRITE, addr, 8))
+		if (!access_ok(addr, 8))
 			goto sigbus;
 
 		compute_return_epc(regs);
@@ -1254,7 +1254,7 @@ static void emulate_load_store_insn(struct pt_regs *regs,
 
 		switch (insn.msa_mi10_format.func) {
 		case msa_ld_op:
-			if (!access_ok(VERIFY_READ, addr, sizeof(*fpr)))
+			if (!access_ok(addr, sizeof(*fpr)))
 				goto sigbus;
 
 			do {
@@ -1290,7 +1290,7 @@ static void emulate_load_store_insn(struct pt_regs *regs,
 			break;
 
 		case msa_st_op:
-			if (!access_ok(VERIFY_WRITE, addr, sizeof(*fpr)))
+			if (!access_ok(addr, sizeof(*fpr)))
 				goto sigbus;
 
 			/*
@@ -1463,7 +1463,7 @@ static void emulate_load_store_microMIPS(struct pt_regs *regs,
 			if (reg == 31)
 				goto sigbus;
 
-			if (!access_ok(VERIFY_READ, addr, 8))
+			if (!access_ok(addr, 8))
 				goto sigbus;
 
 			LoadW(addr, value, res);
@@ -1482,7 +1482,7 @@ static void emulate_load_store_microMIPS(struct pt_regs *regs,
 			if (reg == 31)
 				goto sigbus;
 
-			if (!access_ok(VERIFY_WRITE, addr, 8))
+			if (!access_ok(addr, 8))
 				goto sigbus;
 
 			value = regs->regs[reg];
@@ -1502,7 +1502,7 @@ static void emulate_load_store_microMIPS(struct pt_regs *regs,
 			if (reg == 31)
 				goto sigbus;
 
-			if (!access_ok(VERIFY_READ, addr, 16))
+			if (!access_ok(addr, 16))
 				goto sigbus;
 
 			LoadDW(addr, value, res);
@@ -1525,7 +1525,7 @@ static void emulate_load_store_microMIPS(struct pt_regs *regs,
 			if (reg == 31)
 				goto sigbus;
 
-			if (!access_ok(VERIFY_WRITE, addr, 16))
+			if (!access_ok(addr, 16))
 				goto sigbus;
 
 			value = regs->regs[reg];
@@ -1548,11 +1548,10 @@ static void emulate_load_store_microMIPS(struct pt_regs *regs,
 			if ((rvar > 9) || !reg)
 				goto sigill;
 			if (reg & 0x10) {
-				if (!access_ok
-				    (VERIFY_READ, addr, 4 * (rvar + 1)))
+				if (!access_ok(addr, 4 * (rvar + 1)))
 					goto sigbus;
 			} else {
-				if (!access_ok(VERIFY_READ, addr, 4 * rvar))
+				if (!access_ok(addr, 4 * rvar))
 					goto sigbus;
 			}
 			if (rvar == 9)
@@ -1585,11 +1584,10 @@ static void emulate_load_store_microMIPS(struct pt_regs *regs,
 			if ((rvar > 9) || !reg)
 				goto sigill;
 			if (reg & 0x10) {
-				if (!access_ok
-				    (VERIFY_WRITE, addr, 4 * (rvar + 1)))
+				if (!access_ok(addr, 4 * (rvar + 1)))
 					goto sigbus;
 			} else {
-				if (!access_ok(VERIFY_WRITE, addr, 4 * rvar))
+				if (!access_ok(addr, 4 * rvar))
 					goto sigbus;
 			}
 			if (rvar == 9)
@@ -1623,11 +1621,10 @@ static void emulate_load_store_microMIPS(struct pt_regs *regs,
 			if ((rvar > 9) || !reg)
 				goto sigill;
 			if (reg & 0x10) {
-				if (!access_ok
-				    (VERIFY_READ, addr, 8 * (rvar + 1)))
+				if (!access_ok(addr, 8 * (rvar + 1)))
 					goto sigbus;
 			} else {
-				if (!access_ok(VERIFY_READ, addr, 8 * rvar))
+				if (!access_ok(addr, 8 * rvar))
 					goto sigbus;
 			}
 			if (rvar == 9)
@@ -1665,11 +1662,10 @@ static void emulate_load_store_microMIPS(struct pt_regs *regs,
 			if ((rvar > 9) || !reg)
 				goto sigill;
 			if (reg & 0x10) {
-				if (!access_ok
-				    (VERIFY_WRITE, addr, 8 * (rvar + 1)))
+				if (!access_ok(addr, 8 * (rvar + 1)))
 					goto sigbus;
 			} else {
-				if (!access_ok(VERIFY_WRITE, addr, 8 * rvar))
+				if (!access_ok(addr, 8 * rvar))
 					goto sigbus;
 			}
 			if (rvar == 9)
@@ -1788,7 +1784,7 @@ fpu_emul:
 		case mm_lwm16_op:
 			reg = insn.mm16_m_format.rlist;
 			rvar = reg + 1;
-			if (!access_ok(VERIFY_READ, addr, 4 * rvar))
+			if (!access_ok(addr, 4 * rvar))
 				goto sigbus;
 
 			for (i = 16; rvar; rvar--, i++) {
@@ -1808,7 +1804,7 @@ fpu_emul:
 		case mm_swm16_op:
 			reg = insn.mm16_m_format.rlist;
 			rvar = reg + 1;
-			if (!access_ok(VERIFY_WRITE, addr, 4 * rvar))
+			if (!access_ok(addr, 4 * rvar))
 				goto sigbus;
 
 			for (i = 16; rvar; rvar--, i++) {
@@ -1862,7 +1858,7 @@ fpu_emul:
 	}
 
 loadHW:
-	if (!access_ok(VERIFY_READ, addr, 2))
+	if (!access_ok(addr, 2))
 		goto sigbus;
 
 	LoadHW(addr, value, res);
@@ -1872,7 +1868,7 @@ loadHW:
 	goto success;
 
 loadHWU:
-	if (!access_ok(VERIFY_READ, addr, 2))
+	if (!access_ok(addr, 2))
 		goto sigbus;
 
 	LoadHWU(addr, value, res);
@@ -1882,7 +1878,7 @@ loadHWU:
 	goto success;
 
 loadW:
-	if (!access_ok(VERIFY_READ, addr, 4))
+	if (!access_ok(addr, 4))
 		goto sigbus;
 
 	LoadW(addr, value, res);
@@ -1900,7 +1896,7 @@ loadWU:
 	 * would blow up, so for now we don't handle unaligned 64-bit
 	 * instructions on 32-bit kernels.
 	 */
-	if (!access_ok(VERIFY_READ, addr, 4))
+	if (!access_ok(addr, 4))
 		goto sigbus;
 
 	LoadWU(addr, value, res);
@@ -1922,7 +1918,7 @@ loadDW:
 	 * would blow up, so for now we don't handle unaligned 64-bit
 	 * instructions on 32-bit kernels.
 	 */
-	if (!access_ok(VERIFY_READ, addr, 8))
+	if (!access_ok(addr, 8))
 		goto sigbus;
 
 	LoadDW(addr, value, res);
@@ -1936,7 +1932,7 @@ loadDW:
 	goto sigill;
 
 storeHW:
-	if (!access_ok(VERIFY_WRITE, addr, 2))
+	if (!access_ok(addr, 2))
 		goto sigbus;
 
 	value = regs->regs[reg];
@@ -1946,7 +1942,7 @@ storeHW:
 	goto success;
 
 storeW:
-	if (!access_ok(VERIFY_WRITE, addr, 4))
+	if (!access_ok(addr, 4))
 		goto sigbus;
 
 	value = regs->regs[reg];
@@ -1964,7 +1960,7 @@ storeDW:
 	 * would blow up, so for now we don't handle unaligned 64-bit
 	 * instructions on 32-bit kernels.
 	 */
-	if (!access_ok(VERIFY_WRITE, addr, 8))
+	if (!access_ok(addr, 8))
 		goto sigbus;
 
 	value = regs->regs[reg];
@@ -2122,7 +2118,7 @@ static void emulate_load_store_MIPS16e(struct pt_regs *regs, void __user * addr)
 		goto sigbus;
 
 	case MIPS16e_lh_op:
-		if (!access_ok(VERIFY_READ, addr, 2))
+		if (!access_ok(addr, 2))
 			goto sigbus;
 
 		LoadHW(addr, value, res);
@@ -2133,7 +2129,7 @@ static void emulate_load_store_MIPS16e(struct pt_regs *regs, void __user * addr)
 		break;
 
 	case MIPS16e_lhu_op:
-		if (!access_ok(VERIFY_READ, addr, 2))
+		if (!access_ok(addr, 2))
 			goto sigbus;
 
 		LoadHWU(addr, value, res);
@@ -2146,7 +2142,7 @@ static void emulate_load_store_MIPS16e(struct pt_regs *regs, void __user * addr)
 	case MIPS16e_lw_op:
 	case MIPS16e_lwpc_op:
 	case MIPS16e_lwsp_op:
-		if (!access_ok(VERIFY_READ, addr, 4))
+		if (!access_ok(addr, 4))
 			goto sigbus;
 
 		LoadW(addr, value, res);
@@ -2165,7 +2161,7 @@ static void emulate_load_store_MIPS16e(struct pt_regs *regs, void __user * addr)
 		 * would blow up, so for now we don't handle unaligned 64-bit
 		 * instructions on 32-bit kernels.
 		 */
-		if (!access_ok(VERIFY_READ, addr, 4))
+		if (!access_ok(addr, 4))
 			goto sigbus;
 
 		LoadWU(addr, value, res);
@@ -2189,7 +2185,7 @@ loadDW:
 		 * would blow up, so for now we don't handle unaligned 64-bit
 		 * instructions on 32-bit kernels.
 		 */
-		if (!access_ok(VERIFY_READ, addr, 8))
+		if (!access_ok(addr, 8))
 			goto sigbus;
 
 		LoadDW(addr, value, res);
@@ -2204,7 +2200,7 @@ loadDW:
 		goto sigill;
 
 	case MIPS16e_sh_op:
-		if (!access_ok(VERIFY_WRITE, addr, 2))
+		if (!access_ok(addr, 2))
 			goto sigbus;
 
 		MIPS16e_compute_return_epc(regs, &oldinst);
@@ -2217,7 +2213,7 @@ loadDW:
 	case MIPS16e_sw_op:
 	case MIPS16e_swsp_op:
 	case MIPS16e_i8_op:	/* actually - MIPS16e_swrasp_func */
-		if (!access_ok(VERIFY_WRITE, addr, 4))
+		if (!access_ok(addr, 4))
 			goto sigbus;
 
 		MIPS16e_compute_return_epc(regs, &oldinst);
@@ -2237,7 +2233,7 @@ writeDW:
 		 * would blow up, so for now we don't handle unaligned 64-bit
 		 * instructions on 32-bit kernels.
 		 */
-		if (!access_ok(VERIFY_WRITE, addr, 8))
+		if (!access_ok(addr, 8))
 			goto sigbus;
 
 		MIPS16e_compute_return_epc(regs, &oldinst);
diff --git a/arch/mips/math-emu/cp1emu.c b/arch/mips/math-emu/cp1emu.c
index 82e2993c1a2c..e60e29078ef5 100644
--- a/arch/mips/math-emu/cp1emu.c
+++ b/arch/mips/math-emu/cp1emu.c
@@ -1063,7 +1063,7 @@ emul:
 				     MIPSInst_SIMM(ir));
 		MIPS_FPU_EMU_INC_STATS(loads);
 
-		if (!access_ok(VERIFY_READ, dva, sizeof(u64))) {
+		if (!access_ok(dva, sizeof(u64))) {
 			MIPS_FPU_EMU_INC_STATS(errors);
 			*fault_addr = dva;
 			return SIGBUS;
@@ -1081,7 +1081,7 @@ emul:
 				      MIPSInst_SIMM(ir));
 		MIPS_FPU_EMU_INC_STATS(stores);
 		DIFROMREG(dval, MIPSInst_RT(ir));
-		if (!access_ok(VERIFY_WRITE, dva, sizeof(u64))) {
+		if (!access_ok(dva, sizeof(u64))) {
 			MIPS_FPU_EMU_INC_STATS(errors);
 			*fault_addr = dva;
 			return SIGBUS;
@@ -1097,7 +1097,7 @@ emul:
 		wva = (u32 __user *) (xcp->regs[MIPSInst_RS(ir)] +
 				      MIPSInst_SIMM(ir));
 		MIPS_FPU_EMU_INC_STATS(loads);
-		if (!access_ok(VERIFY_READ, wva, sizeof(u32))) {
+		if (!access_ok(wva, sizeof(u32))) {
 			MIPS_FPU_EMU_INC_STATS(errors);
 			*fault_addr = wva;
 			return SIGBUS;
@@ -1115,7 +1115,7 @@ emul:
 				      MIPSInst_SIMM(ir));
 		MIPS_FPU_EMU_INC_STATS(stores);
 		SIFROMREG(wval, MIPSInst_RT(ir));
-		if (!access_ok(VERIFY_WRITE, wva, sizeof(u32))) {
+		if (!access_ok(wva, sizeof(u32))) {
 			MIPS_FPU_EMU_INC_STATS(errors);
 			*fault_addr = wva;
 			return SIGBUS;
@@ -1493,7 +1493,7 @@ static int fpux_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
 				xcp->regs[MIPSInst_FT(ir)]);
 
 			MIPS_FPU_EMU_INC_STATS(loads);
-			if (!access_ok(VERIFY_READ, va, sizeof(u32))) {
+			if (!access_ok(va, sizeof(u32))) {
 				MIPS_FPU_EMU_INC_STATS(errors);
 				*fault_addr = va;
 				return SIGBUS;
@@ -1513,7 +1513,7 @@ static int fpux_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
 			MIPS_FPU_EMU_INC_STATS(stores);
 
 			SIFROMREG(val, MIPSInst_FS(ir));
-			if (!access_ok(VERIFY_WRITE, va, sizeof(u32))) {
+			if (!access_ok(va, sizeof(u32))) {
 				MIPS_FPU_EMU_INC_STATS(errors);
 				*fault_addr = va;
 				return SIGBUS;
@@ -1590,7 +1590,7 @@ static int fpux_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
 				xcp->regs[MIPSInst_FT(ir)]);
 
 			MIPS_FPU_EMU_INC_STATS(loads);
-			if (!access_ok(VERIFY_READ, va, sizeof(u64))) {
+			if (!access_ok(va, sizeof(u64))) {
 				MIPS_FPU_EMU_INC_STATS(errors);
 				*fault_addr = va;
 				return SIGBUS;
@@ -1609,7 +1609,7 @@ static int fpux_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
 
 			MIPS_FPU_EMU_INC_STATS(stores);
 			DIFROMREG(val, MIPSInst_FS(ir));
-			if (!access_ok(VERIFY_WRITE, va, sizeof(u64))) {
+			if (!access_ok(va, sizeof(u64))) {
 				MIPS_FPU_EMU_INC_STATS(errors);
 				*fault_addr = va;
 				return SIGBUS;
diff --git a/arch/mips/mm/cache.c b/arch/mips/mm/cache.c
index 70a523151ff3..55099fbff4e6 100644
--- a/arch/mips/mm/cache.c
+++ b/arch/mips/mm/cache.c
@@ -76,7 +76,7 @@ SYSCALL_DEFINE3(cacheflush, unsigned long, addr, unsigned long, bytes,
 {
 	if (bytes == 0)
 		return 0;
-	if (!access_ok(VERIFY_WRITE, (void __user *) addr, bytes))
+	if (!access_ok((void __user *) addr, bytes))
 		return -EFAULT;
 
 	__flush_icache_user_range(addr, addr + bytes);
diff --git a/arch/mips/mm/gup.c b/arch/mips/mm/gup.c
index 5a4875cac1ec..0d14e0d8eacf 100644
--- a/arch/mips/mm/gup.c
+++ b/arch/mips/mm/gup.c
@@ -195,8 +195,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
 	addr = start;
 	len = (unsigned long) nr_pages << PAGE_SHIFT;
 	end = start + len;
-	if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
-					(void __user *)start, len)))
+	if (unlikely(!access_ok((void __user *)start, len)))
 		return 0;
 
 	/*
diff --git a/arch/mips/oprofile/backtrace.c b/arch/mips/oprofile/backtrace.c
index 806fb798091f..07d98ba7f49e 100644
--- a/arch/mips/oprofile/backtrace.c
+++ b/arch/mips/oprofile/backtrace.c
@@ -19,7 +19,7 @@ struct stackframe {
 static inline int get_mem(unsigned long addr, unsigned long *result)
 {
 	unsigned long *address = (unsigned long *) addr;
-	if (!access_ok(VERIFY_READ, address, sizeof(unsigned long)))
+	if (!access_ok(address, sizeof(unsigned long)))
 		return -1;
 	if (__copy_from_user_inatomic(result, address, sizeof(unsigned long)))
 		return -3;
diff --git a/arch/mips/sibyte/common/sb_tbprof.c b/arch/mips/sibyte/common/sb_tbprof.c
index 99c720be72d2..9ff26b0cd3b6 100644
--- a/arch/mips/sibyte/common/sb_tbprof.c
+++ b/arch/mips/sibyte/common/sb_tbprof.c
@@ -458,7 +458,7 @@ static ssize_t sbprof_tb_read(struct file *filp, char *buf,
 	char *dest    =	 buf;
 	long  cur_off = *offp;
 
-	if (!access_ok(VERIFY_WRITE, buf, size))
+	if (!access_ok(buf, size))
 		return -EFAULT;
 
 	mutex_lock(&sbp.lock);
diff --git a/arch/nds32/include/asm/futex.h b/arch/nds32/include/asm/futex.h
index cb6cb91cfdf8..baf178bf1d0b 100644
--- a/arch/nds32/include/asm/futex.h
+++ b/arch/nds32/include/asm/futex.h
@@ -40,7 +40,7 @@ futex_atomic_cmpxchg_inatomic(u32 * uval, u32 __user * uaddr,
 	int ret = 0;
 	u32 val, tmp, flags;
 
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
+	if (!access_ok(uaddr, sizeof(u32)))
 		return -EFAULT;
 
 	smp_mb();
diff --git a/arch/nds32/include/asm/uaccess.h b/arch/nds32/include/asm/uaccess.h
index 362a32d9bd16..53dcb49b0b12 100644
--- a/arch/nds32/include/asm/uaccess.h
+++ b/arch/nds32/include/asm/uaccess.h
@@ -13,9 +13,6 @@
 #include <asm/types.h>
 #include <linux/mm.h>
 
-#define VERIFY_READ	0
-#define VERIFY_WRITE	1
-
 #define __asmeq(x, y)  ".ifnc " x "," y " ; .err ; .endif\n\t"
 
 /*
@@ -53,7 +50,7 @@ static inline void set_fs(mm_segment_t fs)
 
 #define __range_ok(addr, size) (size <= get_fs() && addr <= (get_fs() -size))
 
-#define access_ok(type, addr, size)	\
+#define access_ok(addr, size)	\
 	__range_ok((unsigned long)addr, (unsigned long)size)
 /*
  * Single-value transfer routines.  They automatically use the right
@@ -94,7 +91,7 @@ static inline void set_fs(mm_segment_t fs)
 ({									\
 	const __typeof__(*(ptr)) __user *__p = (ptr);			\
 	might_fault();							\
-	if (access_ok(VERIFY_READ, __p, sizeof(*__p))) {		\
+	if (access_ok(__p, sizeof(*__p))) {		\
 		__get_user_err((x), __p, (err));			\
 	} else {							\
 		(x) = 0; (err) = -EFAULT;				\
@@ -189,7 +186,7 @@ do {									\
 ({									\
 	__typeof__(*(ptr)) __user *__p = (ptr);				\
 	might_fault();							\
-	if (access_ok(VERIFY_WRITE, __p, sizeof(*__p))) {		\
+	if (access_ok(__p, sizeof(*__p))) {		\
 		__put_user_err((x), __p, (err));			\
 	} else	{							\
 		(err) = -EFAULT;					\
@@ -279,7 +276,7 @@ extern unsigned long __arch_copy_to_user(void __user * to, const void *from,
 #define INLINE_COPY_TO_USER
 static inline unsigned long clear_user(void __user * to, unsigned long n)
 {
-	if (access_ok(VERIFY_WRITE, to, n))
+	if (access_ok(to, n))
 		n = __arch_clear_user(to, n);
 	return n;
 }
diff --git a/arch/nds32/kernel/perf_event_cpu.c b/arch/nds32/kernel/perf_event_cpu.c
index 5e00ce54d0ff..334c2a6cec23 100644
--- a/arch/nds32/kernel/perf_event_cpu.c
+++ b/arch/nds32/kernel/perf_event_cpu.c
@@ -1306,7 +1306,7 @@ user_backtrace(struct perf_callchain_entry_ctx *entry, unsigned long fp)
 		(unsigned long *)(fp - (unsigned long)sizeof(buftail));
 
 	/* Check accessibility of one struct frame_tail beyond */
-	if (!access_ok(VERIFY_READ, user_frame_tail, sizeof(buftail)))
+	if (!access_ok(user_frame_tail, sizeof(buftail)))
 		return 0;
 	if (__copy_from_user_inatomic
 		(&buftail, user_frame_tail, sizeof(buftail)))
@@ -1332,7 +1332,7 @@ user_backtrace_opt_size(struct perf_callchain_entry_ctx *entry,
 		(unsigned long *)(fp - (unsigned long)sizeof(buftail));
 
 	/* Check accessibility of one struct frame_tail beyond */
-	if (!access_ok(VERIFY_READ, user_frame_tail, sizeof(buftail)))
+	if (!access_ok(user_frame_tail, sizeof(buftail)))
 		return 0;
 	if (__copy_from_user_inatomic
 		(&buftail, user_frame_tail, sizeof(buftail)))
@@ -1386,7 +1386,7 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry,
 		user_frame_tail =
 			(unsigned long *)(fp - (unsigned long)sizeof(fp));
 
-		if (!access_ok(VERIFY_READ, user_frame_tail, sizeof(fp)))
+		if (!access_ok(user_frame_tail, sizeof(fp)))
 			return;
 
 		if (__copy_from_user_inatomic
@@ -1406,8 +1406,7 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry,
 				(unsigned long *)(fp -
 					(unsigned long)sizeof(buftail));
 
-			if (!access_ok
-				(VERIFY_READ, user_frame_tail, sizeof(buftail)))
+			if (!access_ok(user_frame_tail, sizeof(buftail)))
 				return;
 
 			if (__copy_from_user_inatomic
@@ -1424,7 +1423,7 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry,
 					(unsigned long *)(fp - (unsigned long)
 						sizeof(buftail_opt_size));
 
-				if (!access_ok(VERIFY_READ, user_frame_tail,
+				if (!access_ok(user_frame_tail,
 					       sizeof(buftail_opt_size)))
 					return;
 
diff --git a/arch/nds32/kernel/signal.c b/arch/nds32/kernel/signal.c
index 5b5be082cfa4..5f7660aa2d68 100644
--- a/arch/nds32/kernel/signal.c
+++ b/arch/nds32/kernel/signal.c
@@ -151,7 +151,7 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
 
 	frame = (struct rt_sigframe __user *)regs->sp;
 
-	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
+	if (!access_ok(frame, sizeof(*frame)))
 		goto badframe;
 
 	if (restore_sigframe(regs, frame))
@@ -275,7 +275,7 @@ setup_rt_frame(struct ksignal *ksig, sigset_t * set, struct pt_regs *regs)
 	    get_sigframe(ksig, regs, sizeof(*frame));
 	int err = 0;
 
-	if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+	if (!access_ok(frame, sizeof(*frame)))
 		return -EFAULT;
 
 	__put_user_error(0, &frame->uc.uc_flags, err);
diff --git a/arch/nds32/mm/alignment.c b/arch/nds32/mm/alignment.c
index e1aed9dc692d..c8b9061a2ee3 100644
--- a/arch/nds32/mm/alignment.c
+++ b/arch/nds32/mm/alignment.c
@@ -289,13 +289,13 @@ static inline int do_16(unsigned long inst, struct pt_regs *regs)
 		unaligned_addr += shift;
 
 	if (load) {
-		if (!access_ok(VERIFY_READ, (void *)unaligned_addr, len))
+		if (!access_ok((void *)unaligned_addr, len))
 			return -EACCES;
 
 		get_data(unaligned_addr, &target_val, len);
 		*idx_to_addr(regs, target_idx) = target_val;
 	} else {
-		if (!access_ok(VERIFY_WRITE, (void *)unaligned_addr, len))
+		if (!access_ok((void *)unaligned_addr, len))
 			return -EACCES;
 		target_val = *idx_to_addr(regs, target_idx);
 		set_data((void *)unaligned_addr, target_val, len);
@@ -479,7 +479,7 @@ static inline int do_32(unsigned long inst, struct pt_regs *regs)
 
 	if (load) {
 
-		if (!access_ok(VERIFY_READ, (void *)unaligned_addr, len))
+		if (!access_ok((void *)unaligned_addr, len))
 			return -EACCES;
 
 		get_data(unaligned_addr, &target_val, len);
@@ -491,7 +491,7 @@ static inline int do_32(unsigned long inst, struct pt_regs *regs)
 			*idx_to_addr(regs, RT(inst)) = target_val;
 	} else {
 
-		if (!access_ok(VERIFY_WRITE, (void *)unaligned_addr, len))
+		if (!access_ok((void *)unaligned_addr, len))
 			return -EACCES;
 
 		target_val = *idx_to_addr(regs, RT(inst));
diff --git a/arch/nios2/include/asm/uaccess.h b/arch/nios2/include/asm/uaccess.h
index dfa3c7cb30b4..e0ea10806491 100644
--- a/arch/nios2/include/asm/uaccess.h
+++ b/arch/nios2/include/asm/uaccess.h
@@ -37,7 +37,7 @@
 	(((signed long)(((long)get_fs().seg) &	\
 		((long)(addr) | (((long)(addr)) + (len)) | (len)))) == 0)
 
-#define access_ok(type, addr, len)		\
+#define access_ok(addr, len)		\
 	likely(__access_ok((unsigned long)(addr), (unsigned long)(len)))
 
 # define __EX_TABLE_SECTION	".section __ex_table,\"a\"\n"
@@ -70,7 +70,7 @@ static inline unsigned long __must_check __clear_user(void __user *to,
 static inline unsigned long __must_check clear_user(void __user *to,
 						    unsigned long n)
 {
-	if (!access_ok(VERIFY_WRITE, to, n))
+	if (!access_ok(to, n))
 		return n;
 	return __clear_user(to, n);
 }
@@ -142,7 +142,7 @@ do {									\
 	long __gu_err = -EFAULT;					\
 	const __typeof__(*(ptr)) __user *__gu_ptr = (ptr);		\
 	unsigned long __gu_val = 0;					\
-	if (access_ok(VERIFY_READ,  __gu_ptr, sizeof(*__gu_ptr)))	\
+	if (access_ok( __gu_ptr, sizeof(*__gu_ptr)))	\
 		__get_user_common(__gu_val, sizeof(*__gu_ptr),		\
 			__gu_ptr, __gu_err);				\
 	(x) = (__force __typeof__(x))__gu_val;				\
@@ -168,7 +168,7 @@ do {									\
 	long __pu_err = -EFAULT;					\
 	__typeof__(*(ptr)) __user *__pu_ptr = (ptr);			\
 	__typeof__(*(ptr)) __pu_val = (__typeof(*ptr))(x);		\
-	if (access_ok(VERIFY_WRITE, __pu_ptr, sizeof(*__pu_ptr))) {	\
+	if (access_ok(__pu_ptr, sizeof(*__pu_ptr))) {	\
 		switch (sizeof(*__pu_ptr)) {				\
 		case 1:							\
 			__put_user_asm(__pu_val, "stb", __pu_ptr, __pu_err); \
diff --git a/arch/nios2/kernel/signal.c b/arch/nios2/kernel/signal.c
index 20662b0f6c9e..4a81876b6086 100644
--- a/arch/nios2/kernel/signal.c
+++ b/arch/nios2/kernel/signal.c
@@ -106,7 +106,7 @@ asmlinkage int do_rt_sigreturn(struct switch_stack *sw)
 	sigset_t set;
 	int rval;
 
-	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
+	if (!access_ok(frame, sizeof(*frame)))
 		goto badframe;
 
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
diff --git a/arch/openrisc/include/asm/futex.h b/arch/openrisc/include/asm/futex.h
index 618da4a1bffb..fe894e6331ae 100644
--- a/arch/openrisc/include/asm/futex.h
+++ b/arch/openrisc/include/asm/futex.h
@@ -72,7 +72,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
 	int ret = 0;
 	u32 prev;
 
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
+	if (!access_ok(uaddr, sizeof(u32)))
 		return -EFAULT;
 
 	__asm__ __volatile__ (				\
diff --git a/arch/openrisc/include/asm/uaccess.h b/arch/openrisc/include/asm/uaccess.h
index bbf5c79cce7a..bc8191a34db7 100644
--- a/arch/openrisc/include/asm/uaccess.h
+++ b/arch/openrisc/include/asm/uaccess.h
@@ -58,7 +58,7 @@
 /* Ensure that addr is below task's addr_limit */
 #define __addr_ok(addr) ((unsigned long) addr < get_fs())
 
-#define access_ok(type, addr, size) \
+#define access_ok(addr, size) \
 	__range_ok((unsigned long)addr, (unsigned long)size)
 
 /*
@@ -102,7 +102,7 @@ extern long __put_user_bad(void);
 ({									\
 	long __pu_err = -EFAULT;					\
 	__typeof__(*(ptr)) *__pu_addr = (ptr);				\
-	if (access_ok(VERIFY_WRITE, __pu_addr, size))			\
+	if (access_ok(__pu_addr, size))			\
 		__put_user_size((x), __pu_addr, (size), __pu_err);	\
 	__pu_err;							\
 })
@@ -175,7 +175,7 @@ struct __large_struct {
 ({									\
 	long __gu_err = -EFAULT, __gu_val = 0;				\
 	const __typeof__(*(ptr)) * __gu_addr = (ptr);			\
-	if (access_ok(VERIFY_READ, __gu_addr, size))			\
+	if (access_ok(__gu_addr, size))			\
 		__get_user_size(__gu_val, __gu_addr, (size), __gu_err);	\
 	(x) = (__force __typeof__(*(ptr)))__gu_val;			\
 	__gu_err;							\
@@ -254,7 +254,7 @@ extern unsigned long __clear_user(void *addr, unsigned long size);
 static inline __must_check unsigned long
 clear_user(void *addr, unsigned long size)
 {
-	if (likely(access_ok(VERIFY_WRITE, addr, size)))
+	if (likely(access_ok(addr, size)))
 		size = __clear_user(addr, size);
 	return size;
 }
diff --git a/arch/openrisc/kernel/signal.c b/arch/openrisc/kernel/signal.c
index 265f10fb3930..5ac9d3b1d615 100644
--- a/arch/openrisc/kernel/signal.c
+++ b/arch/openrisc/kernel/signal.c
@@ -50,7 +50,7 @@ static int restore_sigcontext(struct pt_regs *regs,
 
 	/*
 	 * Restore the regs from &sc->regs.
-	 * (sc is already checked for VERIFY_READ since the sigframe was
+	 * (sc is already checked since the sigframe was
 	 *  checked in sys_sigreturn previously)
 	 */
 	err |= __copy_from_user(regs, sc->regs.gpr, 32 * sizeof(unsigned long));
@@ -83,7 +83,7 @@ asmlinkage long _sys_rt_sigreturn(struct pt_regs *regs)
 	if (((long)frame) & 3)
 		goto badframe;
 
-	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
+	if (!access_ok(frame, sizeof(*frame)))
 		goto badframe;
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
@@ -161,7 +161,7 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set,
 
 	frame = get_sigframe(ksig, regs, sizeof(*frame));
 
-	if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+	if (!access_ok(frame, sizeof(*frame)))
 		return -EFAULT;
 
 	/* Create siginfo.  */
diff --git a/arch/parisc/include/asm/futex.h b/arch/parisc/include/asm/futex.h
index cf7ba058f619..d2c3e4106851 100644
--- a/arch/parisc/include/asm/futex.h
+++ b/arch/parisc/include/asm/futex.h
@@ -95,7 +95,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
 	if (uaccess_kernel() && !uaddr)
 		return -EFAULT;
 
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
+	if (!access_ok(uaddr, sizeof(u32)))
 		return -EFAULT;
 
 	/* HPPA has no cmpxchg in hardware and therefore the
diff --git a/arch/parisc/include/asm/uaccess.h b/arch/parisc/include/asm/uaccess.h
index ea70e36ce6af..30ac2865ea73 100644
--- a/arch/parisc/include/asm/uaccess.h
+++ b/arch/parisc/include/asm/uaccess.h
@@ -27,7 +27,7 @@
  * that put_user is the same as __put_user, etc.
  */
 
-#define access_ok(type, uaddr, size)	\
+#define access_ok(uaddr, size)	\
 	( (uaddr) == (uaddr) )
 
 #define put_user __put_user
diff --git a/arch/powerpc/include/asm/futex.h b/arch/powerpc/include/asm/futex.h
index 94542776a62d..88b38b37c21b 100644
--- a/arch/powerpc/include/asm/futex.h
+++ b/arch/powerpc/include/asm/futex.h
@@ -72,7 +72,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
 	int ret = 0;
 	u32 prev;
 
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
+	if (!access_ok(uaddr, sizeof(u32)))
 		return -EFAULT;
 
         __asm__ __volatile__ (
diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h
index ebc0b916dcf9..b31bf45eebd4 100644
--- a/arch/powerpc/include/asm/uaccess.h
+++ b/arch/powerpc/include/asm/uaccess.h
@@ -62,7 +62,7 @@ static inline int __access_ok(unsigned long addr, unsigned long size,
 
 #endif
 
-#define access_ok(type, addr, size)		\
+#define access_ok(addr, size)		\
 	(__chk_user_ptr(addr), (void)(type),		\
 	 __access_ok((__force unsigned long)(addr), (size), get_fs()))
 
@@ -166,7 +166,7 @@ do {								\
 	long __pu_err = -EFAULT;					\
 	__typeof__(*(ptr)) __user *__pu_addr = (ptr);			\
 	might_fault();							\
-	if (access_ok(VERIFY_WRITE, __pu_addr, size))			\
+	if (access_ok(__pu_addr, size))			\
 		__put_user_size((x), __pu_addr, (size), __pu_err);	\
 	__pu_err;							\
 })
@@ -276,7 +276,7 @@ do {								\
 	__long_type(*(ptr)) __gu_val = 0;				\
 	__typeof__(*(ptr)) __user *__gu_addr = (ptr);		\
 	might_fault();							\
-	if (access_ok(VERIFY_READ, __gu_addr, (size))) {		\
+	if (access_ok(__gu_addr, (size))) {		\
 		barrier_nospec();					\
 		__get_user_size(__gu_val, __gu_addr, (size), __gu_err);	\
 	}								\
@@ -374,7 +374,7 @@ extern unsigned long __clear_user(void __user *addr, unsigned long size);
 static inline unsigned long clear_user(void __user *addr, unsigned long size)
 {
 	might_fault();
-	if (likely(access_ok(VERIFY_WRITE, addr, size)))
+	if (likely(access_ok(addr, size)))
 		return __clear_user(addr, size);
 	return size;
 }
diff --git a/arch/powerpc/kernel/align.c b/arch/powerpc/kernel/align.c
index 11550a3d1ac2..0d1b6370bae0 100644
--- a/arch/powerpc/kernel/align.c
+++ b/arch/powerpc/kernel/align.c
@@ -131,8 +131,7 @@ static int emulate_spe(struct pt_regs *regs, unsigned int reg,
 
 	/* Verify the address of the operand */
 	if (unlikely(user_mode(regs) &&
-		     !access_ok((flags & ST ? VERIFY_WRITE : VERIFY_READ),
-				addr, nb)))
+		     !access_ok(addr, nb)))
 		return -EFAULT;
 
 	/* userland only */
diff --git a/arch/powerpc/kernel/rtas_flash.c b/arch/powerpc/kernel/rtas_flash.c
index 10fabae2574d..8246f437bbc6 100644
--- a/arch/powerpc/kernel/rtas_flash.c
+++ b/arch/powerpc/kernel/rtas_flash.c
@@ -523,7 +523,7 @@ static ssize_t validate_flash_write(struct file *file, const char __user *buf,
 		args_buf->status = VALIDATE_INCOMPLETE;
 	}
 
-	if (!access_ok(VERIFY_READ, buf, count)) {
+	if (!access_ok(buf, count)) {
 		rc = -EFAULT;
 		goto done;
 	}
diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c
index 38cadae4ca4f..8a1746d755c9 100644
--- a/arch/powerpc/kernel/rtasd.c
+++ b/arch/powerpc/kernel/rtasd.c
@@ -335,7 +335,7 @@ static ssize_t rtas_log_read(struct file * file, char __user * buf,
 
 	count = rtas_error_log_buffer_max;
 
-	if (!access_ok(VERIFY_WRITE, buf, count))
+	if (!access_ok(buf, count))
 		return -EFAULT;
 
 	tmp = kmalloc(count, GFP_KERNEL);
diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c
index b3e8db376ecd..e6c30cee6abf 100644
--- a/arch/powerpc/kernel/signal.c
+++ b/arch/powerpc/kernel/signal.c
@@ -44,7 +44,7 @@ void __user *get_sigframe(struct ksignal *ksig, unsigned long sp,
 	newsp = (oldsp - frame_size) & ~0xFUL;
 
 	/* Check access */
-	if (!access_ok(VERIFY_WRITE, (void __user *)newsp, oldsp - newsp))
+	if (!access_ok((void __user *)newsp, oldsp - newsp))
 		return NULL;
 
         return (void __user *)newsp;
diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index 2d47cc79e5b3..ede4f04281ae 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -1017,7 +1017,7 @@ static int do_setcontext(struct ucontext __user *ucp, struct pt_regs *regs, int
 #else
 	if (__get_user(mcp, &ucp->uc_regs))
 		return -EFAULT;
-	if (!access_ok(VERIFY_READ, mcp, sizeof(*mcp)))
+	if (!access_ok(mcp, sizeof(*mcp)))
 		return -EFAULT;
 #endif
 	set_current_blocked(&set);
@@ -1120,7 +1120,7 @@ SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, old_ctx,
 		 */
 		mctx = (struct mcontext __user *)
 			((unsigned long) &old_ctx->uc_mcontext & ~0xfUL);
-		if (!access_ok(VERIFY_WRITE, old_ctx, ctx_size)
+		if (!access_ok(old_ctx, ctx_size)
 		    || save_user_regs(regs, mctx, NULL, 0, ctx_has_vsx_region)
 		    || put_sigset_t(&old_ctx->uc_sigmask, &current->blocked)
 		    || __put_user(to_user_ptr(mctx), &old_ctx->uc_regs))
@@ -1128,7 +1128,7 @@ SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, old_ctx,
 	}
 	if (new_ctx == NULL)
 		return 0;
-	if (!access_ok(VERIFY_READ, new_ctx, ctx_size) ||
+	if (!access_ok(new_ctx, ctx_size) ||
 	    fault_in_pages_readable((u8 __user *)new_ctx, ctx_size))
 		return -EFAULT;
 
@@ -1169,7 +1169,7 @@ SYSCALL_DEFINE0(rt_sigreturn)
 
 	rt_sf = (struct rt_sigframe __user *)
 		(regs->gpr[1] + __SIGNAL_FRAMESIZE + 16);
-	if (!access_ok(VERIFY_READ, rt_sf, sizeof(*rt_sf)))
+	if (!access_ok(rt_sf, sizeof(*rt_sf)))
 		goto bad;
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
@@ -1315,7 +1315,7 @@ SYSCALL_DEFINE3(debug_setcontext, struct ucontext __user *, ctx,
 	current->thread.debug.dbcr0 = new_dbcr0;
 #endif
 
-	if (!access_ok(VERIFY_READ, ctx, sizeof(*ctx)) ||
+	if (!access_ok(ctx, sizeof(*ctx)) ||
 	    fault_in_pages_readable((u8 __user *)ctx, sizeof(*ctx)))
 		return -EFAULT;
 
@@ -1500,7 +1500,7 @@ SYSCALL_DEFINE0(sigreturn)
 	{
 		sr = (struct mcontext __user *)from_user_ptr(sigctx.regs);
 		addr = sr;
-		if (!access_ok(VERIFY_READ, sr, sizeof(*sr))
+		if (!access_ok(sr, sizeof(*sr))
 		    || restore_user_regs(regs, sr, 1))
 			goto badframe;
 	}
diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index 0935fe6c282a..bd5e6834ca69 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -383,7 +383,7 @@ static long restore_sigcontext(struct task_struct *tsk, sigset_t *set, int sig,
 	err |= __get_user(v_regs, &sc->v_regs);
 	if (err)
 		return err;
-	if (v_regs && !access_ok(VERIFY_READ, v_regs, 34 * sizeof(vector128)))
+	if (v_regs && !access_ok(v_regs, 34 * sizeof(vector128)))
 		return -EFAULT;
 	/* Copy 33 vec registers (vr0..31 and vscr) from the stack */
 	if (v_regs != NULL && (msr & MSR_VEC) != 0) {
@@ -502,10 +502,9 @@ static long restore_tm_sigcontexts(struct task_struct *tsk,
 	err |= __get_user(tm_v_regs, &tm_sc->v_regs);
 	if (err)
 		return err;
-	if (v_regs && !access_ok(VERIFY_READ, v_regs, 34 * sizeof(vector128)))
+	if (v_regs && !access_ok(v_regs, 34 * sizeof(vector128)))
 		return -EFAULT;
-	if (tm_v_regs && !access_ok(VERIFY_READ,
-				    tm_v_regs, 34 * sizeof(vector128)))
+	if (tm_v_regs && !access_ok(tm_v_regs, 34 * sizeof(vector128)))
 		return -EFAULT;
 	/* Copy 33 vec registers (vr0..31 and vscr) from the stack */
 	if (v_regs != NULL && tm_v_regs != NULL && (msr & MSR_VEC) != 0) {
@@ -671,7 +670,7 @@ SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, old_ctx,
 		ctx_has_vsx_region = 1;
 
 	if (old_ctx != NULL) {
-		if (!access_ok(VERIFY_WRITE, old_ctx, ctx_size)
+		if (!access_ok(old_ctx, ctx_size)
 		    || setup_sigcontext(&old_ctx->uc_mcontext, current, 0, NULL, 0,
 					ctx_has_vsx_region)
 		    || __copy_to_user(&old_ctx->uc_sigmask,
@@ -680,7 +679,7 @@ SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, old_ctx,
 	}
 	if (new_ctx == NULL)
 		return 0;
-	if (!access_ok(VERIFY_READ, new_ctx, ctx_size)
+	if (!access_ok(new_ctx, ctx_size)
 	    || __get_user(tmp, (u8 __user *) new_ctx)
 	    || __get_user(tmp, (u8 __user *) new_ctx + ctx_size - 1))
 		return -EFAULT;
@@ -725,7 +724,7 @@ SYSCALL_DEFINE0(rt_sigreturn)
 	/* Always make any pending restarted system calls return -EINTR */
 	current->restart_block.fn = do_no_restart_syscall;
 
-	if (!access_ok(VERIFY_READ, uc, sizeof(*uc)))
+	if (!access_ok(uc, sizeof(*uc)))
 		goto badframe;
 
 	if (__copy_from_user(&set, &uc->uc_sigmask, sizeof(set)))
diff --git a/arch/powerpc/kernel/syscalls.c b/arch/powerpc/kernel/syscalls.c
index 466216506eb2..e6982ab21816 100644
--- a/arch/powerpc/kernel/syscalls.c
+++ b/arch/powerpc/kernel/syscalls.c
@@ -89,7 +89,7 @@ ppc_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, s
 	if ( (unsigned long)n >= 4096 )
 	{
 		unsigned long __user *buffer = (unsigned long __user *)n;
-		if (!access_ok(VERIFY_READ, buffer, 5*sizeof(unsigned long))
+		if (!access_ok(buffer, 5*sizeof(unsigned long))
 		    || __get_user(n, buffer)
 		    || __get_user(inp, ((fd_set __user * __user *)(buffer+1)))
 		    || __get_user(outp, ((fd_set  __user * __user *)(buffer+2)))
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 00af2c4febf4..64936b60d521 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -837,7 +837,7 @@ static void p9_hmi_special_emu(struct pt_regs *regs)
 	addr = (__force const void __user *)ea;
 
 	/* Check it */
-	if (!access_ok(VERIFY_READ, addr, 16)) {
+	if (!access_ok(addr, 16)) {
 		pr_devel("HMI vec emu: bad access %i:%s[%d] nip=%016lx"
 			 " instr=%08x addr=%016lx\n",
 			 smp_processor_id(), current->comm, current->pid,
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 6f2d2fb4e098..bd2dcfbf00cd 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -1744,7 +1744,7 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf,
 	int first_pass;
 	unsigned long hpte[2];
 
-	if (!access_ok(VERIFY_WRITE, buf, count))
+	if (!access_ok(buf, count))
 		return -EFAULT;
 	if (kvm_is_radix(kvm))
 		return 0;
@@ -1844,7 +1844,7 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
 	int mmu_ready;
 	int pshift;
 
-	if (!access_ok(VERIFY_READ, buf, count))
+	if (!access_ok(buf, count))
 		return -EFAULT;
 	if (kvm_is_radix(kvm))
 		return -EINVAL;
diff --git a/arch/powerpc/lib/checksum_wrappers.c b/arch/powerpc/lib/checksum_wrappers.c
index a0cb63fb76a1..890d4ddd91d6 100644
--- a/arch/powerpc/lib/checksum_wrappers.c
+++ b/arch/powerpc/lib/checksum_wrappers.c
@@ -37,7 +37,7 @@ __wsum csum_and_copy_from_user(const void __user *src, void *dst,
 		goto out;
 	}
 
-	if (unlikely((len < 0) || !access_ok(VERIFY_READ, src, len))) {
+	if (unlikely((len < 0) || !access_ok(src, len))) {
 		*err_ptr = -EFAULT;
 		csum = (__force unsigned int)sum;
 		goto out;
@@ -78,7 +78,7 @@ __wsum csum_and_copy_to_user(const void *src, void __user *dst, int len,
 		goto out;
 	}
 
-	if (unlikely((len < 0) || !access_ok(VERIFY_WRITE, dst, len))) {
+	if (unlikely((len < 0) || !access_ok(dst, len))) {
 		*err_ptr = -EFAULT;
 		csum = -1; /* invalid checksum */
 		goto out;
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index a6dcfda3e11e..887f11bcf330 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -274,7 +274,7 @@ static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
 			return false;
 
 		if ((flags & FAULT_FLAG_WRITE) && (flags & FAULT_FLAG_USER) &&
-		    access_ok(VERIFY_READ, nip, sizeof(*nip))) {
+		    access_ok(nip, sizeof(*nip))) {
 			unsigned int inst;
 			int res;
 
diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c
index 3327551c8b47..5e4178790dee 100644
--- a/arch/powerpc/mm/subpage-prot.c
+++ b/arch/powerpc/mm/subpage-prot.c
@@ -214,7 +214,7 @@ SYSCALL_DEFINE3(subpage_prot, unsigned long, addr,
 		return 0;
 	}
 
-	if (!access_ok(VERIFY_READ, map, (len >> PAGE_SHIFT) * sizeof(u32)))
+	if (!access_ok(map, (len >> PAGE_SHIFT) * sizeof(u32)))
 		return -EFAULT;
 
 	down_write(&mm->mmap_sem);
diff --git a/arch/powerpc/oprofile/backtrace.c b/arch/powerpc/oprofile/backtrace.c
index 5df6290d1ccc..260c53700978 100644
--- a/arch/powerpc/oprofile/backtrace.c
+++ b/arch/powerpc/oprofile/backtrace.c
@@ -31,7 +31,7 @@ static unsigned int user_getsp32(unsigned int sp, int is_first)
 	unsigned int stack_frame[2];
 	void __user *p = compat_ptr(sp);
 
-	if (!access_ok(VERIFY_READ, p, sizeof(stack_frame)))
+	if (!access_ok(p, sizeof(stack_frame)))
 		return 0;
 
 	/*
@@ -57,7 +57,7 @@ static unsigned long user_getsp64(unsigned long sp, int is_first)
 {
 	unsigned long stack_frame[3];
 
-	if (!access_ok(VERIFY_READ, (void __user *)sp, sizeof(stack_frame)))
+	if (!access_ok((void __user *)sp, sizeof(stack_frame)))
 		return 0;
 
 	if (__copy_from_user_inatomic(stack_frame, (void __user *)sp,
diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c
index 43e7b93f27c7..ae8123edddc6 100644
--- a/arch/powerpc/platforms/cell/spufs/file.c
+++ b/arch/powerpc/platforms/cell/spufs/file.c
@@ -609,7 +609,7 @@ static ssize_t spufs_mbox_read(struct file *file, char __user *buf,
 	if (len < 4)
 		return -EINVAL;
 
-	if (!access_ok(VERIFY_WRITE, buf, len))
+	if (!access_ok(buf, len))
 		return -EFAULT;
 
 	udata = (void __user *)buf;
@@ -717,7 +717,7 @@ static ssize_t spufs_ibox_read(struct file *file, char __user *buf,
 	if (len < 4)
 		return -EINVAL;
 
-	if (!access_ok(VERIFY_WRITE, buf, len))
+	if (!access_ok(buf, len))
 		return -EFAULT;
 
 	udata = (void __user *)buf;
@@ -856,7 +856,7 @@ static ssize_t spufs_wbox_write(struct file *file, const char __user *buf,
 		return -EINVAL;
 
 	udata = (void __user *)buf;
-	if (!access_ok(VERIFY_READ, buf, len))
+	if (!access_ok(buf, len))
 		return -EFAULT;
 
 	if (__get_user(wbox_data, udata))
@@ -1994,7 +1994,7 @@ static ssize_t spufs_mbox_info_read(struct file *file, char __user *buf,
 	int ret;
 	struct spu_context *ctx = file->private_data;
 
-	if (!access_ok(VERIFY_WRITE, buf, len))
+	if (!access_ok(buf, len))
 		return -EFAULT;
 
 	ret = spu_acquire_saved(ctx);
@@ -2034,7 +2034,7 @@ static ssize_t spufs_ibox_info_read(struct file *file, char __user *buf,
 	struct spu_context *ctx = file->private_data;
 	int ret;
 
-	if (!access_ok(VERIFY_WRITE, buf, len))
+	if (!access_ok(buf, len))
 		return -EFAULT;
 
 	ret = spu_acquire_saved(ctx);
@@ -2077,7 +2077,7 @@ static ssize_t spufs_wbox_info_read(struct file *file, char __user *buf,
 	struct spu_context *ctx = file->private_data;
 	int ret;
 
-	if (!access_ok(VERIFY_WRITE, buf, len))
+	if (!access_ok(buf, len))
 		return -EFAULT;
 
 	ret = spu_acquire_saved(ctx);
@@ -2129,7 +2129,7 @@ static ssize_t spufs_dma_info_read(struct file *file, char __user *buf,
 	struct spu_context *ctx = file->private_data;
 	int ret;
 
-	if (!access_ok(VERIFY_WRITE, buf, len))
+	if (!access_ok(buf, len))
 		return -EFAULT;
 
 	ret = spu_acquire_saved(ctx);
@@ -2160,7 +2160,7 @@ static ssize_t __spufs_proxydma_info_read(struct spu_context *ctx,
 	if (len < ret)
 		return -EINVAL;
 
-	if (!access_ok(VERIFY_WRITE, buf, len))
+	if (!access_ok(buf, len))
 		return -EFAULT;
 
 	info.proxydma_info_type = ctx->csa.prob.dma_querytype_RW;
diff --git a/arch/powerpc/platforms/powernv/opal-lpc.c b/arch/powerpc/platforms/powernv/opal-lpc.c
index 6c7ad1d8b32e..2623996a193a 100644
--- a/arch/powerpc/platforms/powernv/opal-lpc.c
+++ b/arch/powerpc/platforms/powernv/opal-lpc.c
@@ -192,7 +192,7 @@ static ssize_t lpc_debug_read(struct file *filp, char __user *ubuf,
 	u32 data, pos, len, todo;
 	int rc;
 
-	if (!access_ok(VERIFY_WRITE, ubuf, count))
+	if (!access_ok(ubuf, count))
 		return -EFAULT;
 
 	todo = count;
@@ -283,7 +283,7 @@ static ssize_t lpc_debug_write(struct file *filp, const char __user *ubuf,
 	u32 data, pos, len, todo;
 	int rc;
 
-	if (!access_ok(VERIFY_READ, ubuf, count))
+	if (!access_ok(ubuf, count))
 		return -EFAULT;
 
 	todo = count;
diff --git a/arch/powerpc/platforms/pseries/scanlog.c b/arch/powerpc/platforms/pseries/scanlog.c
index 054ce7a16fc3..24b157e1e890 100644
--- a/arch/powerpc/platforms/pseries/scanlog.c
+++ b/arch/powerpc/platforms/pseries/scanlog.c
@@ -63,7 +63,7 @@ static ssize_t scanlog_read(struct file *file, char __user *buf,
 		return -EINVAL;
 	}
 
-	if (!access_ok(VERIFY_WRITE, buf, count))
+	if (!access_ok(buf, count))
 		return -EFAULT;
 
 	for (;;) {
diff --git a/arch/riscv/include/asm/futex.h b/arch/riscv/include/asm/futex.h
index 3b19eba1bc8e..66641624d8a5 100644
--- a/arch/riscv/include/asm/futex.h
+++ b/arch/riscv/include/asm/futex.h
@@ -95,7 +95,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
 	u32 val;
 	uintptr_t tmp;
 
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
+	if (!access_ok(uaddr, sizeof(u32)))
 		return -EFAULT;
 
 	__enable_user_access();
diff --git a/arch/riscv/include/asm/uaccess.h b/arch/riscv/include/asm/uaccess.h
index 8c3e3e3c8be1..637b896894fc 100644
--- a/arch/riscv/include/asm/uaccess.h
+++ b/arch/riscv/include/asm/uaccess.h
@@ -54,14 +54,8 @@ static inline void set_fs(mm_segment_t fs)
 #define user_addr_max()	(get_fs())
 
 
-#define VERIFY_READ	0
-#define VERIFY_WRITE	1
-
 /**
  * access_ok: - Checks if a user space pointer is valid
- * @type: Type of access: %VERIFY_READ or %VERIFY_WRITE.  Note that
- *        %VERIFY_WRITE is a superset of %VERIFY_READ - if it is safe
- *        to write to a block, it is always safe to read from it.
  * @addr: User space pointer to start of block to check
  * @size: Size of block to check
  *
@@ -76,7 +70,7 @@ static inline void set_fs(mm_segment_t fs)
  * checks that the pointer is in the user space range - after calling
  * this function, memory access functions may still return -EFAULT.
  */
-#define access_ok(type, addr, size) ({					\
+#define access_ok(addr, size) ({					\
 	__chk_user_ptr(addr);						\
 	likely(__access_ok((unsigned long __force)(addr), (size)));	\
 })
@@ -258,7 +252,7 @@ do {								\
 ({								\
 	const __typeof__(*(ptr)) __user *__p = (ptr);		\
 	might_fault();						\
-	access_ok(VERIFY_READ, __p, sizeof(*__p)) ?		\
+	access_ok(__p, sizeof(*__p)) ?		\
 		__get_user((x), __p) :				\
 		((x) = 0, -EFAULT);				\
 })
@@ -386,7 +380,7 @@ do {								\
 ({								\
 	__typeof__(*(ptr)) __user *__p = (ptr);			\
 	might_fault();						\
-	access_ok(VERIFY_WRITE, __p, sizeof(*__p)) ?		\
+	access_ok(__p, sizeof(*__p)) ?		\
 		__put_user((x), __p) :				\
 		-EFAULT;					\
 })
@@ -421,7 +415,7 @@ static inline
 unsigned long __must_check clear_user(void __user *to, unsigned long n)
 {
 	might_fault();
-	return access_ok(VERIFY_WRITE, to, n) ?
+	return access_ok(to, n) ?
 		__clear_user(to, n) : n;
 }
 
diff --git a/arch/riscv/kernel/signal.c b/arch/riscv/kernel/signal.c
index f9b5e7e352ef..837e1646091a 100644
--- a/arch/riscv/kernel/signal.c
+++ b/arch/riscv/kernel/signal.c
@@ -115,7 +115,7 @@ SYSCALL_DEFINE0(rt_sigreturn)
 
 	frame = (struct rt_sigframe __user *)regs->sp;
 
-	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
+	if (!access_ok(frame, sizeof(*frame)))
 		goto badframe;
 
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
@@ -187,7 +187,7 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set,
 	long err = 0;
 
 	frame = get_sigframe(ksig, regs, sizeof(*frame));
-	if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+	if (!access_ok(frame, sizeof(*frame)))
 		return -EFAULT;
 
 	err |= copy_siginfo_to_user(&frame->info, &ksig->info);
diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h
index ad6b91013a05..bd2545977ad3 100644
--- a/arch/s390/include/asm/uaccess.h
+++ b/arch/s390/include/asm/uaccess.h
@@ -48,7 +48,7 @@ static inline int __range_ok(unsigned long addr, unsigned long size)
 	__range_ok((unsigned long)(addr), (size));	\
 })
 
-#define access_ok(type, addr, size) __access_ok(addr, size)
+#define access_ok(addr, size) __access_ok(addr, size)
 
 unsigned long __must_check
 raw_copy_from_user(void *to, const void __user *from, unsigned long n);
diff --git a/arch/sh/include/asm/checksum_32.h b/arch/sh/include/asm/checksum_32.h
index b58f3d95dc19..36b84cfd3f67 100644
--- a/arch/sh/include/asm/checksum_32.h
+++ b/arch/sh/include/asm/checksum_32.h
@@ -197,7 +197,7 @@ static inline __wsum csum_and_copy_to_user(const void *src,
 					   int len, __wsum sum,
 					   int *err_ptr)
 {
-	if (access_ok(VERIFY_WRITE, dst, len))
+	if (access_ok(dst, len))
 		return csum_partial_copy_generic((__force const void *)src,
 						dst, len, sum, NULL, err_ptr);
 
diff --git a/arch/sh/include/asm/futex.h b/arch/sh/include/asm/futex.h
index 6d192f4908a7..3190ec89df81 100644
--- a/arch/sh/include/asm/futex.h
+++ b/arch/sh/include/asm/futex.h
@@ -22,7 +22,7 @@ static inline int
 futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
 			      u32 oldval, u32 newval)
 {
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
+	if (!access_ok(uaddr, sizeof(u32)))
 		return -EFAULT;
 
 	return atomic_futex_op_cmpxchg_inatomic(uval, uaddr, oldval, newval);
diff --git a/arch/sh/include/asm/uaccess.h b/arch/sh/include/asm/uaccess.h
index 32eb56e00c11..deebbfab5342 100644
--- a/arch/sh/include/asm/uaccess.h
+++ b/arch/sh/include/asm/uaccess.h
@@ -18,7 +18,7 @@
  */
 #define __access_ok(addr, size)		\
 	(__addr_ok((addr) + (size)))
-#define access_ok(type, addr, size)	\
+#define access_ok(addr, size)	\
 	(__chk_user_ptr(addr),		\
 	 __access_ok((unsigned long __force)(addr), (size)))
 
@@ -66,7 +66,7 @@ struct __large_struct { unsigned long buf[100]; };
 	long __gu_err = -EFAULT;					\
 	unsigned long __gu_val = 0;					\
 	const __typeof__(*(ptr)) *__gu_addr = (ptr);			\
-	if (likely(access_ok(VERIFY_READ, __gu_addr, (size))))		\
+	if (likely(access_ok(__gu_addr, (size))))		\
 		__get_user_size(__gu_val, __gu_addr, (size), __gu_err);	\
 	(x) = (__force __typeof__(*(ptr)))__gu_val;			\
 	__gu_err;							\
@@ -87,7 +87,7 @@ struct __large_struct { unsigned long buf[100]; };
 	long __pu_err = -EFAULT;				\
 	__typeof__(*(ptr)) __user *__pu_addr = (ptr);		\
 	__typeof__(*(ptr)) __pu_val = x;			\
-	if (likely(access_ok(VERIFY_WRITE, __pu_addr, size)))	\
+	if (likely(access_ok(__pu_addr, size)))	\
 		__put_user_size(__pu_val, __pu_addr, (size),	\
 				__pu_err);			\
 	__pu_err;						\
@@ -132,8 +132,7 @@ __kernel_size_t __clear_user(void *addr, __kernel_size_t size);
 	void __user * __cl_addr = (addr);				\
 	unsigned long __cl_size = (n);					\
 									\
-	if (__cl_size && access_ok(VERIFY_WRITE,			\
-		((unsigned long)(__cl_addr)), __cl_size))		\
+	if (__cl_size && access_ok(__cl_addr, __cl_size))		\
 		__cl_size = __clear_user(__cl_addr, __cl_size);		\
 									\
 	__cl_size;							\
diff --git a/arch/sh/kernel/signal_32.c b/arch/sh/kernel/signal_32.c
index c46c0020ff55..2a2121ba8ebe 100644
--- a/arch/sh/kernel/signal_32.c
+++ b/arch/sh/kernel/signal_32.c
@@ -160,7 +160,7 @@ asmlinkage int sys_sigreturn(void)
         /* Always make any pending restarted system calls return -EINTR */
 	current->restart_block.fn = do_no_restart_syscall;
 
-	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
+	if (!access_ok(frame, sizeof(*frame)))
 		goto badframe;
 
 	if (__get_user(set.sig[0], &frame->sc.oldmask)
@@ -190,7 +190,7 @@ asmlinkage int sys_rt_sigreturn(void)
 	/* Always make any pending restarted system calls return -EINTR */
 	current->restart_block.fn = do_no_restart_syscall;
 
-	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
+	if (!access_ok(frame, sizeof(*frame)))
 		goto badframe;
 
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
@@ -272,7 +272,7 @@ static int setup_frame(struct ksignal *ksig, sigset_t *set,
 
 	frame = get_sigframe(&ksig->ka, regs->regs[15], sizeof(*frame));
 
-	if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+	if (!access_ok(frame, sizeof(*frame)))
 		return -EFAULT;
 
 	err |= setup_sigcontext(&frame->sc, regs, set->sig[0]);
@@ -338,7 +338,7 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set,
 
 	frame = get_sigframe(&ksig->ka, regs->regs[15], sizeof(*frame));
 
-	if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+	if (!access_ok(frame, sizeof(*frame)))
 		return -EFAULT;
 
 	err |= copy_siginfo_to_user(&frame->info, &ksig->info);
diff --git a/arch/sh/kernel/signal_64.c b/arch/sh/kernel/signal_64.c
index 76661dee3c65..f1f1598879c2 100644
--- a/arch/sh/kernel/signal_64.c
+++ b/arch/sh/kernel/signal_64.c
@@ -259,7 +259,7 @@ asmlinkage int sys_sigreturn(unsigned long r2, unsigned long r3,
 	/* Always make any pending restarted system calls return -EINTR */
 	current->restart_block.fn = do_no_restart_syscall;
 
-	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
+	if (!access_ok(frame, sizeof(*frame)))
 		goto badframe;
 
 	if (__get_user(set.sig[0], &frame->sc.oldmask)
@@ -293,7 +293,7 @@ asmlinkage int sys_rt_sigreturn(unsigned long r2, unsigned long r3,
 	/* Always make any pending restarted system calls return -EINTR */
 	current->restart_block.fn = do_no_restart_syscall;
 
-	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
+	if (!access_ok(frame, sizeof(*frame)))
 		goto badframe;
 
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
@@ -379,7 +379,7 @@ static int setup_frame(struct ksignal *ksig, sigset_t *set, struct pt_regs *regs
 
 	frame = get_sigframe(&ksig->ka, regs->regs[REG_SP], sizeof(*frame));
 
-	if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+	if (!access_ok(frame, sizeof(*frame)))
 		return -EFAULT;
 
 	err |= setup_sigcontext(&frame->sc, regs, set->sig[0]);
@@ -465,7 +465,7 @@ static int setup_rt_frame(struct ksignal *kig, sigset_t *set,
 
 	frame = get_sigframe(&ksig->ka, regs->regs[REG_SP], sizeof(*frame));
 
-	if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+	if (!access_ok(frame, sizeof(*frame)))
 		return -EFAULT;
 
 	err |= __put_user(&frame->info, &frame->pinfo);
diff --git a/arch/sh/kernel/traps_64.c b/arch/sh/kernel/traps_64.c
index c52bda4d2574..8ce90a7da67d 100644
--- a/arch/sh/kernel/traps_64.c
+++ b/arch/sh/kernel/traps_64.c
@@ -40,7 +40,7 @@ static int read_opcode(reg_size_t pc, insn_size_t *result_opcode, int from_user_
 		/* SHmedia */
 		aligned_pc = pc & ~3;
 		if (from_user_mode) {
-			if (!access_ok(VERIFY_READ, aligned_pc, sizeof(insn_size_t))) {
+			if (!access_ok(aligned_pc, sizeof(insn_size_t))) {
 				get_user_error = -EFAULT;
 			} else {
 				get_user_error = __get_user(opcode, (insn_size_t *)aligned_pc);
@@ -180,7 +180,7 @@ static int misaligned_load(struct pt_regs *regs,
 	if (user_mode(regs)) {
 		__u64 buffer;
 
-		if (!access_ok(VERIFY_READ, (unsigned long) address, 1UL<<width_shift)) {
+		if (!access_ok((unsigned long) address, 1UL<<width_shift)) {
 			return -1;
 		}
 
@@ -254,7 +254,7 @@ static int misaligned_store(struct pt_regs *regs,
 	if (user_mode(regs)) {
 		__u64 buffer;
 
-		if (!access_ok(VERIFY_WRITE, (unsigned long) address, 1UL<<width_shift)) {
+		if (!access_ok((unsigned long) address, 1UL<<width_shift)) {
 			return -1;
 		}
 
@@ -327,7 +327,7 @@ static int misaligned_fpu_load(struct pt_regs *regs,
 		__u64 buffer;
 		__u32 buflo, bufhi;
 
-		if (!access_ok(VERIFY_READ, (unsigned long) address, 1UL<<width_shift)) {
+		if (!access_ok((unsigned long) address, 1UL<<width_shift)) {
 			return -1;
 		}
 
@@ -400,7 +400,7 @@ static int misaligned_fpu_store(struct pt_regs *regs,
 		/* Initialise these to NaNs. */
 		__u32 buflo=0xffffffffUL, bufhi=0xffffffffUL;
 
-		if (!access_ok(VERIFY_WRITE, (unsigned long) address, 1UL<<width_shift)) {
+		if (!access_ok((unsigned long) address, 1UL<<width_shift)) {
 			return -1;
 		}
 
@@ -663,7 +663,7 @@ void do_reserved_inst(unsigned long error_code, struct pt_regs *regs)
 	/* SHmedia : check for defect.  This requires executable vmas
 	   to be readable too. */
 	aligned_pc = pc & ~3;
-	if (!access_ok(VERIFY_READ, aligned_pc, sizeof(insn_size_t)))
+	if (!access_ok(aligned_pc, sizeof(insn_size_t)))
 		get_user_error = -EFAULT;
 	else
 		get_user_error = __get_user(opcode, (insn_size_t *)aligned_pc);
diff --git a/arch/sh/mm/gup.c b/arch/sh/mm/gup.c
index 56c86ca98ecf..3e27f6d1f1ec 100644
--- a/arch/sh/mm/gup.c
+++ b/arch/sh/mm/gup.c
@@ -177,8 +177,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
 	addr = start;
 	len = (unsigned long) nr_pages << PAGE_SHIFT;
 	end = start + len;
-	if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
-					(void __user *)start, len)))
+	if (unlikely(!access_ok((void __user *)start, len)))
 		return 0;
 
 	/*
diff --git a/arch/sh/oprofile/backtrace.c b/arch/sh/oprofile/backtrace.c
index c7695f99c8c3..8279a7e91043 100644
--- a/arch/sh/oprofile/backtrace.c
+++ b/arch/sh/oprofile/backtrace.c
@@ -51,7 +51,7 @@ user_backtrace(unsigned long *stackaddr, struct pt_regs *regs)
 	unsigned long buf_stack;
 
 	/* Also check accessibility of address */
-	if (!access_ok(VERIFY_READ, stackaddr, sizeof(unsigned long)))
+	if (!access_ok(stackaddr, sizeof(unsigned long)))
 		return NULL;
 
 	if (__copy_from_user_inatomic(&buf_stack, stackaddr, sizeof(unsigned long)))
diff --git a/arch/sparc/include/asm/checksum_32.h b/arch/sparc/include/asm/checksum_32.h
index d1e53d7aed39..5fc98d80b03b 100644
--- a/arch/sparc/include/asm/checksum_32.h
+++ b/arch/sparc/include/asm/checksum_32.h
@@ -87,7 +87,7 @@ static inline __wsum
 csum_partial_copy_to_user(const void *src, void __user *dst, int len,
 			  __wsum sum, int *err)
 {
-	if (!access_ok (VERIFY_WRITE, dst, len)) {
+	if (!access_ok(dst, len)) {
 		*err = -EFAULT;
 		return sum;
 	} else {
diff --git a/arch/sparc/include/asm/uaccess_32.h b/arch/sparc/include/asm/uaccess_32.h
index de71c65b99f0..69afb856e181 100644
--- a/arch/sparc/include/asm/uaccess_32.h
+++ b/arch/sparc/include/asm/uaccess_32.h
@@ -39,7 +39,7 @@
 #define __user_ok(addr, size) ({ (void)(size); (addr) < STACK_TOP; })
 #define __kernel_ok (uaccess_kernel())
 #define __access_ok(addr, size) (__user_ok((addr) & get_fs().seg, (size)))
-#define access_ok(type, addr, size) \
+#define access_ok(addr, size) \
 	({ (void)(type); __access_ok((unsigned long)(addr), size); })
 
 /*
diff --git a/arch/sparc/include/asm/uaccess_64.h b/arch/sparc/include/asm/uaccess_64.h
index cbb308cee394..87ae9ffb1521 100644
--- a/arch/sparc/include/asm/uaccess_64.h
+++ b/arch/sparc/include/asm/uaccess_64.h
@@ -68,7 +68,7 @@ static inline int __access_ok(const void __user * addr, unsigned long size)
 	return 1;
 }
 
-static inline int access_ok(int type, const void __user * addr, unsigned long size)
+static inline int access_ok(const void __user * addr, unsigned long size)
 {
 	return 1;
 }
diff --git a/arch/sparc/kernel/sigutil_32.c b/arch/sparc/kernel/sigutil_32.c
index 1e9fae56a853..f25c6daa9f52 100644
--- a/arch/sparc/kernel/sigutil_32.c
+++ b/arch/sparc/kernel/sigutil_32.c
@@ -65,7 +65,7 @@ int restore_fpu_state(struct pt_regs *regs, __siginfo_fpu_t __user *fpu)
 	set_used_math();
 	clear_tsk_thread_flag(current, TIF_USEDFPU);
 
-	if (!access_ok(VERIFY_READ, fpu, sizeof(*fpu)))
+	if (!access_ok(fpu, sizeof(*fpu)))
 		return -EFAULT;
 
 	err = __copy_from_user(&current->thread.float_regs[0], &fpu->si_float_regs[0],
diff --git a/arch/sparc/kernel/unaligned_32.c b/arch/sparc/kernel/unaligned_32.c
index 64ac8c0c1429..83db94c0b431 100644
--- a/arch/sparc/kernel/unaligned_32.c
+++ b/arch/sparc/kernel/unaligned_32.c
@@ -278,7 +278,6 @@ static inline int ok_for_user(struct pt_regs *regs, unsigned int insn,
 			      enum direction dir)
 {
 	unsigned int reg;
-	int check = (dir == load) ? VERIFY_READ : VERIFY_WRITE;
 	int size = ((insn >> 19) & 3) == 3 ? 8 : 4;
 
 	if ((regs->pc | regs->npc) & 3)
@@ -290,18 +289,18 @@ static inline int ok_for_user(struct pt_regs *regs, unsigned int insn,
 
 	reg = (insn >> 25) & 0x1f;
 	if (reg >= 16) {
-		if (!access_ok(check, WINREG_ADDR(reg - 16), size))
+		if (!access_ok(WINREG_ADDR(reg - 16), size))
 			return -EFAULT;
 	}
 	reg = (insn >> 14) & 0x1f;
 	if (reg >= 16) {
-		if (!access_ok(check, WINREG_ADDR(reg - 16), size))
+		if (!access_ok(WINREG_ADDR(reg - 16), size))
 			return -EFAULT;
 	}
 	if (!(insn & 0x2000)) {
 		reg = (insn & 0x1f);
 		if (reg >= 16) {
-			if (!access_ok(check, WINREG_ADDR(reg - 16), size))
+			if (!access_ok(WINREG_ADDR(reg - 16), size))
 				return -EFAULT;
 		}
 	}
diff --git a/arch/um/kernel/ptrace.c b/arch/um/kernel/ptrace.c
index 1a1d88a4d940..5f47422401e1 100644
--- a/arch/um/kernel/ptrace.c
+++ b/arch/um/kernel/ptrace.c
@@ -66,7 +66,7 @@ long arch_ptrace(struct task_struct *child, long request,
 
 #ifdef PTRACE_GETREGS
 	case PTRACE_GETREGS: { /* Get all gp regs from the child. */
-		if (!access_ok(VERIFY_WRITE, p, MAX_REG_OFFSET)) {
+		if (!access_ok(p, MAX_REG_OFFSET)) {
 			ret = -EIO;
 			break;
 		}
@@ -81,7 +81,7 @@ long arch_ptrace(struct task_struct *child, long request,
 #ifdef PTRACE_SETREGS
 	case PTRACE_SETREGS: { /* Set all gp regs in the child. */
 		unsigned long tmp = 0;
-		if (!access_ok(VERIFY_READ, p, MAX_REG_OFFSET)) {
+		if (!access_ok(p, MAX_REG_OFFSET)) {
 			ret = -EIO;
 			break;
 		}
diff --git a/arch/unicore32/kernel/signal.c b/arch/unicore32/kernel/signal.c
index 4ae51cf15ade..63be04809d40 100644
--- a/arch/unicore32/kernel/signal.c
+++ b/arch/unicore32/kernel/signal.c
@@ -117,7 +117,7 @@ asmlinkage int __sys_rt_sigreturn(struct pt_regs *regs)
 
 	frame = (struct rt_sigframe __user *)regs->UCreg_sp;
 
-	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
+	if (!access_ok(frame, sizeof(*frame)))
 		goto badframe;
 
 	if (restore_sigframe(regs, &frame->sig))
@@ -205,7 +205,7 @@ static inline void __user *get_sigframe(struct k_sigaction *ka,
 	/*
 	 * Check that we can actually write to the signal frame.
 	 */
-	if (!access_ok(VERIFY_WRITE, frame, framesize))
+	if (!access_ok(frame, framesize))
 		frame = NULL;
 
 	return frame;
diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index d78bcc03e60e..d9d81ad7a400 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -99,7 +99,7 @@ static bool write_ok_or_segv(unsigned long ptr, size_t size)
 	 * sig_on_uaccess_err, this could go away.
 	 */
 
-	if (!access_ok(VERIFY_WRITE, (void __user *)ptr, size)) {
+	if (!access_ok((void __user *)ptr, size)) {
 		struct thread_struct *thread = &current->thread;
 
 		thread->error_code	= X86_PF_USER | X86_PF_WRITE;
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index 8e02b30cf08e..f65b78d32f5e 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -176,10 +176,10 @@ static int aout_core_dump(struct coredump_params *cprm)
 
 	/* make sure we actually have a data and stack area to dump */
 	set_fs(USER_DS);
-	if (!access_ok(VERIFY_READ, (void *) (unsigned long)START_DATA(dump),
+	if (!access_ok((void *) (unsigned long)START_DATA(dump),
 		       dump.u_dsize << PAGE_SHIFT))
 		dump.u_dsize = 0;
-	if (!access_ok(VERIFY_READ, (void *) (unsigned long)START_STACK(dump),
+	if (!access_ok((void *) (unsigned long)START_STACK(dump),
 		       dump.u_ssize << PAGE_SHIFT))
 		dump.u_ssize = 0;
 
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 86b1341cba9a..321fe5f5d0e9 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -119,7 +119,7 @@ asmlinkage long sys32_sigreturn(void)
 	struct sigframe_ia32 __user *frame = (struct sigframe_ia32 __user *)(regs->sp-8);
 	sigset_t set;
 
-	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
+	if (!access_ok(frame, sizeof(*frame)))
 		goto badframe;
 	if (__get_user(set.sig[0], &frame->sc.oldmask)
 	    || (_COMPAT_NSIG_WORDS > 1
@@ -147,7 +147,7 @@ asmlinkage long sys32_rt_sigreturn(void)
 
 	frame = (struct rt_sigframe_ia32 __user *)(regs->sp - 4);
 
-	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
+	if (!access_ok(frame, sizeof(*frame)))
 		goto badframe;
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
@@ -269,7 +269,7 @@ int ia32_setup_frame(int sig, struct ksignal *ksig,
 
 	frame = get_sigframe(ksig, regs, sizeof(*frame), &fpstate);
 
-	if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+	if (!access_ok(frame, sizeof(*frame)))
 		return -EFAULT;
 
 	if (__put_user(sig, &frame->sig))
@@ -349,7 +349,7 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
 
 	frame = get_sigframe(ksig, regs, sizeof(*frame), &fpstate);
 
-	if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+	if (!access_ok(frame, sizeof(*frame)))
 		return -EFAULT;
 
 	put_user_try {
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index 11ef7b7c9cc8..a43212036257 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -75,7 +75,7 @@ static int cp_stat64(struct stat64 __user *ubuf, struct kstat *stat)
 	typeof(ubuf->st_gid) gid = 0;
 	SET_UID(uid, from_kuid_munged(current_user_ns(), stat->uid));
 	SET_GID(gid, from_kgid_munged(current_user_ns(), stat->gid));
-	if (!access_ok(VERIFY_WRITE, ubuf, sizeof(struct stat64)) ||
+	if (!access_ok(ubuf, sizeof(struct stat64)) ||
 	    __put_user(huge_encode_dev(stat->dev), &ubuf->st_dev) ||
 	    __put_user(stat->ino, &ubuf->__st_ino) ||
 	    __put_user(stat->ino, &ubuf->st_ino) ||
diff --git a/arch/x86/include/asm/checksum_32.h b/arch/x86/include/asm/checksum_32.h
index 7a659c74cd03..f57b94e02c57 100644
--- a/arch/x86/include/asm/checksum_32.h
+++ b/arch/x86/include/asm/checksum_32.h
@@ -182,7 +182,7 @@ static inline __wsum csum_and_copy_to_user(const void *src,
 	__wsum ret;
 
 	might_sleep();
-	if (access_ok(VERIFY_WRITE, dst, len)) {
+	if (access_ok(dst, len)) {
 		stac();
 		ret = csum_partial_copy_generic(src, (__force void *)dst,
 						len, sum, NULL, err_ptr);
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index b3ec519e3982..4fe9e7fc74d3 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -37,7 +37,7 @@ void sync_initial_page_table(void);
 /*
  * Define this if things work differently on an i386 and an i486:
  * it will (on an i486) warn about kernel memory accesses that are
- * done without a 'access_ok(VERIFY_WRITE,..)'
+ * done without a 'access_ok( ..)'
  */
 #undef TEST_ACCESS_OK
 
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index b5e58cc0c5e7..3920f456db79 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -77,9 +77,6 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
 
 /**
  * access_ok: - Checks if a user space pointer is valid
- * @type: Type of access: %VERIFY_READ or %VERIFY_WRITE.  Note that
- *        %VERIFY_WRITE is a superset of %VERIFY_READ - if it is safe
- *        to write to a block, it is always safe to read from it.
  * @addr: User space pointer to start of block to check
  * @size: Size of block to check
  *
@@ -95,7 +92,7 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
  * checks that the pointer is in the user space range - after calling
  * this function, memory access functions may still return -EFAULT.
  */
-#define access_ok(type, addr, size)					\
+#define access_ok(addr, size)					\
 ({									\
 	WARN_ON_IN_IRQ();						\
 	likely(!__range_not_ok(addr, size, user_addr_max()));		\
@@ -670,7 +667,7 @@ extern void __cmpxchg_wrong_size(void)
 
 #define user_atomic_cmpxchg_inatomic(uval, ptr, old, new)		\
 ({									\
-	access_ok(VERIFY_WRITE, (ptr), sizeof(*(ptr))) ?		\
+	access_ok((ptr), sizeof(*(ptr))) ?		\
 		__user_atomic_cmpxchg_inatomic((uval), (ptr),		\
 				(old), (new), sizeof(*(ptr))) :		\
 		-EFAULT;						\
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index d99a8ee9e185..f6a1d299627c 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -164,7 +164,7 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
 	ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) ||
 			 IS_ENABLED(CONFIG_IA32_EMULATION));
 
-	if (!access_ok(VERIFY_WRITE, buf, size))
+	if (!access_ok(buf, size))
 		return -EACCES;
 
 	if (!static_cpu_has(X86_FEATURE_FPU))
@@ -281,7 +281,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 		return 0;
 	}
 
-	if (!access_ok(VERIFY_READ, buf, size))
+	if (!access_ok(buf, size))
 		return -EACCES;
 
 	fpu__initialize(fpu);
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 92a3b312a53c..08dfd4c1a4f9 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -322,7 +322,7 @@ __setup_frame(int sig, struct ksignal *ksig, sigset_t *set,
 
 	frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fpstate);
 
-	if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+	if (!access_ok(frame, sizeof(*frame)))
 		return -EFAULT;
 
 	if (__put_user(sig, &frame->sig))
@@ -385,7 +385,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
 
 	frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fpstate);
 
-	if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+	if (!access_ok(frame, sizeof(*frame)))
 		return -EFAULT;
 
 	put_user_try {
@@ -465,7 +465,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
 
 	frame = get_sigframe(&ksig->ka, regs, sizeof(struct rt_sigframe), &fp);
 
-	if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+	if (!access_ok(frame, sizeof(*frame)))
 		return -EFAULT;
 
 	if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
@@ -547,7 +547,7 @@ static int x32_setup_rt_frame(struct ksignal *ksig,
 
 	frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fpstate);
 
-	if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+	if (!access_ok(frame, sizeof(*frame)))
 		return -EFAULT;
 
 	if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
@@ -610,7 +610,7 @@ SYSCALL_DEFINE0(sigreturn)
 
 	frame = (struct sigframe __user *)(regs->sp - 8);
 
-	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
+	if (!access_ok(frame, sizeof(*frame)))
 		goto badframe;
 	if (__get_user(set.sig[0], &frame->sc.oldmask) || (_NSIG_WORDS > 1
 		&& __copy_from_user(&set.sig[1], &frame->extramask,
@@ -642,7 +642,7 @@ SYSCALL_DEFINE0(rt_sigreturn)
 	unsigned long uc_flags;
 
 	frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
-	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
+	if (!access_ok(frame, sizeof(*frame)))
 		goto badframe;
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
@@ -871,7 +871,7 @@ asmlinkage long sys32_x32_rt_sigreturn(void)
 
 	frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8);
 
-	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
+	if (!access_ok(frame, sizeof(*frame)))
 		goto badframe;
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 7627455047c2..5c2d71a1dc06 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -177,7 +177,7 @@ copy_stack_frame(const void __user *fp, struct stack_frame_user *frame)
 {
 	int ret;
 
-	if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
+	if (!access_ok(fp, sizeof(*frame)))
 		return 0;
 
 	ret = 1;
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index c2fd39752da8..a092b6b40c6b 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -114,7 +114,7 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval)
 	set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | vm86->veflags_mask);
 	user = vm86->user_vm86;
 
-	if (!access_ok(VERIFY_WRITE, user, vm86->vm86plus.is_vm86pus ?
+	if (!access_ok(user, vm86->vm86plus.is_vm86pus ?
 		       sizeof(struct vm86plus_struct) :
 		       sizeof(struct vm86_struct))) {
 		pr_alert("could not access userspace vm86 info\n");
@@ -278,7 +278,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
 	if (vm86->saved_sp0)
 		return -EPERM;
 
-	if (!access_ok(VERIFY_READ, user_vm86, plus ?
+	if (!access_ok(user_vm86, plus ?
 		       sizeof(struct vm86_struct) :
 		       sizeof(struct vm86plus_struct)))
 		return -EFAULT;
diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c
index 8bd53589ecfb..a6a2b7dccbff 100644
--- a/arch/x86/lib/csum-wrappers_64.c
+++ b/arch/x86/lib/csum-wrappers_64.c
@@ -27,7 +27,7 @@ csum_partial_copy_from_user(const void __user *src, void *dst,
 	might_sleep();
 	*errp = 0;
 
-	if (!likely(access_ok(VERIFY_READ, src, len)))
+	if (!likely(access_ok(src, len)))
 		goto out_err;
 
 	/*
@@ -89,7 +89,7 @@ csum_partial_copy_to_user(const void *src, void __user *dst,
 
 	might_sleep();
 
-	if (unlikely(!access_ok(VERIFY_WRITE, dst, len))) {
+	if (unlikely(!access_ok(dst, len))) {
 		*errp = -EFAULT;
 		return 0;
 	}
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index 71fb58d44d58..bfd94e7812fc 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -67,7 +67,7 @@ unsigned long
 clear_user(void __user *to, unsigned long n)
 {
 	might_fault();
-	if (access_ok(VERIFY_WRITE, to, n))
+	if (access_ok(to, n))
 		__do_clear_user(to, n);
 	return n;
 }
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index 1bd837cdc4b1..ee42bb0cbeb3 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -48,7 +48,7 @@ EXPORT_SYMBOL(__clear_user);
 
 unsigned long clear_user(void __user *to, unsigned long n)
 {
-	if (access_ok(VERIFY_WRITE, to, n))
+	if (access_ok(to, n))
 		return __clear_user(to, n);
 	return n;
 }
diff --git a/arch/x86/math-emu/fpu_system.h b/arch/x86/math-emu/fpu_system.h
index c8b1b31ed7c4..f98a0c956764 100644
--- a/arch/x86/math-emu/fpu_system.h
+++ b/arch/x86/math-emu/fpu_system.h
@@ -104,7 +104,7 @@ static inline bool seg_writable(struct desc_struct *d)
 #define instruction_address	(*(struct address *)&I387->soft.fip)
 #define operand_address		(*(struct address *)&I387->soft.foo)
 
-#define FPU_access_ok(x,y,z)	if ( !access_ok(x,y,z) ) \
+#define FPU_access_ok(y,z)	if ( !access_ok(y,z) ) \
 				math_abort(FPU_info,SIGSEGV)
 #define FPU_abort		math_abort(FPU_info, SIGSEGV)
 
@@ -119,7 +119,7 @@ static inline bool seg_writable(struct desc_struct *d)
 /* A simpler test than access_ok() can probably be done for
    FPU_code_access_ok() because the only possible error is to step
    past the upper boundary of a legal code area. */
-#define	FPU_code_access_ok(z) FPU_access_ok(VERIFY_READ,(void __user *)FPU_EIP,z)
+#define	FPU_code_access_ok(z) FPU_access_ok((void __user *)FPU_EIP,z)
 #endif
 
 #define FPU_get_user(x,y)       get_user((x),(y))
diff --git a/arch/x86/math-emu/load_store.c b/arch/x86/math-emu/load_store.c
index f821a9cd7753..f15263e158e8 100644
--- a/arch/x86/math-emu/load_store.c
+++ b/arch/x86/math-emu/load_store.c
@@ -251,7 +251,7 @@ int FPU_load_store(u_char type, fpu_addr_modes addr_modes,
 		break;
 	case 024:		/* fldcw */
 		RE_ENTRANT_CHECK_OFF;
-		FPU_access_ok(VERIFY_READ, data_address, 2);
+		FPU_access_ok(data_address, 2);
 		FPU_get_user(control_word,
 			     (unsigned short __user *)data_address);
 		RE_ENTRANT_CHECK_ON;
@@ -291,7 +291,7 @@ int FPU_load_store(u_char type, fpu_addr_modes addr_modes,
 		break;
 	case 034:		/* fstcw m16int */
 		RE_ENTRANT_CHECK_OFF;
-		FPU_access_ok(VERIFY_WRITE, data_address, 2);
+		FPU_access_ok(data_address, 2);
 		FPU_put_user(control_word,
 			     (unsigned short __user *)data_address);
 		RE_ENTRANT_CHECK_ON;
@@ -305,7 +305,7 @@ int FPU_load_store(u_char type, fpu_addr_modes addr_modes,
 		break;
 	case 036:		/* fstsw m2byte */
 		RE_ENTRANT_CHECK_OFF;
-		FPU_access_ok(VERIFY_WRITE, data_address, 2);
+		FPU_access_ok(data_address, 2);
 		FPU_put_user(status_word(),
 			     (unsigned short __user *)data_address);
 		RE_ENTRANT_CHECK_ON;
diff --git a/arch/x86/math-emu/reg_ld_str.c b/arch/x86/math-emu/reg_ld_str.c
index d40ff45497b9..f3779743d15e 100644
--- a/arch/x86/math-emu/reg_ld_str.c
+++ b/arch/x86/math-emu/reg_ld_str.c
@@ -84,7 +84,7 @@ int FPU_load_extended(long double __user *s, int stnr)
 	FPU_REG *sti_ptr = &st(stnr);
 
 	RE_ENTRANT_CHECK_OFF;
-	FPU_access_ok(VERIFY_READ, s, 10);
+	FPU_access_ok(s, 10);
 	__copy_from_user(sti_ptr, s, 10);
 	RE_ENTRANT_CHECK_ON;
 
@@ -98,7 +98,7 @@ int FPU_load_double(double __user *dfloat, FPU_REG *loaded_data)
 	unsigned m64, l64;
 
 	RE_ENTRANT_CHECK_OFF;
-	FPU_access_ok(VERIFY_READ, dfloat, 8);
+	FPU_access_ok(dfloat, 8);
 	FPU_get_user(m64, 1 + (unsigned long __user *)dfloat);
 	FPU_get_user(l64, (unsigned long __user *)dfloat);
 	RE_ENTRANT_CHECK_ON;
@@ -159,7 +159,7 @@ int FPU_load_single(float __user *single, FPU_REG *loaded_data)
 	int exp, tag, negative;
 
 	RE_ENTRANT_CHECK_OFF;
-	FPU_access_ok(VERIFY_READ, single, 4);
+	FPU_access_ok(single, 4);
 	FPU_get_user(m32, (unsigned long __user *)single);
 	RE_ENTRANT_CHECK_ON;
 
@@ -214,7 +214,7 @@ int FPU_load_int64(long long __user *_s)
 	FPU_REG *st0_ptr = &st(0);
 
 	RE_ENTRANT_CHECK_OFF;
-	FPU_access_ok(VERIFY_READ, _s, 8);
+	FPU_access_ok(_s, 8);
 	if (copy_from_user(&s, _s, 8))
 		FPU_abort;
 	RE_ENTRANT_CHECK_ON;
@@ -243,7 +243,7 @@ int FPU_load_int32(long __user *_s, FPU_REG *loaded_data)
 	int negative;
 
 	RE_ENTRANT_CHECK_OFF;
-	FPU_access_ok(VERIFY_READ, _s, 4);
+	FPU_access_ok(_s, 4);
 	FPU_get_user(s, _s);
 	RE_ENTRANT_CHECK_ON;
 
@@ -271,7 +271,7 @@ int FPU_load_int16(short __user *_s, FPU_REG *loaded_data)
 	int s, negative;
 
 	RE_ENTRANT_CHECK_OFF;
-	FPU_access_ok(VERIFY_READ, _s, 2);
+	FPU_access_ok(_s, 2);
 	/* Cast as short to get the sign extended. */
 	FPU_get_user(s, _s);
 	RE_ENTRANT_CHECK_ON;
@@ -304,7 +304,7 @@ int FPU_load_bcd(u_char __user *s)
 	int sign;
 
 	RE_ENTRANT_CHECK_OFF;
-	FPU_access_ok(VERIFY_READ, s, 10);
+	FPU_access_ok(s, 10);
 	RE_ENTRANT_CHECK_ON;
 	for (pos = 8; pos >= 0; pos--) {
 		l *= 10;
@@ -345,7 +345,7 @@ int FPU_store_extended(FPU_REG *st0_ptr, u_char st0_tag,
 
 	if (st0_tag != TAG_Empty) {
 		RE_ENTRANT_CHECK_OFF;
-		FPU_access_ok(VERIFY_WRITE, d, 10);
+		FPU_access_ok(d, 10);
 
 		FPU_put_user(st0_ptr->sigl, (unsigned long __user *)d);
 		FPU_put_user(st0_ptr->sigh,
@@ -364,7 +364,7 @@ int FPU_store_extended(FPU_REG *st0_ptr, u_char st0_tag,
 		/* The masked response */
 		/* Put out the QNaN indefinite */
 		RE_ENTRANT_CHECK_OFF;
-		FPU_access_ok(VERIFY_WRITE, d, 10);
+		FPU_access_ok(d, 10);
 		FPU_put_user(0, (unsigned long __user *)d);
 		FPU_put_user(0xc0000000, 1 + (unsigned long __user *)d);
 		FPU_put_user(0xffff, 4 + (short __user *)d);
@@ -539,7 +539,7 @@ denormal_arg:
 			/* The masked response */
 			/* Put out the QNaN indefinite */
 			RE_ENTRANT_CHECK_OFF;
-			FPU_access_ok(VERIFY_WRITE, dfloat, 8);
+			FPU_access_ok(dfloat, 8);
 			FPU_put_user(0, (unsigned long __user *)dfloat);
 			FPU_put_user(0xfff80000,
 				     1 + (unsigned long __user *)dfloat);
@@ -552,7 +552,7 @@ denormal_arg:
 		l[1] |= 0x80000000;
 
 	RE_ENTRANT_CHECK_OFF;
-	FPU_access_ok(VERIFY_WRITE, dfloat, 8);
+	FPU_access_ok(dfloat, 8);
 	FPU_put_user(l[0], (unsigned long __user *)dfloat);
 	FPU_put_user(l[1], 1 + (unsigned long __user *)dfloat);
 	RE_ENTRANT_CHECK_ON;
@@ -724,7 +724,7 @@ int FPU_store_single(FPU_REG *st0_ptr, u_char st0_tag, float __user *single)
 			/* The masked response */
 			/* Put out the QNaN indefinite */
 			RE_ENTRANT_CHECK_OFF;
-			FPU_access_ok(VERIFY_WRITE, single, 4);
+			FPU_access_ok(single, 4);
 			FPU_put_user(0xffc00000,
 				     (unsigned long __user *)single);
 			RE_ENTRANT_CHECK_ON;
@@ -742,7 +742,7 @@ int FPU_store_single(FPU_REG *st0_ptr, u_char st0_tag, float __user *single)
 		templ |= 0x80000000;
 
 	RE_ENTRANT_CHECK_OFF;
-	FPU_access_ok(VERIFY_WRITE, single, 4);
+	FPU_access_ok(single, 4);
 	FPU_put_user(templ, (unsigned long __user *)single);
 	RE_ENTRANT_CHECK_ON;
 
@@ -791,7 +791,7 @@ int FPU_store_int64(FPU_REG *st0_ptr, u_char st0_tag, long long __user *d)
 	}
 
 	RE_ENTRANT_CHECK_OFF;
-	FPU_access_ok(VERIFY_WRITE, d, 8);
+	FPU_access_ok(d, 8);
 	if (copy_to_user(d, &tll, 8))
 		FPU_abort;
 	RE_ENTRANT_CHECK_ON;
@@ -838,7 +838,7 @@ int FPU_store_int32(FPU_REG *st0_ptr, u_char st0_tag, long __user *d)
 	}
 
 	RE_ENTRANT_CHECK_OFF;
-	FPU_access_ok(VERIFY_WRITE, d, 4);
+	FPU_access_ok(d, 4);
 	FPU_put_user(t.sigl, (unsigned long __user *)d);
 	RE_ENTRANT_CHECK_ON;
 
@@ -884,7 +884,7 @@ int FPU_store_int16(FPU_REG *st0_ptr, u_char st0_tag, short __user *d)
 	}
 
 	RE_ENTRANT_CHECK_OFF;
-	FPU_access_ok(VERIFY_WRITE, d, 2);
+	FPU_access_ok(d, 2);
 	FPU_put_user((short)t.sigl, d);
 	RE_ENTRANT_CHECK_ON;
 
@@ -925,7 +925,7 @@ int FPU_store_bcd(FPU_REG *st0_ptr, u_char st0_tag, u_char __user *d)
 		if (control_word & CW_Invalid) {
 			/* Produce the QNaN "indefinite" */
 			RE_ENTRANT_CHECK_OFF;
-			FPU_access_ok(VERIFY_WRITE, d, 10);
+			FPU_access_ok(d, 10);
 			for (i = 0; i < 7; i++)
 				FPU_put_user(0, d + i);	/* These bytes "undefined" */
 			FPU_put_user(0xc0, d + 7);	/* This byte "undefined" */
@@ -941,7 +941,7 @@ int FPU_store_bcd(FPU_REG *st0_ptr, u_char st0_tag, u_char __user *d)
 	}
 
 	RE_ENTRANT_CHECK_OFF;
-	FPU_access_ok(VERIFY_WRITE, d, 10);
+	FPU_access_ok(d, 10);
 	RE_ENTRANT_CHECK_ON;
 	for (i = 0; i < 9; i++) {
 		b = FPU_div_small(&ll, 10);
@@ -1034,7 +1034,7 @@ u_char __user *fldenv(fpu_addr_modes addr_modes, u_char __user *s)
 	    ((addr_modes.default_mode == PM16)
 	     ^ (addr_modes.override.operand_size == OP_SIZE_PREFIX))) {
 		RE_ENTRANT_CHECK_OFF;
-		FPU_access_ok(VERIFY_READ, s, 0x0e);
+		FPU_access_ok(s, 0x0e);
 		FPU_get_user(control_word, (unsigned short __user *)s);
 		FPU_get_user(partial_status, (unsigned short __user *)(s + 2));
 		FPU_get_user(tag_word, (unsigned short __user *)(s + 4));
@@ -1056,7 +1056,7 @@ u_char __user *fldenv(fpu_addr_modes addr_modes, u_char __user *s)
 		}
 	} else {
 		RE_ENTRANT_CHECK_OFF;
-		FPU_access_ok(VERIFY_READ, s, 0x1c);
+		FPU_access_ok(s, 0x1c);
 		FPU_get_user(control_word, (unsigned short __user *)s);
 		FPU_get_user(partial_status, (unsigned short __user *)(s + 4));
 		FPU_get_user(tag_word, (unsigned short __user *)(s + 8));
@@ -1125,7 +1125,7 @@ void frstor(fpu_addr_modes addr_modes, u_char __user *data_address)
 
 	/* Copy all registers in stack order. */
 	RE_ENTRANT_CHECK_OFF;
-	FPU_access_ok(VERIFY_READ, s, 80);
+	FPU_access_ok(s, 80);
 	__copy_from_user(register_base + offset, s, other);
 	if (offset)
 		__copy_from_user(register_base, s + other, offset);
@@ -1146,7 +1146,7 @@ u_char __user *fstenv(fpu_addr_modes addr_modes, u_char __user *d)
 	    ((addr_modes.default_mode == PM16)
 	     ^ (addr_modes.override.operand_size == OP_SIZE_PREFIX))) {
 		RE_ENTRANT_CHECK_OFF;
-		FPU_access_ok(VERIFY_WRITE, d, 14);
+		FPU_access_ok(d, 14);
 #ifdef PECULIAR_486
 		FPU_put_user(control_word & ~0xe080, (unsigned long __user *)d);
 #else
@@ -1174,7 +1174,7 @@ u_char __user *fstenv(fpu_addr_modes addr_modes, u_char __user *d)
 		d += 0x0e;
 	} else {
 		RE_ENTRANT_CHECK_OFF;
-		FPU_access_ok(VERIFY_WRITE, d, 7 * 4);
+		FPU_access_ok(d, 7 * 4);
 #ifdef PECULIAR_486
 		control_word &= ~0xe080;
 		/* An 80486 sets nearly all of the reserved bits to 1. */
@@ -1204,7 +1204,7 @@ void fsave(fpu_addr_modes addr_modes, u_char __user *data_address)
 	d = fstenv(addr_modes, data_address);
 
 	RE_ENTRANT_CHECK_OFF;
-	FPU_access_ok(VERIFY_WRITE, d, 80);
+	FPU_access_ok(d, 80);
 
 	/* Copy all registers in stack order. */
 	if (__copy_to_user(d, register_base + offset, other))
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
index 2385538e8065..de1851d15699 100644
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -495,7 +495,7 @@ static int get_bt_addr(struct mm_struct *mm,
 	unsigned long bd_entry;
 	unsigned long bt_addr;
 
-	if (!access_ok(VERIFY_READ, (bd_entry_ptr), sizeof(*bd_entry_ptr)))
+	if (!access_ok((bd_entry_ptr), sizeof(*bd_entry_ptr)))
 		return -EFAULT;
 
 	while (1) {
diff --git a/arch/x86/um/asm/checksum_32.h b/arch/x86/um/asm/checksum_32.h
index 83a75f8a1233..b9ac7c9eb72c 100644
--- a/arch/x86/um/asm/checksum_32.h
+++ b/arch/x86/um/asm/checksum_32.h
@@ -43,7 +43,7 @@ static __inline__ __wsum csum_and_copy_to_user(const void *src,
 						     void __user *dst,
 						     int len, __wsum sum, int *err_ptr)
 {
-	if (access_ok(VERIFY_WRITE, dst, len)) {
+	if (access_ok(dst, len)) {
 		if (copy_to_user(dst, src, len)) {
 			*err_ptr = -EFAULT;
 			return (__force __wsum)-1;
diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c
index 727ed442e0a5..8b4a71efe7ee 100644
--- a/arch/x86/um/signal.c
+++ b/arch/x86/um/signal.c
@@ -367,7 +367,7 @@ int setup_signal_stack_sc(unsigned long stack_top, struct ksignal *ksig,
 	/* This is the same calculation as i386 - ((sp + 4) & 15) == 0 */
 	stack_top = ((stack_top + 4) & -16UL) - 4;
 	frame = (struct sigframe __user *) stack_top - 1;
-	if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+	if (!access_ok(frame, sizeof(*frame)))
 		return 1;
 
 	restorer = frame->retcode;
@@ -412,7 +412,7 @@ int setup_signal_stack_si(unsigned long stack_top, struct ksignal *ksig,
 
 	stack_top &= -8UL;
 	frame = (struct rt_sigframe __user *) stack_top - 1;
-	if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+	if (!access_ok(frame, sizeof(*frame)))
 		return 1;
 
 	restorer = frame->retcode;
@@ -497,7 +497,7 @@ int setup_signal_stack_si(unsigned long stack_top, struct ksignal *ksig,
 	/* Subtract 128 for a red zone and 8 for proper alignment */
 	frame = (struct rt_sigframe __user *) ((unsigned long) frame - 128 - 8);
 
-	if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+	if (!access_ok(frame, sizeof(*frame)))
 		goto out;
 
 	if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
diff --git a/arch/xtensa/include/asm/checksum.h b/arch/xtensa/include/asm/checksum.h
index 3ae74d7e074b..f302ef57973a 100644
--- a/arch/xtensa/include/asm/checksum.h
+++ b/arch/xtensa/include/asm/checksum.h
@@ -243,7 +243,7 @@ static __inline__ __wsum csum_and_copy_to_user(const void *src,
 					       void __user *dst, int len,
 					       __wsum sum, int *err_ptr)
 {
-	if (access_ok(VERIFY_WRITE, dst, len))
+	if (access_ok(dst, len))
 		return csum_partial_copy_generic(src,dst,len,sum,NULL,err_ptr);
 
 	if (len)
diff --git a/arch/xtensa/include/asm/futex.h b/arch/xtensa/include/asm/futex.h
index fd0eef6b8e7c..505d09eff184 100644
--- a/arch/xtensa/include/asm/futex.h
+++ b/arch/xtensa/include/asm/futex.h
@@ -93,7 +93,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
 {
 	int ret = 0;
 
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
+	if (!access_ok(uaddr, sizeof(u32)))
 		return -EFAULT;
 
 #if !XCHAL_HAVE_S32C1I
diff --git a/arch/xtensa/include/asm/uaccess.h b/arch/xtensa/include/asm/uaccess.h
index d11ef2939652..4b2480304bc3 100644
--- a/arch/xtensa/include/asm/uaccess.h
+++ b/arch/xtensa/include/asm/uaccess.h
@@ -42,7 +42,7 @@
 #define __user_ok(addr, size) \
 		(((size) <= TASK_SIZE)&&((addr) <= TASK_SIZE-(size)))
 #define __access_ok(addr, size) (__kernel_ok || __user_ok((addr), (size)))
-#define access_ok(type, addr, size) __access_ok((unsigned long)(addr), (size))
+#define access_ok(addr, size) __access_ok((unsigned long)(addr), (size))
 
 #define user_addr_max() (uaccess_kernel() ? ~0UL : TASK_SIZE)
 
@@ -86,7 +86,7 @@ extern long __put_user_bad(void);
 ({									\
 	long __pu_err = -EFAULT;					\
 	__typeof__(*(ptr)) *__pu_addr = (ptr);				\
-	if (access_ok(VERIFY_WRITE, __pu_addr, size))			\
+	if (access_ok(__pu_addr, size))			\
 		__put_user_size((x), __pu_addr, (size), __pu_err);	\
 	__pu_err;							\
 })
@@ -183,7 +183,7 @@ __asm__ __volatile__(					\
 ({									\
 	long __gu_err = -EFAULT, __gu_val = 0;				\
 	const __typeof__(*(ptr)) *__gu_addr = (ptr);			\
-	if (access_ok(VERIFY_READ, __gu_addr, size))			\
+	if (access_ok(__gu_addr, size))			\
 		__get_user_size(__gu_val, __gu_addr, (size), __gu_err);	\
 	(x) = (__force __typeof__(*(ptr)))__gu_val;			\
 	__gu_err;							\
@@ -269,7 +269,7 @@ __xtensa_clear_user(void *addr, unsigned long size)
 static inline unsigned long
 clear_user(void *addr, unsigned long size)
 {
-	if (access_ok(VERIFY_WRITE, addr, size))
+	if (access_ok(addr, size))
 		return __xtensa_clear_user(addr, size);
 	return size ? -EFAULT : 0;
 }
@@ -284,7 +284,7 @@ extern long __strncpy_user(char *, const char *, long);
 static inline long
 strncpy_from_user(char *dst, const char *src, long count)
 {
-	if (access_ok(VERIFY_READ, src, 1))
+	if (access_ok(src, 1))
 		return __strncpy_user(dst, src, count);
 	return -EFAULT;
 }
diff --git a/arch/xtensa/kernel/signal.c b/arch/xtensa/kernel/signal.c
index 74e1682876ac..dc22a238ed9c 100644
--- a/arch/xtensa/kernel/signal.c
+++ b/arch/xtensa/kernel/signal.c
@@ -251,7 +251,7 @@ asmlinkage long xtensa_rt_sigreturn(long a0, long a1, long a2, long a3,
 
 	frame = (struct rt_sigframe __user *) regs->areg[1];
 
-	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
+	if (!access_ok(frame, sizeof(*frame)))
 		goto badframe;
 
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
@@ -348,7 +348,7 @@ static int setup_frame(struct ksignal *ksig, sigset_t *set,
 	if (regs->depc > 64)
 		panic ("Double exception sys_sigreturn\n");
 
-	if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) {
+	if (!access_ok(frame, sizeof(*frame))) {
 		return -EFAULT;
 	}
 
diff --git a/arch/xtensa/kernel/stacktrace.c b/arch/xtensa/kernel/stacktrace.c
index 0df4080fa20f..174c11f13bba 100644
--- a/arch/xtensa/kernel/stacktrace.c
+++ b/arch/xtensa/kernel/stacktrace.c
@@ -91,7 +91,7 @@ void xtensa_backtrace_user(struct pt_regs *regs, unsigned int depth,
 		pc = MAKE_PC_FROM_RA(a0, pc);
 
 		/* Check if the region is OK to access. */
-		if (!access_ok(VERIFY_READ, &SPILL_SLOT(a1, 0), 8))
+		if (!access_ok(&SPILL_SLOT(a1, 0), 8))
 			return;
 		/* Copy a1, a0 from user space stack frame. */
 		if (__get_user(a0, &SPILL_SLOT(a1, 0)) ||
diff --git a/drivers/acpi/acpi_dbg.c b/drivers/acpi/acpi_dbg.c
index f21c99ec46ee..a2dcd62ea32f 100644
--- a/drivers/acpi/acpi_dbg.c
+++ b/drivers/acpi/acpi_dbg.c
@@ -614,7 +614,7 @@ static ssize_t acpi_aml_read(struct file *file, char __user *buf,
 
 	if (!count)
 		return 0;
-	if (!access_ok(VERIFY_WRITE, buf, count))
+	if (!access_ok(buf, count))
 		return -EFAULT;
 
 	while (count > 0) {
@@ -684,7 +684,7 @@ static ssize_t acpi_aml_write(struct file *file, const char __user *buf,
 
 	if (!count)
 		return 0;
-	if (!access_ok(VERIFY_READ, buf, count))
+	if (!access_ok(buf, count))
 		return -EFAULT;
 
 	while (count > 0) {
diff --git a/drivers/char/generic_nvram.c b/drivers/char/generic_nvram.c
index 14e728fbb8a0..ff5394f47587 100644
--- a/drivers/char/generic_nvram.c
+++ b/drivers/char/generic_nvram.c
@@ -44,7 +44,7 @@ static ssize_t read_nvram(struct file *file, char __user *buf,
 	unsigned int i;
 	char __user *p = buf;
 
-	if (!access_ok(VERIFY_WRITE, buf, count))
+	if (!access_ok(buf, count))
 		return -EFAULT;
 	if (*ppos >= nvram_len)
 		return 0;
@@ -62,7 +62,7 @@ static ssize_t write_nvram(struct file *file, const char __user *buf,
 	const char __user *p = buf;
 	char c;
 
-	if (!access_ok(VERIFY_READ, buf, count))
+	if (!access_ok(buf, count))
 		return -EFAULT;
 	if (*ppos >= nvram_len)
 		return 0;
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index 7b4e4de778e4..b08dc50f9f26 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -609,7 +609,7 @@ static ssize_t read_port(struct file *file, char __user *buf,
 	unsigned long i = *ppos;
 	char __user *tmp = buf;
 
-	if (!access_ok(VERIFY_WRITE, buf, count))
+	if (!access_ok(buf, count))
 		return -EFAULT;
 	while (count-- > 0 && i < 65536) {
 		if (__put_user(inb(i), tmp) < 0)
@@ -627,7 +627,7 @@ static ssize_t write_port(struct file *file, const char __user *buf,
 	unsigned long i = *ppos;
 	const char __user *tmp = buf;
 
-	if (!access_ok(VERIFY_READ, buf, count))
+	if (!access_ok(buf, count))
 		return -EFAULT;
 	while (count-- > 0 && i < 65536) {
 		char c;
diff --git a/drivers/char/nwflash.c b/drivers/char/nwflash.c
index a284ae25e69a..76fb434068d4 100644
--- a/drivers/char/nwflash.c
+++ b/drivers/char/nwflash.c
@@ -167,7 +167,7 @@ static ssize_t flash_write(struct file *file, const char __user *buf,
 	if (count > gbFlashSize - p)
 		count = gbFlashSize - p;
 			
-	if (!access_ok(VERIFY_READ, buf, count))
+	if (!access_ok(buf, count))
 		return -EFAULT;
 
 	/*
diff --git a/drivers/char/pcmcia/cm4000_cs.c b/drivers/char/pcmcia/cm4000_cs.c
index 809507bf8f1c..7a4eb86aedac 100644
--- a/drivers/char/pcmcia/cm4000_cs.c
+++ b/drivers/char/pcmcia/cm4000_cs.c
@@ -1445,11 +1445,11 @@ static long cmm_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 	      _IOC_DIR(cmd), _IOC_READ, _IOC_WRITE, size, cmd);
 
 	if (_IOC_DIR(cmd) & _IOC_READ) {
-		if (!access_ok(VERIFY_WRITE, argp, size))
+		if (!access_ok(argp, size))
 			goto out;
 	}
 	if (_IOC_DIR(cmd) & _IOC_WRITE) {
-		if (!access_ok(VERIFY_READ, argp, size))
+		if (!access_ok(argp, size))
 			goto out;
 	}
 	rc = 0;
diff --git a/drivers/crypto/ccp/psp-dev.c b/drivers/crypto/ccp/psp-dev.c
index d64a78ccc03e..b16be8a11d92 100644
--- a/drivers/crypto/ccp/psp-dev.c
+++ b/drivers/crypto/ccp/psp-dev.c
@@ -364,7 +364,7 @@ static int sev_ioctl_do_pek_csr(struct sev_issue_cmd *argp)
 		goto cmd;
 
 	/* allocate a physically contiguous buffer to store the CSR blob */
-	if (!access_ok(VERIFY_WRITE, input.address, input.length) ||
+	if (!access_ok(input.address, input.length) ||
 	    input.length > SEV_FW_BLOB_MAX_SIZE) {
 		ret = -EFAULT;
 		goto e_free;
@@ -644,14 +644,14 @@ static int sev_ioctl_do_pdh_export(struct sev_issue_cmd *argp)
 
 	/* Allocate a physically contiguous buffer to store the PDH blob. */
 	if ((input.pdh_cert_len > SEV_FW_BLOB_MAX_SIZE) ||
-	    !access_ok(VERIFY_WRITE, input.pdh_cert_address, input.pdh_cert_len)) {
+	    !access_ok(input.pdh_cert_address, input.pdh_cert_len)) {
 		ret = -EFAULT;
 		goto e_free;
 	}
 
 	/* Allocate a physically contiguous buffer to store the cert chain blob. */
 	if ((input.cert_chain_len > SEV_FW_BLOB_MAX_SIZE) ||
-	    !access_ok(VERIFY_WRITE, input.cert_chain_address, input.cert_chain_len)) {
+	    !access_ok(input.cert_chain_address, input.cert_chain_len)) {
 		ret = -EFAULT;
 		goto e_free;
 	}
diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c
index d8e185582642..16a7045736a9 100644
--- a/drivers/firewire/core-cdev.c
+++ b/drivers/firewire/core-cdev.c
@@ -1094,7 +1094,7 @@ static int ioctl_queue_iso(struct client *client, union ioctl_arg *arg)
 		return -EINVAL;
 
 	p = (struct fw_cdev_iso_packet __user *)u64_to_uptr(a->packets);
-	if (!access_ok(VERIFY_READ, p, a->size))
+	if (!access_ok(p, a->size))
 		return -EFAULT;
 
 	end = (void __user *)p + a->size;
diff --git a/drivers/firmware/efi/test/efi_test.c b/drivers/firmware/efi/test/efi_test.c
index 769640940c9f..51ecf7d6da48 100644
--- a/drivers/firmware/efi/test/efi_test.c
+++ b/drivers/firmware/efi/test/efi_test.c
@@ -68,7 +68,7 @@ copy_ucs2_from_user_len(efi_char16_t **dst, efi_char16_t __user *src,
 		return 0;
 	}
 
-	if (!access_ok(VERIFY_READ, src, 1))
+	if (!access_ok(src, 1))
 		return -EFAULT;
 
 	buf = memdup_user(src, len);
@@ -89,7 +89,7 @@ copy_ucs2_from_user_len(efi_char16_t **dst, efi_char16_t __user *src,
 static inline int
 get_ucs2_strsize_from_user(efi_char16_t __user *src, size_t *len)
 {
-	if (!access_ok(VERIFY_READ, src, 1))
+	if (!access_ok(src, 1))
 		return -EFAULT;
 
 	*len = user_ucs2_strsize(src);
@@ -116,7 +116,7 @@ copy_ucs2_from_user(efi_char16_t **dst, efi_char16_t __user *src)
 {
 	size_t len;
 
-	if (!access_ok(VERIFY_READ, src, 1))
+	if (!access_ok(src, 1))
 		return -EFAULT;
 
 	len = user_ucs2_strsize(src);
@@ -140,7 +140,7 @@ copy_ucs2_to_user_len(efi_char16_t __user *dst, efi_char16_t *src, size_t len)
 	if (!src)
 		return 0;
 
-	if (!access_ok(VERIFY_WRITE, dst, 1))
+	if (!access_ok(dst, 1))
 		return -EFAULT;
 
 	return copy_to_user(dst, src, len);
diff --git a/drivers/fpga/dfl-afu-dma-region.c b/drivers/fpga/dfl-afu-dma-region.c
index 025aba3ea76c..e18a786fc943 100644
--- a/drivers/fpga/dfl-afu-dma-region.c
+++ b/drivers/fpga/dfl-afu-dma-region.c
@@ -369,7 +369,7 @@ int afu_dma_map_region(struct dfl_feature_platform_data *pdata,
 	if (user_addr + length < user_addr)
 		return -EINVAL;
 
-	if (!access_ok(VERIFY_WRITE, (void __user *)(unsigned long)user_addr,
+	if (!access_ok((void __user *)(unsigned long)user_addr,
 		       length))
 		return -EINVAL;
 
diff --git a/drivers/fpga/dfl-fme-pr.c b/drivers/fpga/dfl-fme-pr.c
index fe5a5578fbf7..d9ca9554844a 100644
--- a/drivers/fpga/dfl-fme-pr.c
+++ b/drivers/fpga/dfl-fme-pr.c
@@ -99,8 +99,7 @@ static int fme_pr(struct platform_device *pdev, unsigned long arg)
 		return -EINVAL;
 	}
 
-	if (!access_ok(VERIFY_READ,
-		       (void __user *)(unsigned long)port_pr.buffer_address,
+	if (!access_ok((void __user *)(unsigned long)port_pr.buffer_address,
 		       port_pr.buffer_size))
 		return -EFAULT;
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 3623538baf6f..be68752c3469 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -158,8 +158,7 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties,
 	}
 
 	if ((args->ring_base_address) &&
-		(!access_ok(VERIFY_WRITE,
-			(const void __user *) args->ring_base_address,
+		(!access_ok((const void __user *) args->ring_base_address,
 			sizeof(uint64_t)))) {
 		pr_err("Can't access ring base address\n");
 		return -EFAULT;
@@ -170,31 +169,27 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties,
 		return -EINVAL;
 	}
 
-	if (!access_ok(VERIFY_WRITE,
-			(const void __user *) args->read_pointer_address,
+	if (!access_ok((const void __user *) args->read_pointer_address,
 			sizeof(uint32_t))) {
 		pr_err("Can't access read pointer\n");
 		return -EFAULT;
 	}
 
-	if (!access_ok(VERIFY_WRITE,
-			(const void __user *) args->write_pointer_address,
+	if (!access_ok((const void __user *) args->write_pointer_address,
 			sizeof(uint32_t))) {
 		pr_err("Can't access write pointer\n");
 		return -EFAULT;
 	}
 
 	if (args->eop_buffer_address &&
-		!access_ok(VERIFY_WRITE,
-			(const void __user *) args->eop_buffer_address,
+		!access_ok((const void __user *) args->eop_buffer_address,
 			sizeof(uint32_t))) {
 		pr_debug("Can't access eop buffer");
 		return -EFAULT;
 	}
 
 	if (args->ctx_save_restore_address &&
-		!access_ok(VERIFY_WRITE,
-			(const void __user *) args->ctx_save_restore_address,
+		!access_ok((const void __user *) args->ctx_save_restore_address,
 			sizeof(uint32_t))) {
 		pr_debug("Can't access ctx save restore buffer");
 		return -EFAULT;
@@ -365,8 +360,7 @@ static int kfd_ioctl_update_queue(struct file *filp, struct kfd_process *p,
 	}
 
 	if ((args->ring_base_address) &&
-		(!access_ok(VERIFY_WRITE,
-			(const void __user *) args->ring_base_address,
+		(!access_ok((const void __user *) args->ring_base_address,
 			sizeof(uint64_t)))) {
 		pr_err("Can't access ring base address\n");
 		return -EFAULT;
diff --git a/drivers/gpu/drm/armada/armada_gem.c b/drivers/gpu/drm/armada/armada_gem.c
index 892c1d9304bb..642d0e70d0f8 100644
--- a/drivers/gpu/drm/armada/armada_gem.c
+++ b/drivers/gpu/drm/armada/armada_gem.c
@@ -334,7 +334,7 @@ int armada_gem_pwrite_ioctl(struct drm_device *dev, void *data,
 
 	ptr = (char __user *)(uintptr_t)args->ptr;
 
-	if (!access_ok(VERIFY_READ, ptr, args->size))
+	if (!access_ok(ptr, args->size))
 		return -EFAULT;
 
 	ret = fault_in_pages_readable(ptr, args->size);
diff --git a/drivers/gpu/drm/drm_file.c b/drivers/gpu/drm/drm_file.c
index ffa8dc35515f..46f48f245eb5 100644
--- a/drivers/gpu/drm/drm_file.c
+++ b/drivers/gpu/drm/drm_file.c
@@ -525,7 +525,7 @@ ssize_t drm_read(struct file *filp, char __user *buffer,
 	struct drm_device *dev = file_priv->minor->dev;
 	ssize_t ret;
 
-	if (!access_ok(VERIFY_WRITE, buffer, count))
+	if (!access_ok(buffer, count))
 		return -EFAULT;
 
 	ret = mutex_lock_interruptible(&file_priv->event_read_lock);
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_drv.c b/drivers/gpu/drm/etnaviv/etnaviv_drv.c
index 96efc84396bf..18c27f795cf6 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_drv.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_drv.c
@@ -339,7 +339,6 @@ static int etnaviv_ioctl_gem_userptr(struct drm_device *dev, void *data,
 	struct drm_file *file)
 {
 	struct drm_etnaviv_gem_userptr *args = data;
-	int access;
 
 	if (args->flags & ~(ETNA_USERPTR_READ|ETNA_USERPTR_WRITE) ||
 	    args->flags == 0)
@@ -351,12 +350,7 @@ static int etnaviv_ioctl_gem_userptr(struct drm_device *dev, void *data,
 	    args->user_ptr & ~PAGE_MASK)
 		return -EINVAL;
 
-	if (args->flags & ETNA_USERPTR_WRITE)
-		access = VERIFY_WRITE;
-	else
-		access = VERIFY_READ;
-
-	if (!access_ok(access, (void __user *)(unsigned long)args->user_ptr,
+	if (!access_ok((void __user *)(unsigned long)args->user_ptr,
 		       args->user_size))
 		return -EFAULT;
 
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index a9de07bb72c8..216f52b744a6 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1282,8 +1282,7 @@ i915_gem_pread_ioctl(struct drm_device *dev, void *data,
 	if (args->size == 0)
 		return 0;
 
-	if (!access_ok(VERIFY_WRITE,
-		       u64_to_user_ptr(args->data_ptr),
+	if (!access_ok(u64_to_user_ptr(args->data_ptr),
 		       args->size))
 		return -EFAULT;
 
@@ -1609,9 +1608,7 @@ i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
 	if (args->size == 0)
 		return 0;
 
-	if (!access_ok(VERIFY_READ,
-		       u64_to_user_ptr(args->data_ptr),
-		       args->size))
+	if (!access_ok(u64_to_user_ptr(args->data_ptr), args->size))
 		return -EFAULT;
 
 	obj = i915_gem_object_lookup(file, args->handle);
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 8ff6b581cf1c..fee66ccebed6 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1447,7 +1447,7 @@ static int eb_relocate_vma(struct i915_execbuffer *eb, struct i915_vma *vma)
 	 * to read. However, if the array is not writable the user loses
 	 * the updated relocation values.
 	 */
-	if (unlikely(!access_ok(VERIFY_READ, urelocs, remain*sizeof(*urelocs))))
+	if (unlikely(!access_ok(urelocs, remain*sizeof(*urelocs))))
 		return -EFAULT;
 
 	do {
@@ -1554,7 +1554,7 @@ static int check_relocations(const struct drm_i915_gem_exec_object2 *entry)
 
 	addr = u64_to_user_ptr(entry->relocs_ptr);
 	size *= sizeof(struct drm_i915_gem_relocation_entry);
-	if (!access_ok(VERIFY_READ, addr, size))
+	if (!access_ok(addr, size))
 		return -EFAULT;
 
 	end = addr + size;
@@ -2090,7 +2090,7 @@ get_fence_array(struct drm_i915_gem_execbuffer2 *args,
 		return ERR_PTR(-EINVAL);
 
 	user = u64_to_user_ptr(args->cliprects_ptr);
-	if (!access_ok(VERIFY_READ, user, nfences * sizeof(*user)))
+	if (!access_ok(user, nfences * sizeof(*user)))
 		return ERR_PTR(-EFAULT);
 
 	fences = kvmalloc_array(nfences, sizeof(*fences),
diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c
index 3df77020aada..9558582c105e 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -789,8 +789,7 @@ i915_gem_userptr_ioctl(struct drm_device *dev,
 	if (offset_in_page(args->user_ptr | args->user_size))
 		return -EINVAL;
 
-	if (!access_ok(args->flags & I915_USERPTR_READ_ONLY ? VERIFY_READ : VERIFY_WRITE,
-		       (char __user *)(unsigned long)args->user_ptr, args->user_size))
+	if (!access_ok((char __user *)(unsigned long)args->user_ptr, args->user_size))
 		return -EFAULT;
 
 	if (args->flags & I915_USERPTR_READ_ONLY) {
diff --git a/drivers/gpu/drm/i915/i915_ioc32.c b/drivers/gpu/drm/i915/i915_ioc32.c
index 0e5c580d117c..e869daf9c8a9 100644
--- a/drivers/gpu/drm/i915/i915_ioc32.c
+++ b/drivers/gpu/drm/i915/i915_ioc32.c
@@ -52,7 +52,7 @@ static int compat_i915_getparam(struct file *file, unsigned int cmd,
 		return -EFAULT;
 
 	request = compat_alloc_user_space(sizeof(*request));
-	if (!access_ok(VERIFY_WRITE, request, sizeof(*request)) ||
+	if (!access_ok(request, sizeof(*request)) ||
 	    __put_user(req32.param, &request->param) ||
 	    __put_user((void __user *)(unsigned long)req32.value,
 		       &request->value))
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 4529edfdcfc8..2b2eb57ca71f 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -3052,7 +3052,7 @@ static struct i915_oa_reg *alloc_oa_regs(struct drm_i915_private *dev_priv,
 	if (!n_regs)
 		return NULL;
 
-	if (!access_ok(VERIFY_READ, regs, n_regs * sizeof(u32) * 2))
+	if (!access_ok(regs, n_regs * sizeof(u32) * 2))
 		return ERR_PTR(-EFAULT);
 
 	/* No is_valid function means we're not allowing any register to be programmed. */
diff --git a/drivers/gpu/drm/i915/i915_query.c b/drivers/gpu/drm/i915/i915_query.c
index 6fc4b8eeab42..fe56465cdfd6 100644
--- a/drivers/gpu/drm/i915/i915_query.c
+++ b/drivers/gpu/drm/i915/i915_query.c
@@ -46,7 +46,7 @@ static int query_topology_info(struct drm_i915_private *dev_priv,
 	if (topo.flags != 0)
 		return -EINVAL;
 
-	if (!access_ok(VERIFY_WRITE, u64_to_user_ptr(query_item->data_ptr),
+	if (!access_ok(u64_to_user_ptr(query_item->data_ptr),
 		       total_length))
 		return -EFAULT;
 
diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c b/drivers/gpu/drm/msm/msm_gem_submit.c
index a28465d90529..12b983fc0b56 100644
--- a/drivers/gpu/drm/msm/msm_gem_submit.c
+++ b/drivers/gpu/drm/msm/msm_gem_submit.c
@@ -77,7 +77,7 @@ void msm_gem_submit_free(struct msm_gem_submit *submit)
 static inline unsigned long __must_check
 copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
 {
-	if (access_ok(VERIFY_READ, from, n))
+	if (access_ok(from, n))
 		return __copy_from_user_inatomic(to, from, n);
 	return -EFAULT;
 }
diff --git a/drivers/gpu/drm/qxl/qxl_ioctl.c b/drivers/gpu/drm/qxl/qxl_ioctl.c
index 6e828158bcb0..d410e2925162 100644
--- a/drivers/gpu/drm/qxl/qxl_ioctl.c
+++ b/drivers/gpu/drm/qxl/qxl_ioctl.c
@@ -163,8 +163,7 @@ static int qxl_process_single_command(struct qxl_device *qdev,
 	if (cmd->command_size > PAGE_SIZE - sizeof(union qxl_release_info))
 		return -EINVAL;
 
-	if (!access_ok(VERIFY_READ,
-		       u64_to_user_ptr(cmd->command),
+	if (!access_ok(u64_to_user_ptr(cmd->command),
 		       cmd->command_size))
 		return -EFAULT;
 
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index 9f9172eb1512..fb0007aa0c27 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -611,8 +611,7 @@ static ssize_t verify_hdr(struct ib_uverbs_cmd_hdr *hdr,
 			if (hdr->out_words * 8 < method_elm->resp_size)
 				return -ENOSPC;
 
-			if (!access_ok(VERIFY_WRITE,
-				       u64_to_user_ptr(ex_hdr->response),
+			if (!access_ok(u64_to_user_ptr(ex_hdr->response),
 				       (hdr->out_words + ex_hdr->provider_out_words) * 8))
 				return -EFAULT;
 		} else {
diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.c b/drivers/infiniband/hw/hfi1/user_exp_rcv.c
index dbe7d14a5c76..0cd71ce7cc71 100644
--- a/drivers/infiniband/hw/hfi1/user_exp_rcv.c
+++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.c
@@ -232,7 +232,7 @@ static int pin_rcv_pages(struct hfi1_filedata *fd, struct tid_user_buf *tidbuf)
 	}
 
 	/* Verify that access is OK for the user buffer */
-	if (!access_ok(VERIFY_WRITE, (void __user *)vaddr,
+	if (!access_ok((void __user *)vaddr,
 		       npages * PAGE_SIZE)) {
 		dd_dev_err(dd, "Fail vaddr %p, %u pages, !access_ok\n",
 			   (void *)vaddr, npages);
diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c
index 98e1ce14fa2a..78fa634de98a 100644
--- a/drivers/infiniband/hw/qib/qib_file_ops.c
+++ b/drivers/infiniband/hw/qib/qib_file_ops.c
@@ -343,7 +343,7 @@ static int qib_tid_update(struct qib_ctxtdata *rcd, struct file *fp,
 
 	/* virtual address of first page in transfer */
 	vaddr = ti->tidvaddr;
-	if (!access_ok(VERIFY_WRITE, (void __user *) vaddr,
+	if (!access_ok((void __user *) vaddr,
 		       cnt * PAGE_SIZE)) {
 		ret = -EFAULT;
 		goto done;
diff --git a/drivers/macintosh/ans-lcd.c b/drivers/macintosh/ans-lcd.c
index ef0c2366cf59..400960cf04d5 100644
--- a/drivers/macintosh/ans-lcd.c
+++ b/drivers/macintosh/ans-lcd.c
@@ -64,7 +64,7 @@ anslcd_write( struct file * file, const char __user * buf,
 	printk(KERN_DEBUG "LCD: write\n");
 #endif
 
-	if (!access_ok(VERIFY_READ, buf, count))
+	if (!access_ok(buf, count))
 		return -EFAULT;
 
 	mutex_lock(&anslcd_mutex);
diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c
index ac0cf37d6239..21d532a78fa4 100644
--- a/drivers/macintosh/via-pmu.c
+++ b/drivers/macintosh/via-pmu.c
@@ -2188,7 +2188,7 @@ pmu_read(struct file *file, char __user *buf,
 
 	if (count < 1 || !pp)
 		return -EINVAL;
-	if (!access_ok(VERIFY_WRITE, buf, count))
+	if (!access_ok(buf, count))
 		return -EFAULT;
 
 	spin_lock_irqsave(&pp->lock, flags);
diff --git a/drivers/media/pci/ivtv/ivtvfb.c b/drivers/media/pci/ivtv/ivtvfb.c
index 3e02de02ffdd..8ec2525d8ef5 100644
--- a/drivers/media/pci/ivtv/ivtvfb.c
+++ b/drivers/media/pci/ivtv/ivtvfb.c
@@ -356,7 +356,7 @@ static int ivtvfb_prep_frame(struct ivtv *itv, int cmd, void __user *source,
 		IVTVFB_WARN("ivtvfb_prep_frame: Count not a multiple of 4 (%d)\n", count);
 
 	/* Check Source */
-	if (!access_ok(VERIFY_READ, source + dest_offset, count)) {
+	if (!access_ok(source + dest_offset, count)) {
 		IVTVFB_WARN("Invalid userspace pointer %p\n", source);
 
 		IVTVFB_DEBUG_WARN("access_ok() failed for offset 0x%08lx source %p count %d\n",
diff --git a/drivers/media/v4l2-core/v4l2-compat-ioctl32.c b/drivers/media/v4l2-core/v4l2-compat-ioctl32.c
index fe4577a46869..73dac1d8d4f6 100644
--- a/drivers/media/v4l2-core/v4l2-compat-ioctl32.c
+++ b/drivers/media/v4l2-core/v4l2-compat-ioctl32.c
@@ -158,7 +158,7 @@ static int get_v4l2_window32(struct v4l2_window __user *p64,
 	compat_caddr_t p;
 	u32 clipcount;
 
-	if (!access_ok(VERIFY_READ, p32, sizeof(*p32)) ||
+	if (!access_ok(p32, sizeof(*p32)) ||
 	    copy_in_user(&p64->w, &p32->w, sizeof(p32->w)) ||
 	    assign_in_user(&p64->field, &p32->field) ||
 	    assign_in_user(&p64->chromakey, &p32->chromakey) ||
@@ -283,7 +283,7 @@ static int __bufsize_v4l2_format(struct v4l2_format32 __user *p32, u32 *size)
 
 static int bufsize_v4l2_format(struct v4l2_format32 __user *p32, u32 *size)
 {
-	if (!access_ok(VERIFY_READ, p32, sizeof(*p32)))
+	if (!access_ok(p32, sizeof(*p32)))
 		return -EFAULT;
 	return __bufsize_v4l2_format(p32, size);
 }
@@ -335,7 +335,7 @@ static int get_v4l2_format32(struct v4l2_format __user *p64,
 			     struct v4l2_format32 __user *p32,
 			     void __user *aux_buf, u32 aux_space)
 {
-	if (!access_ok(VERIFY_READ, p32, sizeof(*p32)))
+	if (!access_ok(p32, sizeof(*p32)))
 		return -EFAULT;
 	return __get_v4l2_format32(p64, p32, aux_buf, aux_space);
 }
@@ -343,7 +343,7 @@ static int get_v4l2_format32(struct v4l2_format __user *p64,
 static int bufsize_v4l2_create(struct v4l2_create_buffers32 __user *p32,
 			       u32 *size)
 {
-	if (!access_ok(VERIFY_READ, p32, sizeof(*p32)))
+	if (!access_ok(p32, sizeof(*p32)))
 		return -EFAULT;
 	return __bufsize_v4l2_format(&p32->format, size);
 }
@@ -352,7 +352,7 @@ static int get_v4l2_create32(struct v4l2_create_buffers __user *p64,
 			     struct v4l2_create_buffers32 __user *p32,
 			     void __user *aux_buf, u32 aux_space)
 {
-	if (!access_ok(VERIFY_READ, p32, sizeof(*p32)) ||
+	if (!access_ok(p32, sizeof(*p32)) ||
 	    copy_in_user(p64, p32,
 			 offsetof(struct v4l2_create_buffers32, format)))
 		return -EFAULT;
@@ -404,7 +404,7 @@ static int __put_v4l2_format32(struct v4l2_format __user *p64,
 static int put_v4l2_format32(struct v4l2_format __user *p64,
 			     struct v4l2_format32 __user *p32)
 {
-	if (!access_ok(VERIFY_WRITE, p32, sizeof(*p32)))
+	if (!access_ok(p32, sizeof(*p32)))
 		return -EFAULT;
 	return __put_v4l2_format32(p64, p32);
 }
@@ -412,7 +412,7 @@ static int put_v4l2_format32(struct v4l2_format __user *p64,
 static int put_v4l2_create32(struct v4l2_create_buffers __user *p64,
 			     struct v4l2_create_buffers32 __user *p32)
 {
-	if (!access_ok(VERIFY_WRITE, p32, sizeof(*p32)) ||
+	if (!access_ok(p32, sizeof(*p32)) ||
 	    copy_in_user(p32, p64,
 			 offsetof(struct v4l2_create_buffers32, format)) ||
 	    assign_in_user(&p32->capabilities, &p64->capabilities) ||
@@ -434,7 +434,7 @@ static int get_v4l2_standard32(struct v4l2_standard __user *p64,
 			       struct v4l2_standard32 __user *p32)
 {
 	/* other fields are not set by the user, nor used by the driver */
-	if (!access_ok(VERIFY_READ, p32, sizeof(*p32)) ||
+	if (!access_ok(p32, sizeof(*p32)) ||
 	    assign_in_user(&p64->index, &p32->index))
 		return -EFAULT;
 	return 0;
@@ -443,7 +443,7 @@ static int get_v4l2_standard32(struct v4l2_standard __user *p64,
 static int put_v4l2_standard32(struct v4l2_standard __user *p64,
 			       struct v4l2_standard32 __user *p32)
 {
-	if (!access_ok(VERIFY_WRITE, p32, sizeof(*p32)) ||
+	if (!access_ok(p32, sizeof(*p32)) ||
 	    assign_in_user(&p32->index, &p64->index) ||
 	    assign_in_user(&p32->id, &p64->id) ||
 	    copy_in_user(p32->name, p64->name, sizeof(p32->name)) ||
@@ -560,7 +560,7 @@ static int bufsize_v4l2_buffer(struct v4l2_buffer32 __user *p32, u32 *size)
 	u32 type;
 	u32 length;
 
-	if (!access_ok(VERIFY_READ, p32, sizeof(*p32)) ||
+	if (!access_ok(p32, sizeof(*p32)) ||
 	    get_user(type, &p32->type) ||
 	    get_user(length, &p32->length))
 		return -EFAULT;
@@ -593,7 +593,7 @@ static int get_v4l2_buffer32(struct v4l2_buffer __user *p64,
 	compat_caddr_t p;
 	int ret;
 
-	if (!access_ok(VERIFY_READ, p32, sizeof(*p32)) ||
+	if (!access_ok(p32, sizeof(*p32)) ||
 	    assign_in_user(&p64->index, &p32->index) ||
 	    get_user(type, &p32->type) ||
 	    put_user(type, &p64->type) ||
@@ -632,7 +632,7 @@ static int get_v4l2_buffer32(struct v4l2_buffer __user *p64,
 			return -EFAULT;
 
 		uplane32 = compat_ptr(p);
-		if (!access_ok(VERIFY_READ, uplane32,
+		if (!access_ok(uplane32,
 			       num_planes * sizeof(*uplane32)))
 			return -EFAULT;
 
@@ -691,7 +691,7 @@ static int put_v4l2_buffer32(struct v4l2_buffer __user *p64,
 	compat_caddr_t p;
 	int ret;
 
-	if (!access_ok(VERIFY_WRITE, p32, sizeof(*p32)) ||
+	if (!access_ok(p32, sizeof(*p32)) ||
 	    assign_in_user(&p32->index, &p64->index) ||
 	    get_user(type, &p64->type) ||
 	    put_user(type, &p32->type) ||
@@ -781,7 +781,7 @@ static int get_v4l2_framebuffer32(struct v4l2_framebuffer __user *p64,
 {
 	compat_caddr_t tmp;
 
-	if (!access_ok(VERIFY_READ, p32, sizeof(*p32)) ||
+	if (!access_ok(p32, sizeof(*p32)) ||
 	    get_user(tmp, &p32->base) ||
 	    put_user_force(compat_ptr(tmp), &p64->base) ||
 	    assign_in_user(&p64->capability, &p32->capability) ||
@@ -796,7 +796,7 @@ static int put_v4l2_framebuffer32(struct v4l2_framebuffer __user *p64,
 {
 	void *base;
 
-	if (!access_ok(VERIFY_WRITE, p32, sizeof(*p32)) ||
+	if (!access_ok(p32, sizeof(*p32)) ||
 	    get_user(base, &p64->base) ||
 	    put_user(ptr_to_compat((void __user *)base), &p32->base) ||
 	    assign_in_user(&p32->capability, &p64->capability) ||
@@ -893,7 +893,7 @@ static int bufsize_v4l2_ext_controls(struct v4l2_ext_controls32 __user *p32,
 {
 	u32 count;
 
-	if (!access_ok(VERIFY_READ, p32, sizeof(*p32)) ||
+	if (!access_ok(p32, sizeof(*p32)) ||
 	    get_user(count, &p32->count))
 		return -EFAULT;
 	if (count > V4L2_CID_MAX_CTRLS)
@@ -913,7 +913,7 @@ static int get_v4l2_ext_controls32(struct file *file,
 	u32 n;
 	compat_caddr_t p;
 
-	if (!access_ok(VERIFY_READ, p32, sizeof(*p32)) ||
+	if (!access_ok(p32, sizeof(*p32)) ||
 	    assign_in_user(&p64->which, &p32->which) ||
 	    get_user(count, &p32->count) ||
 	    put_user(count, &p64->count) ||
@@ -929,7 +929,7 @@ static int get_v4l2_ext_controls32(struct file *file,
 	if (get_user(p, &p32->controls))
 		return -EFAULT;
 	ucontrols = compat_ptr(p);
-	if (!access_ok(VERIFY_READ, ucontrols, count * sizeof(*ucontrols)))
+	if (!access_ok(ucontrols, count * sizeof(*ucontrols)))
 		return -EFAULT;
 	if (aux_space < count * sizeof(*kcontrols))
 		return -EFAULT;
@@ -979,7 +979,7 @@ static int put_v4l2_ext_controls32(struct file *file,
 	 * with __user causes smatch warnings, so instead declare it
 	 * without __user and cast it as a userspace pointer where needed.
 	 */
-	if (!access_ok(VERIFY_WRITE, p32, sizeof(*p32)) ||
+	if (!access_ok(p32, sizeof(*p32)) ||
 	    assign_in_user(&p32->which, &p64->which) ||
 	    get_user(count, &p64->count) ||
 	    put_user(count, &p32->count) ||
@@ -994,7 +994,7 @@ static int put_v4l2_ext_controls32(struct file *file,
 	if (get_user(p, &p32->controls))
 		return -EFAULT;
 	ucontrols = compat_ptr(p);
-	if (!access_ok(VERIFY_WRITE, ucontrols, count * sizeof(*ucontrols)))
+	if (!access_ok(ucontrols, count * sizeof(*ucontrols)))
 		return -EFAULT;
 
 	for (n = 0; n < count; n++) {
@@ -1043,7 +1043,7 @@ struct v4l2_event32 {
 static int put_v4l2_event32(struct v4l2_event __user *p64,
 			    struct v4l2_event32 __user *p32)
 {
-	if (!access_ok(VERIFY_WRITE, p32, sizeof(*p32)) ||
+	if (!access_ok(p32, sizeof(*p32)) ||
 	    assign_in_user(&p32->type, &p64->type) ||
 	    copy_in_user(&p32->u, &p64->u, sizeof(p64->u)) ||
 	    assign_in_user(&p32->pending, &p64->pending) ||
@@ -1069,7 +1069,7 @@ static int get_v4l2_edid32(struct v4l2_edid __user *p64,
 {
 	compat_uptr_t tmp;
 
-	if (!access_ok(VERIFY_READ, p32, sizeof(*p32)) ||
+	if (!access_ok(p32, sizeof(*p32)) ||
 	    assign_in_user(&p64->pad, &p32->pad) ||
 	    assign_in_user(&p64->start_block, &p32->start_block) ||
 	    assign_in_user_cast(&p64->blocks, &p32->blocks) ||
@@ -1085,7 +1085,7 @@ static int put_v4l2_edid32(struct v4l2_edid __user *p64,
 {
 	void *edid;
 
-	if (!access_ok(VERIFY_WRITE, p32, sizeof(*p32)) ||
+	if (!access_ok(p32, sizeof(*p32)) ||
 	    assign_in_user(&p32->pad, &p64->pad) ||
 	    assign_in_user(&p32->start_block, &p64->start_block) ||
 	    assign_in_user(&p32->blocks, &p64->blocks) ||
diff --git a/drivers/misc/vmw_vmci/vmci_host.c b/drivers/misc/vmw_vmci/vmci_host.c
index 5da1f3e3f997..997f92543dd4 100644
--- a/drivers/misc/vmw_vmci/vmci_host.c
+++ b/drivers/misc/vmw_vmci/vmci_host.c
@@ -236,7 +236,7 @@ static int vmci_host_setup_notify(struct vmci_ctx *context,
 	 * about the size.
 	 */
 	BUILD_BUG_ON(sizeof(bool) != sizeof(u8));
-	if (!access_ok(VERIFY_WRITE, (void __user *)uva, sizeof(u8)))
+	if (!access_ok((void __user *)uva, sizeof(u8)))
 		return VMCI_ERROR_GENERIC;
 
 	/*
diff --git a/drivers/pci/proc.c b/drivers/pci/proc.c
index 7ac035af39f0..6fa1627ce08d 100644
--- a/drivers/pci/proc.c
+++ b/drivers/pci/proc.c
@@ -52,7 +52,7 @@ static ssize_t proc_bus_pci_read(struct file *file, char __user *buf,
 		nbytes = size - pos;
 	cnt = nbytes;
 
-	if (!access_ok(VERIFY_WRITE, buf, cnt))
+	if (!access_ok(buf, cnt))
 		return -EINVAL;
 
 	pci_config_pm_runtime_get(dev);
@@ -125,7 +125,7 @@ static ssize_t proc_bus_pci_write(struct file *file, const char __user *buf,
 		nbytes = size - pos;
 	cnt = nbytes;
 
-	if (!access_ok(VERIFY_READ, buf, cnt))
+	if (!access_ok(buf, cnt))
 		return -EINVAL;
 
 	pci_config_pm_runtime_get(dev);
diff --git a/drivers/platform/goldfish/goldfish_pipe.c b/drivers/platform/goldfish/goldfish_pipe.c
index 7c639006252e..321bc673c417 100644
--- a/drivers/platform/goldfish/goldfish_pipe.c
+++ b/drivers/platform/goldfish/goldfish_pipe.c
@@ -416,8 +416,7 @@ static ssize_t goldfish_pipe_read_write(struct file *filp,
 	if (unlikely(bufflen == 0))
 		return 0;
 	/* Check the buffer range for access */
-	if (unlikely(!access_ok(is_write ? VERIFY_WRITE : VERIFY_READ,
-				buffer, bufflen)))
+	if (unlikely(!access_ok(buffer, bufflen)))
 		return -EFAULT;
 
 	address = (unsigned long)buffer;
diff --git a/drivers/pnp/isapnp/proc.c b/drivers/pnp/isapnp/proc.c
index 262285e48a09..051613140812 100644
--- a/drivers/pnp/isapnp/proc.c
+++ b/drivers/pnp/isapnp/proc.c
@@ -47,7 +47,7 @@ static ssize_t isapnp_proc_bus_read(struct file *file, char __user * buf,
 		nbytes = size - pos;
 	cnt = nbytes;
 
-	if (!access_ok(VERIFY_WRITE, buf, cnt))
+	if (!access_ok(buf, cnt))
 		return -EINVAL;
 
 	isapnp_cfg_begin(dev->card->number, dev->number);
diff --git a/drivers/scsi/pmcraid.c b/drivers/scsi/pmcraid.c
index 7c4673308f5b..e338d7a4f571 100644
--- a/drivers/scsi/pmcraid.c
+++ b/drivers/scsi/pmcraid.c
@@ -3600,7 +3600,7 @@ static long pmcraid_ioctl_passthrough(
 	u32 ioasc;
 	int request_size;
 	int buffer_size;
-	u8 access, direction;
+	u8 direction;
 	int rc = 0;
 
 	/* If IOA reset is in progress, wait 10 secs for reset to complete */
@@ -3649,10 +3649,8 @@ static long pmcraid_ioctl_passthrough(
 	request_size = le32_to_cpu(buffer->ioarcb.data_transfer_length);
 
 	if (buffer->ioarcb.request_flags0 & TRANSFER_DIR_WRITE) {
-		access = VERIFY_READ;
 		direction = DMA_TO_DEVICE;
 	} else {
-		access = VERIFY_WRITE;
 		direction = DMA_FROM_DEVICE;
 	}
 
diff --git a/drivers/scsi/scsi_ioctl.c b/drivers/scsi/scsi_ioctl.c
index cc30fccc1a2e..840d96fe81bc 100644
--- a/drivers/scsi/scsi_ioctl.c
+++ b/drivers/scsi/scsi_ioctl.c
@@ -221,7 +221,7 @@ int scsi_ioctl(struct scsi_device *sdev, int cmd, void __user *arg)
 
 	switch (cmd) {
 	case SCSI_IOCTL_GET_IDLUN:
-		if (!access_ok(VERIFY_WRITE, arg, sizeof(struct scsi_idlun)))
+		if (!access_ok(arg, sizeof(struct scsi_idlun)))
 			return -EFAULT;
 
 		__put_user((sdev->id & 0xff)
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 4e27460ec926..d3f15319b9b3 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -434,7 +434,7 @@ sg_read(struct file *filp, char __user *buf, size_t count, loff_t * ppos)
 	SCSI_LOG_TIMEOUT(3, sg_printk(KERN_INFO, sdp,
 				      "sg_read: count=%d\n", (int) count));
 
-	if (!access_ok(VERIFY_WRITE, buf, count))
+	if (!access_ok(buf, count))
 		return -EFAULT;
 	if (sfp->force_packid && (count >= SZ_SG_HEADER)) {
 		old_hdr = kmalloc(SZ_SG_HEADER, GFP_KERNEL);
@@ -632,7 +632,7 @@ sg_write(struct file *filp, const char __user *buf, size_t count, loff_t * ppos)
 	      scsi_block_when_processing_errors(sdp->device)))
 		return -ENXIO;
 
-	if (!access_ok(VERIFY_READ, buf, count))
+	if (!access_ok(buf, count))
 		return -EFAULT;	/* protects following copy_from_user()s + get_user()s */
 	if (count < SZ_SG_HEADER)
 		return -EIO;
@@ -729,7 +729,7 @@ sg_new_write(Sg_fd *sfp, struct file *file, const char __user *buf,
 
 	if (count < SZ_SG_IO_HDR)
 		return -EINVAL;
-	if (!access_ok(VERIFY_READ, buf, count))
+	if (!access_ok(buf, count))
 		return -EFAULT; /* protects following copy_from_user()s + get_user()s */
 
 	sfp->cmd_q = 1;	/* when sg_io_hdr seen, set command queuing on */
@@ -768,7 +768,7 @@ sg_new_write(Sg_fd *sfp, struct file *file, const char __user *buf,
 		sg_remove_request(sfp, srp);
 		return -EMSGSIZE;
 	}
-	if (!access_ok(VERIFY_READ, hp->cmdp, hp->cmd_len)) {
+	if (!access_ok(hp->cmdp, hp->cmd_len)) {
 		sg_remove_request(sfp, srp);
 		return -EFAULT;	/* protects following copy_from_user()s + get_user()s */
 	}
@@ -922,7 +922,7 @@ sg_ioctl(struct file *filp, unsigned int cmd_in, unsigned long arg)
 			return -ENODEV;
 		if (!scsi_block_when_processing_errors(sdp->device))
 			return -ENXIO;
-		if (!access_ok(VERIFY_WRITE, p, SZ_SG_IO_HDR))
+		if (!access_ok(p, SZ_SG_IO_HDR))
 			return -EFAULT;
 		result = sg_new_write(sfp, filp, p, SZ_SG_IO_HDR,
 				 1, read_only, 1, &srp);
@@ -968,7 +968,7 @@ sg_ioctl(struct file *filp, unsigned int cmd_in, unsigned long arg)
 	case SG_GET_LOW_DMA:
 		return put_user((int) sdp->device->host->unchecked_isa_dma, ip);
 	case SG_GET_SCSI_ID:
-		if (!access_ok(VERIFY_WRITE, p, sizeof (sg_scsi_id_t)))
+		if (!access_ok(p, sizeof (sg_scsi_id_t)))
 			return -EFAULT;
 		else {
 			sg_scsi_id_t __user *sg_idp = p;
@@ -997,7 +997,7 @@ sg_ioctl(struct file *filp, unsigned int cmd_in, unsigned long arg)
 		sfp->force_packid = val ? 1 : 0;
 		return 0;
 	case SG_GET_PACK_ID:
-		if (!access_ok(VERIFY_WRITE, ip, sizeof (int)))
+		if (!access_ok(ip, sizeof (int)))
 			return -EFAULT;
 		read_lock_irqsave(&sfp->rq_list_lock, iflags);
 		list_for_each_entry(srp, &sfp->rq_list, entry) {
@@ -1078,7 +1078,7 @@ sg_ioctl(struct file *filp, unsigned int cmd_in, unsigned long arg)
 		val = (sdp->device ? 1 : 0);
 		return put_user(val, ip);
 	case SG_GET_REQUEST_TABLE:
-		if (!access_ok(VERIFY_WRITE, p, SZ_SG_REQ_INFO * SG_MAX_QUEUE))
+		if (!access_ok(p, SZ_SG_REQ_INFO * SG_MAX_QUEUE))
 			return -EFAULT;
 		else {
 			sg_req_info_t *rinfo;
diff --git a/drivers/staging/comedi/comedi_compat32.c b/drivers/staging/comedi/comedi_compat32.c
index fa9d239474ee..36a3564ba1fb 100644
--- a/drivers/staging/comedi/comedi_compat32.c
+++ b/drivers/staging/comedi/comedi_compat32.c
@@ -102,8 +102,8 @@ static int compat_chaninfo(struct file *file, unsigned long arg)
 	chaninfo = compat_alloc_user_space(sizeof(*chaninfo));
 
 	/* Copy chaninfo structure.  Ignore unused members. */
-	if (!access_ok(VERIFY_READ, chaninfo32, sizeof(*chaninfo32)) ||
-	    !access_ok(VERIFY_WRITE, chaninfo, sizeof(*chaninfo)))
+	if (!access_ok(chaninfo32, sizeof(*chaninfo32)) ||
+	    !access_ok(chaninfo, sizeof(*chaninfo)))
 		return -EFAULT;
 
 	err = 0;
@@ -136,8 +136,8 @@ static int compat_rangeinfo(struct file *file, unsigned long arg)
 	rangeinfo = compat_alloc_user_space(sizeof(*rangeinfo));
 
 	/* Copy rangeinfo structure. */
-	if (!access_ok(VERIFY_READ, rangeinfo32, sizeof(*rangeinfo32)) ||
-	    !access_ok(VERIFY_WRITE, rangeinfo, sizeof(*rangeinfo)))
+	if (!access_ok(rangeinfo32, sizeof(*rangeinfo32)) ||
+	    !access_ok(rangeinfo, sizeof(*rangeinfo)))
 		return -EFAULT;
 
 	err = 0;
@@ -163,8 +163,8 @@ static int get_compat_cmd(struct comedi_cmd __user *cmd,
 	} temp;
 
 	/* Copy cmd structure. */
-	if (!access_ok(VERIFY_READ, cmd32, sizeof(*cmd32)) ||
-	    !access_ok(VERIFY_WRITE, cmd, sizeof(*cmd)))
+	if (!access_ok(cmd32, sizeof(*cmd32)) ||
+	    !access_ok(cmd, sizeof(*cmd)))
 		return -EFAULT;
 
 	err = 0;
@@ -217,8 +217,8 @@ static int put_compat_cmd(struct comedi32_cmd_struct __user *cmd32,
 	 * Assume the pointer values are already valid.
 	 * (Could use ptr_to_compat() to set them.)
 	 */
-	if (!access_ok(VERIFY_READ, cmd, sizeof(*cmd)) ||
-	    !access_ok(VERIFY_WRITE, cmd32, sizeof(*cmd32)))
+	if (!access_ok(cmd, sizeof(*cmd)) ||
+	    !access_ok(cmd32, sizeof(*cmd32)))
 		return -EFAULT;
 
 	err = 0;
@@ -317,8 +317,8 @@ static int get_compat_insn(struct comedi_insn __user *insn,
 
 	/* Copy insn structure.  Ignore the unused members. */
 	err = 0;
-	if (!access_ok(VERIFY_READ, insn32, sizeof(*insn32)) ||
-	    !access_ok(VERIFY_WRITE, insn, sizeof(*insn)))
+	if (!access_ok(insn32, sizeof(*insn32)) ||
+	    !access_ok(insn, sizeof(*insn)))
 		return -EFAULT;
 
 	err |= __get_user(temp.uint, &insn32->insn);
@@ -350,7 +350,7 @@ static int compat_insnlist(struct file *file, unsigned long arg)
 	insnlist32 = compat_ptr(arg);
 
 	/* Get 32-bit insnlist structure.  */
-	if (!access_ok(VERIFY_READ, insnlist32, sizeof(*insnlist32)))
+	if (!access_ok(insnlist32, sizeof(*insnlist32)))
 		return -EFAULT;
 
 	err = 0;
@@ -365,7 +365,7 @@ static int compat_insnlist(struct file *file, unsigned long arg)
 					     insn[n_insns]));
 
 	/* Set native insnlist structure. */
-	if (!access_ok(VERIFY_WRITE, &s->insnlist, sizeof(s->insnlist)))
+	if (!access_ok(&s->insnlist, sizeof(s->insnlist)))
 		return -EFAULT;
 
 	err |= __put_user(n_insns, &s->insnlist.n_insns);
diff --git a/drivers/tty/n_hdlc.c b/drivers/tty/n_hdlc.c
index 99460af61b77..4164414d4c64 100644
--- a/drivers/tty/n_hdlc.c
+++ b/drivers/tty/n_hdlc.c
@@ -573,7 +573,7 @@ static ssize_t n_hdlc_tty_read(struct tty_struct *tty, struct file *file,
 		return -EIO;
 
 	/* verify user access to buffer */
-	if (!access_ok(VERIFY_WRITE, buf, nr)) {
+	if (!access_ok(buf, nr)) {
 		printk(KERN_WARNING "%s(%d) n_hdlc_tty_read() can't verify user "
 		"buffer\n", __FILE__, __LINE__);
 		return -EFAULT;
diff --git a/drivers/usb/core/devices.c b/drivers/usb/core/devices.c
index 3de3c750b5f6..44f28a114c2b 100644
--- a/drivers/usb/core/devices.c
+++ b/drivers/usb/core/devices.c
@@ -598,7 +598,7 @@ static ssize_t usb_device_read(struct file *file, char __user *buf,
 		return -EINVAL;
 	if (nbytes <= 0)
 		return 0;
-	if (!access_ok(VERIFY_WRITE, buf, nbytes))
+	if (!access_ok(buf, nbytes))
 		return -EFAULT;
 
 	mutex_lock(&usb_bus_idr_lock);
diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c
index a75bc0b8a50f..d65566341dd1 100644
--- a/drivers/usb/core/devio.c
+++ b/drivers/usb/core/devio.c
@@ -1094,7 +1094,7 @@ static int proc_control(struct usb_dev_state *ps, void __user *arg)
 		ctrl.bRequestType, ctrl.bRequest, ctrl.wValue,
 		ctrl.wIndex, ctrl.wLength);
 	if (ctrl.bRequestType & 0x80) {
-		if (ctrl.wLength && !access_ok(VERIFY_WRITE, ctrl.data,
+		if (ctrl.wLength && !access_ok(ctrl.data,
 					       ctrl.wLength)) {
 			ret = -EINVAL;
 			goto done;
@@ -1183,7 +1183,7 @@ static int proc_bulk(struct usb_dev_state *ps, void __user *arg)
 	}
 	tmo = bulk.timeout;
 	if (bulk.ep & 0x80) {
-		if (len1 && !access_ok(VERIFY_WRITE, bulk.data, len1)) {
+		if (len1 && !access_ok(bulk.data, len1)) {
 			ret = -EINVAL;
 			goto done;
 		}
@@ -1584,8 +1584,7 @@ static int proc_do_submiturb(struct usb_dev_state *ps, struct usbdevfs_urb *uurb
 	}
 
 	if (uurb->buffer_length > 0 &&
-			!access_ok(is_in ? VERIFY_WRITE : VERIFY_READ,
-				uurb->buffer, uurb->buffer_length)) {
+			!access_ok(uurb->buffer, uurb->buffer_length)) {
 		ret = -EFAULT;
 		goto error;
 	}
diff --git a/drivers/usb/gadget/function/f_hid.c b/drivers/usb/gadget/function/f_hid.c
index 54e859dcb25c..75b113a5b25c 100644
--- a/drivers/usb/gadget/function/f_hid.c
+++ b/drivers/usb/gadget/function/f_hid.c
@@ -252,7 +252,7 @@ static ssize_t f_hidg_read(struct file *file, char __user *buffer,
 	if (!count)
 		return 0;
 
-	if (!access_ok(VERIFY_WRITE, buffer, count))
+	if (!access_ok(buffer, count))
 		return -EFAULT;
 
 	spin_lock_irqsave(&hidg->read_spinlock, flags);
@@ -339,7 +339,7 @@ static ssize_t f_hidg_write(struct file *file, const char __user *buffer,
 	unsigned long flags;
 	ssize_t status = -ENOMEM;
 
-	if (!access_ok(VERIFY_READ, buffer, count))
+	if (!access_ok(buffer, count))
 		return -EFAULT;
 
 	spin_lock_irqsave(&hidg->write_spinlock, flags);
diff --git a/drivers/usb/gadget/udc/atmel_usba_udc.c b/drivers/usb/gadget/udc/atmel_usba_udc.c
index 11247322d587..660712e0bf98 100644
--- a/drivers/usb/gadget/udc/atmel_usba_udc.c
+++ b/drivers/usb/gadget/udc/atmel_usba_udc.c
@@ -88,7 +88,7 @@ static ssize_t queue_dbg_read(struct file *file, char __user *buf,
 	size_t len, remaining, actual = 0;
 	char tmpbuf[38];
 
-	if (!access_ok(VERIFY_WRITE, buf, nbytes))
+	if (!access_ok(buf, nbytes))
 		return -EFAULT;
 
 	inode_lock(file_inode(file));
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 55e5aa662ad5..9f7942cbcbb2 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -655,7 +655,7 @@ static bool log_access_ok(void __user *log_base, u64 addr, unsigned long sz)
 	    a + (unsigned long)log_base > ULONG_MAX)
 		return false;
 
-	return access_ok(VERIFY_WRITE, log_base + a,
+	return access_ok(log_base + a,
 			 (sz + VHOST_PAGE_SIZE * 8 - 1) / VHOST_PAGE_SIZE / 8);
 }
 
@@ -681,7 +681,7 @@ static bool vq_memory_access_ok(void __user *log_base, struct vhost_umem *umem,
 			return false;
 
 
-		if (!access_ok(VERIFY_WRITE, (void __user *)a,
+		if (!access_ok((void __user *)a,
 				    node->size))
 			return false;
 		else if (log_all && !log_access_ok(log_base,
@@ -973,10 +973,10 @@ static bool umem_access_ok(u64 uaddr, u64 size, int access)
 		return false;
 
 	if ((access & VHOST_ACCESS_RO) &&
-	    !access_ok(VERIFY_READ, (void __user *)a, size))
+	    !access_ok((void __user *)a, size))
 		return false;
 	if ((access & VHOST_ACCESS_WO) &&
-	    !access_ok(VERIFY_WRITE, (void __user *)a, size))
+	    !access_ok((void __user *)a, size))
 		return false;
 	return true;
 }
@@ -1185,10 +1185,10 @@ static bool vq_access_ok(struct vhost_virtqueue *vq, unsigned int num,
 {
 	size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
 
-	return access_ok(VERIFY_READ, desc, num * sizeof *desc) &&
-	       access_ok(VERIFY_READ, avail,
+	return access_ok(desc, num * sizeof *desc) &&
+	       access_ok(avail,
 			 sizeof *avail + num * sizeof *avail->ring + s) &&
-	       access_ok(VERIFY_WRITE, used,
+	       access_ok(used,
 			sizeof *used + num * sizeof *used->ring + s);
 }
 
@@ -1814,7 +1814,7 @@ int vhost_vq_init_access(struct vhost_virtqueue *vq)
 		goto err;
 	vq->signalled_used_valid = false;
 	if (!vq->iotlb &&
-	    !access_ok(VERIFY_READ, &vq->used->idx, sizeof vq->used->idx)) {
+	    !access_ok(&vq->used->idx, sizeof vq->used->idx)) {
 		r = -EFAULT;
 		goto err;
 	}
diff --git a/drivers/video/fbdev/amifb.c b/drivers/video/fbdev/amifb.c
index 0777aff211e5..758457026694 100644
--- a/drivers/video/fbdev/amifb.c
+++ b/drivers/video/fbdev/amifb.c
@@ -1855,7 +1855,7 @@ static int ami_get_var_cursorinfo(struct fb_var_cursorinfo *var,
 	var->yspot = par->crsr.spot_y;
 	if (size > var->height * var->width)
 		return -ENAMETOOLONG;
-	if (!access_ok(VERIFY_WRITE, data, size))
+	if (!access_ok(data, size))
 		return -EFAULT;
 	delta = 1 << par->crsr.fmode;
 	lspr = lofsprite + (delta << 1);
@@ -1935,7 +1935,7 @@ static int ami_set_var_cursorinfo(struct fb_var_cursorinfo *var,
 		return -EINVAL;
 	if (!var->height)
 		return -EINVAL;
-	if (!access_ok(VERIFY_READ, data, var->width * var->height))
+	if (!access_ok(data, var->width * var->height))
 		return -EFAULT;
 	delta = 1 << fmode;
 	lofsprite = shfsprite = (u_short *)spritememory;
diff --git a/drivers/video/fbdev/omap2/omapfb/omapfb-ioctl.c b/drivers/video/fbdev/omap2/omapfb/omapfb-ioctl.c
index a3edb20ea4c3..53f93616c671 100644
--- a/drivers/video/fbdev/omap2/omapfb/omapfb-ioctl.c
+++ b/drivers/video/fbdev/omap2/omapfb/omapfb-ioctl.c
@@ -493,7 +493,7 @@ static int omapfb_memory_read(struct fb_info *fbi,
 	if (!display || !display->driver->memory_read)
 		return -ENOENT;
 
-	if (!access_ok(VERIFY_WRITE, mr->buffer, mr->buffer_size))
+	if (!access_ok(mr->buffer, mr->buffer_size))
 		return -EFAULT;
 
 	if (mr->w > 4096 || mr->h > 4096)
diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
index 7e6e682104dc..b24ddac1604b 100644
--- a/drivers/xen/privcmd.c
+++ b/drivers/xen/privcmd.c
@@ -459,14 +459,14 @@ static long privcmd_ioctl_mmap_batch(
 			return -EFAULT;
 		/* Returns per-frame error in m.arr. */
 		m.err = NULL;
-		if (!access_ok(VERIFY_WRITE, m.arr, m.num * sizeof(*m.arr)))
+		if (!access_ok(m.arr, m.num * sizeof(*m.arr)))
 			return -EFAULT;
 		break;
 	case 2:
 		if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch_v2)))
 			return -EFAULT;
 		/* Returns per-frame error code in m.err. */
-		if (!access_ok(VERIFY_WRITE, m.err, m.num * (sizeof(*m.err))))
+		if (!access_ok(m.err, m.num * (sizeof(*m.err))))
 			return -EFAULT;
 		break;
 	default:
@@ -661,7 +661,7 @@ static long privcmd_ioctl_dm_op(struct file *file, void __user *udata)
 			goto out;
 		}
 
-		if (!access_ok(VERIFY_WRITE, kbufs[i].uptr,
+		if (!access_ok(kbufs[i].uptr,
 			       kbufs[i].size)) {
 			rc = -EFAULT;
 			goto out;
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index c3deb2e35f20..ca9725f18e00 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -78,9 +78,9 @@ static int aout_core_dump(struct coredump_params *cprm)
 
 /* make sure we actually have a data and stack area to dump */
 	set_fs(USER_DS);
-	if (!access_ok(VERIFY_READ, START_DATA(dump), dump.u_dsize << PAGE_SHIFT))
+	if (!access_ok(START_DATA(dump), dump.u_dsize << PAGE_SHIFT))
 		dump.u_dsize = 0;
-	if (!access_ok(VERIFY_READ, START_STACK(dump), dump.u_ssize << PAGE_SHIFT))
+	if (!access_ok(START_STACK(dump), dump.u_ssize << PAGE_SHIFT))
 		dump.u_ssize = 0;
 
 	set_fs(KERNEL_DS);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 1b15b43905f8..7ea2d6b1f170 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -6646,7 +6646,7 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
 		goto out;
 	}
 
-	if (!access_ok(VERIFY_READ, arg->clone_sources,
+	if (!access_ok(arg->clone_sources,
 			sizeof(*arg->clone_sources) *
 			arg->clone_sources_count)) {
 		ret = -EFAULT;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 8a5a1010886b..7ebae39fbcb3 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -2172,7 +2172,7 @@ static int do_epoll_wait(int epfd, struct epoll_event __user *events,
 		return -EINVAL;
 
 	/* Verify that the area passed by the user is writeable */
-	if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event)))
+	if (!access_ok(events, maxevents * sizeof(struct epoll_event)))
 		return -EFAULT;
 
 	/* Get the "struct file *" for the eventpoll file */
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index c8366cb8eccd..0295a095b920 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -805,7 +805,7 @@ static long fat_dir_ioctl(struct file *filp, unsigned int cmd,
 		return fat_generic_ioctl(filp, cmd, arg);
 	}
 
-	if (!access_ok(VERIFY_WRITE, d1, sizeof(struct __fat_dirent[2])))
+	if (!access_ok(d1, sizeof(struct __fat_dirent[2])))
 		return -EFAULT;
 	/*
 	 * Yes, we don't need this put_user() absolutely. However old
@@ -845,7 +845,7 @@ static long fat_compat_dir_ioctl(struct file *filp, unsigned cmd,
 		return fat_generic_ioctl(filp, cmd, (unsigned long)arg);
 	}
 
-	if (!access_ok(VERIFY_WRITE, d1, sizeof(struct compat_dirent[2])))
+	if (!access_ok(d1, sizeof(struct compat_dirent[2])))
 		return -EFAULT;
 	/*
 	 * Yes, we don't need this put_user() absolutely. However old
diff --git a/fs/ioctl.c b/fs/ioctl.c
index d64f622cac8b..fef3a6bf7c78 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -203,7 +203,7 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
 	fieinfo.fi_extents_start = ufiemap->fm_extents;
 
 	if (fiemap.fm_extent_count != 0 &&
-	    !access_ok(VERIFY_WRITE, fieinfo.fi_extents_start,
+	    !access_ok(fieinfo.fi_extents_start,
 		       fieinfo.fi_extents_max * sizeof(struct fiemap_extent)))
 		return -EFAULT;
 
diff --git a/fs/namespace.c b/fs/namespace.c
index a7f91265ea67..97b7c7098c3d 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2651,7 +2651,7 @@ static long exact_copy_from_user(void *to, const void __user * from,
 	const char __user *f = from;
 	char c;
 
-	if (!access_ok(VERIFY_READ, from, n))
+	if (!access_ok(from, n))
 		return n;
 
 	current->kernel_uaccess_faults_ok++;
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index b8fa1487cd85..8decbe95dcec 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -254,7 +254,7 @@ static ssize_t dlmfs_file_read(struct file *filp,
 	if (!count)
 		return 0;
 
-	if (!access_ok(VERIFY_WRITE, buf, count))
+	if (!access_ok(buf, count))
 		return -EFAULT;
 
 	/* don't read past the lvb */
@@ -302,7 +302,7 @@ static ssize_t dlmfs_file_write(struct file *filp,
 	if (!count)
 		return 0;
 
-	if (!access_ok(VERIFY_READ, buf, count))
+	if (!access_ok(buf, count))
 		return -EFAULT;
 
 	/* don't write past the lvb */
diff --git a/fs/pstore/pmsg.c b/fs/pstore/pmsg.c
index 24db02de1787..97fcef74e5af 100644
--- a/fs/pstore/pmsg.c
+++ b/fs/pstore/pmsg.c
@@ -33,7 +33,7 @@ static ssize_t write_pmsg(struct file *file, const char __user *buf,
 	record.size = count;
 
 	/* check outside lock, page in any data. write_user also checks */
-	if (!access_ok(VERIFY_READ, buf, count))
+	if (!access_ok(buf, count))
 		return -EFAULT;
 
 	mutex_lock(&pmsg_lock);
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index c11711c2cc83..f375c0735351 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -357,7 +357,7 @@ int notrace persistent_ram_write_user(struct persistent_ram_zone *prz,
 	int rem, ret = 0, c = count;
 	size_t start;
 
-	if (unlikely(!access_ok(VERIFY_READ, s, count)))
+	if (unlikely(!access_ok(s, count)))
 		return -EFAULT;
 	if (unlikely(c > prz->buffer_size)) {
 		s += c - prz->buffer_size;
diff --git a/fs/read_write.c b/fs/read_write.c
index 58f30537c47a..ff3c5e6f87cf 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -442,7 +442,7 @@ ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
 		return -EBADF;
 	if (!(file->f_mode & FMODE_CAN_READ))
 		return -EINVAL;
-	if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
+	if (unlikely(!access_ok(buf, count)))
 		return -EFAULT;
 
 	ret = rw_verify_area(READ, file, pos, count);
@@ -538,7 +538,7 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_
 		return -EBADF;
 	if (!(file->f_mode & FMODE_CAN_WRITE))
 		return -EINVAL;
-	if (unlikely(!access_ok(VERIFY_READ, buf, count)))
+	if (unlikely(!access_ok(buf, count)))
 		return -EFAULT;
 
 	ret = rw_verify_area(WRITE, file, pos, count);
@@ -718,9 +718,6 @@ static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
 	return ret;
 }
 
-/* A write operation does a read from user space and vice versa */
-#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
-
 /**
  * rw_copy_check_uvector() - Copy an array of &struct iovec from userspace
  *     into the kernel and check that it is valid.
@@ -810,7 +807,7 @@ ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
 			goto out;
 		}
 		if (type >= 0
-		    && unlikely(!access_ok(vrfy_dir(type), buf, len))) {
+		    && unlikely(!access_ok(buf, len))) {
 			ret = -EFAULT;
 			goto out;
 		}
@@ -856,7 +853,7 @@ ssize_t compat_rw_copy_check_uvector(int type,
 	*ret_pointer = iov;
 
 	ret = -EFAULT;
-	if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector)))
+	if (!access_ok(uvector, nr_segs*sizeof(*uvector)))
 		goto out;
 
 	/*
@@ -881,7 +878,7 @@ ssize_t compat_rw_copy_check_uvector(int type,
 		if (len < 0)	/* size_t not fitting in compat_ssize_t .. */
 			goto out;
 		if (type >= 0 &&
-		    !access_ok(vrfy_dir(type), compat_ptr(buf), len)) {
+		    !access_ok(compat_ptr(buf), len)) {
 			ret = -EFAULT;
 			goto out;
 		}
diff --git a/fs/readdir.c b/fs/readdir.c
index d97f548e6323..2f6a4534e0df 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -105,7 +105,7 @@ static int fillonedir(struct dir_context *ctx, const char *name, int namlen,
 	}
 	buf->result++;
 	dirent = buf->dirent;
-	if (!access_ok(VERIFY_WRITE, dirent,
+	if (!access_ok(dirent,
 			(unsigned long)(dirent->d_name + namlen + 1) -
 				(unsigned long)dirent))
 		goto efault;
@@ -221,7 +221,7 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
 	};
 	int error;
 
-	if (!access_ok(VERIFY_WRITE, dirent, count))
+	if (!access_ok(dirent, count))
 		return -EFAULT;
 
 	f = fdget_pos(fd);
@@ -304,7 +304,7 @@ int ksys_getdents64(unsigned int fd, struct linux_dirent64 __user *dirent,
 	};
 	int error;
 
-	if (!access_ok(VERIFY_WRITE, dirent, count))
+	if (!access_ok(dirent, count))
 		return -EFAULT;
 
 	f = fdget_pos(fd);
@@ -365,7 +365,7 @@ static int compat_fillonedir(struct dir_context *ctx, const char *name,
 	}
 	buf->result++;
 	dirent = buf->dirent;
-	if (!access_ok(VERIFY_WRITE, dirent,
+	if (!access_ok(dirent,
 			(unsigned long)(dirent->d_name + namlen + 1) -
 				(unsigned long)dirent))
 		goto efault;
@@ -475,7 +475,7 @@ COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd,
 	};
 	int error;
 
-	if (!access_ok(VERIFY_WRITE, dirent, count))
+	if (!access_ok(dirent, count))
 		return -EFAULT;
 
 	f = fdget_pos(fd);
diff --git a/fs/select.c b/fs/select.c
index 4c8652390c94..d0f35dbc0e8f 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -381,9 +381,6 @@ typedef struct {
 #define FDS_BYTES(nr)	(FDS_LONGS(nr)*sizeof(long))
 
 /*
- * We do a VERIFY_WRITE here even though we are only reading this time:
- * we'll write to it eventually..
- *
  * Use "unsigned long" accesses to let user-mode fd_set's be long-aligned.
  */
 static inline
@@ -782,7 +779,7 @@ SYSCALL_DEFINE6(pselect6, int, n, fd_set __user *, inp, fd_set __user *, outp,
 	sigset_t __user *up = NULL;
 
 	if (sig) {
-		if (!access_ok(VERIFY_READ, sig, sizeof(void *)+sizeof(size_t))
+		if (!access_ok(sig, sizeof(void *)+sizeof(size_t))
 		    || __get_user(up, (sigset_t __user * __user *)sig)
 		    || __get_user(sigsetsize,
 				(size_t __user *)(sig+sizeof(void *))))
@@ -802,7 +799,7 @@ SYSCALL_DEFINE6(pselect6_time32, int, n, fd_set __user *, inp, fd_set __user *,
 	sigset_t __user *up = NULL;
 
 	if (sig) {
-		if (!access_ok(VERIFY_READ, sig, sizeof(void *)+sizeof(size_t))
+		if (!access_ok(sig, sizeof(void *)+sizeof(size_t))
 		    || __get_user(up, (sigset_t __user * __user *)sig)
 		    || __get_user(sigsetsize,
 				(size_t __user *)(sig+sizeof(void *))))
@@ -1368,7 +1365,7 @@ COMPAT_SYSCALL_DEFINE6(pselect6_time64, int, n, compat_ulong_t __user *, inp,
 	compat_uptr_t up = 0;
 
 	if (sig) {
-		if (!access_ok(VERIFY_READ, sig,
+		if (!access_ok(sig,
 				sizeof(compat_uptr_t)+sizeof(compat_size_t)) ||
 				__get_user(up, (compat_uptr_t __user *)sig) ||
 				__get_user(sigsetsize,
@@ -1390,7 +1387,7 @@ COMPAT_SYSCALL_DEFINE6(pselect6, int, n, compat_ulong_t __user *, inp,
 	compat_uptr_t up = 0;
 
 	if (sig) {
-		if (!access_ok(VERIFY_READ, sig,
+		if (!access_ok(sig,
 				sizeof(compat_uptr_t)+sizeof(compat_size_t)) ||
 		    	__get_user(up, (compat_uptr_t __user *)sig) ||
 		    	__get_user(sigsetsize,
diff --git a/include/asm-generic/uaccess.h b/include/asm-generic/uaccess.h
index 6b2e63df2739..d82c78a79da5 100644
--- a/include/asm-generic/uaccess.h
+++ b/include/asm-generic/uaccess.h
@@ -35,7 +35,7 @@ static inline void set_fs(mm_segment_t fs)
 #define segment_eq(a, b) ((a).seg == (b).seg)
 #endif
 
-#define access_ok(type, addr, size) __access_ok((unsigned long)(addr),(size))
+#define access_ok(addr, size) __access_ok((unsigned long)(addr),(size))
 
 /*
  * The architecture should really override this if possible, at least
@@ -78,7 +78,7 @@ static inline int __access_ok(unsigned long addr, unsigned long size)
 ({								\
 	void __user *__p = (ptr);				\
 	might_fault();						\
-	access_ok(VERIFY_WRITE, __p, sizeof(*ptr)) ?		\
+	access_ok(__p, sizeof(*ptr)) ?		\
 		__put_user((x), ((__typeof__(*(ptr)) __user *)__p)) :	\
 		-EFAULT;					\
 })
@@ -140,7 +140,7 @@ extern int __put_user_bad(void) __attribute__((noreturn));
 ({								\
 	const void __user *__p = (ptr);				\
 	might_fault();						\
-	access_ok(VERIFY_READ, __p, sizeof(*ptr)) ?		\
+	access_ok(__p, sizeof(*ptr)) ?		\
 		__get_user((x), (__typeof__(*(ptr)) __user *)__p) :\
 		((x) = (__typeof__(*(ptr)))0,-EFAULT);		\
 })
@@ -175,7 +175,7 @@ __strncpy_from_user(char *dst, const char __user *src, long count)
 static inline long
 strncpy_from_user(char *dst, const char __user *src, long count)
 {
-	if (!access_ok(VERIFY_READ, src, 1))
+	if (!access_ok(src, 1))
 		return -EFAULT;
 	return __strncpy_from_user(dst, src, count);
 }
@@ -196,7 +196,7 @@ strncpy_from_user(char *dst, const char __user *src, long count)
  */
 static inline long strnlen_user(const char __user *src, long n)
 {
-	if (!access_ok(VERIFY_READ, src, 1))
+	if (!access_ok(src, 1))
 		return 0;
 	return __strnlen_user(src, n);
 }
@@ -217,7 +217,7 @@ static inline __must_check unsigned long
 clear_user(void __user *to, unsigned long n)
 {
 	might_fault();
-	if (!access_ok(VERIFY_WRITE, to, n))
+	if (!access_ok(to, n))
 		return n;
 
 	return __clear_user(to, n);
diff --git a/include/linux/regset.h b/include/linux/regset.h
index 494cedaafdf2..a85c1707285c 100644
--- a/include/linux/regset.h
+++ b/include/linux/regset.h
@@ -376,7 +376,7 @@ static inline int copy_regset_to_user(struct task_struct *target,
 	if (!regset->get)
 		return -EOPNOTSUPP;
 
-	if (!access_ok(VERIFY_WRITE, data, size))
+	if (!access_ok(data, size))
 		return -EFAULT;
 
 	return regset->get(target, regset, offset, size, NULL, data);
@@ -402,7 +402,7 @@ static inline int copy_regset_from_user(struct task_struct *target,
 	if (!regset->set)
 		return -EOPNOTSUPP;
 
-	if (!access_ok(VERIFY_READ, data, size))
+	if (!access_ok(data, size))
 		return -EFAULT;
 
 	return regset->set(target, regset, offset, size, NULL, data);
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index efe79c1cdd47..bf2523867a02 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -6,9 +6,6 @@
 #include <linux/thread_info.h>
 #include <linux/kasan-checks.h>
 
-#define VERIFY_READ 0
-#define VERIFY_WRITE 1
-
 #define uaccess_kernel() segment_eq(get_fs(), KERNEL_DS)
 
 #include <asm/uaccess.h>
@@ -111,7 +108,7 @@ _copy_from_user(void *to, const void __user *from, unsigned long n)
 {
 	unsigned long res = n;
 	might_fault();
-	if (likely(access_ok(VERIFY_READ, from, n))) {
+	if (likely(access_ok(from, n))) {
 		kasan_check_write(to, n);
 		res = raw_copy_from_user(to, from, n);
 	}
@@ -129,7 +126,7 @@ static inline unsigned long
 _copy_to_user(void __user *to, const void *from, unsigned long n)
 {
 	might_fault();
-	if (access_ok(VERIFY_WRITE, to, n)) {
+	if (access_ok(to, n)) {
 		kasan_check_read(from, n);
 		n = raw_copy_to_user(to, from, n);
 	}
@@ -160,7 +157,7 @@ static __always_inline unsigned long __must_check
 copy_in_user(void __user *to, const void __user *from, unsigned long n)
 {
 	might_fault();
-	if (access_ok(VERIFY_WRITE, to, n) && access_ok(VERIFY_READ, from, n))
+	if (access_ok(to, n) && access_ok(from, n))
 		n = raw_copy_in_user(to, from, n);
 	return n;
 }
diff --git a/include/net/checksum.h b/include/net/checksum.h
index aef2b2bb6603..0f319e13be2c 100644
--- a/include/net/checksum.h
+++ b/include/net/checksum.h
@@ -30,7 +30,7 @@ static inline
 __wsum csum_and_copy_from_user (const void __user *src, void *dst,
 				      int len, __wsum sum, int *err_ptr)
 {
-	if (access_ok(VERIFY_READ, src, len))
+	if (access_ok(src, len))
 		return csum_partial_copy_from_user(src, dst, len, sum, err_ptr);
 
 	if (len)
@@ -46,7 +46,7 @@ static __inline__ __wsum csum_and_copy_to_user
 {
 	sum = csum_partial(src, len, sum);
 
-	if (access_ok(VERIFY_WRITE, dst, len)) {
+	if (access_ok(dst, len)) {
 		if (copy_to_user(dst, src, len) == 0)
 			return sum;
 	}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0607db304def..b155cd17c1bd 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -79,7 +79,7 @@ int bpf_check_uarg_tail_zero(void __user *uaddr,
 	if (unlikely(actual_size > PAGE_SIZE))	/* silly large */
 		return -E2BIG;
 
-	if (unlikely(!access_ok(VERIFY_READ, uaddr, actual_size)))
+	if (unlikely(!access_ok(uaddr, actual_size)))
 		return -EFAULT;
 
 	if (actual_size <= expected_size)
diff --git a/kernel/compat.c b/kernel/compat.c
index 089d00d0da9c..705d4ae6c018 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -95,28 +95,28 @@ int compat_put_timex(struct compat_timex __user *utp, const struct timex *txc)
 
 static int __compat_get_timeval(struct timeval *tv, const struct old_timeval32 __user *ctv)
 {
-	return (!access_ok(VERIFY_READ, ctv, sizeof(*ctv)) ||
+	return (!access_ok(ctv, sizeof(*ctv)) ||
 			__get_user(tv->tv_sec, &ctv->tv_sec) ||
 			__get_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0;
 }
 
 static int __compat_put_timeval(const struct timeval *tv, struct old_timeval32 __user *ctv)
 {
-	return (!access_ok(VERIFY_WRITE, ctv, sizeof(*ctv)) ||
+	return (!access_ok(ctv, sizeof(*ctv)) ||
 			__put_user(tv->tv_sec, &ctv->tv_sec) ||
 			__put_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0;
 }
 
 static int __compat_get_timespec(struct timespec *ts, const struct old_timespec32 __user *cts)
 {
-	return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) ||
+	return (!access_ok(cts, sizeof(*cts)) ||
 			__get_user(ts->tv_sec, &cts->tv_sec) ||
 			__get_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
 }
 
 static int __compat_put_timespec(const struct timespec *ts, struct old_timespec32 __user *cts)
 {
-	return (!access_ok(VERIFY_WRITE, cts, sizeof(*cts)) ||
+	return (!access_ok(cts, sizeof(*cts)) ||
 			__put_user(ts->tv_sec, &cts->tv_sec) ||
 			__put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
 }
@@ -335,7 +335,7 @@ int get_compat_sigevent(struct sigevent *event,
 		const struct compat_sigevent __user *u_event)
 {
 	memset(event, 0, sizeof(*event));
-	return (!access_ok(VERIFY_READ, u_event, sizeof(*u_event)) ||
+	return (!access_ok(u_event, sizeof(*u_event)) ||
 		__get_user(event->sigev_value.sival_int,
 			&u_event->sigev_value.sival_int) ||
 		__get_user(event->sigev_signo, &u_event->sigev_signo) ||
@@ -354,7 +354,7 @@ long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask,
 	bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG);
 	nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size);
 
-	if (!access_ok(VERIFY_READ, umask, bitmap_size / 8))
+	if (!access_ok(umask, bitmap_size / 8))
 		return -EFAULT;
 
 	user_access_begin();
@@ -384,7 +384,7 @@ long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask,
 	bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG);
 	nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size);
 
-	if (!access_ok(VERIFY_WRITE, umask, bitmap_size / 8))
+	if (!access_ok(umask, bitmap_size / 8))
 		return -EFAULT;
 
 	user_access_begin();
@@ -438,7 +438,7 @@ void __user *compat_alloc_user_space(unsigned long len)
 
 	ptr = arch_compat_alloc_user_space(len);
 
-	if (unlikely(!access_ok(VERIFY_WRITE, ptr, len)))
+	if (unlikely(!access_ok(ptr, len)))
 		return NULL;
 
 	return ptr;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 67ecac337374..3cd13a30f732 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -10135,7 +10135,7 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
 	u32 size;
 	int ret;
 
-	if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
+	if (!access_ok(uattr, PERF_ATTR_SIZE_VER0))
 		return -EFAULT;
 
 	/*
diff --git a/kernel/exit.c b/kernel/exit.c
index 0e21e6d21f35..8a01b671dc1f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1604,7 +1604,7 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
 	if (!infop)
 		return err;
 
-	if (!access_ok(VERIFY_WRITE, infop, sizeof(*infop)))
+	if (!access_ok(infop, sizeof(*infop)))
 		return -EFAULT;
 
 	user_access_begin();
@@ -1732,7 +1732,7 @@ COMPAT_SYSCALL_DEFINE5(waitid,
 	if (!infop)
 		return err;
 
-	if (!access_ok(VERIFY_WRITE, infop, sizeof(*infop)))
+	if (!access_ok(infop, sizeof(*infop)))
 		return -EFAULT;
 
 	user_access_begin();
diff --git a/kernel/futex.c b/kernel/futex.c
index 054105854e0e..be3bff2315ff 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -481,13 +481,18 @@ static void drop_futex_key_refs(union futex_key *key)
 	}
 }
 
+enum futex_access {
+	FUTEX_READ,
+	FUTEX_WRITE
+};
+
 /**
  * get_futex_key() - Get parameters which are the keys for a futex
  * @uaddr:	virtual address of the futex
  * @fshared:	0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
  * @key:	address where result is stored.
- * @rw:		mapping needs to be read/write (values: VERIFY_READ,
- *              VERIFY_WRITE)
+ * @rw:		mapping needs to be read/write (values: FUTEX_READ,
+ *              FUTEX_WRITE)
  *
  * Return: a negative error code or 0
  *
@@ -500,7 +505,7 @@ static void drop_futex_key_refs(union futex_key *key)
  * lock_page() might sleep, the caller should not hold a spinlock.
  */
 static int
-get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
+get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, enum futex_access rw)
 {
 	unsigned long address = (unsigned long)uaddr;
 	struct mm_struct *mm = current->mm;
@@ -516,7 +521,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
 		return -EINVAL;
 	address -= key->both.offset;
 
-	if (unlikely(!access_ok(rw, uaddr, sizeof(u32))))
+	if (unlikely(!access_ok(uaddr, sizeof(u32))))
 		return -EFAULT;
 
 	if (unlikely(should_fail_futex(fshared)))
@@ -546,7 +551,7 @@ again:
 	 * If write access is not required (eg. FUTEX_WAIT), try
 	 * and get read-only access.
 	 */
-	if (err == -EFAULT && rw == VERIFY_READ) {
+	if (err == -EFAULT && rw == FUTEX_READ) {
 		err = get_user_pages_fast(address, 1, 0, &page);
 		ro = 1;
 	}
@@ -1583,7 +1588,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
 	if (!bitset)
 		return -EINVAL;
 
-	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_READ);
+	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ);
 	if (unlikely(ret != 0))
 		goto out;
 
@@ -1642,7 +1647,7 @@ static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)
 		oparg = 1 << oparg;
 	}
 
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
+	if (!access_ok(uaddr, sizeof(u32)))
 		return -EFAULT;
 
 	ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr);
@@ -1682,10 +1687,10 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
 	DEFINE_WAKE_Q(wake_q);
 
 retry:
-	ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
+	ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
 	if (unlikely(ret != 0))
 		goto out;
-	ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
+	ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
 	if (unlikely(ret != 0))
 		goto out_put_key1;
 
@@ -1961,11 +1966,11 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
 	}
 
 retry:
-	ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
+	ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
 	if (unlikely(ret != 0))
 		goto out;
 	ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
-			    requeue_pi ? VERIFY_WRITE : VERIFY_READ);
+			    requeue_pi ? FUTEX_WRITE : FUTEX_READ);
 	if (unlikely(ret != 0))
 		goto out_put_key1;
 
@@ -2634,7 +2639,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
 	 * while the syscall executes.
 	 */
 retry:
-	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, VERIFY_READ);
+	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, FUTEX_READ);
 	if (unlikely(ret != 0))
 		return ret;
 
@@ -2793,7 +2798,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
 	}
 
 retry:
-	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, VERIFY_WRITE);
+	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE);
 	if (unlikely(ret != 0))
 		goto out;
 
@@ -2972,7 +2977,7 @@ retry:
 	if ((uval & FUTEX_TID_MASK) != vpid)
 		return -EPERM;
 
-	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE);
+	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE);
 	if (ret)
 		return ret;
 
@@ -3199,7 +3204,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 	 */
 	rt_mutex_init_waiter(&rt_waiter);
 
-	ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
+	ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
 	if (unlikely(ret != 0))
 		goto out;
 
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 1306fe0c1dc6..d3d170374ceb 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1466,7 +1466,7 @@ int do_syslog(int type, char __user *buf, int len, int source)
 			return -EINVAL;
 		if (!len)
 			return 0;
-		if (!access_ok(VERIFY_WRITE, buf, len))
+		if (!access_ok(buf, len))
 			return -EFAULT;
 		error = wait_event_interruptible(log_wait,
 						 syslog_seq != log_next_seq);
@@ -1484,7 +1484,7 @@ int do_syslog(int type, char __user *buf, int len, int source)
 			return -EINVAL;
 		if (!len)
 			return 0;
-		if (!access_ok(VERIFY_WRITE, buf, len))
+		if (!access_ok(buf, len))
 			return -EFAULT;
 		error = syslog_print_all(buf, len, clear);
 		break;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index c2cee9db5204..771e93f9c43f 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -1073,7 +1073,7 @@ int ptrace_request(struct task_struct *child, long request,
 		struct iovec kiov;
 		struct iovec __user *uiov = datavp;
 
-		if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov)))
+		if (!access_ok(uiov, sizeof(*uiov)))
 			return -EFAULT;
 
 		if (__get_user(kiov.iov_base, &uiov->iov_base) ||
@@ -1229,7 +1229,7 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
 		compat_uptr_t ptr;
 		compat_size_t len;
 
-		if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov)))
+		if (!access_ok(uiov, sizeof(*uiov)))
 			return -EFAULT;
 
 		if (__get_user(ptr, &uiov->iov_base) ||
diff --git a/kernel/rseq.c b/kernel/rseq.c
index c6242d8594dc..25e9a7b60eba 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -267,7 +267,7 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
 
 	if (unlikely(t->flags & PF_EXITING))
 		return;
-	if (unlikely(!access_ok(VERIFY_WRITE, t->rseq, sizeof(*t->rseq))))
+	if (unlikely(!access_ok(t->rseq, sizeof(*t->rseq))))
 		goto error;
 	ret = rseq_ip_fixup(regs);
 	if (unlikely(ret < 0))
@@ -295,7 +295,7 @@ void rseq_syscall(struct pt_regs *regs)
 
 	if (!t->rseq)
 		return;
-	if (!access_ok(VERIFY_READ, t->rseq, sizeof(*t->rseq)) ||
+	if (!access_ok(t->rseq, sizeof(*t->rseq)) ||
 	    rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs))
 		force_sig(SIGSEGV, t);
 }
@@ -351,7 +351,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
 	if (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) ||
 	    rseq_len != sizeof(*rseq))
 		return -EINVAL;
-	if (!access_ok(VERIFY_WRITE, rseq, rseq_len))
+	if (!access_ok(rseq, rseq_len))
 		return -EFAULT;
 	current->rseq = rseq;
 	current->rseq_len = rseq_len;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f66920173370..1f3e19fd6dc6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4450,7 +4450,7 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a
 	u32 size;
 	int ret;
 
-	if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
+	if (!access_ok(uattr, SCHED_ATTR_SIZE_VER0))
 		return -EFAULT;
 
 	/* Zero the full structure, so that a short copy will be nice: */
@@ -4650,7 +4650,7 @@ static int sched_read_attr(struct sched_attr __user *uattr,
 {
 	int ret;
 
-	if (!access_ok(VERIFY_WRITE, uattr, usize))
+	if (!access_ok(uattr, usize))
 		return -EFAULT;
 
 	/*
diff --git a/kernel/signal.c b/kernel/signal.c
index 53e07d97ffe0..e1d7ad8e6ab1 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3997,7 +3997,7 @@ SYSCALL_DEFINE3(sigaction, int, sig,
 
 	if (act) {
 		old_sigset_t mask;
-		if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
+		if (!access_ok(act, sizeof(*act)) ||
 		    __get_user(new_ka.sa.sa_handler, &act->sa_handler) ||
 		    __get_user(new_ka.sa.sa_restorer, &act->sa_restorer) ||
 		    __get_user(new_ka.sa.sa_flags, &act->sa_flags) ||
@@ -4012,7 +4012,7 @@ SYSCALL_DEFINE3(sigaction, int, sig,
 	ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
 
 	if (!ret && oact) {
-		if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
+		if (!access_ok(oact, sizeof(*oact)) ||
 		    __put_user(old_ka.sa.sa_handler, &oact->sa_handler) ||
 		    __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer) ||
 		    __put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
@@ -4034,7 +4034,7 @@ COMPAT_SYSCALL_DEFINE3(sigaction, int, sig,
 	compat_uptr_t handler, restorer;
 
 	if (act) {
-		if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
+		if (!access_ok(act, sizeof(*act)) ||
 		    __get_user(handler, &act->sa_handler) ||
 		    __get_user(restorer, &act->sa_restorer) ||
 		    __get_user(new_ka.sa.sa_flags, &act->sa_flags) ||
@@ -4052,7 +4052,7 @@ COMPAT_SYSCALL_DEFINE3(sigaction, int, sig,
 	ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
 
 	if (!ret && oact) {
-		if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
+		if (!access_ok(oact, sizeof(*oact)) ||
 		    __put_user(ptr_to_compat(old_ka.sa.sa_handler),
 			       &oact->sa_handler) ||
 		    __put_user(ptr_to_compat(old_ka.sa.sa_restorer),
diff --git a/kernel/sys.c b/kernel/sys.c
index 64b5a230f38d..a48cbf1414b8 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2627,7 +2627,7 @@ COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info)
 		s.freehigh >>= bitcount;
 	}
 
-	if (!access_ok(VERIFY_WRITE, info, sizeof(struct compat_sysinfo)) ||
+	if (!access_ok(info, sizeof(struct compat_sysinfo)) ||
 	    __put_user(s.uptime, &info->uptime) ||
 	    __put_user(s.loads[0], &info->loads[0]) ||
 	    __put_user(s.loads[1], &info->loads[1]) ||
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 9ddb6fddb4e0..8b068adb9da1 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -170,7 +170,7 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src,
 		return -EPERM;
 	if (unlikely(uaccess_kernel()))
 		return -EPERM;
-	if (!access_ok(VERIFY_WRITE, unsafe_ptr, size))
+	if (!access_ok(unsafe_ptr, size))
 		return -EPERM;
 
 	return probe_kernel_write(unsafe_ptr, src, size);
diff --git a/lib/bitmap.c b/lib/bitmap.c
index eead55aa7170..98872e9025da 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -443,7 +443,7 @@ int bitmap_parse_user(const char __user *ubuf,
 			unsigned int ulen, unsigned long *maskp,
 			int nmaskbits)
 {
-	if (!access_ok(VERIFY_READ, ubuf, ulen))
+	if (!access_ok(ubuf, ulen))
 		return -EFAULT;
 	return __bitmap_parse((const char __force *)ubuf,
 				ulen, 1, maskp, nmaskbits);
@@ -641,7 +641,7 @@ int bitmap_parselist_user(const char __user *ubuf,
 			unsigned int ulen, unsigned long *maskp,
 			int nmaskbits)
 {
-	if (!access_ok(VERIFY_READ, ubuf, ulen))
+	if (!access_ok(ubuf, ulen))
 		return -EFAULT;
 	return __bitmap_parselist((const char __force *)ubuf,
 					ulen, 1, maskp, nmaskbits);
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 1928009f506e..c93870987b58 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -136,7 +136,7 @@
 
 static int copyout(void __user *to, const void *from, size_t n)
 {
-	if (access_ok(VERIFY_WRITE, to, n)) {
+	if (access_ok(to, n)) {
 		kasan_check_read(from, n);
 		n = raw_copy_to_user(to, from, n);
 	}
@@ -145,7 +145,7 @@ static int copyout(void __user *to, const void *from, size_t n)
 
 static int copyin(void *to, const void __user *from, size_t n)
 {
-	if (access_ok(VERIFY_READ, from, n)) {
+	if (access_ok(from, n)) {
 		kasan_check_write(to, n);
 		n = raw_copy_from_user(to, from, n);
 	}
@@ -614,7 +614,7 @@ EXPORT_SYMBOL(_copy_to_iter);
 #ifdef CONFIG_ARCH_HAS_UACCESS_MCSAFE
 static int copyout_mcsafe(void __user *to, const void *from, size_t n)
 {
-	if (access_ok(VERIFY_WRITE, to, n)) {
+	if (access_ok(to, n)) {
 		kasan_check_read(from, n);
 		n = copy_to_user_mcsafe((__force void *) to, from, n);
 	}
@@ -1663,7 +1663,7 @@ int import_single_range(int rw, void __user *buf, size_t len,
 {
 	if (len > MAX_RW_COUNT)
 		len = MAX_RW_COUNT;
-	if (unlikely(!access_ok(!rw, buf, len)))
+	if (unlikely(!access_ok(buf, len)))
 		return -EFAULT;
 
 	iov->iov_base = buf;
diff --git a/lib/usercopy.c b/lib/usercopy.c
index 3744b2a8e591..c2bfbcaeb3dc 100644
--- a/lib/usercopy.c
+++ b/lib/usercopy.c
@@ -8,7 +8,7 @@ unsigned long _copy_from_user(void *to, const void __user *from, unsigned long n
 {
 	unsigned long res = n;
 	might_fault();
-	if (likely(access_ok(VERIFY_READ, from, n))) {
+	if (likely(access_ok(from, n))) {
 		kasan_check_write(to, n);
 		res = raw_copy_from_user(to, from, n);
 	}
@@ -23,7 +23,7 @@ EXPORT_SYMBOL(_copy_from_user);
 unsigned long _copy_to_user(void __user *to, const void *from, unsigned long n)
 {
 	might_fault();
-	if (likely(access_ok(VERIFY_WRITE, to, n))) {
+	if (likely(access_ok(to, n))) {
 		kasan_check_read(from, n);
 		n = raw_copy_to_user(to, from, n);
 	}
diff --git a/mm/gup.c b/mm/gup.c
index 8cb68a50dbdf..6f591ccb8eca 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1813,8 +1813,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
 	len = (unsigned long) nr_pages << PAGE_SHIFT;
 	end = start + len;
 
-	if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
-					(void __user *)start, len)))
+	if (unlikely(!access_ok((void __user *)start, len)))
 		return 0;
 
 	/*
@@ -1868,8 +1867,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
 	if (nr_pages <= 0)
 		return 0;
 
-	if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
-					(void __user *)start, len)))
+	if (unlikely(!access_ok((void __user *)start, len)))
 		return -EFAULT;
 
 	if (gup_fast_permitted(start, nr_pages, write)) {
diff --git a/mm/mincore.c b/mm/mincore.c
index 4985965aa20a..218099b5ed31 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -233,14 +233,14 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len,
 		return -EINVAL;
 
 	/* ..and we need to be passed a valid user-space range */
-	if (!access_ok(VERIFY_READ, (void __user *) start, len))
+	if (!access_ok((void __user *) start, len))
 		return -ENOMEM;
 
 	/* This also avoids any overflows on PAGE_ALIGN */
 	pages = len >> PAGE_SHIFT;
 	pages += (offset_in_page(len)) != 0;
 
-	if (!access_ok(VERIFY_WRITE, vec, pages))
+	if (!access_ok(vec, pages))
 		return -EFAULT;
 
 	tmp = (void *) __get_free_page(GFP_USER);
diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c
index d70f363c52ae..6d5859714f52 100644
--- a/net/batman-adv/icmp_socket.c
+++ b/net/batman-adv/icmp_socket.c
@@ -147,7 +147,7 @@ static ssize_t batadv_socket_read(struct file *file, char __user *buf,
 	if (!buf || count < sizeof(struct batadv_icmp_packet))
 		return -EINVAL;
 
-	if (!access_ok(VERIFY_WRITE, buf, count))
+	if (!access_ok(buf, count))
 		return -EFAULT;
 
 	error = wait_event_interruptible(socket_client->queue_wait,
diff --git a/net/batman-adv/log.c b/net/batman-adv/log.c
index 02e55b78132f..75f602e1ce94 100644
--- a/net/batman-adv/log.c
+++ b/net/batman-adv/log.c
@@ -136,7 +136,7 @@ static ssize_t batadv_log_read(struct file *file, char __user *buf,
 	if (count == 0)
 		return 0;
 
-	if (!access_ok(VERIFY_WRITE, buf, count))
+	if (!access_ok(buf, count))
 		return -EFAULT;
 
 	error = wait_event_interruptible(debug_log->queue_wait,
diff --git a/net/compat.c b/net/compat.c
index c3a2f868e8af..959d1c51826d 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -358,7 +358,7 @@ static int do_set_sock_timeout(struct socket *sock, int level,
 
 	if (optlen < sizeof(*up))
 		return -EINVAL;
-	if (!access_ok(VERIFY_READ, up, sizeof(*up)) ||
+	if (!access_ok(up, sizeof(*up)) ||
 	    __get_user(ktime.tv_sec, &up->tv_sec) ||
 	    __get_user(ktime.tv_usec, &up->tv_usec))
 		return -EFAULT;
@@ -438,7 +438,7 @@ static int do_get_sock_timeout(struct socket *sock, int level, int optname,
 
 	if (!err) {
 		if (put_user(sizeof(*up), optlen) ||
-		    !access_ok(VERIFY_WRITE, up, sizeof(*up)) ||
+		    !access_ok(up, sizeof(*up)) ||
 		    __put_user(ktime.tv_sec, &up->tv_sec) ||
 		    __put_user(ktime.tv_usec, &up->tv_usec))
 			err = -EFAULT;
@@ -590,8 +590,8 @@ int compat_mc_setsockopt(struct sock *sock, int level, int optname,
 			compat_alloc_user_space(sizeof(struct group_req));
 		u32 interface;
 
-		if (!access_ok(VERIFY_READ, gr32, sizeof(*gr32)) ||
-		    !access_ok(VERIFY_WRITE, kgr, sizeof(struct group_req)) ||
+		if (!access_ok(gr32, sizeof(*gr32)) ||
+		    !access_ok(kgr, sizeof(struct group_req)) ||
 		    __get_user(interface, &gr32->gr_interface) ||
 		    __put_user(interface, &kgr->gr_interface) ||
 		    copy_in_user(&kgr->gr_group, &gr32->gr_group,
@@ -611,8 +611,8 @@ int compat_mc_setsockopt(struct sock *sock, int level, int optname,
 			sizeof(struct group_source_req));
 		u32 interface;
 
-		if (!access_ok(VERIFY_READ, gsr32, sizeof(*gsr32)) ||
-		    !access_ok(VERIFY_WRITE, kgsr,
+		if (!access_ok(gsr32, sizeof(*gsr32)) ||
+		    !access_ok(kgsr,
 			sizeof(struct group_source_req)) ||
 		    __get_user(interface, &gsr32->gsr_interface) ||
 		    __put_user(interface, &kgsr->gsr_interface) ||
@@ -631,7 +631,7 @@ int compat_mc_setsockopt(struct sock *sock, int level, int optname,
 		struct group_filter __user *kgf;
 		u32 interface, fmode, numsrc;
 
-		if (!access_ok(VERIFY_READ, gf32, __COMPAT_GF0_SIZE) ||
+		if (!access_ok(gf32, __COMPAT_GF0_SIZE) ||
 		    __get_user(interface, &gf32->gf_interface) ||
 		    __get_user(fmode, &gf32->gf_fmode) ||
 		    __get_user(numsrc, &gf32->gf_numsrc))
@@ -641,7 +641,7 @@ int compat_mc_setsockopt(struct sock *sock, int level, int optname,
 		if (koptlen < GROUP_FILTER_SIZE(numsrc))
 			return -EINVAL;
 		kgf = compat_alloc_user_space(koptlen);
-		if (!access_ok(VERIFY_WRITE, kgf, koptlen) ||
+		if (!access_ok(kgf, koptlen) ||
 		    __put_user(interface, &kgf->gf_interface) ||
 		    __put_user(fmode, &kgf->gf_fmode) ||
 		    __put_user(numsrc, &kgf->gf_numsrc) ||
@@ -675,7 +675,7 @@ int compat_mc_getsockopt(struct sock *sock, int level, int optname,
 		return getsockopt(sock, level, optname, optval, optlen);
 
 	koptlen = compat_alloc_user_space(sizeof(*koptlen));
-	if (!access_ok(VERIFY_READ, optlen, sizeof(*optlen)) ||
+	if (!access_ok(optlen, sizeof(*optlen)) ||
 	    __get_user(ulen, optlen))
 		return -EFAULT;
 
@@ -685,14 +685,14 @@ int compat_mc_getsockopt(struct sock *sock, int level, int optname,
 	if (klen < GROUP_FILTER_SIZE(0))
 		return -EINVAL;
 
-	if (!access_ok(VERIFY_WRITE, koptlen, sizeof(*koptlen)) ||
+	if (!access_ok(koptlen, sizeof(*koptlen)) ||
 	    __put_user(klen, koptlen))
 		return -EFAULT;
 
 	/* have to allow space for previous compat_alloc_user_space, too */
 	kgf = compat_alloc_user_space(klen+sizeof(*optlen));
 
-	if (!access_ok(VERIFY_READ, gf32, __COMPAT_GF0_SIZE) ||
+	if (!access_ok(gf32, __COMPAT_GF0_SIZE) ||
 	    __get_user(interface, &gf32->gf_interface) ||
 	    __get_user(fmode, &gf32->gf_fmode) ||
 	    __get_user(numsrc, &gf32->gf_numsrc) ||
@@ -706,18 +706,18 @@ int compat_mc_getsockopt(struct sock *sock, int level, int optname,
 	if (err)
 		return err;
 
-	if (!access_ok(VERIFY_READ, koptlen, sizeof(*koptlen)) ||
+	if (!access_ok(koptlen, sizeof(*koptlen)) ||
 	    __get_user(klen, koptlen))
 		return -EFAULT;
 
 	ulen = klen - (sizeof(*kgf)-sizeof(*gf32));
 
-	if (!access_ok(VERIFY_WRITE, optlen, sizeof(*optlen)) ||
+	if (!access_ok(optlen, sizeof(*optlen)) ||
 	    __put_user(ulen, optlen))
 		return -EFAULT;
 
-	if (!access_ok(VERIFY_READ, kgf, klen) ||
-	    !access_ok(VERIFY_WRITE, gf32, ulen) ||
+	if (!access_ok(kgf, klen) ||
+	    !access_ok(gf32, ulen) ||
 	    __get_user(interface, &kgf->gf_interface) ||
 	    __get_user(fmode, &kgf->gf_fmode) ||
 	    __get_user(numsrc, &kgf->gf_numsrc) ||
diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c
index 8c3936403fea..0bea8ff8b0d3 100644
--- a/net/sunrpc/sysctl.c
+++ b/net/sunrpc/sysctl.c
@@ -89,7 +89,7 @@ proc_dodebug(struct ctl_table *table, int write,
 	left = *lenp;
 
 	if (write) {
-		if (!access_ok(VERIFY_READ, buffer, left))
+		if (!access_ok(buffer, left))
 			return -EFAULT;
 		p = buffer;
 		while (left && __get_user(c, p) >= 0 && isspace(c))
diff --git a/security/tomoyo/common.c b/security/tomoyo/common.c
index 9b38f94b5dd0..c598aa00d5e3 100644
--- a/security/tomoyo/common.c
+++ b/security/tomoyo/common.c
@@ -2591,7 +2591,7 @@ ssize_t tomoyo_write_control(struct tomoyo_io_buffer *head,
 	int idx;
 	if (!head->write)
 		return -ENOSYS;
-	if (!access_ok(VERIFY_READ, buffer, buffer_len))
+	if (!access_ok(buffer, buffer_len))
 		return -EFAULT;
 	if (mutex_lock_interruptible(&head->io_sem))
 		return -EINTR;
diff --git a/sound/core/seq/seq_clientmgr.c b/sound/core/seq/seq_clientmgr.c
index 92e6524a3a9d..7d4640d1fe9f 100644
--- a/sound/core/seq/seq_clientmgr.c
+++ b/sound/core/seq/seq_clientmgr.c
@@ -393,7 +393,7 @@ static ssize_t snd_seq_read(struct file *file, char __user *buf, size_t count,
 	if (!(snd_seq_file_flags(file) & SNDRV_SEQ_LFLG_INPUT))
 		return -ENXIO;
 
-	if (!access_ok(VERIFY_WRITE, buf, count))
+	if (!access_ok(buf, count))
 		return -EFAULT;
 
 	/* check client structures are in place */
diff --git a/sound/isa/sb/emu8000_patch.c b/sound/isa/sb/emu8000_patch.c
index d45a6b9d6437..3d44c358c4b3 100644
--- a/sound/isa/sb/emu8000_patch.c
+++ b/sound/isa/sb/emu8000_patch.c
@@ -183,10 +183,10 @@ snd_emu8000_sample_new(struct snd_emux *rec, struct snd_sf_sample *sp,
 	}
 
 	if (sp->v.mode_flags & SNDRV_SFNT_SAMPLE_8BITS) {
-		if (!access_ok(VERIFY_READ, data, sp->v.size))
+		if (!access_ok(data, sp->v.size))
 			return -EFAULT;
 	} else {
-		if (!access_ok(VERIFY_READ, data, sp->v.size * 2))
+		if (!access_ok(data, sp->v.size * 2))
 			return -EFAULT;
 	}
 
diff --git a/tools/perf/util/include/asm/uaccess.h b/tools/perf/util/include/asm/uaccess.h
index 6a6f4b990547..548100315710 100644
--- a/tools/perf/util/include/asm/uaccess.h
+++ b/tools/perf/util/include/asm/uaccess.h
@@ -10,6 +10,6 @@
 
 #define get_user	__get_user
 
-#define access_ok(type, addr, size)	1
+#define access_ok(addr, size)	1
 
 #endif
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 666d0155662d..1f888a103f78 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -939,8 +939,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
 	/* We can read the guest memory with __xxx_user() later on. */
 	if ((id < KVM_USER_MEM_SLOTS) &&
 	    ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
-	     !access_ok(VERIFY_WRITE,
-			(void __user *)(unsigned long)mem->userspace_addr,
+	     !access_ok((void __user *)(unsigned long)mem->userspace_addr,
 			mem->memory_size)))
 		goto out;
 	if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM)
-- 
cgit 


From 2e05ea5cdc1ac55d9ef678ed5ea6c38acf7fd2a3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 25 Dec 2018 08:50:35 +0100
Subject: dma-mapping: implement dma_map_single_attrs using dma_map_page_attrs

And also switch the way we implement the unmap side around to stay
consistent.  This ensures dma-debug works again because it records which
function we used for mapping to ensure it is also used for unmapping,
and also reduces further code duplication.  Last but not least this
also officially allows calling dma_sync_single_* for mappings created
using dma_map_page, which is perfectly fine given that the sync calls
only take a dma_addr_t, but not a virtual address or struct page.

Fixes: 7f0fee242e ("dma-mapping: merge dma_unmap_page_attrs and dma_unmap_single_attrs")
Signed-off-by: Christoph Hellwig <hch@lst.de>
Tested-by: LABBE Corentin <clabbe.montjoie@gmail.com>
---
 include/linux/dma-debug.h   | 11 +++-----
 include/linux/dma-mapping.h | 66 +++++++++++++++++----------------------------
 kernel/dma/debug.c          | 17 +++---------
 3 files changed, 32 insertions(+), 62 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/dma-debug.h b/include/linux/dma-debug.h
index 2ad5c363d7d5..cb422cbe587d 100644
--- a/include/linux/dma-debug.h
+++ b/include/linux/dma-debug.h
@@ -35,13 +35,12 @@ extern void debug_dma_map_single(struct device *dev, const void *addr,
 
 extern void debug_dma_map_page(struct device *dev, struct page *page,
 			       size_t offset, size_t size,
-			       int direction, dma_addr_t dma_addr,
-			       bool map_single);
+			       int direction, dma_addr_t dma_addr);
 
 extern void debug_dma_mapping_error(struct device *dev, dma_addr_t dma_addr);
 
 extern void debug_dma_unmap_page(struct device *dev, dma_addr_t addr,
-				 size_t size, int direction, bool map_single);
+				 size_t size, int direction);
 
 extern void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
 			     int nents, int mapped_ents, int direction);
@@ -95,8 +94,7 @@ static inline void debug_dma_map_single(struct device *dev, const void *addr,
 
 static inline void debug_dma_map_page(struct device *dev, struct page *page,
 				      size_t offset, size_t size,
-				      int direction, dma_addr_t dma_addr,
-				      bool map_single)
+				      int direction, dma_addr_t dma_addr)
 {
 }
 
@@ -106,8 +104,7 @@ static inline void debug_dma_mapping_error(struct device *dev,
 }
 
 static inline void debug_dma_unmap_page(struct device *dev, dma_addr_t addr,
-					size_t size, int direction,
-					bool map_single)
+					size_t size, int direction)
 {
 }
 
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index ba521d5506c9..0452a8be2789 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -284,32 +284,25 @@ static inline void dma_direct_sync_sg_for_cpu(struct device *dev,
 }
 #endif
 
-static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr,
-					      size_t size,
-					      enum dma_data_direction dir,
-					      unsigned long attrs)
+static inline dma_addr_t dma_map_page_attrs(struct device *dev,
+		struct page *page, size_t offset, size_t size,
+		enum dma_data_direction dir, unsigned long attrs)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 	dma_addr_t addr;
 
 	BUG_ON(!valid_dma_direction(dir));
-	debug_dma_map_single(dev, ptr, size);
 	if (dma_is_direct(ops))
-		addr = dma_direct_map_page(dev, virt_to_page(ptr),
-				offset_in_page(ptr), size, dir, attrs);
+		addr = dma_direct_map_page(dev, page, offset, size, dir, attrs);
 	else
-		addr = ops->map_page(dev, virt_to_page(ptr),
-				offset_in_page(ptr), size, dir, attrs);
-	debug_dma_map_page(dev, virt_to_page(ptr),
-			   offset_in_page(ptr), size,
-			   dir, addr, true);
+		addr = ops->map_page(dev, page, offset, size, dir, attrs);
+	debug_dma_map_page(dev, page, offset, size, dir, addr);
+
 	return addr;
 }
 
-static inline void dma_unmap_single_attrs(struct device *dev, dma_addr_t addr,
-					  size_t size,
-					  enum dma_data_direction dir,
-					  unsigned long attrs)
+static inline void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr,
+		size_t size, enum dma_data_direction dir, unsigned long attrs)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
@@ -318,13 +311,7 @@ static inline void dma_unmap_single_attrs(struct device *dev, dma_addr_t addr,
 		dma_direct_unmap_page(dev, addr, size, dir, attrs);
 	else if (ops->unmap_page)
 		ops->unmap_page(dev, addr, size, dir, attrs);
-	debug_dma_unmap_page(dev, addr, size, dir, true);
-}
-
-static inline void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr,
-		size_t size, enum dma_data_direction dir, unsigned long attrs)
-{
-	return dma_unmap_single_attrs(dev, addr, size, dir, attrs);
+	debug_dma_unmap_page(dev, addr, size, dir);
 }
 
 /*
@@ -363,25 +350,6 @@ static inline void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg
 		ops->unmap_sg(dev, sg, nents, dir, attrs);
 }
 
-static inline dma_addr_t dma_map_page_attrs(struct device *dev,
-					    struct page *page,
-					    size_t offset, size_t size,
-					    enum dma_data_direction dir,
-					    unsigned long attrs)
-{
-	const struct dma_map_ops *ops = get_dma_ops(dev);
-	dma_addr_t addr;
-
-	BUG_ON(!valid_dma_direction(dir));
-	if (dma_is_direct(ops))
-		addr = dma_direct_map_page(dev, page, offset, size, dir, attrs);
-	else
-		addr = ops->map_page(dev, page, offset, size, dir, attrs);
-	debug_dma_map_page(dev, page, offset, size, dir, addr, false);
-
-	return addr;
-}
-
 static inline dma_addr_t dma_map_resource(struct device *dev,
 					  phys_addr_t phys_addr,
 					  size_t size,
@@ -488,6 +456,20 @@ dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
 
 }
 
+static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr,
+		size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
+	debug_dma_map_single(dev, ptr, size);
+	return dma_map_page_attrs(dev, virt_to_page(ptr), offset_in_page(ptr),
+			size, dir, attrs);
+}
+
+static inline void dma_unmap_single_attrs(struct device *dev, dma_addr_t addr,
+		size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
+	return dma_unmap_page_attrs(dev, addr, size, dir, attrs);
+}
+
 #define dma_map_single(d, a, s, r) dma_map_single_attrs(d, a, s, r, 0)
 #define dma_unmap_single(d, a, s, r) dma_unmap_single_attrs(d, a, s, r, 0)
 #define dma_map_sg(d, s, n, r) dma_map_sg_attrs(d, s, n, r, 0)
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index 164706da2a73..1e0157113d15 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -49,7 +49,6 @@
 
 enum {
 	dma_debug_single,
-	dma_debug_page,
 	dma_debug_sg,
 	dma_debug_coherent,
 	dma_debug_resource,
@@ -1300,8 +1299,7 @@ void debug_dma_map_single(struct device *dev, const void *addr,
 EXPORT_SYMBOL(debug_dma_map_single);
 
 void debug_dma_map_page(struct device *dev, struct page *page, size_t offset,
-			size_t size, int direction, dma_addr_t dma_addr,
-			bool map_single)
+			size_t size, int direction, dma_addr_t dma_addr)
 {
 	struct dma_debug_entry *entry;
 
@@ -1316,7 +1314,7 @@ void debug_dma_map_page(struct device *dev, struct page *page, size_t offset,
 		return;
 
 	entry->dev       = dev;
-	entry->type      = dma_debug_page;
+	entry->type      = dma_debug_single;
 	entry->pfn	 = page_to_pfn(page);
 	entry->offset	 = offset,
 	entry->dev_addr  = dma_addr;
@@ -1324,9 +1322,6 @@ void debug_dma_map_page(struct device *dev, struct page *page, size_t offset,
 	entry->direction = direction;
 	entry->map_err_type = MAP_ERR_NOT_CHECKED;
 
-	if (map_single)
-		entry->type = dma_debug_single;
-
 	check_for_stack(dev, page, offset);
 
 	if (!PageHighMem(page)) {
@@ -1378,10 +1373,10 @@ void debug_dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 EXPORT_SYMBOL(debug_dma_mapping_error);
 
 void debug_dma_unmap_page(struct device *dev, dma_addr_t addr,
-			  size_t size, int direction, bool map_single)
+			  size_t size, int direction)
 {
 	struct dma_debug_entry ref = {
-		.type           = dma_debug_page,
+		.type           = dma_debug_single,
 		.dev            = dev,
 		.dev_addr       = addr,
 		.size           = size,
@@ -1390,10 +1385,6 @@ void debug_dma_unmap_page(struct device *dev, dma_addr_t addr,
 
 	if (unlikely(dma_debug_disabled()))
 		return;
-
-	if (map_single)
-		ref.type = dma_debug_single;
-
 	check_unmap(&ref);
 }
 EXPORT_SYMBOL(debug_dma_unmap_page);
-- 
cgit 


From d7076f07840851bbe57cb21ba052d6a4a9b1efa9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 25 Dec 2018 17:44:19 +0100
Subject: dma-mapping: implement dmam_alloc_coherent using dmam_alloc_attrs

dmam_alloc_coherent is just the default no-flags case of
dmam_alloc_attrs, so take advantage of this similar to the non-managed
version.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/dma-mapping.h | 20 +++++++++++++-------
 kernel/dma/mapping.c        | 39 ---------------------------------------
 2 files changed, 13 insertions(+), 46 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 0452a8be2789..fa2ebe8ad4d0 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -677,21 +677,20 @@ dma_mark_declared_memory_occupied(struct device *dev,
  * Managed DMA API
  */
 #ifdef CONFIG_HAS_DMA
-extern void *dmam_alloc_coherent(struct device *dev, size_t size,
-				 dma_addr_t *dma_handle, gfp_t gfp);
+extern void *dmam_alloc_attrs(struct device *dev, size_t size,
+				 dma_addr_t *dma_handle, gfp_t gfp,
+				 unsigned long attrs);
 extern void dmam_free_coherent(struct device *dev, size_t size, void *vaddr,
 			       dma_addr_t dma_handle);
 #else /* !CONFIG_HAS_DMA */
-static inline void *dmam_alloc_coherent(struct device *dev, size_t size,
-					dma_addr_t *dma_handle, gfp_t gfp)
+static inline void *dmam_alloc_attrs(struct device *dev, size_t size,
+					dma_addr_t *dma_handle, gfp_t gfp,
+					unsigned long attrs)
 { return NULL; }
 static inline void dmam_free_coherent(struct device *dev, size_t size,
 				      void *vaddr, dma_addr_t dma_handle) { }
 #endif /* !CONFIG_HAS_DMA */
 
-extern void *dmam_alloc_attrs(struct device *dev, size_t size,
-			      dma_addr_t *dma_handle, gfp_t gfp,
-			      unsigned long attrs);
 #ifdef CONFIG_HAVE_GENERIC_DMA_COHERENT
 extern int dmam_declare_coherent_memory(struct device *dev,
 					phys_addr_t phys_addr,
@@ -711,6 +710,13 @@ static inline void dmam_release_declared_memory(struct device *dev)
 }
 #endif /* CONFIG_HAVE_GENERIC_DMA_COHERENT */
 
+static inline void *dmam_alloc_coherent(struct device *dev, size_t size,
+		dma_addr_t *dma_handle, gfp_t gfp)
+{
+	return dmam_alloc_attrs(dev, size, dma_handle, gfp,
+			(gfp & __GFP_NOWARN) ? DMA_ATTR_NO_WARN : 0);
+}
+
 static inline void *dma_alloc_wc(struct device *dev, size_t size,
 				 dma_addr_t *dma_addr, gfp_t gfp)
 {
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index d7c34d2d1ba5..f00544cda4e9 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -45,45 +45,6 @@ static int dmam_match(struct device *dev, void *res, void *match_data)
 	return 0;
 }
 
-/**
- * dmam_alloc_coherent - Managed dma_alloc_coherent()
- * @dev: Device to allocate coherent memory for
- * @size: Size of allocation
- * @dma_handle: Out argument for allocated DMA handle
- * @gfp: Allocation flags
- *
- * Managed dma_alloc_coherent().  Memory allocated using this function
- * will be automatically released on driver detach.
- *
- * RETURNS:
- * Pointer to allocated memory on success, NULL on failure.
- */
-void *dmam_alloc_coherent(struct device *dev, size_t size,
-			   dma_addr_t *dma_handle, gfp_t gfp)
-{
-	struct dma_devres *dr;
-	void *vaddr;
-
-	dr = devres_alloc(dmam_release, sizeof(*dr), gfp);
-	if (!dr)
-		return NULL;
-
-	vaddr = dma_alloc_coherent(dev, size, dma_handle, gfp);
-	if (!vaddr) {
-		devres_free(dr);
-		return NULL;
-	}
-
-	dr->vaddr = vaddr;
-	dr->dma_handle = *dma_handle;
-	dr->size = size;
-
-	devres_add(dev, dr);
-
-	return vaddr;
-}
-EXPORT_SYMBOL(dmam_alloc_coherent);
-
 /**
  * dmam_free_coherent - Managed dma_free_coherent()
  * @dev: Device to free coherent memory for
-- 
cgit 


From 4788ba5792cc1368ba4867e1488dc168b4fe97b7 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 26 Dec 2018 07:51:44 +0100
Subject: dma-mapping: remove dmam_{declare,release}_coherent_memory

These functions have never been used.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 Documentation/driver-model/devres.txt |  1 -
 include/linux/dma-mapping.h           | 19 ------------
 kernel/dma/mapping.c                  | 55 -----------------------------------
 3 files changed, 75 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/driver-model/devres.txt b/Documentation/driver-model/devres.txt
index 841c99529d27..b277cafce71e 100644
--- a/Documentation/driver-model/devres.txt
+++ b/Documentation/driver-model/devres.txt
@@ -250,7 +250,6 @@ DMA
   dmaenginem_async_device_register()
   dmam_alloc_coherent()
   dmam_alloc_attrs()
-  dmam_declare_coherent_memory()
   dmam_free_coherent()
   dmam_pool_create()
   dmam_pool_destroy()
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index fa2ebe8ad4d0..937c2a949fca 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -691,25 +691,6 @@ static inline void dmam_free_coherent(struct device *dev, size_t size,
 				      void *vaddr, dma_addr_t dma_handle) { }
 #endif /* !CONFIG_HAS_DMA */
 
-#ifdef CONFIG_HAVE_GENERIC_DMA_COHERENT
-extern int dmam_declare_coherent_memory(struct device *dev,
-					phys_addr_t phys_addr,
-					dma_addr_t device_addr, size_t size,
-					int flags);
-extern void dmam_release_declared_memory(struct device *dev);
-#else /* CONFIG_HAVE_GENERIC_DMA_COHERENT */
-static inline int dmam_declare_coherent_memory(struct device *dev,
-				phys_addr_t phys_addr, dma_addr_t device_addr,
-				size_t size, gfp_t gfp)
-{
-	return 0;
-}
-
-static inline void dmam_release_declared_memory(struct device *dev)
-{
-}
-#endif /* CONFIG_HAVE_GENERIC_DMA_COHERENT */
-
 static inline void *dmam_alloc_coherent(struct device *dev, size_t size,
 		dma_addr_t *dma_handle, gfp_t gfp)
 {
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index f00544cda4e9..a11006b6d8e8 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -105,61 +105,6 @@ void *dmam_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
 }
 EXPORT_SYMBOL(dmam_alloc_attrs);
 
-#ifdef CONFIG_HAVE_GENERIC_DMA_COHERENT
-
-static void dmam_coherent_decl_release(struct device *dev, void *res)
-{
-	dma_release_declared_memory(dev);
-}
-
-/**
- * dmam_declare_coherent_memory - Managed dma_declare_coherent_memory()
- * @dev: Device to declare coherent memory for
- * @phys_addr: Physical address of coherent memory to be declared
- * @device_addr: Device address of coherent memory to be declared
- * @size: Size of coherent memory to be declared
- * @flags: Flags
- *
- * Managed dma_declare_coherent_memory().
- *
- * RETURNS:
- * 0 on success, -errno on failure.
- */
-int dmam_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
-				 dma_addr_t device_addr, size_t size, int flags)
-{
-	void *res;
-	int rc;
-
-	res = devres_alloc(dmam_coherent_decl_release, 0, GFP_KERNEL);
-	if (!res)
-		return -ENOMEM;
-
-	rc = dma_declare_coherent_memory(dev, phys_addr, device_addr, size,
-					 flags);
-	if (!rc)
-		devres_add(dev, res);
-	else
-		devres_free(res);
-
-	return rc;
-}
-EXPORT_SYMBOL(dmam_declare_coherent_memory);
-
-/**
- * dmam_release_declared_memory - Managed dma_release_declared_memory().
- * @dev: Device to release declared coherent memory for
- *
- * Managed dmam_release_declared_memory().
- */
-void dmam_release_declared_memory(struct device *dev)
-{
-	WARN_ON(devres_destroy(dev, dmam_coherent_decl_release, NULL, NULL));
-}
-EXPORT_SYMBOL(dmam_release_declared_memory);
-
-#endif
-
 /*
  * Create scatter-list for the already allocated DMA buffer.
  */
-- 
cgit 


From 48e638fb68be8fecdca0611beff53a9c947704e3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 1 Jan 2019 17:14:39 +0100
Subject: dma-mapping: remove a few unused exports

Now that the slow path DMA API calls are implemented out of line a few
helpers only used by them don't need to be exported anymore.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 kernel/dma/coherent.c | 2 --
 kernel/dma/debug.c    | 2 --
 2 files changed, 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c
index 597d40893862..66f0fb7e9a3a 100644
--- a/kernel/dma/coherent.c
+++ b/kernel/dma/coherent.c
@@ -223,7 +223,6 @@ int dma_alloc_from_dev_coherent(struct device *dev, ssize_t size,
 	 */
 	return mem->flags & DMA_MEMORY_EXCLUSIVE;
 }
-EXPORT_SYMBOL(dma_alloc_from_dev_coherent);
 
 void *dma_alloc_from_global_coherent(ssize_t size, dma_addr_t *dma_handle)
 {
@@ -268,7 +267,6 @@ int dma_release_from_dev_coherent(struct device *dev, int order, void *vaddr)
 
 	return __dma_release_from_coherent(mem, order, vaddr);
 }
-EXPORT_SYMBOL(dma_release_from_dev_coherent);
 
 int dma_release_from_global_coherent(int order, void *vaddr)
 {
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index 1e0157113d15..23cf5361bcf1 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -1512,7 +1512,6 @@ void debug_dma_alloc_coherent(struct device *dev, size_t size,
 
 	add_dma_entry(entry);
 }
-EXPORT_SYMBOL(debug_dma_alloc_coherent);
 
 void debug_dma_free_coherent(struct device *dev, size_t size,
 			 void *virt, dma_addr_t addr)
@@ -1540,7 +1539,6 @@ void debug_dma_free_coherent(struct device *dev, size_t size,
 
 	check_unmap(&ref);
 }
-EXPORT_SYMBOL(debug_dma_free_coherent);
 
 void debug_dma_map_resource(struct device *dev, phys_addr_t addr, size_t size,
 			    int direction, dma_addr_t dma_addr)
-- 
cgit 


From 594cc251fdd0d231d342d88b2fdff4bc42fb0690 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 4 Jan 2019 12:56:09 -0800
Subject: make 'user_access_begin()' do 'access_ok()'

Originally, the rule used to be that you'd have to do access_ok()
separately, and then user_access_begin() before actually doing the
direct (optimized) user access.

But experience has shown that people then decide not to do access_ok()
at all, and instead rely on it being implied by other operations or
similar.  Which makes it very hard to verify that the access has
actually been range-checked.

If you use the unsafe direct user accesses, hardware features (either
SMAP - Supervisor Mode Access Protection - on x86, or PAN - Privileged
Access Never - on ARM) do force you to use user_access_begin().  But
nothing really forces the range check.

By putting the range check into user_access_begin(), we actually force
people to do the right thing (tm), and the range check vill be visible
near the actual accesses.  We have way too long a history of people
trying to avoid them.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/uaccess.h             |  9 ++++++++-
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 15 +++++++++++++--
 include/linux/uaccess.h                    |  2 +-
 kernel/compat.c                            |  6 ++----
 kernel/exit.c                              |  6 ++----
 lib/strncpy_from_user.c                    |  9 +++++----
 lib/strnlen_user.c                         |  9 +++++----
 7 files changed, 36 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 3920f456db79..a87ab5290ab4 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -705,7 +705,14 @@ extern struct movsl_mask {
  * checking before using them, but you have to surround them with the
  * user_access_begin/end() pair.
  */
-#define user_access_begin()	__uaccess_begin()
+static __must_check inline bool user_access_begin(const void __user *ptr, size_t len)
+{
+	if (unlikely(!access_ok(ptr,len)))
+		return 0;
+	__uaccess_begin();
+	return 1;
+}
+#define user_access_begin(a,b)	user_access_begin(a,b)
 #define user_access_end()	__uaccess_end()
 
 #define unsafe_put_user(x, ptr, err_label)					\
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 55d8f9b8777f..485b259127c3 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1624,7 +1624,9 @@ end_user:
 		 * happened we would make the mistake of assuming that the
 		 * relocations were valid.
 		 */
-		user_access_begin();
+		if (!user_access_begin(urelocs, size))
+			goto end_user;
+
 		for (copied = 0; copied < nreloc; copied++)
 			unsafe_put_user(-1,
 					&urelocs[copied].presumed_offset,
@@ -2606,7 +2608,16 @@ i915_gem_execbuffer2_ioctl(struct drm_device *dev, void *data,
 		unsigned int i;
 
 		/* Copy the new buffer offsets back to the user's exec list. */
-		user_access_begin();
+		/*
+		 * Note: count * sizeof(*user_exec_list) does not overflow,
+		 * because we checked 'count' in check_buffer_count().
+		 *
+		 * And this range already got effectively checked earlier
+		 * when we did the "copy_from_user()" above.
+		 */
+		if (!user_access_begin(user_exec_list, count * sizeof(*user_exec_list)))
+			goto end_user;
+
 		for (i = 0; i < args->buffer_count; i++) {
 			if (!(exec2_list[i].offset & UPDATE))
 				continue;
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index bf2523867a02..37b226e8df13 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -264,7 +264,7 @@ extern long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count);
 	probe_kernel_read(&retval, addr, sizeof(retval))
 
 #ifndef user_access_begin
-#define user_access_begin() do { } while (0)
+#define user_access_begin(ptr,len) access_ok(ptr, len)
 #define user_access_end() do { } while (0)
 #define unsafe_get_user(x, ptr, err) do { if (unlikely(__get_user(x, ptr))) goto err; } while (0)
 #define unsafe_put_user(x, ptr, err) do { if (unlikely(__put_user(x, ptr))) goto err; } while (0)
diff --git a/kernel/compat.c b/kernel/compat.c
index 705d4ae6c018..f01affa17e22 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -354,10 +354,9 @@ long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask,
 	bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG);
 	nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size);
 
-	if (!access_ok(umask, bitmap_size / 8))
+	if (!user_access_begin(umask, bitmap_size / 8))
 		return -EFAULT;
 
-	user_access_begin();
 	while (nr_compat_longs > 1) {
 		compat_ulong_t l1, l2;
 		unsafe_get_user(l1, umask++, Efault);
@@ -384,10 +383,9 @@ long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask,
 	bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG);
 	nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size);
 
-	if (!access_ok(umask, bitmap_size / 8))
+	if (!user_access_begin(umask, bitmap_size / 8))
 		return -EFAULT;
 
-	user_access_begin();
 	while (nr_compat_longs > 1) {
 		unsigned long m = *mask++;
 		unsafe_put_user((compat_ulong_t)m, umask++, Efault);
diff --git a/kernel/exit.c b/kernel/exit.c
index 8a01b671dc1f..2d14979577ee 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1604,10 +1604,9 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
 	if (!infop)
 		return err;
 
-	if (!access_ok(infop, sizeof(*infop)))
+	if (!user_access_begin(infop, sizeof(*infop)))
 		return -EFAULT;
 
-	user_access_begin();
 	unsafe_put_user(signo, &infop->si_signo, Efault);
 	unsafe_put_user(0, &infop->si_errno, Efault);
 	unsafe_put_user(info.cause, &infop->si_code, Efault);
@@ -1732,10 +1731,9 @@ COMPAT_SYSCALL_DEFINE5(waitid,
 	if (!infop)
 		return err;
 
-	if (!access_ok(infop, sizeof(*infop)))
+	if (!user_access_begin(infop, sizeof(*infop)))
 		return -EFAULT;
 
-	user_access_begin();
 	unsafe_put_user(signo, &infop->si_signo, Efault);
 	unsafe_put_user(0, &infop->si_errno, Efault);
 	unsafe_put_user(info.cause, &infop->si_code, Efault);
diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c
index b53e1b5d80f4..58eacd41526c 100644
--- a/lib/strncpy_from_user.c
+++ b/lib/strncpy_from_user.c
@@ -114,10 +114,11 @@ long strncpy_from_user(char *dst, const char __user *src, long count)
 
 		kasan_check_write(dst, count);
 		check_object_size(dst, count, false);
-		user_access_begin();
-		retval = do_strncpy_from_user(dst, src, count, max);
-		user_access_end();
-		return retval;
+		if (user_access_begin(src, max)) {
+			retval = do_strncpy_from_user(dst, src, count, max);
+			user_access_end();
+			return retval;
+		}
 	}
 	return -EFAULT;
 }
diff --git a/lib/strnlen_user.c b/lib/strnlen_user.c
index 60d0bbda8f5e..1c1a1b0e38a5 100644
--- a/lib/strnlen_user.c
+++ b/lib/strnlen_user.c
@@ -114,10 +114,11 @@ long strnlen_user(const char __user *str, long count)
 		unsigned long max = max_addr - src_addr;
 		long retval;
 
-		user_access_begin();
-		retval = do_strnlen_user(str, count, max);
-		user_access_end();
-		return retval;
+		if (user_access_begin(str, max)) {
+			retval = do_strnlen_user(str, count, max);
+			user_access_end();
+			return retval;
+		}
 	}
 	return 0;
 }
-- 
cgit 


From 09be178400829dddc1189b50a7888495dd26aa84 Mon Sep 17 00:00:00 2001
From: Cheng Lin <cheng.lin130@zte.com.cn>
Date: Thu, 3 Jan 2019 15:26:13 -0800
Subject: proc/sysctl: fix return error for proc_doulongvec_minmax()

If the number of input parameters is less than the total parameters, an
EINVAL error will be returned.

For example, we use proc_doulongvec_minmax to pass up to two parameters
with kern_table:

{
	.procname       = "monitor_signals",
	.data           = &monitor_sigs,
	.maxlen         = 2*sizeof(unsigned long),
	.mode           = 0644,
	.proc_handler   = proc_doulongvec_minmax,
},

Reproduce:

When passing two parameters, it's work normal.  But passing only one
parameter, an error "Invalid argument"(EINVAL) is returned.

  [root@cl150 ~]# echo 1 2 > /proc/sys/kernel/monitor_signals
  [root@cl150 ~]# cat /proc/sys/kernel/monitor_signals
  1       2
  [root@cl150 ~]# echo 3 > /proc/sys/kernel/monitor_signals
  -bash: echo: write error: Invalid argument
  [root@cl150 ~]# echo $?
  1
  [root@cl150 ~]# cat /proc/sys/kernel/monitor_signals
  3       2
  [root@cl150 ~]#

The following is the result after apply this patch.  No error is
returned when the number of input parameters is less than the total
parameters.

  [root@cl150 ~]# echo 1 2 > /proc/sys/kernel/monitor_signals
  [root@cl150 ~]# cat /proc/sys/kernel/monitor_signals
  1       2
  [root@cl150 ~]# echo 3 > /proc/sys/kernel/monitor_signals
  [root@cl150 ~]# echo $?
  0
  [root@cl150 ~]# cat /proc/sys/kernel/monitor_signals
  3       2
  [root@cl150 ~]#

There are three processing functions dealing with digital parameters,
__do_proc_dointvec/__do_proc_douintvec/__do_proc_doulongvec_minmax.

This patch deals with __do_proc_doulongvec_minmax, just as
__do_proc_dointvec does, adding a check for parameters 'left'.  In
__do_proc_douintvec, its code implementation explicitly does not support
multiple inputs.

static int __do_proc_douintvec(...){
         ...
         /*
          * Arrays are not supported, keep this simple. *Do not* add
          * support for them.
          */
         if (vleft != 1) {
                 *lenp = 0;
                 return -EINVAL;
         }
         ...
}

So, just __do_proc_doulongvec_minmax has the problem.  And most use of
proc_doulongvec_minmax/proc_doulongvec_ms_jiffies_minmax just have one
parameter.

Link: http://lkml.kernel.org/r/1544081775-15720-1-git-send-email-cheng.lin130@zte.com.cn
Signed-off-by: Cheng Lin <cheng.lin130@zte.com.cn>
Acked-by: Luis Chamberlain <mcgrof@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 1825f712e73b..7f6c1a3b3485 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2787,6 +2787,8 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
 			bool neg;
 
 			left -= proc_skip_spaces(&p);
+			if (!left)
+				break;
 
 			err = proc_get_long(&p, &left, &val, &neg,
 					     proc_wspace_sep,
-- 
cgit 


From 168e06f7937d96c7222037d8a05565e8a6eb00fe Mon Sep 17 00:00:00 2001
From: "Liu, Chuansheng" <chuansheng.liu@intel.com>
Date: Thu, 3 Jan 2019 15:26:27 -0800
Subject: kernel/hung_task.c: force console verbose before panic

Based on commit 401c636a0eeb ("kernel/hung_task.c: show all hung tasks
before panic"), we could get the call stack of hung task.

However, if the console loglevel is not high, we still can not see the
useful panic information in practice, and in most cases users don't set
console loglevel to high level.

This patch is to force console verbose before system panic, so that the
real useful information can be seen in the console, instead of being
like the following, which doesn't have hung task information.

  INFO: task init:1 blocked for more than 120 seconds.
        Tainted: G     U  W         4.19.0-quilt-2e5dc0ac-g51b6c21d76cc #1
  "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
  Kernel panic - not syncing: hung_task: blocked tasks
  CPU: 2 PID: 479 Comm: khungtaskd Tainted: G     U  W         4.19.0-quilt-2e5dc0ac-g51b6c21d76cc #1
  Call Trace:
   dump_stack+0x4f/0x65
   panic+0xde/0x231
   watchdog+0x290/0x410
   kthread+0x12c/0x150
   ret_from_fork+0x35/0x40
  reboot: panic mode set: p,w
  Kernel Offset: 0x34000000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff)

Link: http://lkml.kernel.org/r/27240C0AC20F114CBF8149A2696CBE4A6015B675@SHSMSX101.ccr.corp.intel.com
Signed-off-by: Chuansheng Liu <chuansheng.liu@intel.com>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/hung_task.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index cb8e3e8ac7b9..85af0cde7f46 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -112,8 +112,11 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
 
 	trace_sched_process_hang(t);
 
-	if (!sysctl_hung_task_warnings && !sysctl_hung_task_panic)
-		return;
+	if (sysctl_hung_task_panic) {
+		console_verbose();
+		hung_task_show_lock = true;
+		hung_task_call_panic = true;
+	}
 
 	/*
 	 * Ok, the task did not get scheduled for more than 2 minutes,
@@ -135,11 +138,6 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
 	}
 
 	touch_nmi_watchdog();
-
-	if (sysctl_hung_task_panic) {
-		hung_task_show_lock = true;
-		hung_task_call_panic = true;
-	}
 }
 
 /*
-- 
cgit 


From 304ae42739b108305f8d7b3eb3c1aec7c2b643a9 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Thu, 3 Jan 2019 15:26:31 -0800
Subject: kernel/hung_task.c: break RCU locks based on jiffies

check_hung_uninterruptible_tasks() is currently calling rcu_lock_break()
for every 1024 threads.  But check_hung_task() is very slow if printk()
was called, and is very fast otherwise.

If many threads within some 1024 threads called printk(), the RCU grace
period might be extended enough to trigger RCU stall warnings.
Therefore, calling rcu_lock_break() for every some fixed jiffies will be
safer.

Link: http://lkml.kernel.org/r/1544800658-11423-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Acked-by: Paul E. McKenney <paulmck@linux.ibm.com>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/hung_task.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 85af0cde7f46..4a9191617076 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -34,7 +34,7 @@ int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
  * is disabled during the critical section. It also controls the size of
  * the RCU grace period. So it needs to be upper-bound.
  */
-#define HUNG_TASK_BATCHING 1024
+#define HUNG_TASK_LOCK_BREAK (HZ / 10)
 
 /*
  * Zero means infinite timeout - no checking done:
@@ -171,7 +171,7 @@ static bool rcu_lock_break(struct task_struct *g, struct task_struct *t)
 static void check_hung_uninterruptible_tasks(unsigned long timeout)
 {
 	int max_count = sysctl_hung_task_check_count;
-	int batch_count = HUNG_TASK_BATCHING;
+	unsigned long last_break = jiffies;
 	struct task_struct *g, *t;
 
 	/*
@@ -186,10 +186,10 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
 	for_each_process_thread(g, t) {
 		if (!max_count--)
 			goto unlock;
-		if (!--batch_count) {
-			batch_count = HUNG_TASK_BATCHING;
+		if (time_after(jiffies, last_break + HUNG_TASK_LOCK_BREAK)) {
 			if (!rcu_lock_break(g, t))
 				goto unlock;
+			last_break = jiffies;
 		}
 		/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
 		if (t->state == TASK_UNINTERRUPTIBLE)
-- 
cgit 


From fb5bf31722d0805a3f394f7d59f2e8cd07acccb7 Mon Sep 17 00:00:00 2001
From: Yi Wang <wang.yi59@zte.com.cn>
Date: Thu, 3 Jan 2019 15:28:03 -0800
Subject: fork: fix some -Wmissing-prototypes warnings

We get a warning when building kernel with W=1:

  kernel/fork.c:167:13: warning: no previous prototype for `arch_release_thread_stack' [-Wmissing-prototypes]
  kernel/fork.c:779:13: warning: no previous prototype for `fork_init' [-Wmissing-prototypes]

Add the missing declaration in head file to fix this.

Also, remove arch_release_thread_stack() completely because no arch
seems to implement it since bb9d81264 (arch: remove tile port).

Link: http://lkml.kernel.org/r/1542170087-23645-1-git-send-email-wang.yi59@zte.com.cn
Signed-off-by: Yi Wang <wang.yi59@zte.com.cn>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched/task.h | 2 ++
 init/main.c                | 1 -
 kernel/fork.c              | 5 -----
 3 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 108ede99e533..44c6f15800ff 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -39,6 +39,8 @@ void __noreturn do_task_dead(void);
 
 extern void proc_caches_init(void);
 
+extern void fork_init(void);
+
 extern void release_task(struct task_struct * p);
 
 #ifdef CONFIG_HAVE_COPY_THREAD_TLS
diff --git a/init/main.c b/init/main.c
index 6a74ba0892d2..e2e80ca3165a 100644
--- a/init/main.c
+++ b/init/main.c
@@ -105,7 +105,6 @@
 static int kernel_init(void *);
 
 extern void init_IRQ(void);
-extern void fork_init(void);
 extern void radix_tree_init(void);
 
 /*
diff --git a/kernel/fork.c b/kernel/fork.c
index d439c48ecf18..a60459947f18 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -164,10 +164,6 @@ static inline void free_task_struct(struct task_struct *tsk)
 }
 #endif
 
-void __weak arch_release_thread_stack(unsigned long *stack)
-{
-}
-
 #ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR
 
 /*
@@ -422,7 +418,6 @@ static void release_task_stack(struct task_struct *tsk)
 		return;  /* Better to leak the stack than to free prematurely */
 
 	account_kernel_stack(tsk, -1);
-	arch_release_thread_stack(tsk->stack);
 	free_thread_stack(tsk);
 	tsk->stack = NULL;
 #ifdef CONFIG_VMAP_STACK
-- 
cgit 


From d999bd9392dea7c1a9ac43b8680b22c4425ae4c7 Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@intel.com>
Date: Thu, 3 Jan 2019 15:28:17 -0800
Subject: panic: add options to print system info when panic happens

Kernel panic issues are always painful to debug, partially because it's
not easy to get enough information of the context when panic happens.

And we have ramoops and kdump for that, while this commit tries to
provide a easier way to show the system info by adding a cmdline
parameter, referring some idea from sysrq handler.

Link: http://lkml.kernel.org/r/1543398842-19295-2-git-send-email-feng.tang@intel.com
Signed-off-by: Feng Tang <feng.tang@intel.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Acked-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/admin-guide/kernel-parameters.txt |  8 +++++++
 kernel/panic.c                                  | 28 +++++++++++++++++++++++++
 2 files changed, 36 insertions(+)

(limited to 'kernel')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 408781ee142c..e7b5c49702bb 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3092,6 +3092,14 @@
 			timeout < 0: reboot immediately
 			Format: <timeout>
 
+	panic_print=	Bitmask for printing system info when panic happens.
+			User can chose combination of the following bits:
+			bit 0: print all tasks info
+			bit 1: print system memory info
+			bit 2: print timer info
+			bit 3: print locks info if CONFIG_LOCKDEP is on
+			bit 4: print ftrace buffer
+
 	panic_on_warn	panic() instead of WARN().  Useful to cause kdump
 			on a WARN().
 
diff --git a/kernel/panic.c b/kernel/panic.c
index d10c340c43b0..855f66738bc7 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -46,6 +46,13 @@ int panic_on_warn __read_mostly;
 int panic_timeout = CONFIG_PANIC_TIMEOUT;
 EXPORT_SYMBOL_GPL(panic_timeout);
 
+#define PANIC_PRINT_TASK_INFO		0x00000001
+#define PANIC_PRINT_MEM_INFO		0x00000002
+#define PANIC_PRINT_TIMER_INFO		0x00000004
+#define PANIC_PRINT_LOCK_INFO		0x00000008
+#define PANIC_PRINT_FTRACE_INFO		0x00000010
+static unsigned long panic_print;
+
 ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
 
 EXPORT_SYMBOL(panic_notifier_list);
@@ -125,6 +132,24 @@ void nmi_panic(struct pt_regs *regs, const char *msg)
 }
 EXPORT_SYMBOL(nmi_panic);
 
+static void panic_print_sys_info(void)
+{
+	if (panic_print & PANIC_PRINT_TASK_INFO)
+		show_state();
+
+	if (panic_print & PANIC_PRINT_MEM_INFO)
+		show_mem(0, NULL);
+
+	if (panic_print & PANIC_PRINT_TIMER_INFO)
+		sysrq_timer_list_show();
+
+	if (panic_print & PANIC_PRINT_LOCK_INFO)
+		debug_show_all_locks();
+
+	if (panic_print & PANIC_PRINT_FTRACE_INFO)
+		ftrace_dump(DUMP_ALL);
+}
+
 /**
  *	panic - halt the system
  *	@fmt: The text string to print
@@ -254,6 +279,8 @@ void panic(const char *fmt, ...)
 	debug_locks_off();
 	console_flush_on_panic();
 
+	panic_print_sys_info();
+
 	if (!panic_blink)
 		panic_blink = no_blink;
 
@@ -658,6 +685,7 @@ void refcount_error_report(struct pt_regs *regs, const char *err)
 #endif
 
 core_param(panic, panic_timeout, int, 0644);
+core_param(panic_print, panic_print, ulong, 0644);
 core_param(pause_on_oops, pause_on_oops, int, 0644);
 core_param(panic_on_warn, panic_on_warn, int, 0644);
 core_param(crash_kexec_post_notifiers, crash_kexec_post_notifiers, bool, 0644);
-- 
cgit 


From 81c9d43f94870be66146739c6e61df40dc17bb64 Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@intel.com>
Date: Thu, 3 Jan 2019 15:28:20 -0800
Subject: kernel/sysctl: add panic_print into sysctl

So that we can also runtime chose to print out the needed system info
for panic, other than setting the kernel cmdline.

Link: http://lkml.kernel.org/r/1543398842-19295-3-git-send-email-feng.tang@intel.com
Signed-off-by: Feng Tang <feng.tang@intel.com>
Suggested-by: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Kees Cook <keescook@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/sysctl/kernel.txt | 17 +++++++++++++++++
 include/linux/kernel.h          |  1 +
 include/uapi/linux/sysctl.h     |  1 +
 kernel/panic.c                  |  2 +-
 kernel/sysctl.c                 |  7 +++++++
 kernel/sysctl_binary.c          |  1 +
 6 files changed, 28 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 1b8775298cf7..c0527d8a468a 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -60,6 +60,7 @@ show up in /proc/sys/kernel:
 - panic_on_stackoverflow
 - panic_on_unrecovered_nmi
 - panic_on_warn
+- panic_print
 - panic_on_rcu_stall
 - perf_cpu_time_max_percent
 - perf_event_paranoid
@@ -654,6 +655,22 @@ a kernel rebuild when attempting to kdump at the location of a WARN().
 
 ==============================================================
 
+panic_print:
+
+Bitmask for printing system info when panic happens. User can chose
+combination of the following bits:
+
+bit 0: print all tasks info
+bit 1: print system memory info
+bit 2: print timer info
+bit 3: print locks info if CONFIG_LOCKDEP is on
+bit 4: print ftrace buffer
+
+So for example to print tasks and memory info on panic, user can:
+  echo 3 > /proc/sys/kernel/panic_print
+
+==============================================================
+
 panic_on_rcu_stall:
 
 When set to 1, calls panic() after RCU stall detection messages. This
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index d6aac75b51ba..8f0e68e250a7 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -527,6 +527,7 @@ static inline u32 int_sqrt64(u64 x)
 extern void bust_spinlocks(int yes);
 extern int oops_in_progress;		/* If set, an oops, panic(), BUG() or die() is in progress */
 extern int panic_timeout;
+extern unsigned long panic_print;
 extern int panic_on_oops;
 extern int panic_on_unrecovered_nmi;
 extern int panic_on_io_nmi;
diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h
index d71013fffaf6..87aa2a6d9125 100644
--- a/include/uapi/linux/sysctl.h
+++ b/include/uapi/linux/sysctl.h
@@ -153,6 +153,7 @@ enum
 	KERN_NMI_WATCHDOG=75, /* int: enable/disable nmi watchdog */
 	KERN_PANIC_ON_NMI=76, /* int: whether we will panic on an unrecovered */
 	KERN_PANIC_ON_WARN=77, /* int: call panic() in WARN() functions */
+	KERN_PANIC_PRINT=78, /* ulong: bitmask to print system info on panic */
 };
 
 
diff --git a/kernel/panic.c b/kernel/panic.c
index 855f66738bc7..f121e6ba7e11 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -51,7 +51,7 @@ EXPORT_SYMBOL_GPL(panic_timeout);
 #define PANIC_PRINT_TIMER_INFO		0x00000004
 #define PANIC_PRINT_LOCK_INFO		0x00000008
 #define PANIC_PRINT_FTRACE_INFO		0x00000010
-static unsigned long panic_print;
+unsigned long panic_print;
 
 ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 7f6c1a3b3485..ba4d9e85feb8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -807,6 +807,13 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+	{
+		.procname	= "panic_print",
+		.data		= &panic_print,
+		.maxlen		= sizeof(unsigned long),
+		.mode		= 0644,
+		.proc_handler	= proc_doulongvec_minmax,
+	},
 #if defined CONFIG_PRINTK
 	{
 		.procname	= "printk",
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 07148b497451..73c132095a7b 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -140,6 +140,7 @@ static const struct bin_table bin_kern_table[] = {
 	{ CTL_INT,	KERN_MAX_LOCK_DEPTH,		"max_lock_depth" },
 	{ CTL_INT,	KERN_PANIC_ON_NMI,		"panic_on_unrecovered_nmi" },
 	{ CTL_INT,	KERN_PANIC_ON_WARN,		"panic_on_warn" },
+	{ CTL_ULONG,	KERN_PANIC_PRINT,		"panic_print" },
 	{}
 };
 
-- 
cgit 


From 634724431607f6f46c495dfef801a1c8b44a96d9 Mon Sep 17 00:00:00 2001
From: Anders Roxell <anders.roxell@linaro.org>
Date: Thu, 3 Jan 2019 15:28:24 -0800
Subject: kernel/kcov.c: mark write_comp_data() as notrace

Since __sanitizer_cov_trace_const_cmp4 is marked as notrace, the
function called from __sanitizer_cov_trace_const_cmp4 shouldn't be
traceable either.  ftrace_graph_caller() gets called every time func
write_comp_data() gets called if it isn't marked 'notrace'.  This is the
backtrace from gdb:

 #0  ftrace_graph_caller () at ../arch/arm64/kernel/entry-ftrace.S:179
 #1  0xffffff8010201920 in ftrace_caller () at ../arch/arm64/kernel/entry-ftrace.S:151
 #2  0xffffff8010439714 in write_comp_data (type=5, arg1=0, arg2=0, ip=18446743524224276596) at ../kernel/kcov.c:116
 #3  0xffffff8010439894 in __sanitizer_cov_trace_const_cmp4 (arg1=<optimized out>, arg2=<optimized out>) at ../kernel/kcov.c:188
 #4  0xffffff8010201874 in prepare_ftrace_return (self_addr=18446743524226602768, parent=0xffffff801014b918, frame_pointer=18446743524223531344) at ./include/generated/atomic-instrumented.h:27
 #5  0xffffff801020194c in ftrace_graph_caller () at ../arch/arm64/kernel/entry-ftrace.S:182

Rework so that write_comp_data() that are called from
__sanitizer_cov_trace_*_cmp*() are marked as 'notrace'.

Commit 903e8ff86753 ("kernel/kcov.c: mark funcs in __sanitizer_cov_trace_pc() as notrace")
missed to mark write_comp_data() as 'notrace'. When that patch was
created gcc-7 was used. In lib/Kconfig.debug
config KCOV_ENABLE_COMPARISONS
	depends on $(cc-option,-fsanitize-coverage=trace-cmp)

That code path isn't hit with gcc-7. However, it were that with gcc-8.

Link: http://lkml.kernel.org/r/20181206143011.23719-1-anders.roxell@linaro.org
Signed-off-by: Anders Roxell <anders.roxell@linaro.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Co-developed-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kcov.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kcov.c b/kernel/kcov.c
index 97959d7b77e2..c2277dbdbfb1 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -112,7 +112,7 @@ void notrace __sanitizer_cov_trace_pc(void)
 EXPORT_SYMBOL(__sanitizer_cov_trace_pc);
 
 #ifdef CONFIG_KCOV_ENABLE_COMPARISONS
-static void write_comp_data(u64 type, u64 arg1, u64 arg2, u64 ip)
+static void notrace write_comp_data(u64 type, u64 arg1, u64 arg2, u64 ip)
 {
 	struct task_struct *t;
 	u64 *area;
-- 
cgit 


From 3bb5f4ac55dd91d516e7e36b45c94bd57efbb068 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Thu, 3 Jan 2019 15:28:44 -0800
Subject: kernel/locking/mutex.c: remove caller signal_pending branch
 predictions

This is already done for us internally by the signal machinery.

Link: http://lkml.kernel.org/r/20181116002713.8474-2-dave@stgolabs.net
Signed-off-by: Davidlohr Bueso <dave@stgolabs.net>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/locking/mutex.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 3f8a35104285..db578783dd36 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -987,7 +987,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		 * wait_lock. This ensures the lock cancellation is ordered
 		 * against mutex_unlock() and wake-ups do not go missing.
 		 */
-		if (unlikely(signal_pending_state(state, current))) {
+		if (signal_pending_state(state, current)) {
 			ret = -EINTR;
 			goto err;
 		}
-- 
cgit 


From 34ec35ad8f5f4624e8391dbb83afb4c791f027e3 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Thu, 3 Jan 2019 15:28:48 -0800
Subject: kernel/sched/: remove caller signal_pending branch predictions

This is already done for us internally by the signal machinery.

Link: http://lkml.kernel.org/r/20181116002713.8474-3-dave@stgolabs.net
Signed-off-by: Davidlohr Bueso <dave@stgolabs.net>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched/core.c  | 2 +-
 kernel/sched/swait.c | 2 +-
 kernel/sched/wait.c  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f66920173370..17a954c9e153 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3416,7 +3416,7 @@ static void __sched notrace __schedule(bool preempt)
 
 	switch_count = &prev->nivcsw;
 	if (!preempt && prev->state) {
-		if (unlikely(signal_pending_state(prev->state, prev))) {
+		if (signal_pending_state(prev->state, prev)) {
 			prev->state = TASK_RUNNING;
 		} else {
 			deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
index 66b59ac77c22..e83a3f8449f6 100644
--- a/kernel/sched/swait.c
+++ b/kernel/sched/swait.c
@@ -93,7 +93,7 @@ long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait
 	long ret = 0;
 
 	raw_spin_lock_irqsave(&q->lock, flags);
-	if (unlikely(signal_pending_state(state, current))) {
+	if (signal_pending_state(state, current)) {
 		/*
 		 * See prepare_to_wait_event(). TL;DR, subsequent swake_up_one()
 		 * must not see us.
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 5dd47f1103d1..6eb1f8efd221 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -264,7 +264,7 @@ long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_en
 	long ret = 0;
 
 	spin_lock_irqsave(&wq_head->lock, flags);
-	if (unlikely(signal_pending_state(state, current))) {
+	if (signal_pending_state(state, current)) {
 		/*
 		 * Exclusive waiter must not fail if it was selected by wakeup,
 		 * it should "consume" the condition we were waiting for.
-- 
cgit 


From 8270f3a11ceef64bdb0ab141180e8d2f17c619ec Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 4 Jan 2019 18:31:48 +0100
Subject: dma-direct: fix DMA_ATTR_NO_KERNEL_MAPPING for remapped allocations

We need to return a dma_addr_t even if we don't have a kernel mapping.
Do so by consolidating the phys_to_dma call in a single place and jump
to it from all the branches that return successfully.

Fixes: bfd56cd60521 ("dma-mapping: support highmem in the generic remap allocator")
Reported-by: Liviu Dudau <liviu@dudau.co.uk
Signed-off-by: Christoph Hellwig <hch@lst.de>
Tested-by: Liviu Dudau <liviu@dudau.co.uk>
---
 kernel/dma/remap.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c
index 18cc09fc27b9..7a723194ecbe 100644
--- a/kernel/dma/remap.c
+++ b/kernel/dma/remap.c
@@ -204,8 +204,7 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
 		ret = dma_alloc_from_pool(size, &page, flags);
 		if (!ret)
 			return NULL;
-		*dma_handle = phys_to_dma(dev, page_to_phys(page));
-		return ret;
+		goto done;
 	}
 
 	page = __dma_direct_alloc_pages(dev, size, dma_handle, flags, attrs);
@@ -215,8 +214,10 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
 	/* remove any dirty cache lines on the kernel alias */
 	arch_dma_prep_coherent(page, size);
 
-	if (attrs & DMA_ATTR_NO_KERNEL_MAPPING)
-		return page; /* opaque cookie */
+	if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) {
+		ret = page; /* opaque cookie */
+		goto done;
+	}
 
 	/* create a coherent mapping */
 	ret = dma_common_contiguous_remap(page, size, VM_USERMAP,
@@ -227,9 +228,9 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
 		return ret;
 	}
 
-	*dma_handle = phys_to_dma(dev, page_to_phys(page));
 	memset(ret, 0, size);
-
+done:
+	*dma_handle = phys_to_dma(dev, page_to_phys(page));
 	return ret;
 }
 
-- 
cgit 


From e9666d10a5677a494260d60d1fa0b73cc7646eb3 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Mon, 31 Dec 2018 00:14:15 +0900
Subject: jump_label: move 'asm goto' support test to Kconfig

Currently, CONFIG_JUMP_LABEL just means "I _want_ to use jump label".

The jump label is controlled by HAVE_JUMP_LABEL, which is defined
like this:

  #if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_JUMP_LABEL)
  # define HAVE_JUMP_LABEL
  #endif

We can improve this by testing 'asm goto' support in Kconfig, then
make JUMP_LABEL depend on CC_HAS_ASM_GOTO.

Ugly #ifdef HAVE_JUMP_LABEL will go away, and CONFIG_JUMP_LABEL will
match to the real kernel capability.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au> (powerpc)
Tested-by: Sedat Dilek <sedat.dilek@gmail.com>
---
 Makefile                                          |  7 -------
 arch/Kconfig                                      |  1 +
 arch/arm/kernel/jump_label.c                      |  4 ----
 arch/arm64/kernel/jump_label.c                    |  4 ----
 arch/mips/kernel/jump_label.c                     |  4 ----
 arch/powerpc/include/asm/asm-prototypes.h         |  2 +-
 arch/powerpc/kernel/jump_label.c                  |  2 --
 arch/powerpc/platforms/powernv/opal-tracepoints.c |  2 +-
 arch/powerpc/platforms/powernv/opal-wrappers.S    |  2 +-
 arch/powerpc/platforms/pseries/hvCall.S           |  4 ++--
 arch/powerpc/platforms/pseries/lpar.c             |  2 +-
 arch/s390/kernel/Makefile                         |  3 ++-
 arch/s390/kernel/jump_label.c                     |  4 ----
 arch/sparc/kernel/Makefile                        |  2 +-
 arch/sparc/kernel/jump_label.c                    |  4 ----
 arch/x86/Makefile                                 |  2 +-
 arch/x86/entry/calling.h                          |  2 +-
 arch/x86/include/asm/cpufeature.h                 |  2 +-
 arch/x86/include/asm/jump_label.h                 | 13 -------------
 arch/x86/include/asm/rmwcc.h                      |  6 +++---
 arch/x86/kernel/Makefile                          |  3 ++-
 arch/x86/kernel/jump_label.c                      |  4 ----
 arch/x86/kvm/emulate.c                            |  2 +-
 arch/xtensa/kernel/jump_label.c                   |  4 ----
 include/linux/dynamic_debug.h                     |  6 +++---
 include/linux/jump_label.h                        | 22 +++++++++-------------
 include/linux/jump_label_ratelimit.h              |  8 +++-----
 include/linux/module.h                            |  2 +-
 include/linux/netfilter.h                         |  4 ++--
 include/linux/netfilter_ingress.h                 |  2 +-
 init/Kconfig                                      |  3 +++
 kernel/jump_label.c                               | 10 +++-------
 kernel/module.c                                   |  2 +-
 kernel/sched/core.c                               |  2 +-
 kernel/sched/debug.c                              |  4 ++--
 kernel/sched/fair.c                               |  6 +++---
 kernel/sched/sched.h                              |  6 +++---
 lib/dynamic_debug.c                               |  2 +-
 net/core/dev.c                                    |  6 +++---
 net/netfilter/core.c                              |  6 +++---
 scripts/gcc-goto.sh                               |  2 +-
 tools/arch/x86/include/asm/rmwcc.h                |  6 +++---
 42 files changed, 65 insertions(+), 119 deletions(-)

(limited to 'kernel')

diff --git a/Makefile b/Makefile
index 60a473247657..04a857817f77 100644
--- a/Makefile
+++ b/Makefile
@@ -514,13 +514,6 @@ RETPOLINE_VDSO_CFLAGS := $(call cc-option,$(RETPOLINE_VDSO_CFLAGS_GCC),$(call cc
 export RETPOLINE_CFLAGS
 export RETPOLINE_VDSO_CFLAGS
 
-# check for 'asm goto'
-ifeq ($(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-goto.sh $(CC) $(KBUILD_CFLAGS)), y)
-  CC_HAVE_ASM_GOTO := 1
-  KBUILD_CFLAGS += -DCC_HAVE_ASM_GOTO
-  KBUILD_AFLAGS += -DCC_HAVE_ASM_GOTO
-endif
-
 # The expansion should be delayed until arch/$(SRCARCH)/Makefile is included.
 # Some architectures define CROSS_COMPILE in arch/$(SRCARCH)/Makefile.
 # CC_VERSION_TEXT is referenced from Kconfig (so it needs export),
diff --git a/arch/Kconfig b/arch/Kconfig
index b70c952ac838..4cfb6de48f79 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -71,6 +71,7 @@ config KPROBES
 config JUMP_LABEL
        bool "Optimize very unlikely/likely branches"
        depends on HAVE_ARCH_JUMP_LABEL
+       depends on CC_HAS_ASM_GOTO
        help
          This option enables a transparent branch optimization that
 	 makes certain almost-always-true or almost-always-false branch
diff --git a/arch/arm/kernel/jump_label.c b/arch/arm/kernel/jump_label.c
index 90bce3d9928e..303b3ab87f7e 100644
--- a/arch/arm/kernel/jump_label.c
+++ b/arch/arm/kernel/jump_label.c
@@ -4,8 +4,6 @@
 #include <asm/patch.h>
 #include <asm/insn.h>
 
-#ifdef HAVE_JUMP_LABEL
-
 static void __arch_jump_label_transform(struct jump_entry *entry,
 					enum jump_label_type type,
 					bool is_static)
@@ -35,5 +33,3 @@ void arch_jump_label_transform_static(struct jump_entry *entry,
 {
 	__arch_jump_label_transform(entry, type, true);
 }
-
-#endif
diff --git a/arch/arm64/kernel/jump_label.c b/arch/arm64/kernel/jump_label.c
index 646b9562ee64..1eff270e8861 100644
--- a/arch/arm64/kernel/jump_label.c
+++ b/arch/arm64/kernel/jump_label.c
@@ -20,8 +20,6 @@
 #include <linux/jump_label.h>
 #include <asm/insn.h>
 
-#ifdef HAVE_JUMP_LABEL
-
 void arch_jump_label_transform(struct jump_entry *entry,
 			       enum jump_label_type type)
 {
@@ -49,5 +47,3 @@ void arch_jump_label_transform_static(struct jump_entry *entry,
 	 * NOP needs to be replaced by a branch.
 	 */
 }
-
-#endif	/* HAVE_JUMP_LABEL */
diff --git a/arch/mips/kernel/jump_label.c b/arch/mips/kernel/jump_label.c
index 32e3168316cd..ab943927f97a 100644
--- a/arch/mips/kernel/jump_label.c
+++ b/arch/mips/kernel/jump_label.c
@@ -16,8 +16,6 @@
 #include <asm/cacheflush.h>
 #include <asm/inst.h>
 
-#ifdef HAVE_JUMP_LABEL
-
 /*
  * Define parameters for the standard MIPS and the microMIPS jump
  * instruction encoding respectively:
@@ -70,5 +68,3 @@ void arch_jump_label_transform(struct jump_entry *e,
 
 	mutex_unlock(&text_mutex);
 }
-
-#endif /* HAVE_JUMP_LABEL */
diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h
index 6f201b199c02..1d911f68a23b 100644
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -38,7 +38,7 @@ extern struct static_key hcall_tracepoint_key;
 void __trace_hcall_entry(unsigned long opcode, unsigned long *args);
 void __trace_hcall_exit(long opcode, long retval, unsigned long *retbuf);
 /* OPAL tracing */
-#ifdef HAVE_JUMP_LABEL
+#ifdef CONFIG_JUMP_LABEL
 extern struct static_key opal_tracepoint_key;
 #endif
 
diff --git a/arch/powerpc/kernel/jump_label.c b/arch/powerpc/kernel/jump_label.c
index 6472472093d0..0080c5fbd225 100644
--- a/arch/powerpc/kernel/jump_label.c
+++ b/arch/powerpc/kernel/jump_label.c
@@ -11,7 +11,6 @@
 #include <linux/jump_label.h>
 #include <asm/code-patching.h>
 
-#ifdef HAVE_JUMP_LABEL
 void arch_jump_label_transform(struct jump_entry *entry,
 			       enum jump_label_type type)
 {
@@ -22,4 +21,3 @@ void arch_jump_label_transform(struct jump_entry *entry,
 	else
 		patch_instruction(addr, PPC_INST_NOP);
 }
-#endif
diff --git a/arch/powerpc/platforms/powernv/opal-tracepoints.c b/arch/powerpc/platforms/powernv/opal-tracepoints.c
index 1ab7d26c0a2c..f16a43540e30 100644
--- a/arch/powerpc/platforms/powernv/opal-tracepoints.c
+++ b/arch/powerpc/platforms/powernv/opal-tracepoints.c
@@ -4,7 +4,7 @@
 #include <asm/trace.h>
 #include <asm/asm-prototypes.h>
 
-#ifdef HAVE_JUMP_LABEL
+#ifdef CONFIG_JUMP_LABEL
 struct static_key opal_tracepoint_key = STATIC_KEY_INIT;
 
 int opal_tracepoint_regfunc(void)
diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
index 251528231a9e..f4875fe3f8ff 100644
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -20,7 +20,7 @@
 	.section	".text"
 
 #ifdef CONFIG_TRACEPOINTS
-#ifdef HAVE_JUMP_LABEL
+#ifdef CONFIG_JUMP_LABEL
 #define OPAL_BRANCH(LABEL)					\
 	ARCH_STATIC_BRANCH(LABEL, opal_tracepoint_key)
 #else
diff --git a/arch/powerpc/platforms/pseries/hvCall.S b/arch/powerpc/platforms/pseries/hvCall.S
index d91412c591ef..50dc9426d0be 100644
--- a/arch/powerpc/platforms/pseries/hvCall.S
+++ b/arch/powerpc/platforms/pseries/hvCall.S
@@ -19,7 +19,7 @@
 	
 #ifdef CONFIG_TRACEPOINTS
 
-#ifndef HAVE_JUMP_LABEL
+#ifndef CONFIG_JUMP_LABEL
 	.section	".toc","aw"
 
 	.globl hcall_tracepoint_refcount
@@ -79,7 +79,7 @@ hcall_tracepoint_refcount:
 	mr	r5,BUFREG;					\
 	__HCALL_INST_POSTCALL
 
-#ifdef HAVE_JUMP_LABEL
+#ifdef CONFIG_JUMP_LABEL
 #define HCALL_BRANCH(LABEL)					\
 	ARCH_STATIC_BRANCH(LABEL, hcall_tracepoint_key)
 #else
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index 32d4452973e7..f2a9f0adc2d3 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -1040,7 +1040,7 @@ EXPORT_SYMBOL(arch_free_page);
 #endif /* CONFIG_PPC_BOOK3S_64 */
 
 #ifdef CONFIG_TRACEPOINTS
-#ifdef HAVE_JUMP_LABEL
+#ifdef CONFIG_JUMP_LABEL
 struct static_key hcall_tracepoint_key = STATIC_KEY_INIT;
 
 int hcall_tracepoint_regfunc(void)
diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile
index 386b1abb217b..e216e116a9a9 100644
--- a/arch/s390/kernel/Makefile
+++ b/arch/s390/kernel/Makefile
@@ -48,7 +48,7 @@ CFLAGS_ptrace.o		+= -DUTS_MACHINE='"$(UTS_MACHINE)"'
 obj-y	:= traps.o time.o process.o base.o early.o setup.o idle.o vtime.o
 obj-y	+= processor.o sys_s390.o ptrace.o signal.o cpcmd.o ebcdic.o nmi.o
 obj-y	+= debug.o irq.o ipl.o dis.o diag.o vdso.o early_nobss.o
-obj-y	+= sysinfo.o jump_label.o lgr.o os_info.o machine_kexec.o pgm_check.o
+obj-y	+= sysinfo.o lgr.o os_info.o machine_kexec.o pgm_check.o
 obj-y	+= runtime_instr.o cache.o fpu.o dumpstack.o guarded_storage.o sthyi.o
 obj-y	+= entry.o reipl.o relocate_kernel.o kdebugfs.o alternative.o
 obj-y	+= nospec-branch.o ipl_vmparm.o
@@ -72,6 +72,7 @@ obj-$(CONFIG_KPROBES)		+= kprobes.o
 obj-$(CONFIG_FUNCTION_TRACER)	+= mcount.o ftrace.o
 obj-$(CONFIG_CRASH_DUMP)	+= crash_dump.o
 obj-$(CONFIG_UPROBES)		+= uprobes.o
+obj-$(CONFIG_JUMP_LABEL)	+= jump_label.o
 
 obj-$(CONFIG_KEXEC_FILE)	+= machine_kexec_file.o kexec_image.o
 obj-$(CONFIG_KEXEC_FILE)	+= kexec_elf.o
diff --git a/arch/s390/kernel/jump_label.c b/arch/s390/kernel/jump_label.c
index 50a1798604a8..3f10b56bd5a3 100644
--- a/arch/s390/kernel/jump_label.c
+++ b/arch/s390/kernel/jump_label.c
@@ -10,8 +10,6 @@
 #include <linux/jump_label.h>
 #include <asm/ipl.h>
 
-#ifdef HAVE_JUMP_LABEL
-
 struct insn {
 	u16 opcode;
 	s32 offset;
@@ -103,5 +101,3 @@ void arch_jump_label_transform_static(struct jump_entry *entry,
 {
 	__jump_label_transform(entry, type, 1);
 }
-
-#endif
diff --git a/arch/sparc/kernel/Makefile b/arch/sparc/kernel/Makefile
index cf8640841b7a..97c0e19263d1 100644
--- a/arch/sparc/kernel/Makefile
+++ b/arch/sparc/kernel/Makefile
@@ -118,4 +118,4 @@ pc--$(CONFIG_PERF_EVENTS) := perf_event.o
 obj-$(CONFIG_SPARC64)	+= $(pc--y)
 
 obj-$(CONFIG_UPROBES)	+= uprobes.o
-obj-$(CONFIG_SPARC64)	+= jump_label.o
+obj-$(CONFIG_JUMP_LABEL) += jump_label.o
diff --git a/arch/sparc/kernel/jump_label.c b/arch/sparc/kernel/jump_label.c
index 7f8eac51df33..a4cfaeecaf5e 100644
--- a/arch/sparc/kernel/jump_label.c
+++ b/arch/sparc/kernel/jump_label.c
@@ -9,8 +9,6 @@
 
 #include <asm/cacheflush.h>
 
-#ifdef HAVE_JUMP_LABEL
-
 void arch_jump_label_transform(struct jump_entry *entry,
 			       enum jump_label_type type)
 {
@@ -47,5 +45,3 @@ void arch_jump_label_transform(struct jump_entry *entry,
 	flushi(insn);
 	mutex_unlock(&text_mutex);
 }
-
-#endif
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 16c3145c0a5f..9c5a67d1b9c1 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -289,7 +289,7 @@ vdso_install:
 
 archprepare: checkbin
 checkbin:
-ifndef CC_HAVE_ASM_GOTO
+ifndef CONFIG_CC_HAS_ASM_GOTO
 	@echo Compiler lacks asm-goto support.
 	@exit 1
 endif
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 20d0885b00fb..efb0d1b1f15f 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -351,7 +351,7 @@ For 32-bit we have the following conventions - kernel is built with
  */
 .macro CALL_enter_from_user_mode
 #ifdef CONFIG_CONTEXT_TRACKING
-#ifdef HAVE_JUMP_LABEL
+#ifdef CONFIG_JUMP_LABEL
 	STATIC_JUMP_IF_FALSE .Lafter_call_\@, context_tracking_enabled, def=0
 #endif
 	call enter_from_user_mode
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index aced6c9290d6..ce95b8cbd229 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -140,7 +140,7 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit);
 
 #define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit)
 
-#if defined(__clang__) && !defined(CC_HAVE_ASM_GOTO)
+#if defined(__clang__) && !defined(CONFIG_CC_HAS_ASM_GOTO)
 
 /*
  * Workaround for the sake of BPF compilation which utilizes kernel
diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h
index 21efc9d07ed9..65191ce8e1cf 100644
--- a/arch/x86/include/asm/jump_label.h
+++ b/arch/x86/include/asm/jump_label.h
@@ -2,19 +2,6 @@
 #ifndef _ASM_X86_JUMP_LABEL_H
 #define _ASM_X86_JUMP_LABEL_H
 
-#ifndef HAVE_JUMP_LABEL
-/*
- * For better or for worse, if jump labels (the gcc extension) are missing,
- * then the entire static branch patching infrastructure is compiled out.
- * If that happens, the code in here will malfunction.  Raise a compiler
- * error instead.
- *
- * In theory, jump labels and the static branch patching infrastructure
- * could be decoupled to fix this.
- */
-#error asm/jump_label.h included on a non-jump-label kernel
-#endif
-
 #define JUMP_LABEL_NOP_SIZE 5
 
 #ifdef CONFIG_X86_64
diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h
index 46ac84b506f5..8a9eba191516 100644
--- a/arch/x86/include/asm/rmwcc.h
+++ b/arch/x86/include/asm/rmwcc.h
@@ -11,7 +11,7 @@
 
 #define __CLOBBERS_MEM(clb...)	"memory", ## clb
 
-#if !defined(__GCC_ASM_FLAG_OUTPUTS__) && defined(CC_HAVE_ASM_GOTO)
+#if !defined(__GCC_ASM_FLAG_OUTPUTS__) && defined(CONFIG_CC_HAS_ASM_GOTO)
 
 /* Use asm goto */
 
@@ -27,7 +27,7 @@ cc_label:	c = true;						\
 	c;								\
 })
 
-#else /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */
+#else /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CONFIG_CC_HAS_ASM_GOTO) */
 
 /* Use flags output or a set instruction */
 
@@ -40,7 +40,7 @@ cc_label:	c = true;						\
 	c;								\
 })
 
-#endif /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */
+#endif /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CONFIG_CC_HAS_ASM_GOTO) */
 
 #define GEN_UNARY_RMWcc_4(op, var, cc, arg0)				\
 	__GEN_RMWcc(op " " arg0, var, cc, __CLOBBERS_MEM())
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index eb51b0e1189c..00b7e27bc2b7 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -49,7 +49,8 @@ obj-$(CONFIG_COMPAT)	+= signal_compat.o
 obj-y			+= traps.o idt.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
 obj-y			+= time.o ioport.o dumpstack.o nmi.o
 obj-$(CONFIG_MODIFY_LDT_SYSCALL)	+= ldt.o
-obj-y			+= setup.o x86_init.o i8259.o irqinit.o jump_label.o
+obj-y			+= setup.o x86_init.o i8259.o irqinit.o
+obj-$(CONFIG_JUMP_LABEL)	+= jump_label.o
 obj-$(CONFIG_IRQ_WORK)  += irq_work.o
 obj-y			+= probe_roms.o
 obj-$(CONFIG_X86_64)	+= sys_x86_64.o
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
index aac0c1f7e354..f99bd26bd3f1 100644
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@@ -16,8 +16,6 @@
 #include <asm/alternative.h>
 #include <asm/text-patching.h>
 
-#ifdef HAVE_JUMP_LABEL
-
 union jump_code_union {
 	char code[JUMP_LABEL_NOP_SIZE];
 	struct {
@@ -130,5 +128,3 @@ __init_or_module void arch_jump_label_transform_static(struct jump_entry *entry,
 	if (jlstate == JL_STATE_UPDATE)
 		__jump_label_transform(entry, type, text_poke_early, 1);
 }
-
-#endif
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 78e430f4e15c..c338984c850d 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -456,7 +456,7 @@ FOP_END;
 
 /*
  * XXX: inoutclob user must know where the argument is being expanded.
- *      Relying on CC_HAVE_ASM_GOTO would allow us to remove _fault.
+ *      Relying on CONFIG_CC_HAS_ASM_GOTO would allow us to remove _fault.
  */
 #define asm_safe(insn, inoutclob...) \
 ({ \
diff --git a/arch/xtensa/kernel/jump_label.c b/arch/xtensa/kernel/jump_label.c
index d108f721c116..61cf6497a646 100644
--- a/arch/xtensa/kernel/jump_label.c
+++ b/arch/xtensa/kernel/jump_label.c
@@ -10,8 +10,6 @@
 
 #include <asm/cacheflush.h>
 
-#ifdef HAVE_JUMP_LABEL
-
 #define J_OFFSET_MASK 0x0003ffff
 #define J_SIGN_MASK (~(J_OFFSET_MASK >> 1))
 
@@ -95,5 +93,3 @@ void arch_jump_label_transform(struct jump_entry *e,
 
 	patch_text(jump_entry_code(e), &insn, JUMP_LABEL_NOP_SIZE);
 }
-
-#endif /* HAVE_JUMP_LABEL */
diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h
index 2fd8006153c3..b3419da1a776 100644
--- a/include/linux/dynamic_debug.h
+++ b/include/linux/dynamic_debug.h
@@ -2,7 +2,7 @@
 #ifndef _DYNAMIC_DEBUG_H
 #define _DYNAMIC_DEBUG_H
 
-#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_JUMP_LABEL)
+#if defined(CONFIG_JUMP_LABEL)
 #include <linux/jump_label.h>
 #endif
 
@@ -38,7 +38,7 @@ struct _ddebug {
 #define _DPRINTK_FLAGS_DEFAULT 0
 #endif
 	unsigned int flags:8;
-#ifdef HAVE_JUMP_LABEL
+#ifdef CONFIG_JUMP_LABEL
 	union {
 		struct static_key_true dd_key_true;
 		struct static_key_false dd_key_false;
@@ -83,7 +83,7 @@ void __dynamic_netdev_dbg(struct _ddebug *descriptor,
 		dd_key_init(key, init)				\
 	}
 
-#ifdef HAVE_JUMP_LABEL
+#ifdef CONFIG_JUMP_LABEL
 
 #define dd_key_init(key, init) key = (init)
 
diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index 5df6a621e464..3e113a1fa0f1 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -71,10 +71,6 @@
  * Additional babbling in: Documentation/static-keys.txt
  */
 
-#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_JUMP_LABEL)
-# define HAVE_JUMP_LABEL
-#endif
-
 #ifndef __ASSEMBLY__
 
 #include <linux/types.h>
@@ -86,7 +82,7 @@ extern bool static_key_initialized;
 				    "%s(): static key '%pS' used before call to jump_label_init()", \
 				    __func__, (key))
 
-#ifdef HAVE_JUMP_LABEL
+#ifdef CONFIG_JUMP_LABEL
 
 struct static_key {
 	atomic_t enabled;
@@ -114,10 +110,10 @@ struct static_key {
 struct static_key {
 	atomic_t enabled;
 };
-#endif	/* HAVE_JUMP_LABEL */
+#endif	/* CONFIG_JUMP_LABEL */
 #endif /* __ASSEMBLY__ */
 
-#ifdef HAVE_JUMP_LABEL
+#ifdef CONFIG_JUMP_LABEL
 #include <asm/jump_label.h>
 
 #ifndef __ASSEMBLY__
@@ -192,7 +188,7 @@ enum jump_label_type {
 
 struct module;
 
-#ifdef HAVE_JUMP_LABEL
+#ifdef CONFIG_JUMP_LABEL
 
 #define JUMP_TYPE_FALSE		0UL
 #define JUMP_TYPE_TRUE		1UL
@@ -245,7 +241,7 @@ extern void static_key_disable_cpuslocked(struct static_key *key);
 	{ .enabled = { 0 },					\
 	  { .entries = (void *)JUMP_TYPE_FALSE } }
 
-#else  /* !HAVE_JUMP_LABEL */
+#else  /* !CONFIG_JUMP_LABEL */
 
 #include <linux/atomic.h>
 #include <linux/bug.h>
@@ -330,7 +326,7 @@ static inline void static_key_disable(struct static_key *key)
 #define STATIC_KEY_INIT_TRUE	{ .enabled = ATOMIC_INIT(1) }
 #define STATIC_KEY_INIT_FALSE	{ .enabled = ATOMIC_INIT(0) }
 
-#endif	/* HAVE_JUMP_LABEL */
+#endif	/* CONFIG_JUMP_LABEL */
 
 #define STATIC_KEY_INIT STATIC_KEY_INIT_FALSE
 #define jump_label_enabled static_key_enabled
@@ -394,7 +390,7 @@ extern bool ____wrong_branch_error(void);
 	static_key_count((struct static_key *)x) > 0;				\
 })
 
-#ifdef HAVE_JUMP_LABEL
+#ifdef CONFIG_JUMP_LABEL
 
 /*
  * Combine the right initial value (type) with the right branch order
@@ -476,12 +472,12 @@ extern bool ____wrong_branch_error(void);
 	unlikely(branch);							\
 })
 
-#else /* !HAVE_JUMP_LABEL */
+#else /* !CONFIG_JUMP_LABEL */
 
 #define static_branch_likely(x)		likely(static_key_enabled(&(x)->key))
 #define static_branch_unlikely(x)	unlikely(static_key_enabled(&(x)->key))
 
-#endif /* HAVE_JUMP_LABEL */
+#endif /* CONFIG_JUMP_LABEL */
 
 /*
  * Advanced usage; refcount, branch is enabled when: count != 0
diff --git a/include/linux/jump_label_ratelimit.h b/include/linux/jump_label_ratelimit.h
index baa8eabbaa56..a49f2b45b3f0 100644
--- a/include/linux/jump_label_ratelimit.h
+++ b/include/linux/jump_label_ratelimit.h
@@ -5,21 +5,19 @@
 #include <linux/jump_label.h>
 #include <linux/workqueue.h>
 
-#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_JUMP_LABEL)
+#if defined(CONFIG_JUMP_LABEL)
 struct static_key_deferred {
 	struct static_key key;
 	unsigned long timeout;
 	struct delayed_work work;
 };
-#endif
 
-#ifdef HAVE_JUMP_LABEL
 extern void static_key_slow_dec_deferred(struct static_key_deferred *key);
 extern void static_key_deferred_flush(struct static_key_deferred *key);
 extern void
 jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl);
 
-#else	/* !HAVE_JUMP_LABEL */
+#else	/* !CONFIG_JUMP_LABEL */
 struct static_key_deferred {
 	struct static_key  key;
 };
@@ -38,5 +36,5 @@ jump_label_rate_limit(struct static_key_deferred *key,
 {
 	STATIC_KEY_CHECK_USE(key);
 }
-#endif	/* HAVE_JUMP_LABEL */
+#endif	/* CONFIG_JUMP_LABEL */
 #endif	/* _LINUX_JUMP_LABEL_RATELIMIT_H */
diff --git a/include/linux/module.h b/include/linux/module.h
index d5453eb5a68b..9a21fe3509af 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -436,7 +436,7 @@ struct module {
 	unsigned int num_bpf_raw_events;
 	struct bpf_raw_event_map *bpf_raw_events;
 #endif
-#ifdef HAVE_JUMP_LABEL
+#ifdef CONFIG_JUMP_LABEL
 	struct jump_entry *jump_entries;
 	unsigned int num_jump_entries;
 #endif
diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index bbe99d2b28b4..72cb19c3db6a 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -176,7 +176,7 @@ void nf_unregister_net_hooks(struct net *net, const struct nf_hook_ops *reg,
 int nf_register_sockopt(struct nf_sockopt_ops *reg);
 void nf_unregister_sockopt(struct nf_sockopt_ops *reg);
 
-#ifdef HAVE_JUMP_LABEL
+#ifdef CONFIG_JUMP_LABEL
 extern struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
 #endif
 
@@ -198,7 +198,7 @@ static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net,
 	struct nf_hook_entries *hook_head = NULL;
 	int ret = 1;
 
-#ifdef HAVE_JUMP_LABEL
+#ifdef CONFIG_JUMP_LABEL
 	if (__builtin_constant_p(pf) &&
 	    __builtin_constant_p(hook) &&
 	    !static_key_false(&nf_hooks_needed[pf][hook]))
diff --git a/include/linux/netfilter_ingress.h b/include/linux/netfilter_ingress.h
index 554c920691dd..a13774be2eb5 100644
--- a/include/linux/netfilter_ingress.h
+++ b/include/linux/netfilter_ingress.h
@@ -8,7 +8,7 @@
 #ifdef CONFIG_NETFILTER_INGRESS
 static inline bool nf_hook_ingress_active(const struct sk_buff *skb)
 {
-#ifdef HAVE_JUMP_LABEL
+#ifdef CONFIG_JUMP_LABEL
 	if (!static_key_false(&nf_hooks_needed[NFPROTO_NETDEV][NF_NETDEV_INGRESS]))
 		return false;
 #endif
diff --git a/init/Kconfig b/init/Kconfig
index 3e6be1694766..d47cb77a220e 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -23,6 +23,9 @@ config CLANG_VERSION
 	int
 	default $(shell,$(srctree)/scripts/clang-version.sh $(CC))
 
+config CC_HAS_ASM_GOTO
+	def_bool $(success,$(srctree)/scripts/gcc-goto.sh $(CC))
+
 config CONSTRUCTORS
 	bool
 	depends on !UML
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index b28028b08d44..bad96b476eb6 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -18,8 +18,6 @@
 #include <linux/cpu.h>
 #include <asm/sections.h>
 
-#ifdef HAVE_JUMP_LABEL
-
 /* mutex to protect coming/going of the the jump_label table */
 static DEFINE_MUTEX(jump_label_mutex);
 
@@ -80,13 +78,13 @@ jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop)
 static void jump_label_update(struct static_key *key);
 
 /*
- * There are similar definitions for the !HAVE_JUMP_LABEL case in jump_label.h.
+ * There are similar definitions for the !CONFIG_JUMP_LABEL case in jump_label.h.
  * The use of 'atomic_read()' requires atomic.h and its problematic for some
  * kernel headers such as kernel.h and others. Since static_key_count() is not
- * used in the branch statements as it is for the !HAVE_JUMP_LABEL case its ok
+ * used in the branch statements as it is for the !CONFIG_JUMP_LABEL case its ok
  * to have it be a function here. Similarly, for 'static_key_enable()' and
  * 'static_key_disable()', which require bug.h. This should allow jump_label.h
- * to be included from most/all places for HAVE_JUMP_LABEL.
+ * to be included from most/all places for CONFIG_JUMP_LABEL.
  */
 int static_key_count(struct static_key *key)
 {
@@ -791,5 +789,3 @@ static __init int jump_label_test(void)
 }
 early_initcall(jump_label_test);
 #endif /* STATIC_KEYS_SELFTEST */
-
-#endif /* HAVE_JUMP_LABEL */
diff --git a/kernel/module.c b/kernel/module.c
index fcbc0128810b..2ad1b5239910 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3102,7 +3102,7 @@ static int find_module_sections(struct module *mod, struct load_info *info)
 					   sizeof(*mod->bpf_raw_events),
 					   &mod->num_bpf_raw_events);
 #endif
-#ifdef HAVE_JUMP_LABEL
+#ifdef CONFIG_JUMP_LABEL
 	mod->jump_entries = section_objs(info, "__jump_table",
 					sizeof(*mod->jump_entries),
 					&mod->num_jump_entries);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 223f78d5c111..a674c7db2f29 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -24,7 +24,7 @@
 
 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 
-#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
+#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_JUMP_LABEL)
 /*
  * Debugging: various feature bits
  *
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 02bd5f969b21..de3de997e245 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -73,7 +73,7 @@ static int sched_feat_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-#ifdef HAVE_JUMP_LABEL
+#ifdef CONFIG_JUMP_LABEL
 
 #define jump_label_key__true  STATIC_KEY_INIT_TRUE
 #define jump_label_key__false STATIC_KEY_INIT_FALSE
@@ -99,7 +99,7 @@ static void sched_feat_enable(int i)
 #else
 static void sched_feat_disable(int i) { };
 static void sched_feat_enable(int i) { };
-#endif /* HAVE_JUMP_LABEL */
+#endif /* CONFIG_JUMP_LABEL */
 
 static int sched_feat_set(char *cmp)
 {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6483834f1278..50aa2aba69bd 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4217,7 +4217,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 
 #ifdef CONFIG_CFS_BANDWIDTH
 
-#ifdef HAVE_JUMP_LABEL
+#ifdef CONFIG_JUMP_LABEL
 static struct static_key __cfs_bandwidth_used;
 
 static inline bool cfs_bandwidth_used(void)
@@ -4234,7 +4234,7 @@ void cfs_bandwidth_usage_dec(void)
 {
 	static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used);
 }
-#else /* HAVE_JUMP_LABEL */
+#else /* CONFIG_JUMP_LABEL */
 static bool cfs_bandwidth_used(void)
 {
 	return true;
@@ -4242,7 +4242,7 @@ static bool cfs_bandwidth_used(void)
 
 void cfs_bandwidth_usage_inc(void) {}
 void cfs_bandwidth_usage_dec(void) {}
-#endif /* HAVE_JUMP_LABEL */
+#endif /* CONFIG_JUMP_LABEL */
 
 /*
  * default period for cfs group bandwidth.
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 0ba08924e017..d04530bf251f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1488,7 +1488,7 @@ enum {
 
 #undef SCHED_FEAT
 
-#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
+#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_JUMP_LABEL)
 
 /*
  * To support run-time toggling of sched features, all the translation units
@@ -1508,7 +1508,7 @@ static __always_inline bool static_branch_##name(struct static_key *key) \
 extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
 #define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x]))
 
-#else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */
+#else /* !(SCHED_DEBUG && CONFIG_JUMP_LABEL) */
 
 /*
  * Each translation unit has its own copy of sysctl_sched_features to allow
@@ -1524,7 +1524,7 @@ static const_debug __maybe_unused unsigned int sysctl_sched_features =
 
 #define sched_feat(x) !!(sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
 
-#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
+#endif /* SCHED_DEBUG && CONFIG_JUMP_LABEL */
 
 extern struct static_key_false sched_numa_balancing;
 extern struct static_key_false sched_schedstats;
diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c
index c7c96bc7654a..dbf2b457e47e 100644
--- a/lib/dynamic_debug.c
+++ b/lib/dynamic_debug.c
@@ -188,7 +188,7 @@ static int ddebug_change(const struct ddebug_query *query,
 			newflags = (dp->flags & mask) | flags;
 			if (newflags == dp->flags)
 				continue;
-#ifdef HAVE_JUMP_LABEL
+#ifdef CONFIG_JUMP_LABEL
 			if (dp->flags & _DPRINTK_FLAGS_PRINT) {
 				if (!(flags & _DPRINTK_FLAGS_PRINT))
 					static_branch_disable(&dp->key.dd_key_true);
diff --git a/net/core/dev.c b/net/core/dev.c
index 1b5a4410be0e..82f20022259d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1821,7 +1821,7 @@ EXPORT_SYMBOL_GPL(net_dec_egress_queue);
 #endif
 
 static DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
-#ifdef HAVE_JUMP_LABEL
+#ifdef CONFIG_JUMP_LABEL
 static atomic_t netstamp_needed_deferred;
 static atomic_t netstamp_wanted;
 static void netstamp_clear(struct work_struct *work)
@@ -1840,7 +1840,7 @@ static DECLARE_WORK(netstamp_work, netstamp_clear);
 
 void net_enable_timestamp(void)
 {
-#ifdef HAVE_JUMP_LABEL
+#ifdef CONFIG_JUMP_LABEL
 	int wanted;
 
 	while (1) {
@@ -1860,7 +1860,7 @@ EXPORT_SYMBOL(net_enable_timestamp);
 
 void net_disable_timestamp(void)
 {
-#ifdef HAVE_JUMP_LABEL
+#ifdef CONFIG_JUMP_LABEL
 	int wanted;
 
 	while (1) {
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index dc240cb47ddf..93aaec3a54ec 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -33,7 +33,7 @@ EXPORT_SYMBOL_GPL(nf_ipv6_ops);
 DEFINE_PER_CPU(bool, nf_skb_duplicated);
 EXPORT_SYMBOL_GPL(nf_skb_duplicated);
 
-#ifdef HAVE_JUMP_LABEL
+#ifdef CONFIG_JUMP_LABEL
 struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
 EXPORT_SYMBOL(nf_hooks_needed);
 #endif
@@ -347,7 +347,7 @@ static int __nf_register_net_hook(struct net *net, int pf,
 	if (pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS)
 		net_inc_ingress_queue();
 #endif
-#ifdef HAVE_JUMP_LABEL
+#ifdef CONFIG_JUMP_LABEL
 	static_key_slow_inc(&nf_hooks_needed[pf][reg->hooknum]);
 #endif
 	BUG_ON(p == new_hooks);
@@ -405,7 +405,7 @@ static void __nf_unregister_net_hook(struct net *net, int pf,
 		if (pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS)
 			net_dec_ingress_queue();
 #endif
-#ifdef HAVE_JUMP_LABEL
+#ifdef CONFIG_JUMP_LABEL
 		static_key_slow_dec(&nf_hooks_needed[pf][reg->hooknum]);
 #endif
 	} else {
diff --git a/scripts/gcc-goto.sh b/scripts/gcc-goto.sh
index 083c526073ef..8b980fb2270a 100755
--- a/scripts/gcc-goto.sh
+++ b/scripts/gcc-goto.sh
@@ -3,7 +3,7 @@
 # Test for gcc 'asm goto' support
 # Copyright (C) 2010, Jason Baron <jbaron@redhat.com>
 
-cat << "END" | $@ -x c - -c -o /dev/null >/dev/null 2>&1 && echo "y"
+cat << "END" | $@ -x c - -fno-PIE -c -o /dev/null
 int main(void)
 {
 #if defined(__arm__) || defined(__aarch64__)
diff --git a/tools/arch/x86/include/asm/rmwcc.h b/tools/arch/x86/include/asm/rmwcc.h
index dc90c0c2fae3..fee7983a90b4 100644
--- a/tools/arch/x86/include/asm/rmwcc.h
+++ b/tools/arch/x86/include/asm/rmwcc.h
@@ -2,7 +2,7 @@
 #ifndef _TOOLS_LINUX_ASM_X86_RMWcc
 #define _TOOLS_LINUX_ASM_X86_RMWcc
 
-#ifdef CC_HAVE_ASM_GOTO
+#ifdef CONFIG_CC_HAS_ASM_GOTO
 
 #define __GEN_RMWcc(fullop, var, cc, ...)				\
 do {									\
@@ -20,7 +20,7 @@ cc_label:								\
 #define GEN_BINARY_RMWcc(op, var, vcon, val, arg0, cc)			\
 	__GEN_RMWcc(op " %1, " arg0, var, cc, vcon (val))
 
-#else /* !CC_HAVE_ASM_GOTO */
+#else /* !CONFIG_CC_HAS_ASM_GOTO */
 
 #define __GEN_RMWcc(fullop, var, cc, ...)				\
 do {									\
@@ -37,6 +37,6 @@ do {									\
 #define GEN_BINARY_RMWcc(op, var, vcon, val, arg0, cc)			\
 	__GEN_RMWcc(op " %2, " arg0, var, cc, vcon (val))
 
-#endif /* CC_HAVE_ASM_GOTO */
+#endif /* CONFIG_CC_HAS_ASM_GOTO */
 
 #endif /* _TOOLS_LINUX_ASM_X86_RMWcc */
-- 
cgit 


From ad774086356da92a477a87916613d96f4b36005c Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Mon, 31 Dec 2018 17:24:09 +0900
Subject: kbuild: change filechk to surround the given command with { }

filechk_* rules often consist of multiple 'echo' lines. They must be
surrounded with { } or ( ) to work correctly. Otherwise, only the
string from the last 'echo' would be written into the target.

Let's take care of that in the 'filechk' in scripts/Kbuild.include
to clean up filechk_* rules.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
---
 Kbuild                   | 2 +-
 Makefile                 | 6 +++---
 arch/s390/tools/Makefile | 2 +-
 firmware/Makefile        | 5 ++---
 kernel/Makefile          | 6 +++++-
 scripts/Kbuild.include   | 2 +-
 scripts/Makefile.lib     | 3 +--
 7 files changed, 14 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/Kbuild b/Kbuild
index 414ae6da1f50..06b801e12fb4 100644
--- a/Kbuild
+++ b/Kbuild
@@ -27,7 +27,7 @@ timeconst-file := include/generated/timeconst.h
 targets += $(timeconst-file)
 
 define filechk_gentimeconst
-	(echo $(CONFIG_HZ) | bc -q $< )
+	echo $(CONFIG_HZ) | bc -q $<
 endef
 
 $(timeconst-file): kernel/time/timeconst.bc FORCE
diff --git a/Makefile b/Makefile
index 04a857817f77..437d6033598c 100644
--- a/Makefile
+++ b/Makefile
@@ -1127,13 +1127,13 @@ define filechk_utsrelease.h
 	  echo '"$(KERNELRELEASE)" exceeds $(uts_len) characters' >&2;    \
 	  exit 1;                                                         \
 	fi;                                                               \
-	(echo \#define UTS_RELEASE \"$(KERNELRELEASE)\";)
+	echo \#define UTS_RELEASE \"$(KERNELRELEASE)\"
 endef
 
 define filechk_version.h
-	(echo \#define LINUX_VERSION_CODE $(shell                         \
+	echo \#define LINUX_VERSION_CODE $(shell                         \
 	expr $(VERSION) \* 65536 + 0$(PATCHLEVEL) \* 256 + 0$(SUBLEVEL)); \
-	echo '#define KERNEL_VERSION(a,b,c) (((a) << 16) + ((b) << 8) + (c))';)
+	echo '#define KERNEL_VERSION(a,b,c) (((a) << 16) + ((b) << 8) + (c))'
 endef
 
 $(version_h): FORCE
diff --git a/arch/s390/tools/Makefile b/arch/s390/tools/Makefile
index 48cdac1143a9..cf4846a7ee8d 100644
--- a/arch/s390/tools/Makefile
+++ b/arch/s390/tools/Makefile
@@ -25,7 +25,7 @@ define filechk_facility-defs.h
 endef
 
 define filechk_dis-defs.h
-	( $(obj)/gen_opcode_table < $(srctree)/arch/$(ARCH)/tools/opcodes.txt )
+	$(obj)/gen_opcode_table < $(srctree)/arch/$(ARCH)/tools/opcodes.txt
 endef
 
 $(kapi)/facility-defs.h: $(obj)/gen_facilities FORCE
diff --git a/firmware/Makefile b/firmware/Makefile
index e2f7dd2f30e0..37e5ae387400 100644
--- a/firmware/Makefile
+++ b/firmware/Makefile
@@ -13,7 +13,7 @@ ASM_WORD  = $(if $(CONFIG_64BIT),.quad,.long)
 ASM_ALIGN = $(if $(CONFIG_64BIT),3,2)
 PROGBITS  = $(if $(CONFIG_ARM),%,@)progbits
 
-filechk_fwbin = { \
+filechk_fwbin = \
 	echo "/* Generated by $(src)/Makefile */"		;\
 	echo "    .section .rodata"				;\
 	echo "    .p2align $(ASM_ALIGN)"			;\
@@ -28,8 +28,7 @@ filechk_fwbin = { \
 	echo "    .p2align $(ASM_ALIGN)"			;\
 	echo "    $(ASM_WORD) _fw_$(FWSTR)_name"		;\
 	echo "    $(ASM_WORD) _fw_$(FWSTR)_bin"			;\
-	echo "    $(ASM_WORD) _fw_end - _fw_$(FWSTR)_bin"	;\
-}
+	echo "    $(ASM_WORD) _fw_end - _fw_$(FWSTR)_bin"
 
 $(obj)/%.gen.S: FORCE
 	$(call filechk,fwbin)
diff --git a/kernel/Makefile b/kernel/Makefile
index cde93d54c571..6aa7543bcdb2 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -122,7 +122,11 @@ targets += config_data.gz
 $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
 	$(call if_changed,gzip)
 
-      filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;")
+filechk_ikconfiggz = \
+	echo "static const char kernel_config_data[] __used = MAGIC_START"; \
+	cat $< | scripts/bin2c; \
+	echo "MAGIC_END;"
+
 targets += config_data.h
 $(obj)/config_data.h: $(obj)/config_data.gz FORCE
 	$(call filechk,ikconfiggz)
diff --git a/scripts/Kbuild.include b/scripts/Kbuild.include
index 46bf1a073f5d..74a3fe7ddc01 100644
--- a/scripts/Kbuild.include
+++ b/scripts/Kbuild.include
@@ -56,7 +56,7 @@ kecho := $($(quiet)kecho)
 define filechk
 	$(Q)set -e;				\
 	mkdir -p $(dir $@);			\
-	$(filechk_$(1)) > $@.tmp;		\
+	{ $(filechk_$(1)); } > $@.tmp;		\
 	if [ -r $@ ] && cmp -s $@ $@.tmp; then	\
 		rm -f $@.tmp;			\
 	else					\
diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
index 390957f9306f..12b88d09c3a4 100644
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -417,7 +417,6 @@ endef
 # Use filechk to avoid rebuilds when a header changes, but the resulting file
 # does not
 define filechk_offsets
-	( \
 	 echo "#ifndef $2"; \
 	 echo "#define $2"; \
 	 echo "/*"; \
@@ -428,5 +427,5 @@ define filechk_offsets
 	 echo ""; \
 	 sed -ne $(sed-offsets) < $<; \
 	 echo ""; \
-	 echo "#endif" )
+	 echo "#endif"
 endef
-- 
cgit